def variativity_replacement(sentence, verb_token, object_token, object_index, verb_index, parsed_tokens, related_verb, lvc_phrase, catvar_file, adjective_adverb_dict, no_adverb_set, object_preposition_dict, no_preposition_objects, subject_person, subject_number): verb_tag = verb_token.tag_ final_phrase = u'' set_negation = False set_perfect = False set_progressive = False set_present_tense = False set_past_tense = False set_future_tense = False set_third_person = False set_modal = False set_gerund_only = False modal_verb = '' set_having_participle = False nodebox_negate = False nodebox_participle = False nodebox_gerund = False nodebox_simple_present_third_person = False nodebox_simple_present_other_person = False nodebox_simple_past = False list_of_auxiliary_verbs = [] for token in parsed_tokens: if token.head is verb_token and token.dep_ in ['aux', 'neg']: list_of_auxiliary_verbs.append(token.orth_.lower()) if 'has' in list_of_auxiliary_verbs: set_present_tense = True set_third_person = True elif 'have' in list_of_auxiliary_verbs: set_present_tense = True elif 'had' in list_of_auxiliary_verbs: set_past_tense = True if 'been' in list_of_auxiliary_verbs: set_perfect = True set_progressive = True if set(['will', '\'ll', 'shall']) & set(list_of_auxiliary_verbs): set_future_tense = True if 'did' in list_of_auxiliary_verbs: set_past_tense = True if 'does' in list_of_auxiliary_verbs: set_present_tense = True set_third_person = True if 'do' in list_of_auxiliary_verbs: set_present_tense = True if 'can' in list_of_auxiliary_verbs: set_modal = True modal_verb = 'can' set_present_tense = True if 'could' in list_of_auxiliary_verbs: set_modal = True modal_verb = 'could' if 'should' in list_of_auxiliary_verbs: set_modal = True modal_verb = 'should' if 'was' in list_of_auxiliary_verbs: set_past_tense = True set_third_person = True if 'were' in list_of_auxiliary_verbs: set_past_tense = True #third person singular form, present tense if verb_tag == u'VBZ': set_third_person = True set_present_tense = True nodebox_simple_present_third_person = True #present tense, plural subject elif verb_tag == u'VBP': set_present_tense = True nodebox_simple_present_other_person = True #gerund elif verb_tag == u'VBG': nodebox_gerund = True set_progressive = True #gerund without auxiliary if not list_of_auxiliary_verbs: set_gerund_only = True #participle. 'having' + participle is handled separately from the regular perfect tenses elif verb_tag == u'VBN' and (set(['has', '\'s', 'have', '\'d', 'had', '\'d', 'having']) & set(list_of_auxiliary_verbs)): nodebox_participle = True if 'having' in list_of_auxiliary_verbs: set_having_participle = True else: set_perfect = True #a simple past tense (VBD) is commonly mistagged as a perfect (VBN) elif verb_tag == u'VBD' or verb_tag == u'VBN': set_past_tense = True nodebox_simple_past = True # the exact phrase to replace in the sentence phrase_to_replace = lvc_phrase aux_verb_buffer = [] for token in parsed_tokens: if token.head is verb_token and token.dep_ in ['aux', 'neg']: aux_verb_buffer.append(token.orth_) if aux_verb_buffer: for aux_verb in reversed(aux_verb_buffer): phrase_to_replace = aux_verb + ' ' + phrase_to_replace #currently omitting cases like 'had HE been going to the station', where all the auxiliary verbs and the main verb are not contiguous if phrase_to_replace not in sentence: return '' list_of_modifiers = [token for token in parsed_tokens if token.head is object_token and token.dep_ != "det" and get_index_in_list(parsed_tokens, token) in range(verb_index, object_index)] list_of_modifiers.extend([token for token in parsed_tokens if token.head is verb_token and token is not object_token and token.dep_ != "det" and get_index_in_list(parsed_tokens, token) in range(verb_index, object_index)]) #handle separately numerical and adjectival modifiers, and also look for clues where the verb must occur in its negated form #adjectival modifiers list_of_amod_modifiers = [token for token in list_of_modifiers if token.dep_ == "amod" and token.head is object_token] #numerical modifiers list_of_nummod_modifiers = [token for token in list_of_modifiers if token.dep_ == "nummod" and token.head is object_token and token.orth_.lower() not in ['0', 'zero']] #clues for negation- a numerical modifier that says 'zero', or a determiner for the object which says 'no'. More can, of course, be added list_of_negative_modifiers = [token for token in parsed_tokens if token.head is object_token and ((token.dep_ == 'nummod' and token.orth_.lower() in ['0', 'zero']) or (token.dep_ == 'det' and token.orth_.lower() == 'no'))] adjectival_modification = '' numerical_modification = '' dative_object_string = '' #handle negation generation if list_of_negative_modifiers: #print list_of_negative_modifiers[0].orth_, list_of_negative_modifiers[0].dep_, list_of_negative_modifiers[0].head.orth_, list_of_negative_modifiers[0].tag_ set_negation = True nodebox_negate = True """negation_list = [token for token in parsed_tokens if token.dep_ == 'neg' and token.head is verb_token and get_index_in_list(parsed_tokens, token) < object_index] if negation_list: print 'Negation case 2' print negation_list[0].orth_, negation_list[0].dep_, negation_list[0].head.orth_, negation_list[0].tag_ set_negation = True nodebox_negate = True""" #convert adjectives to adverbs if list_of_amod_modifiers: main_amod_modifier_token = list_of_amod_modifiers[0] amod_modification = adjectival_modifier(parsed_tokens, main_amod_modifier_token, catvar_file, adjective_adverb_dict, no_adverb_set) if amod_modification: adjectival_modification = amod_modification #convert numerical modifiers by post-appending 'times' if list_of_nummod_modifiers: main_nummod_modifier_token = list_of_nummod_modifiers[0] nummod_modification = numerical_modifier(parsed_tokens, main_nummod_modifier_token) if nummod_modification: numerical_modification = nummod_modification #call to simplenlg program to generate verb form """simplenlg_command = 'java -jar SimpleNLGPhraseGenerator.jar ' + str(set_negation) + ' ' + str(set_present_tense) + ' ' + str(set_past_tense) + ' ' + str(set_future_tense) + ' ' + str(set_third_person) + ' ' + str(set_perfect) + ' ' + str(set_progressive) + ' ' + str(set_modal) + ' ' + str(set_having_participle) + ' ' + str(subject_person) + ' ' + str(subject_number) + ' ' + related_verb + ' ' + modal_verb status = os.system(simplenlg_command + ' > verb_phrase_output') print status print simplenlg_command print sentence temp_file = open('verb_phrase_output', 'r') #content = check_output(simplenlg_command) content = temp_file.readline() phrases = content.split(' ||| ') temp_file.close() os.system('rm verb_phrase_output') #if there was an error in the simplenlg programme if len(phrases) != 2: return ''""" final_phrase_active = nodebox_verb_conjugator(related_verb, aux_verb_buffer, subject_person, subject_number, nodebox_negate, nodebox_participle, nodebox_gerund, nodebox_simple_present_third_person, nodebox_simple_present_other_person, nodebox_simple_past) #phrases[0] final_phrase_passive = nodebox_verb_conjugator_passive(related_verb, final_phrase_active, aux_verb_buffer, subject_person, subject_number, nodebox_negate, nodebox_participle, nodebox_gerund, nodebox_simple_present_third_person, nodebox_simple_present_other_person, nodebox_simple_past) #phrases[1] if final_phrase_passive is None: return '' #if the verb was only a gerund """if (set_gerund_only): print "Sentence is- " + sentence print "Active phrase is- " + final_phrase_active final_phrase_active = final_phrase_active.split(' ')[1] final_phrase_passive = ' '.join(final_phrase_passive.split(' ')[1:]) #'having' + participle if (set_having_participle): final_phrase_active = 'having ' + final_phrase_active final_phrase_passive = 'having ' + final_phrase_passive""" #WAIT, THIS IS PROBLEMATIC list_of_dative_objects_within_phrase = [token for token in parsed_tokens if token.dep_ == 'dative' and token.head is verb_token and get_index_in_list(parsed_tokens, token) in range(verb_index, object_index)] if len(list_of_dative_objects_within_phrase) == 1: dative_token = list_of_dative_objects_within_phrase[0] dative_object_string = dative_token.orth_ dative_object_buffer = [] for token in parsed_tokens: if token.head is dative_token: dative_object_buffer.append(token.orth_) if dative_object_buffer: for dative_object_modifier in reversed(dative_object_buffer): dative_object_string = dative_object_modifier + ' ' + dative_object_string if adjectival_modification and adjectival_modification.strip()[-2:] == 'ly': final_phrase_active = ' '.join(final_phrase_active.split()[:-1]) + ' ' + adjectival_modification + ' ' + final_phrase_active.split()[-1] final_phrase_passive = ' '.join(final_phrase_passive.split()[:-1]) + ' ' + adjectival_modification + ' ' + final_phrase_passive.split()[-1] if dative_object_string: final_phrase_active += ' ' + dative_object_string final_phrase_passive += ' by ' + dative_object_string if adjectival_modification and not (adjectival_modification[-2:] == 'ly'): final_phrase_active += ' ' + adjectival_modification final_phrase_passive += ' ' + adjectival_modification if numerical_modification: final_phrase_active += ' ' + numerical_modification final_phrase_passive += ' ' + numerical_modification final_phrase_active = final_phrase_active.strip() final_phrase_passive = final_phrase_passive.strip() #now to wonder whether to omit the preposition or not setup_verb_prep_combination_dict() list_of_preposition_tokens = [token for token in parsed_tokens if token.tag_ == u'IN' and (token.dep_ == 'dative' or token.dep_ == 'prep') and token.orth_.lower() in common_prepositions and (token.head is verb_token or token.head is object_token) and get_index_in_list(parsed_tokens, token) in range(verb_index + 1, object_index + 2)] if list_of_preposition_tokens: preposition_token = list_of_preposition_tokens[0] verb_prep_caps = verb_token.lemma_.upper() + '_' + preposition_token.orth_.upper() if verb_prep_caps in verb_prep_combination_dict and verb_prep_combination_dict[verb_prep_caps] != '-': if verb_prep_combination_dict[verb_prep_caps] == '+': #reconstruct sentence without that preposition sentence = '' for token in parsed_tokens: if token is not preposition_token: sentence += token.orth_ + ' ' sentence = sentence.strip() #check if preposition to be added additional_preposition = select_preposition_for_object(related_verb, object_preposition_dict, no_preposition_objects) if additional_preposition: print 'Adding preposition ' + additional_preposition final_phrase_active = final_phrase_active.strip() + ' ' + additional_preposition final_phrase_passive = final_phrase_passive.strip() + ' ' + additional_preposition elif verb_prep_combination_dict[verb_prep_caps] == 'A': probabilities = urllib2.urlopen(urllib2.Request('http://weblm.research.microsoft.com/rest.svc/bing-body/2013-12/5/jp?u=' + microsoft_weblm_api_key + '&format=json', en.verb.past(related_verb) + '\n' + en.verb.past(related_verb) + ' ' + preposition_token.orth_)).read().split(',') verb_probability = float(probabilities[0][1:]) verb_prep_probability = float(probabilities[1][:-1]) #print en.verb.past(related_verb), verb_probability #print en.verb.past(related_verb) + ' ' + preposition_token.orth_, verb_prep_probability #major changes may be needed here if verb_probability - verb_prep_probability > 1.75: sentence = '' for token in parsed_tokens: if token is not preposition_token: sentence += token.orth_ + ' ' sentence = sentence.strip() #check if preposition to be added additional_preposition = select_preposition_for_object(related_verb, object_preposition_dict, no_preposition_objects) if additional_preposition: print 'Adding preposition ' + additional_preposition final_phrase_active = final_phrase_active.strip() + ' ' + additional_preposition final_phrase_passive = final_phrase_passive.strip() + ' ' + additional_preposition elif verb_prep_caps not in verb_prep_combination_dict: #check if preposition to be added additional_preposition = select_preposition_for_object(related_verb, object_preposition_dict, no_preposition_objects) if additional_preposition: print 'Adding preposition ' + additional_preposition final_phrase_active = final_phrase_active.strip() + ' ' + additional_preposition final_phrase_passive = final_phrase_passive.strip() + ' ' + additional_preposition #if the object is coordinated in a zeugma, replicate the verb to feature with the coordinated object zeugma_heads = [token for token in parsed_tokens if token.dep_ == 'conj' and token.head is object_token and [other_token for other_token in parsed_tokens if other_token.orth_.lower() in ['and', 'or'] and other_token.dep_ == 'cc' and other_token.head is object_token]] if zeugma_heads: zeugma_conjunction = [other_token for other_token in parsed_tokens if other_token.orth_.lower() in ['and', 'or'] and other_token.dep_ == 'cc' and other_token.head is object_token][0] conjunction_index = get_index_in_list(parsed_tokens, zeugma_conjunction) sentence = sentence.replace(parsed_tokens[conjunction_index-1].orth_ + ' '+ zeugma_conjunction.orth_ + ' ' + parsed_tokens[conjunction_index+1].orth_, parsed_tokens[conjunction_index-1].orth_ + ' '+ zeugma_conjunction.orth_ + ' ' + verb_token.orth_ + ' ' + parsed_tokens[conjunction_index+1].orth_) return [sentence.replace(phrase_to_replace, final_phrase_active), sentence.replace(phrase_to_replace, final_phrase_passive)]
if len(sentence) > 250: sentence = u'' candidate_object_token_numbers[:] = [] continue #parsing the sentence parsed_tokens = nlp_pipeline(sentence) #add all probable direct object candidates to a list for index in range(len(parsed_tokens)): #if there's a direct object, to start with if (parsed_tokens[index].dep_.strip() == "dobj"): #if the verb belongs to the list of allowed verbs if parsed_tokens[index].head.lemma_ in allowed_verbs: #if the object's head verb is not more than 6 tokens behind the direct object poss_head_index = get_index_in_list(parsed_tokens, parsed_tokens[index].head) if poss_head_index < index and index - poss_head_index < 7: #if the object is not a punctuation mark or number-like or url-like if not(parsed_tokens[index].check_flag(IS_PUNCT) or parsed_tokens[index].check_flag(LIKE_URL) or parsed_tokens[index].check_flag(LIKE_NUM)): #if the noun has a verb in the same cluster from catvar if noun_to_verb_in_catvar(catvar_file, parsed_tokens[index].lemma_.lower(), catvar_noun_dict, catvar_no_verb_set): #check if the verb has a conjugation in our verb conjugator if [verb for verb in catvar_noun_dict[parsed_tokens[index].lemma_.lower()] if verb in conjugated_verbs]: #check whether the nominal form of this noun appears much more frequently compared to that of the verbal form in the BNC if compare_lemma_verb_noun_frequencies(parsed_tokens[index].lemma_.lower(), catvar_noun_dict, bnc_noun_frequencies, bnc_verb_frequencies): #add this token to the list of possible light verb objects for this sentence: there may be multiple candidate_object_token_numbers.append(index) #now consider each candidate, construct verb phrase, noun phrase, lvc phrase for object_index in candidate_object_token_numbers: #some initializations