Esempio n. 1
0
 def test_tenses(self):
     # Assert tense of "am".
     self.assertTrue(en.PRESENT_1ST_PERSON_SINGULAR in en.tenses("am"))
     self.assertTrue("1sg"  in en.tenses("am"))
     self.assertTrue("1sg"  in en.tenses("will"))
     self.assertTrue("2sg-" in en.tenses("won't"))
     self.assertTrue("g"    in en.tenses("imaginarifying"))
     print "pattern.en.tenses()"
Esempio n. 2
0
 def tenseses(self, strs):
     if 'present' in str(tenses(strs)):
         ten = 'PRESENT'
     elif 'past' in tenses(strs):
         ten = "PAST"
     else:
         ten = "FUTURE"
     return ten
Esempio n. 3
0
 def test_tenses(self):
     # Assert tense of "am".
     self.assertTrue((en.PRESENT, 1, en.SINGULAR) in en.tenses("am"))
     self.assertTrue("1sg" in en.tenses("am"))
     self.assertTrue("1sg" in en.tenses("will"))
     self.assertTrue("2sg-" in en.tenses("won't"))
     self.assertTrue("part" in en.tenses("imaginarifying"))
     print "pattern.en.tenses()"
Esempio n. 4
0
 def test_tenses(self):
     # Assert tense of "am".
     self.assertTrue((en.PRESENT, 1, en.SINGULAR) in en.tenses("am"))
     self.assertTrue("1sg"  in en.tenses("am"))
     self.assertTrue("1sg"  in en.tenses("will"))
     self.assertTrue("2sg-" in en.tenses("won't"))
     self.assertTrue("part" in en.tenses("imaginarifying"))
     print "pattern.en.tenses()"
Esempio n. 5
0
def mangle_agreement(correct_sentence):
    """Given a correct sentence, return a sentence or sentences with a subject
    verb agreement error"""
    # # Examples
    #
    # Back in the 1800s, people were much shorter and much stronger.
    # This sentence begins with the introductory phrase, 'back in the 1800s'
    # which means that it should have the past tense verb. Any other verb would
    # be incorrect.
    #
    #
    # Jack and jill went up the hill.
    # This sentence is different; 'go' would also be correct. If it began with
    # 'Yesterday', a single-word introductory phrase requiring no comma, only
    # 'went' would be acceptable.
    #
    #
    # The man in the checkered shirt danced his warrior dance to show that
    # he was the most dominant male in the room.
    # This sentence has multiple verbs. If the sentence ended at the word dance,
    # changing 'danced' to 'dances' would be acceptable, but since the sentence
    # continues we cannot make this change -- 'was' agrees with 'danced' but not
    # with 'dances'.  This is a shifty tense error, a classic subject verb
    # agreement error.
    #
    # # Our Method
    #
    # Right now, we will assume that any change in verb form of a single verb in
    # a sentence is incorrect.  As demonstrated above, this is not always true.
    # We hope that since any model created off of this data will use a
    # confidence interval to determine likelihood of a subject-verb agreement
    # error, that some number can be found for which the model excels.
    #
    # It would also be possible to use a rule based learner to evaluate single
    # verb sentences, and only evaluating more complex sentences with the
    # tensorflow model.

    bad_sents = []
    doc = nlp(correct_sentence)
    verbs = [(i, v) for (i, v) in enumerate(doc) if v.tag_.startswith('VB')]
    for i, v in verbs:
        for alt_verb in lexeme(doc[i].text):
            if alt_verb == doc[i].text:
                continue  # Same as the original, skip it
            if (tenses(alt_verb) == tenses(v.text) or
                (alt_verb.startswith(v.text) and alt_verb.endswith("n't"))):
                continue  # Negated version of the original, skip it
            new_sent = str(doc[:i]) + " {} ".format(alt_verb) + str(
                doc[i + 1:])
            new_sent = new_sent.replace(' ,', ',')  # fix space before comma
            bad_sents.append(new_sent)
    return bad_sents
Esempio n. 6
0
def _interface(sentence,edblist):
	target_words, word_pre, person_taggers, org_taggers =  _Stem(sentence, edblist)
	token_list =[]
 
        #import pdb; pdb.set_trace()
        #print "word_pre:", word_pre
        if len(word_pre) > 0:
            word_pre[0] = word_pre[0][0].upper() + word_pre[0][1:]
        #import pdb; pdb.set_trace()
	for word in word_pre:

                #import pdb; pdb.set_trace()
	        tokens = {}
                #if word == "He": # is a person, subject?
                #        tokens[word] = ["He", "She"]
                if word.strip().lower() == person_taggers.strip().lower():
                        tokens[word] = [word, "He", "She"]
                        #tokens[word] = [ "They"]
                        
                elif word.strip().lower() == org_taggers.strip().lower():
                        if _isplural(org_taggers.strip().split()[-1]) or (org_taggers.strip().split()[-1] == 'they'):
                                tokens[word] = [word, "They"]
                        else:
                                tokens[word] = [word, "It"]
                        #tokens[word] = [ "It"]
                #        pass
                else:
                        if lmtzr.lemmatize(word) not in target_words:
                                token_list.append(word)
                        else:
                                r_sent = []
			        candidates = Generate_candidates_topN(word,sentence,19,edblist)
			        for i in range(len(candidates)):
				        r_sent.append(candidates[i] + "@" + sentence.replace(word,candidates[i]))
			        sub_top10 = kenlm_topn(r_sent,9,sentence)
			        if lmtzr.lemmatize(word) not in sub_top10:
			        	sub_top10.insert(0,word)

                                if len(tenses(word)) > 0:
                    	                _sub_top10 = []
			                for w in sub_top10:
			                    _sub_top10.append(conjugate(w, tenses(word)[0][0], 3))
			                tokens[word] = _sub_top10
		                else:
                                        tokens[word] = sub_top10
                                
                if tokens: token_list.append(tokens)
                
	return token_list
Esempio n. 7
0
    def _transform_word(self, word, pos, less, more):
        """transforms a word to be less less and more more

        :param word: word to transform
        :type word: str

        :param pos: part of speech of the word
        :type pos: str

        :param less: list of 'less' words
        :type less: list

        :param more: list of 'more' words
        :type more: list

        :returns: transformed word
        :rtype: str
        """

        new_word = self._get_similar_word(word, less, more)
        new_pos = en.tag(new_word)[0][1]

        if (pos[:2] != new_pos[:2]) or word == new_word:
            return word

        # handle noun
        if pos.startswith('NN'):

            # pluralization
            if pos.endswith('S') and not new_pos.endswith('S'):
                new_word = en.pluralize(new_word)

            elif not pos.endswith('S') and new_pos.endswith('S'):
                new_word = en.singularize(new_word)

            # capitalization
            if word[0].isupper():
                new_word = new_word[0].upper() + new_word[1:]
            else:
                new_word = new_word.lower()

        # handle verb
        elif pos.startswith('VB'):

            tense, person, number = en.tenses(word)[0][:3]

            # conjugation
            conjugated = en.conjugate(new_word,
                                    tense=tense,
                                    person=person,
                                    number=number,
                                    parse=False)

            if conjugated is not None:
                new_word = conjugated

        # remove underscores for joint words
        new_word = new_word.replace('_', ' ')

        return new_word
Esempio n. 8
0
def run_postprocessing(s, rules, all_args):
    rule_list = rules.split(',')
    for rule in rule_list:
        if rule == 'lower':
            s = s.lower()
        elif rule.startswith('tense-'):
            ind = int(rule[6:])
            orig_vb = all_args[ind]
            if " " in orig_vb:
                orig_vb.split()[0]
            tenses = patten.tenses(orig_vb)
            for tense in PATTERN_TENSES:  # Prioritize by PATTERN_TENSES
                if tense in tenses:
                    break
            else:  # Default to first tense
                tense = PATTERN_TENSES[0]
            if " " in s:
                s_verb = s.split()[0]
                s_verb_conj = patten.conjugate(s_verb, tense)
                s = " ".join([s_verb_conj] + s.split()[1:])
            else:
                s = patten.conjugate(s, tense)
        elif rule in POS_TO_PATTERN:
            s = patten.conjugate(s, POS_TO_PATTERN[rule])
    return s
Esempio n. 9
0
    def verb_fom(word: str) -> str:

        # Step 1: check if the word is in present tense or past.
        tense_list = tenses(word)

        if tense_list is None or len(tense_list) == 0:
            return word

        present_tense = True

        for index, i in enumerate(tense_list):
            if i[0] == 'present':
                present_tense = True
                final_tense = i
                break
            if i[0] == 'past':
                present_tense = False
                final_tense = i
                break

        if present_tense:
            tense_string = "past"
        else:
            tense_string = "present"

        # Step 2: Create
        new_word = conjugate(word,
                             tense=tense_string,
                             person=final_tense[1],
                             number=final_tense[2],
                             negated=False)

        # Step 3: Return the word
        return new_word if new_word is not None else word
Esempio n. 10
0
def make_thesaurus(file_path):
    """
    Returns dict of counters 'thesaurus', where
    thesaurus[word] = { synonym1: 4, syn2: 8, syn3: 1, ... }
    """
    thesaurus = defaultdict(lambda: Counter())

    with open(file_path, "r") as f:
        for line in f:

            # Ignore repeated book title headers
            if _is_title(line):
                continue

            parsed = parse(line)

            for tagged_word in parsed.split()[0]:
                word = tagged_word[0].strip().lower()
                pos = tagged_word[1][0]  # get pos for word

                # Reject non-ASCII characters
                try:
                    word = word.decode("ascii")
                except (UnicodeDecodeError, UnicodeEncodeError):
                    continue

                # Reject whitespace character
                if re.match("^[\s]*$", word):
                    continue

                # Increment word count of word w
                thesaurus[word].update([word])

                # Retrieve syn = synonym[w], add to thesaurus[syn]
                for syn in wn.get_synonyms(word):
                    syn = syn.name().split(".")[0]

                    # if noun, add plural form if word is plural, else add singular
                    if pos == "N":
                        if word == pluralize(word):
                            thesaurus[pluralize(syn)].update([word])
                        else:
                            thesaurus[syn].update([word])
                    # if verb, conjugate synonyms to the right form before adding them to thes
                    elif pos == "V":
                        word_tenses = tenses(word)
                        if word_tenses:
                            thesaurus[conjugate(syn, tense=word_tenses[0][0])].update([word])
                        else:
                            thesaurus[syn].update([word])
                    else:
                        thesaurus[syn].update([word])

    # Update thesaurus with mappings, if map_file exists
    file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER)
    map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG)
    thesaurus = _add_mappings(map_file, thesaurus)

    return thesaurus
Esempio n. 11
0
def find_verb_form(original_form, original_lemma, new_lemma):
    """
    Figure out original tense of the verb, then apply that tense to new_lemma
    There might be more than one, let's keep it simple and just apply
    the first one
    """
    possible_conjugations = tenses(original_form)
    if len(possible_conjugations) > 1:
        return conjugate(new_lemma, possible_conjugations[1])
    else:
        return conjugate(new_lemma, possible_conjugations[0])
def get_verb_reduction(verb, tag):
    """Given string of existing verb, returns its corresponding reduction
    That's the verb itself if its lemma is in the top100, else its hash"""
    if lemma(verb.lower()) in literals.verbs:
        return verb.upper()
    if lemma(verb.lower()) in top100.verbs:
        return verb.upper()
    else:
        h = sha256(str(tenses(verb)).encode('utf_8')).hexdigest()
        result = tag + '_' + h
        return result
Esempio n. 13
0
def make_thesaurus_lesk(file_path):
    """
    Returns dict of counters 'thesaurus', where
    thesaurus[synset] = { word1: 4, word2: 8, word3: 1, ... }
    """
    thesaurus = defaultdict(lambda: Counter())

    with open(file_path, "r") as f:

        f = f.read().split()
        for i, word_and_tag in enumerate(f):

            word, tag = word_and_tag.rsplit("_", 1)

            # Reject non-ASCII characters
            try:
                word = word.decode("ascii")
            except (UnicodeDecodeError, UnicodeEncodeError):
                continue

            # look at a window of 9 words each time lesk is called
            window = [i - WINDOW, i + WINDOW]
            if i < WINDOW:
                window = [i, i + 2 * WINDOW]
            elif i >= len(f) - WINDOW:
                window = [i - 2 * WINDOW, i]

            synset = lesk.my_lesk(f[window[0] : window[1]], word)

            # if lesk can decide on a meaning for that word, add
            # that meaning, i.e., that synset, to thesaurus
            if not synset:
                continue

            # if word is verb, only add present tense to thesaurus
            if tag[0] == "V":
                word_tenses = tenses(word.lower())
                if "inf" in word_tenses or "1sg" in word_tenses or "2sg" in word_tenses or "3sg" in word_tenses:
                    thesaurus[str(synset)].update([word.lower()])
            elif tag[0] == "N":
                synset_name = synset.name().split(".")[0]
                if synset_name == pluralize(synset_name):
                    thesaurus[str(synset)].update([pluralize(word.lower())])
                else:
                    thesaurus[str(synset)].update([singularize(word.lower())])
            else:
                thesaurus[str(synset)].update([word.lower()])
    # Update thesaurus with mappings, if map_file exists
    file_path = file_path.replace(config.CORPUS_FOLDER, config.MAPPING_FOLDER)
    map_file = file_path.replace(config.CORP_TAG, config.MAP_TAG)

    thesaurus = _add_mappings(map_file, thesaurus)
    return thesaurus
Esempio n. 14
0
def Bullet_Replace(old_word, new_word, bullet, POS_tag):
    '''
    This function replaces the old_word in bullet with new_word using POS_tag to make 
    the forms of the words match
    '''
    if POS_tag == 'VBD':
        #verb is past tense
        if '3sgp' in tenses(old_word):
            #3rd person singular past
            new_word = conjugate(new_word, '3sgp')
        else:
            #plural past
            new_word = conjugate(new_word, 'ppl')
    elif POS_tag == 'VBG':
        #gerund/present participle
        new_word = conjugate(new_word, 'part')
    elif POS_tag == 'VBN':
        #past participle
        new_word = conjugate(new_word, 'ppart')
    elif POS_tag == 'VBP':
        if '1sg' in tenses(old_word):
            #1st person singular
            new_word = conjugate(new_word, '1sg')
        else:
            #2nd person singular
            new_word = conjugate(new_word, '2sg')
    elif POS_tag == 'VBZ':
        if '3sg' in tenses(old_word):
            new_word = conjugate(new_word, '3sg')
        else:
            new_word = conjugate(new_word, 'pl')
    elif POS_tag in ['NNS', 'NNPS']:
        #need to make new word plural
        new_word = pluralize(new_word)

    #check for capitalization
    if old_word[0] != old_word[0].lower():
        new_word = new_word[0].upper() + new_word[1:]

    return (bullet.replace(old_word, new_word))
Esempio n. 15
0
def filtrar_conjugaciones(verbo, conjugaciones):
    conjugaciones = [x for x in conjugaciones if "n't" not in x and "not" not in x]
    if len(conjugaciones) <= 4:
        return conjugaciones
    else:
        if verbo['pos_tag'] in POS_TAGS_PRESENTE:
            conjugaciones = [x for x in conjugaciones if PAST not in tenses(x)]
        elif verbo['pos_tag'] in POS_TAGS_PASADO:
            conjugaciones = [x for x in conjugaciones if is_not_present(x)]
        if len(conjugaciones) >= 4:
            conjugaciones.remove(omitir_contraccion(verbo['token']))
            conjugaciones = random.sample(conjugaciones, 3)
            conjugaciones.append(verbo['token'])
        return conjugaciones
Esempio n. 16
0
	def get_questions(self):
		z = self.getText()
		(subj,vp) = (z['NP'][0], z['VP'][0])
                from pattern.en import lexeme, lemma, tenses
		import nltk, re
		tagged = nltk.pos_tag(nltk.word_tokenize(subj + " " + vp))
                verb = ""
		sense = supersense(subj)
		if(sense[0][2][-6:] == 'person' or sense[0][1] == 'PRP'): return ("Who " + vp + "?")
		elif(sense[0][2][-4:] == 'time' or re.match("[1|2]\d\d\d", subj)): return ("When " + vp + "?")
		elif(sense[0][2][-8:] == 'location' and
		('PP' in z and z['PP'].split()[0].lower in ["on", "in", "at", "over", "to"])):
			return ("Where " + vp + "?")
                aux = ["Will","Shall","May","Might","Can","Could","Must","Should","Would","Do","Does","Did"]
                for i in reversed(tagged):
                        if(i[1][0] == 'V'):
                                verb = i[0]
                if((u'' + verb) in lexeme("is")):
                        return (verb.capitalize() + " " + subj.lower() + vp[len(verb):] + "?")
                else:
                        for x in aux:
                                if(tenses(x)[0] == tenses(verb)[0]):
                                        return (x + " " + subj.lower() + " " + lemma(verb) + vp[len(verb):] + "?")
Esempio n. 17
0
 def do_q(self, parse_by_structure, verb_index, np_index):
     verb = parse_by_structure[verb_index]
     (tense, person, a, b, c) = tenses(verb)[0]
     present_verb = str(conjugate(verb, tense="present", person=1))
     sent = parse_by_structure
     sent[verb_index] = present_verb
     if tense == 'past':
         sent.insert(np_index, "did")
     elif tense == 'present' and person == 3:
         sent.insert(np_index, "does")
     else:
         sent.insert(np_index, "do")
     sent[-1] = "?"
     sent = " ".join(sent)
     return sent
Esempio n. 18
0
def transform_word(word, pos, word_original):
	words = word.split(' ')
	result = list()
	for i, word in enumerate(words):
		if i == 0:
			try:
				if pos == 'JJR' or pos == 'RBR':
					pos_again = nltk.pos_tag([word])[0][1]
					if pos_again == 'JJR' or pos_again == 'RBR':
						result.append(word)
					else:
						result.append(comparative(word))
				elif pos == 'JJS' or pos == 'RBS':
					pos_again = nltk.pos_tag([word])[0][1]
					if pos_again == 'JJS' or pos_again == 'RBS':
						result.append(word)
					else:
						result.append(superlative(word))
				elif pos == 'NNS' or pos == 'NNPS':
					pos_again = nltk.pos_tag([word])[0][1]
					if pos_again == 'NNS' or pos_again == 'NNPS':
						result.append(word)
					else:
						result.append(pluralize(word))
				elif pos == 'VBD':
					result.append(conjugate(word, 'p'))
				elif pos == 'VBG':
					result.append(conjugate(word, 'part'))
				elif pos == 'VBN':
					result.append(conjugate(word, 'ppart'))
				elif pos == 'VBP':
					if (PRESENT, 1, SG) in tenses(word_original):
						result.append(conjugate(word, '1sg'))
					else:
						result.append(conjugate(word, '2sg'))
				elif pos == 'VBZ':
					result.append(conjugate(word, '3sg'))
				else:
					result.append(word)
			except KeyError:
				result.append(word)
		else:
			result.append(word)
	return ' '.join(result)
Esempio n. 19
0
def match_all_inflections(source_word, target_words, pos):
    if pos == wn.VERB:
        inflections = set()
        conjugations = tenses(source_word)
        for tense, person, number, mood, aspect in conjugations:
            inflections.update([
                conjugate(word,
                          tense=tense,
                          person=person,
                          number=number,
                          mood=mood,
                          aspect=aspect) for word in target_words
            ])
        return inflections
    elif pos == wn.NOUN:
        return [singularize(word) for word in target_words
                ] + [pluralize(word) for word in target_words]
    else:  # pos == "ADJ" or pos == "ADV"
        return target_words
Esempio n. 20
0
def change_tense(token):

    nt = tenses(token)
    #print(nt)
    if len(nt) != 0:
        current_tense = nt[0][0]
    else:
        return token
    #if any(isinstance(i, tuple) for i in nt):
    #    current_tense = nt[0][0]
    #else:
    #    current_tense = nt[0]
    p_conj = []
    for p in range(1, 4):
        n_conj = conjugate(token, tense=current_tense, person=p)
        if n_conj != token:
            return n_conj
        p_conj.append(n_conj)

    return p_conj[0]
Esempio n. 21
0
def fix_vp(np, vp):
    verb = detokenizer.detokenize(vp)
    tnss = tenses(verb)
    if np == ['i']:
        tns = [a for a in tnss if 2 in a][0]
        return [
            conjugate(verb,
                      tense=tns[0],
                      person=1,
                      number=tns[2],
                      mood=tns[3],
                      aspect=tns[4])
        ]
    if np == ['you']:
        tns = [a for a in tnss if 1 in a][0]
        return [
            conjugate(verb,
                      tense=tns[0],
                      person=2,
                      number=tns[2],
                      mood=tns[3],
                      aspect=tns[4])
        ]
    return vp
print article('hour')

print referenced('university')
print referenced('hour')


#singularity
print pluralize('child')
print singularize('wolves')

#
print 
print lexeme('run')
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred') # 'p' in tenses() also works.
print (PAST, 1, PL) in tenses('purred') 

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)


#parse
s = parse('I eat pizza with a fork.')
pprint(s)
    def loop(self, debug=False):
        w_inv_orig = self.tsv('inv_orig.tsv')
        w_inv_trsf = self.tsv('inv_trsf.tsv')
        w_pass_orig = self.tsv('pass_orig.tsv')
        w_pass_trsf = self.tsv('pass_trsf.tsv')

        self.lines = open(mnli_train).readlines()
        already_seen = set()
        self.dicts = []
        n = 0
        for i, line in enumerate(self.lines):
            j = json.loads(line)
            self.dicts.append(j)
            if i % 10000 == 0:
                print('%d out of %d' % (i, len(self.lines)))
            if debug and i == 10000:
                break
            if j['genre'] == 'telephone':
                continue

            tree = j['hyptree'] = nltk.tree.Tree.fromstring(j['sentence2_parse'])

            ss = [x for x in tree.subtrees() if x.label() == 'S']

            for s in ss[:1]:
                if len(s) < 2:  # Not a full NP + VP sentence
                    continue

                subj_head = self.get_np_head(s[0])
                if subj_head is None:
                    continue
                subject_number = self.get_np_number(s[0])


                k = 1
                
                while (s[k].label() not in (u'VP', u'SBAR', u'ADJP')) and (k < len(s) - 1):
                    k+=1

                if k == len(s) - 1:
                    continue
		#iterate through top level branches to find VP

                vp_head = self.get_vp_head(s[k])


                if vp_head[0] is None:
                    continue

                subj = ' '.join(s[0].flatten())
                arguments = tuple(x.label() for x in s[1][1:])

                if (arguments != ('NP',) or 
                        en.lemma(vp_head[0]) in ['be', 'have']):
                    continue		

                direct_object = ' '.join(s[1][1].flatten())

                object_number = self.get_np_number(s[1][1])

                if object_number is None:
                    # Personal pronoun, very complex NP, or parse error
                    continue

                lookup = en.tenses(vp_head[0])

                if len(lookup) == 0:
                    if vp_head[0][-2:]:
                        tense = en.PAST
                    else:
                        tense = en.PRESENT
                else:
                    if en.tenses(vp_head[0])[0][0] == u'past':
                        tense = en.PAST
                    else:
                        tense = en.PRESENT

                subjobj_rev_hyp = ' '.join([
                    upper_first(direct_object),
                    #keep tense
                    en.conjugate(vp_head[0], number=object_number, tense = tense),
                    lower_first(subj)]) + '.'

                passive_hyp_same_meaning = ' '.join([
                    upper_first(direct_object),
                    self.passivize_vp(s[k], object_number),
                    lower_first(subj)]) + '.'

                passive_hyp_inverted = ' '.join([
                    subj,
                    self.passivize_vp(s[k], subject_number),
                    direct_object]) + '.'


                if j['gold_label'] == 'entailment':
                    self.mnli_row(w_inv_orig, 1000000 + n,
                            j['sentence1'], subjobj_rev_hyp, 'neutral')

                self.mnli_row(w_inv_trsf, 1000000 + n,
                        j['sentence2'], subjobj_rev_hyp, 'neutral')

                self.mnli_row(w_pass_orig, 1000000 + n,
                        j['sentence1'], passive_hyp_same_meaning, 
                        j['gold_label'])

                self.mnli_row(w_pass_trsf, 1000000 + n,
                        j['sentence2'], passive_hyp_inverted, 'neutral')
                self.mnli_row(w_pass_trsf, 2000000 + n,
                        j['sentence2'], passive_hyp_same_meaning, 'entailment')

                n += 1
Esempio n. 24
0
print article('university')
print article('hour')

print referenced('university')
print referenced('hour')

#singularity
print pluralize('child')
print singularize('wolves')

#
print
print lexeme('run')
print lemma('running')
print conjugate('purred', '3sg')
print PAST in tenses('purred')  # 'p' in tenses() also works.
print(PAST, 1, PL) in tenses('purred')

print 'Quantification'

print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify('carrot', amount=90)
print quantify({'carrot': 100, 'parrot': 20})

print 'ngrams'
print ngrams("I am eating a pizza.", n=2)

#parse
s = parse('I eat pizza with a fork.')
pprint(s)
Esempio n. 25
0
 def test_tenses(self):
     # Assert tense of "am".
     self.assertTrue(en.PRESENT_1ST_PERSON_SINGULAR in en.tenses("am"))
     self.assertTrue("1sg" in en.tenses("am"))
     print "pattern.en.tenses()"
Esempio n. 26
0
# The comparative() and superlative() commands give the comparative/superlative form of an adjective.
# Words with three or more syllables are simply preceded by "more" or "most".
for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]:
    print word, "=>", comparative(word), "=>", superlative(word)
print
print

# VERB CONJUGATION
# ----------------
# The lexeme() command returns a list of all possible verb inflections.
# The lemma() command returns the base form (infinitive) of a verb.
print "lexeme:", lexeme("be")
print "lemma:", lemma("was")

# The conjugate() command inflects a verb to another tense.
# The tense can be given as a constant, e.g. 
# INFINITIVE, PRESENT_1ST_PERSON_SINGULAR PRESENT_PLURAL, PAST_PARTICIPLE, ...
# or as an abbreviated alias: inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
print conjugate("being", tense="1sg", negated=False)

# Prefer the full constants for code that will be reused/shared.

# The tenses() command returns a list of all tenses for the given verb form.
# For example: tenses("are") => ['present 2nd person singular', 'present plural']
# You can then check if a tense constant is in the list.
# This will also work with aliases, even though they are not explicitly in the list.
from pattern.en import PRESENT_PLURAL
print tenses("are")
print PRESENT_PLURAL in tenses("are")
print "pl" in tenses("are")
Esempio n. 27
0
def is_not_present(conjugacion):
    return (PRESENT, 1) not in tenses(conjugacion) and (PRESENT, 2) not in tenses(conjugacion) and (PRESENT, 3) not in tenses(conjugacion)
Esempio n. 28
0
 def makeSameTense(self, w1, w2):
     tense = count([i[0] for i in tenses(w2)], stopwords=True)
     tense = sorted(tense, key=operator.itemgetter(2))
     return verbs.conjugate(w1, tense[0])
def process_text(text):
    """
    Uses NLP to get passive voice sentences, sentences with adverbs, and progressive tense sentences.
    Works ok so far, but not great -- lots of false positives.
    """
    doc = nlp(text)

    sents = list(doc.sents)
    print("Number of Sentences = ", len(sents))

    # gives match_id, start, end
    matches = matcher(doc)
    df = pd.DataFrame(matches, columns=['id', 'start', 'end'])
    df.drop_duplicates(inplace=True)
    print(df.shape[0], 'passive sentences detected')

    # print out passive phrases
    for i, r in df.iterrows():
        print(doc[r['start']:r['end']])

    # progressive tense detection
    # get verb phrase from sentences and check if progressive


    verb_clause_pattern = r'<VERB>+<ADV>*<PART>*<VERB>*<PART>*'

    progressive_sentences, progressive_verb_clauses = [], []
    adverb_sentences, adverbs = [], []
    for s in sents:
        verb_clauses = list(textacy.extract.pos_regex_matches(s, verb_clause_pattern))
        for v in verb_clauses:
            if len(v) > 1:  # need to have some helper verbs to have a problem
                verb_tenses = pd.DataFrame(list(tenses(v.text)))
                if 'progressive' in verb_tenses.iloc[:, -1].tolist():  # last column
                    progressive_sentences.append(s)
                    progressive_verb_clauses.append(v)

        # adverb detection
        # "...the road to hell is paved with adverbs..." -- Stephen King
        pos = np.array([w.pos_ for w in s])
        adverb_idxs = np.argwhere(pos == 'ADV').flatten()

        if len(adverb_idxs) != 0:
            adverb_sentences.append(s)
            adverbs.append([s[a] for a in adverb_idxs])

    # print out sentences with adverbs and the list of adverbs
    print('\n\n\nADVERBS')
    print(len(adverb_sentences), 'sentences detected with adverbs\n')
    if len(adverb_sentences) > 0:
        for i, adv in enumerate(adverb_sentences):
            print(adv)
            print('has adverbs:', adverbs[i], '\n')

    # print out sentences with possible progressive verb clauses
    print('\n\n\nPROGESSIVE TENSES')
    print(len(progressive_sentences), 'sentences detected with progressive verb clauses\n')
    if len(progressive_sentences) > 0:
        for i, adv in enumerate(progressive_sentences):
            print(adv)
            print('has progressive verb clause(s):', progressive_verb_clauses[i], '\n')
Esempio n. 30
0
def simp_parti_sent(tokens, node_list):

    """
    # the original tokens in the sent
    tokens = StanfordTokenizer().tokenize(sent)
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w

    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)

    """
    #start_time = time.time()

    root = ""
    root_ind = node_list[0][4]['root'][0]
    for nd in node_list:
        if root_ind == nd[0]:
            root=nd[1]

    """
    taggers = []
    for nd in node_list[1:]:
        taggers.append((nd[1], nd[2]))
    """

    strs = ""
    #split_ind = 0
    for nd in node_list[1:]:
        #import pdb; pdb.set_trace()
        #print(nd)
        if (root in nd) and ('nsubj' in nd[4].keys()):
            pass

        if (root in nd) and ('nsubj' in nd[4].keys()):
            #print "conj: ", nd
            #print "conj node: ", nd[4]['conj']

            #import pdb; pdb.set_trace()
            nsubj_ind = nd[4]['nsubj'][0]
            nsubj_dict = {}
            nsubj_compound_list = []
            for _nd in node_list: #BUG
                if nsubj_ind == _nd[0]:
                     nsubj_dict = _nd[4]
                     if ('compound' in nsubj_dict.keys()):
                         nsubj_compound_list = nsubj_dict['compound']
                     break

            #import pdb; pdb.set_trace()
            if ('acl' in nsubj_dict.keys()):
                #[NOTICE]: connect the nsubj + acl as 1st
                # And the 1st end in the PUNC
                #import pdb; pdb.set_trace()
                acl_ind = nsubj_dict['acl'][0]

                #[NOTICE]: end the 1st sentence at the 'punc' place after acl_ind
                # this assumation is wrong
                """
                for punc in PUNCTUATION:
                    if punc in tokens[acl_ind:]:
                        split_ind = tokens[acl_ind:].index(punc)
                        break
                """

                #subj = tokens[nsubj_ind]
                #import pdb; pdb.set_trace()
                nsubj = ""
                for i in nsubj_compound_list:
                    nsubj = nsubj + " " + tokens[i]
                nsubj = nsubj + " " + tokens[nsubj_ind]
                nsubj = nsubj[0].upper() + nsubj[1:] + " "
                #tokens.insert(1, upper_first_char(subj))

                """
                person_taggers = []
                org_taggers = []
            # replace the nsubj with "he/she"

                for token, title in taggers:
                    if token in nsubj:
                        if title == 'PERSON':
                            person_taggers.append(token)
                        elif title == 'ORGANIZATION':
                            org_taggers.append(token)
                        else:
                            org_taggers.append(token)
                """
                #import pdb; pdb.set_trace()
                #verb = "be"
                verb = conjugate("was", tenses(root)[0][0], 3)
                root_ind = tokens.index(root)

                advmod_ind = 0
                for _nd in node_list[1:]:
                    if acl_ind == _nd[0]:
                        acl_dict = _nd[4]
                        break
                if ('advmod' in acl_dict.keys()):
                    advmod_ind = acl_dict['advmod'][0]

                if advmod_ind == 0:
                    _str1 = tokens[acl_ind:root_ind]
                else:
                    if advmod_ind > acl_ind:
                        _str1 = tokens[acl_ind:root_ind]
                    else:
                        _str1 = tokens[advmod_ind:root_ind]

                if len(_str1) > 0 and _str1[-1] in PUNCTUATION:
                    _str1[-1] = ''

                #str1 = base.upper_first_char(nsubj) + " " + verb + " "
                str1 = nsubj + " " + verb + " "
                str1 =  str1 + ' '.join(_str1)
                #print "1st sent: ", str1

                # upper the 1st char in 2nd sent

                #import pdb; pdb.set_trace()
                _strs = tokens[root_ind:]
                _str2 = " ".join(_strs)
                """
                if len(person_taggers) > 0:
                    str2 = "He" + " " + ' '.join(_str2)  # 'he' will be replaced with 'he/she'

                elif len(org_taggers) > 0:
                    if base.isplural(org_taggers[-1]):
                        str2 = "They" + " " + ' '.join(_str2)
                    else:
                        str2 = "It" + " " + ' '.join(_str2)
                else:
                    str2 = nsubj + ' '.join(_str2)
                """
                nsubj = nsubj.strip()
                _nsubj = nsubj[0].upper() + nsubj[1:]

                if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She':
                    str2 = _nsubj + " " + _str2
                else:
                    #sent2 = _nsubj + " " + _str2
                    #nsubj2 = base.replace_nsubj(sent2, nsubj)
                    #str2 = nsubj2 + _str2
                    str2 = _nsubj + " " + _str2
                #w = _w + ' '
                #str2 = base.upper_first_char(nsubj) + " " + ' '.join(_str2)
                #print "2nd sent: ", str2

                strs = str1 + ' . ' + str2
                #return strs

            #if ('acl' in nsubj_dict.keys()):


    #end_time = time.time()
    #during_time = end_time - start_time
    #print "The time of parti function: ", during_time
    return strs
Esempio n. 31
0
def simp_appos_sent(tokens, node_list):
    """
    strs = ""
    # the original tokens in the sent
    tokens = StanfordTokenizer().tokenize(sent)
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w

    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)

    """
    start_time = time.time()

    root = ""
    root_ind = node_list[0][4]['root'][0]
    for nd in node_list:
        if root_ind == nd[0]:
            root=nd[1]

    """
    taggers = []
    for nd in node_list[1:]:
        taggers.append((nd[1], nd[2]))
    """
    strs = ""
    #split_ind = 0
    for nd in node_list[1:]:
        #import pdb; pdb.set_trace()
        #print(nd)
        if (root in nd) and ('nsubj' in nd[4].keys() or ('nsubjpass' in nd[4].keys())):
            pass

        if (root in nd) and ('nsubj' in nd[4].keys() or ('nsubjpass' in nd[4].keys())):
            #print "conj: ", nd
            #print "conj node: ", nd[4]['conj']
            nsubj = ""
            nsubj_ind = 0
            nsubj_nmod_ind = 0
            nsubj_dict = {}
            #import pdb; pdb.set_trace()
            if ('nsubj' in nd[4].keys()):
                nsubj_ind = nd[4]['nsubj'][0]
                nsubj_compound_list = []
                #nsubj_nmod_ind = 0
                for _nd in node_list[1:]: #BUG
                    if nsubj_ind == _nd[0]:
                        nsubj_dict = _nd[4]
                        if ('compound' in nsubj_dict.keys()):
                            nsubj_compound_list = nsubj_dict['compound']
                        #break
                        if ('nmod' in nsubj_dict.keys()):
                            nsubj_nmod_ind=nsubj_dict['nmod'][0]

                #import pdb; pdb.set_trace()
                for i in nsubj_compound_list:
                    nsubj = nsubj + " " + tokens[i]

                cop_ind = 0
                for _nd in node_list[1:]:
                    if (root in _nd) and ('cop' in _nd[4].keys()):
                        cop_ind = _nd[4]['cop'][0]

            # get the nsubj
            #import pdb; pdb.set_trace()

            auxpass_ind = 0
            if ('nsubjpass' in nd[4].keys()):
                nsubj_ind = nd[4]['nsubjpass'][0]
                for _nd in node_list:
                    if root_ind == _nd[0] and  ('auxpass' in _nd[4].keys()):
                        auxpass_ind = nd[4]['auxpass'][0]

            if nsubj_nmod_ind != 0: #BUG here
                nsubj = " ".join(tokens[nsubj_ind:nsubj_nmod_ind+1])
            else:
                nsubj = nsubj + " " + tokens[nsubj_ind]

            nsubj = nsubj.strip()
            nsubj = nsubj[0].upper() + nsubj[1:] + " "

            """
            person_taggers = []
            org_taggers = []
            # replace the nsubj with "he/she"
            for token, title in taggers:
                if token in nsubj:
                    if title == 'PERSON':
                        person_taggers.append(token)
                    elif title == 'ORGANIZATION':
                        org_taggers.append(token)
                    else:
                        org_taggers.append(token)
            """
            #import pdb; pdb.set_trace()
            if len(nsubj_dict)>0 and ('appos' in nsubj_dict.keys()):
                #[NOTICE]: connect the nsubj + acl as 1st
                #import pdb; pdb.set_trace()
                appos_ind = nsubj_dict['appos'][0]

                #verb = "is"
                verb = conjugate("was", tenses(root)[0][0], 3)
                #verb = base.update_vb_conjugation(verb, root)

                #nsubj = base.upper_first_char(tokens[nsubj_ind])

                #[NOTICE]: remove the ',' after the nsubj
                if tokens[nsubj_ind + 1] in PUNCTUATION:
                    tokens[nsubj_ind + 1] = ''

                #tokens.insert(nsubj_ind + 1, verb)

                root_ind = tokens.index(root)
                # SO bad solution, if the root isnot a 'verb'
                split_ind = 0
                if ',' in tokens:
                    split_ind = tokens.index(',')

                for nd in node_list[1:]: #BUG
                    if nsubj_ind == nd[0]:
                     nsubj_dict = _nd[4]
                     if ('compound' in nsubj_dict.keys()):
                         nsubj_compound_list = nsubj_dict['compound']
                     #break
                     if ('nmod' in nsubj_dict.keys()):
                         nsubj_nmod_ind=nsubj_dict['nmod'][0]

                #import pdb; pdb.set_trace()
                if tokens[root_ind] > split_ind:
                    if nsubj_nmod_ind != 0 and cop_ind !=0:
                        _str1 = tokens[split_ind:cop_ind]
                    else:
                        _str1 = tokens[nsubj_ind+1:split_ind]
                    tokens[split_ind] = ''

                    if len(_str1) > 0 and (_str1[-1] in PUNCTUATION):
                        _str1[-1] = ''
                    if len(_str1) >0 and (_str1[0] in PUNCTUATION):
                        _str1[0] = ''

                    str1 = nsubj + " " + verb + ' '.join(_str1)

                    if nsubj_nmod_ind != 0 and cop_ind !=0:
                        _strs = tokens[cop_ind:]
                    else:
                        _strs = tokens[split_ind:]

                    _str2 = " ".join(_strs)
                    """
                    if len(person_taggers) > 0:
                        str2 = "He" + " " + ' '.join(_str2)  # 'he' will be replaced with 'he/she'

                    elif len(org_taggers) > 0:
                        if base.isplural(org_taggers[-1]):
                            str2 = "They" + " " + ' '.join(_str2)
                        else:
                            str2 = "It" + " " + ' '.join(_str2)
                    else:
                        str2 = nsubj + ' '.join(_str2)
                    """
                    nsubj = nsubj.strip()
                    _nsubj = nsubj[0].upper() + nsubj[1:]

                    if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She':
                        str2 = _nsubj + _str2
                    else:
                        #sent2 = _nsubj + " " + _str2
                        #nsubj2 = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj2 + _str2
                        str2 = _nsubj + " " + _str2

                else:

                    #import pdb; pdb.set_trace()
                    _str1 = tokens[nsubj_ind+1:root_ind]

                    if len(_str1) > 0 and _str1[-1] in PUNCTUATION:
                        _str1[-1] = ''
                    str1 = nsubj  + ' '.join(_str1)
                    #print "1st sent: ", str1

                    # upper the 1st char in 2nd sent
                    _strs = tokens[root_ind:]
                    _str2 = " ".join(_strs)
                    """
                    if len(person_taggers) > 0:
                        str2 = "He" + " " + ' '.join(_str2)  # 'he' will be replaced with 'he/she'

                    elif len(org_taggers) > 0:
                        if base.isplural(org_taggers.split()[-1]):
                            str2 = "They" + " " + ' '.join(_str2)
                        else:
                            str2 = "It" + " " + ' '.join(_str2)
                    else:
                        str2 = nsubj + ' '.join(_str2)
                    """
                    #w = _w + ' '
                    #str2 = nsubj  + ' '.join(_str2)
                    #print "2nd sent: ", str2
                    nsubj = nsubj.strip()
                    _nsubj = nsubj[0].upper() + nsubj[1:]

                    if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She':
                        str2 = _nsubj + _str2
                    else:
                        #sent2 = _nsubj + " " + _str2
                        #nsubj2 = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj2 + _str2
                        str2 = _nsubj + " " + _str2

                strs = str1 + ' . ' + str2

                #import pdb; pdb.set_trace()
                end_time = time.time()
                during_time = end_time - start_time
                print "The time of appos function: ", during_time

                return strs

                if auxpass_ind > 0:
                    split_ind = 0
                    if ',' in tokens:
                        split_ind = tokens.index(',')
                    if split_ind == 0:
                        return strs

                    #import pdb; pdb.set_trace()
                    #verb = conjugate("be", tenses(root)[0][0], 3)
                    verb = tokens[auxpass_ind]
                    if tokens[root_ind] > split_ind:
                        _str1 = tokens[nsubj_ind+1:auxpass_ind]

                        if len(_str1) > 0 and _str1[-1] in PUNCTUATION:
                            _str1[-1] = ''
                        if len(_str1) > 0 and _str1[0] in PUNCTUATION:
                            _str1[0] = ''
                        str1 = nsubj  + " " + verb + ' '.join(_str1)
                    #print "1st sent: ", str1

                    # upper the 1st char in 2nd sent
                        _strs = tokens[auxpass_ind:]
                        _str2 = " ".join(_strs)

                        nsubj = nsubj.strip()
                        _nsubj = nsubj[0].upper() + nsubj[1:]

                        if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She':
                            str2 = _nsubj + _str2
                        else:
                        #sent2 = _nsubj + " " + _str2
                        #nsubj2 = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj2 + _str2
                            str2 = _nsubj + " " + _str2

                    strs = str1 + ' . ' + str2

                    return strs


    #import pdb; pdb.set_trace()
    end_time = time.time()
    during_time = end_time - start_time
    print "The time of appos function: ", during_time

    return strs
Esempio n. 32
0
 def makeSameTense(self, w1, w2):
     tense = count([i[0] for i in tenses(w2)], stopwords=True)
     tense = sorted(tense, key=operator.itemgetter(2))
     return verbs.conjugate(w1, tense[0])
Esempio n. 33
0
def pass_act_detect(doc):
    parse = nlp(doc)
    newdoc = ''
    for sent in parse.sents:  # no meaning, test only take one sentence at a time.

        # Init parts of sentence to capture:
        subjpass = ''
        subj = ''
        verb = ''
        verbaspect = ''
        verbtense = ''
        adverb = {'bef': '', 'aft': ''}
        part = ''
        prep = ''
        agent = ''
        aplural = False
        advcltree = None
        aux = list(list(nlp('. .').sents)[0])  # start with 2 'null' elements
        xcomp = ''
        punc = '.'
        # Analyse dependency tree:
        for word in sent:
            if word.dep_ == 'advcl':
                if word.head.dep_ in ('ROOT', 'auxpass'):
                    advcltree = word.subtree
            if word.dep_ == 'nsubjpass':
                if word.head.dep_ == 'ROOT':
                    subjpass = ''.join(
                        w.text_with_ws.lower() if w.tag_ not in (
                            'NNP', 'NNPS') else w.text_with_ws
                        for w in word.subtree).strip()
            if word.dep_ == 'nsubj':
                subj = ''.join(w.text_with_ws.lower() if w.tag_ not in (
                    'NNP', 'NNPS') else w.text_with_ws
                               for w in word.subtree).strip()
                if word.head.dep_ == 'auxpass':
                    if word.head.head.dep_ == 'ROOT':
                        subjpass = subj
            if word.dep_ in ('advmod', 'npadvmod'):
                if word.head.dep_ == 'ROOT':
                    if verb == '':
                        adverb['bef'] = ''.join(
                            w.text_with_ws.lower() if w.tag_ not in (
                                'NNP', 'NNPS') else w.text_with_ws
                            for w in word.subtree).strip()
                    else:
                        adverb['aft'] = ''.join(
                            w.text_with_ws.lower() if w.tag_ not in (
                                'NNP', 'NNPS') else w.text_with_ws
                            for w in word.subtree).strip()
            if word.dep_ == 'auxpass':
                if word.head.dep_ == 'ROOT':
                    if not subjpass:
                        subjpass = subj
            if word.dep_ in ('aux', 'auxpass', 'neg'):
                if word.head.dep_ == 'ROOT':
                    aux += [word]
            if word.dep_ == 'ROOT':
                verb = word.text
                if word.tag_ == 'VB':
                    verbtense = en.INFINITIVE
                elif word.tag_ == 'VBD':
                    verbtense = en.PAST
                elif word.tag_ == 'VBG':
                    verbtense = en.PRESENT
                    verbaspect = en.PROGRESSIVE
                elif word.tag_ == 'VBN':
                    verbtense = en.PAST
                else:
                    verbtense = en.tenses(word.text)[0][0]
            if word.dep_ == 'prt':
                if word.head.dep_ == 'ROOT':
                    part = ''.join(w.text_with_ws.lower() if w.tag_ not in (
                        'NNP', 'NNPS') else w.text_with_ws
                                   for w in word.subtree).strip()
            if word.dep_ == 'prep':
                if word.head.dep_ == 'ROOT':
                    prep = ''.join(w.text_with_ws.lower() if w.tag_ not in (
                        'NNP', 'NNPS') else w.text_with_ws
                                   for w in word.subtree).strip()
            if word.dep_.endswith('obj'):
                if word.head.dep_ == 'agent':
                    if word.head.head.dep_ == 'ROOT':
                        agent = ''.join(
                            w.text + ', ' if w.dep_ == 'appos' else (
                                w.text_with_ws.lower() if w.tag_ not in (
                                    'NNP', 'NNPS') else w.text_with_ws)
                            for w in word.subtree).strip()
                        aplural = word.tag_ in ('NNS', 'NNPS')
            if word.dep_ in ('xcomp', 'ccomp', 'conj'):
                if word.head.dep_ == 'ROOT':
                    xcomp = ''.join(w.text_with_ws.lower() if w.tag_ not in (
                        'NNP', 'NNPS') else w.text_with_ws
                                    for w in word.subtree).strip()
            if word.dep_ == 'punct':
                punc = word.text

        # exit if not passive:
        if subjpass == '':
            newdoc += str(sent) + ' '
            return False  #active

        return True  #passive
Esempio n. 34
0
# The lemma() function returns the base form (infinitive) of a verb.
print("lexeme: %s" % lexeme("be"))
print("lemma: %s" % lemma("was"))
print("")

# The conjugate() function inflects a verb to another tense.
# You can supply:
# - tense : INFINITIVE, PRESENT, PAST,
# - person: 1, 2, 3 or None,
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g.,
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False))
print(conjugate("being", tense="1sg", negated=False))
print("")

# Prefer the full constants for code that will be reused/shared.

# The tenses() function returns a list of all tenses for the given verb form.
# Each tense is a tuple of (tense, person, number, mood, aspect).
# For example: tenses("are") => [('present', 2, 'plural', 'indicative', 'imperfective'), ...]
# You can then check if a tense constant is in the list.
# This will also work with aliases, even though they are not explicitly in the list.
from pattern.en import PRESENT, PLURAL
print(tenses("are"))
print((PRESENT, 1, PLURAL) in tenses("are"))
print("pl" in tenses("are"))
Esempio n. 35
0
    Sentence, Word, Chunk, PNPChunk, modality, wordnet, ADJECTIVE

#indefinite article
print referenced('university')
print referenced('hour')
# pluralization and singularization
print pluralize('child')
print singularize('wolves')
# comparative and superlative
print comparative('bad')
print superlative('bad')
# verb conjugation
print lexeme('purr')
print lemma('purring')
print conjugate('purred', '3sg')  # he / she / it
print 'p' in tenses('purred')  # By alias.
print PAST in tenses('purred')
print(PAST, 1, PL) in tenses('purred')
# rule-based conjugation
print 'google' in verbs.infinitives
print 'googled' in verbs.inflections
print conjugate('googled', tense=PARTICIPLE, parse=False)
print conjugate('googled', tense=PARTICIPLE, parse=True)
# quantification
print number("seventy-five point two")  # "seventy-five point two" => 75.2
print numerals(2.245, round=2)  # 2.245 => "two point twenty-five"
print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify({'carrot': 100, 'parrot': 20})
print quantify('carrot', amount=1000)
# spelling
print suggest("parot")
Esempio n. 36
0
def simp_adverb_sent(_tokens, node_list):
    tokens = list(_tokens)
    strs = ""

    #import pdb; pdb.set_trace()
    if COMMA not in tokens:
        return strs

    root = ""
    root_ind = node_list[0][4]['root'][0]
    for nd in node_list:
        if root_ind == nd[0]:
            root=nd[1]

    #split_ind = 0
    for nd in node_list[1:]:
        #import pdb; pdb.set_trace()
        #print(nd)
        if (root in nd) and ('advcl' in nd[4].keys() or 'xcomp' in nd[4].keys()):
            pass

        if (root in nd) and ('advcl' in nd[4].keys() or 'xcomp' in nd[4].keys() or 'advmod' in nd[4].keys()):
            #print "conj: ", nd
            #print "conj node: ", nd[4]['conj']

            #import pdb; pdb.set_trace()
            nsubj = ""
            nsubj_ind = 0
            det_ind = 0
            if ('nsubj' in nd[4].keys()):
                nsubj_ind = nd[4]['nsubj'][0]

                nsubj_dict = {}
                nsubj_compound_list = []
                amod_list = []
                det_ind = 0
                #import pdb; pdb.set_trace()
                for _nd in node_list:
                    #import pdb; pdb.set_trace()
                    if (nsubj_ind == _nd[0]):
                        #import pdb; pdb.set_trace()
                        nsubj_dict = _nd[4]
                        if ('amod' in nsubj_dict.keys()):
                            amod_list = nsubj_dict['amod']
                        if ('compound' in nsubj_dict.keys()):
                            nsubj_compound_list = nsubj_dict['compound']
                        if ('det' in nsubj_dict.keys()):
                            det_ind = nsubj_dict['det'][0]
                    #break

                #nsubj = tokens[det_ind] + " " + tokens[nsubj_ind]
                for j in amod_list:
                    nsubj = nsubj + " " + tokens[j]
                for i in nsubj_compound_list:
                    nsubj = nsubj + " " + tokens[i]
                if det_ind > 0:
                    nsubj = tokens[det_ind] + " " + nsubj + " " + tokens[nsubj_ind]
                else:
                    nsubj = nsubj + " " + tokens[nsubj_ind]

                #import pdb; pdb.set_trace()
                nsubj = nsubj.strip()
                nsubj = nsubj[0].upper() + nsubj[1:] + " "
                
                #cop_ind = 0
                if ('cop' in nd[4].keys()):
                    cop_ind = nd[4]['cop'][0]
                #import pdb; pdb.set_trace()
                
            if ('nsubjpass' in nd[4].keys()):
                nsubj_ind = nd[4]['nsubjpass'][0]
                for _nd in node_list:
                    #import pdb; pdb.set_trace()
                    if (nsubj_ind == _nd[0]):
                        #import pdb; pdb.set_trace()
                        if ('det' in _nd[4].keys()):
                            det_ind = _nd[4]['det'][0]
                            
                nsubj = tokens[det_ind] + " " + tokens[nsubj_ind]
                nsubj = nsubj.strip()

            """
            person_taggers = []
            org_taggers = []
            #import pdb; pdb.set_trace()
            # replace the nsubj with "he/she"
            for token, title in eng_tagger.tag(tokens):
                if token.lower() in nsubj.lower().split():
                    if token == 'the' or token == 'The': 
                        continue
                    if title == 'PERSON':
                        person_taggers.append(token)
                    elif title == 'ORGANIZATION':
                        org_taggers.append(token)
                    else:
                        org_taggers.append(token)
            """
            #import pdb; pdb.set_trace()
            advcl_dict = {}
            advcl_tag = ""
            if ('advcl' in nd[4].keys()):
                advcl_ind = nd[4]['advcl'][0]

                #import pdb; pdb.set_trace()
                if len(tenses(root))>0:
                    if tenses(root)[0][0] == 'infinitive':
                        tokens[advcl_ind] = conjugate(tokens[advcl_ind], tenses(root)[1][0], 3)
                    else:
                        tokens[advcl_ind] = conjugate(tokens[advcl_ind], tenses(root)[0][0], 3)

                #TODO: update the tense of the advcl_ind
                
                #import pdb; pdb.set_trace()
                #advcl_dict = {}
                for _nd in node_list[1:]: #BUG
                    if advcl_ind == _nd[0]:
                         advcl_dict = _nd[4]
                         advcl_tag = _nd[2]
                         break

                # check the nsubj of the advcl, if they are the same subj, it is adverb
                advcl_dobj_ind = 0
                for _nd in node_list[1:]:
                    if advcl_ind == _nd[0]:
                        if 'nsubj' in _nd[4].keys():
                            #import pdb; pdb.set_trace()
                            advcl_nsubj_ind = _nd[4]['nsubj'][0]
                            if tokens[advcl_nsubj_ind].lower() not in nsubj:
                                return strs
                        if 'dobj' in _nd[4].keys():
                            advcl_dobj_ind = _nd[4]['dobj'][0]

                #import pdb; pdb.set_trace()
                verb = 'was'
                #import pdb; pdb.set_trace()
                if len(tenses(root)) > 0:
                    if nsubj.strip().lower() == 'they':
                        if tenses(root)[0][0] == 'infinitive':
                            verb = conjugate(verb, tenses(root)[1][0], 2)
                        else:
                            verb = conjugate(verb, tenses(root)[0][0], 2)
                    else:
                        if tenses(root)[0][0] == 'infinitive':
                            verb = conjugate(verb, tenses(root)[1][0], 3)
                        else:
                            verb = conjugate(verb, tenses(root)[0][0], 3)

                # TODO, the tense
                if advcl_tag == 'VBN':
                    if len(nsubj)>0:
                        nsubj = nsubj[0].upper() + nsubj[1:] + " "
                if advcl_tag == 'VBG':
                    if len(nsubj)>0:
                        nsubj = nsubj[0].upper() + nsubj[1:] + " "

                #ASSUME ',' is the splitting tag
                # This assumation isnot right
                split_ind = tokens.index(COMMA)
                    #nsubj_ind = nd[4]['nsubj'][0]
                    #if (advcl_ind < split_ind):
                    #subj = tokens[nsubj_ind]
                   # tokens.insert(1, base.upper_first_char(subj))

                #if len(tenses(root))>0:
                #    tokens[advcl_ind]=conjugate(tokens[advcl_ind], tenses(root)[0][0])
                #

                #import pdb; pdb.set_trace()
                if advcl_dobj_ind > split_ind:
                    tokens[split_ind] = ""
                    _str1 = tokens[split_ind:advcl_dobj_ind+1]
                else:
                    #_str1 = ""
                    _str1 = tokens[:(split_ind)]
                    if _str1[-1] in PUNCTUATION:
                        _str1[-1] = ''
                """
                str1 = ""
                if advcl_tag == 'VBN':
                    str1 = nsubj + ' '.join(_str1)
                if advcl_tag == 'VBG':
                    str1 = ' '.join(_str1)
                """
                #import pdb; pdb.set_trace()
                _str1_ = ' '.join(_str1)
                nsubj = ' '.join(nsubj.split())
                str1 = ""
                if nsubj.lower() + ' ' in _str1_.lower().split():
                    str1 = _str1_
                else:
                    if advcl_tag == 'VBN':
                        str1 = nsubj + " " +  verb + " " + _str1_
                    else:
                        str1 = nsubj + " " + _str1_
                #print "1st sent: ", str1

                        # upper the 1st char in 2nd sent
                    #tokens[nsubj_ind] = base.upper_first_char(tokens[nsubj_ind])

                #import pdb; pdb.set_trace()
                _str2 = ""
                str2 = ""
                if split_ind < nsubj_ind:
                    #_str2 = tokens[split_ind+1:] 
                    _strs = tokens[root_ind:]
                    if ('which' == _strs[0].lower()) or ('who' == _strs[0].lower()):
                        _strs = tokens[split_ind+2:]

                    _str2 = " ".join(_strs)
                #_str2 = tokens[root_ind:]
                        #w = _w + ' '
                    """
                    if len(nsubj)>0:
                        if (('it' not in nsubj.lower()) or ('They' not in nsubj.lower())):
                            str2 = nsubj + " " + ' '.join(_str2)
                        else:
                        #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2)
                            if len(person_taggers) > 0:
                                str2 = "He" + " " + ' '.join(_str2)  # 'he' will be replaced with 'he/she'
                            elif len(org_taggers) > 0:
                                if base.isplural(org_taggers[-1]) or (org_taggers[-1].lower() == 'they'):
                                    str2 = "They" + " " + ' '.join(_str2)
                                else:
                                    str2 = "It" + " " + ' '.join(_str2)
                    else:
                        str2 = ' '.join(_str2)
                    """
                    nsubj = nsubj.strip()
                    _nsubj = nsubj[0].upper() + nsubj[1:]

                    if _nsubj == 'I' or _nsubj == 'She' or _nsubj == 'He':
                        str2 = _nsubj + " " + _str2
                    else:
                        #sent2 = _nsubj + " " + _str2
                    
                        #nsubj = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj + _str2
                        str2 = _nsubj + " " + _str2
                else:
                    if advcl_dobj_ind > split_ind:
                        _strs = tokens[advcl_dobj_ind+1:]
                        if _strs[0] in PUNCTUATION:
                            _strs[0] = ''
                    else:
                        _strs = tokens[split_ind+1:]
                        
                    if len(_str2)>0 and (('which' == _str2[0].lower()) or ('who' == _str2[0].lower())):
                        _strs = tokens[split_ind+2:]

                    _str2 = " ".join(_strs)

                    nsubj = nsubj.strip()
                    _nsubj = nsubj[0].upper() + nsubj[1:]
                    
                    if _nsubj == 'I' or _nsubj == 'She' or _nsubj == 'He':
                        str2 = _nsubj + " " + _str2
                    else:
                        #sent2 = _nsubj + " " + _str2
                    
                        #nsubj = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj + " " + _str2
                        str2 = _nsubj + " " + _str2
                    
                #print "2nd sent: ", str2

                if str1:
                    strs = str1 + ' . ' + str2
                else:
                    strs = str2

                return strs    

            #import pdb; pdb.set_trace()
            xcomp_ind = 0 
            if ('xcomp' in nd[4].keys()):
                xcomp_ind = nd[4]['xcomp'][0]
                if len(tenses(root))>0:
                    tokens[xcomp_ind] = conjugate(tokens[xcomp_ind], tenses(root)[0][0], 3)

                #import pdb; pdb.set_trace()
                #advcl_dict = {}
                for _nd in node_list: #BUG
                    if xcomp_ind == _nd[0]:
                         xcomp_dict = _nd[4]
                         xcomp_tag = _nd[2]
                         break

                #if len(tenses(root)) > 0:
                #    tokens[xcomp_ind]=conjugate(tokens[xcomp_ind], tenses(root)[0][0])

                #import pdb; pdb.set_trace()
                verb = 'was'
                #import pdb; pdb.set_trace()
                if len(tenses(root)) > 0:
                    if nsubj.strip().lower() == 'they':
                        verb = conjugate(verb, tenses(root)[0][0], 2)
                    else:
                        verb = conjugate(verb, tenses(root)[0][0], 3)
                # TODO
                if xcomp_tag == 'VBN':
                    nsubj = nsubj[0].upper() + nsubj[1:] + " "
                if xcomp_tag == 'VBG':
                    nsubj = nsubj[0].upper() + nsubj[1:] + " "

                split_ind = tokens.index(COMMA)
                    #nsubj_ind = nd[4]['nsubj'][0]
                    #if (advcl_ind < split_ind):
                    #subj = tokens[nsubj_ind]
                   # tokens.insert(1, base.upper_first_char(subj)) 

                _str1 = tokens[:(split_ind)]
                if _str1[-1] in PUNCTUATION:
                    _str1[-1] = ''

                str1 = ""

                #import pdb; pdb.set_trace()
                nsubj = ' '.join(nsubj.split())
                _str1_ = ' '.join(_str1)
                #if xcomp_tag == 'VBN':
                if nsubj.lower() + ' ' in _str1_.lower():
                    str1 = _str1_
                else:
                    if advcl_tag == 'VBN':
                        str1 = nsubj + " " +  verb + " " + _str1_
                    else:
                        str1 = nsubj + " " +  _str1_
                """
                #elif xcomp_tag == 'VBG':
                    if nsubj.lower() in _str1_.lower():
                        str1 = _str1_
                    else:
                        str1 = nsubj + _str1_
                """
                #print "1st sent: ", str1

                        # upper the 1st char in 2nd sent
                    #tokens[nsubj_ind] = base.upper_first_char(tokens[nsubj_ind])

                #import pdb; pdb.set_trace()
                _str2 = ""
                str2 = ""
                if nsubj_ind < split_ind:
                    _strs = tokens[split_ind+1:]
                    if ('which' == _strs[0].lower()) or ('who' == _strs[0].lower()):
                        _strs = tokens[split_ind+2:]
                    _str2 = " ".join(_strs)
                    #TODO: update the tense
                    #_str2 = tokens[root_ind:]
                #_str2 = tokens[split_ind+1:]
                        #w = _w + ' '
                    """
                    if len(nsubj)>0:
                        if (('it' not in nsubj.lower()) or ('they' not in nsubj.lower())):
                            str2 = nsubj + " " + ' '.join(_str2)
                        else:
                        #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2)
                            if len(person_taggers) > 0:
                                str2 = "He" + " " + ' '.join(_str2)  # 'he' will be replaced with 'he/she'
                            elif len(org_taggers) > 0:
                                if base.isplural(org_taggers[-1]) or (org_taggers[-1].lower() == 'they'):
                                    str2 = "They" + " " + ' '.join(_str2)
                                else:
                                    str2 = "It" + " " + ' '.join(_str2)

                    else:
                        str2 = ' '.join(_str2)
                    """
                     #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2)
                    nsubj = nsubj.strip()
                    _nsubj = nsubj[0].upper() + nsubj[1:]

                    if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She':
                        str2 = _nsubj + " " + _str2
                    else:
                        #sent2 = _nsubj + " " + _str2
                        #nsubj2 = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj2 + _str2
                        str2 = _nsubj + " " + _str2
                   
                else:
                    str2 = base.upper_first_char(nsubj) + " " + ' '.join(tokens[split_ind+2:])
                #str2 = "That" + " " + ' '.join(_str2)
                #print "2nd sent: ", str2

                if str1:
                    if str2:
                        strs = str1 + ' . ' + str2 
                    else:
                        strs = str1 + ' . '
                else:
                    strs = str2 + ' . '

                return strs

            #import pdb; pdb.set_trace()
            advmod_ind = 0 
            if ('advmod' in nd[4].keys()):
                advmod_ind = nd[4]['advmod'][0]
                #if len(tenses(root))>0:
                #    tokens[advmod_ind] = conjugate(tokens[advmod_ind], tenses(root)[0][0], 3)

                #import pdb; pdb.set_trace()
                #advcl_dict = {}
                for _nd in node_list: #BUG
                    if advmod_ind == _nd[0]:
                         advmod_dict = _nd[4]
                         advmod_tag = _nd[2]
                         break

                #if len(tenses(root)) > 0:
                #    tokens[xcomp_ind]=conjugate(tokens[xcomp_ind], tenses(root)[0][0])

                #import pdb; pdb.set_trace()
                verb = 'was'
                #import pdb; pdb.set_trace()
                if len(tenses(root)) > 0:
                    if nsubj.strip().lower() == 'they':
                        verb = conjugate(verb, tenses(root)[0][0], 2)
                    else:
                        verb = conjugate(verb, tenses(root)[0][0], 3)
                # TODO
                if nsubj:
                    nsubj = nsubj.strip()
                    nsubj = nsubj[0].upper() + nsubj[1:]

                split_ind = tokens.index(COMMA)
                    #nsubj_ind = nd[4]['nsubj'][0]
                    #if (advcl_ind < split_ind):
                    #subj = tokens[nsubj_ind]
                   # tokens.insert(1, base.upper_first_char(subj)) 

                _str1 = tokens[:(split_ind)]
                if _str1[-1] in PUNCTUATION:
                    _str1[-1] = ''

                str1 = ""

                #import pdb; pdb.set_trace()
                nsubj = ' '.join(nsubj.split())
                _str1_ = ' '.join(_str1)
                #if xcomp_tag == 'VBN':
                if nsubj.lower() + ' ' in _str1_.lower():
                    str1 = _str1_
                else:
                    str1 = nsubj + " " + verb + " " + _str1_.lower()
                """
                #elif xcomp_tag == 'VBG':
                    if nsubj.lower() in _str1_.lower():
                        str1 = _str1_
                    else:
                        str1 = nsubj + _str1_
                """
                #print "1st sent: ", str1

                        # upper the 1st char in 2nd sent
                    #tokens[nsubj_ind] = base.upper_first_char(tokens[nsubj_ind])

                #import pdb; pdb.set_trace()
                _str2 = ""
                str2 = ""
                if nsubj_ind < split_ind:
                    _strs = tokens[split_ind+1:]
                    if ('which' == _strs[0].lower()) or ('who' == _strs[0].lower()):
                        _strs = tokens[split_ind+2:]
                    _str2 = " ".join(_strs)
                    #TODO: update the tense
                    #_str2 = tokens[root_ind:]
                #_str2 = tokens[split_ind+1:]
                        #w = _w + ' '
                     #str2 = nsubj[0].upper() + nsubj[1:] + " " + ' '.join(_str2)
                    
                    nsubj = nsubj.strip()
                    if nsubj:
                        _nsubj = nsubj[0].upper() + nsubj[1:]

                        if _nsubj == 'I' or _nsubj == 'He' or _nsubj == 'She':
                            str2 = _nsubj + " " + _str2
                        else:
                        #sent2 = _nsubj + " " + _str2
                        #nsubj2 = base.replace_nsubj(sent2, nsubj)
                        #str2 = nsubj2 + _str2
                            str2 = _nsubj + " " + _str2
                   
                else:
                    str2 = base.upper_first_char(nsubj) + " " + ' '.join(tokens[split_ind+2:])
                #str2 = "That" + " " + ' '.join(_str2)
                #print "2nd sent: ", str2

                #import pdb; pdb.set_trace()

                if str1:
                    if str2:
                        strs = str1 + ' . ' + str2 
                    else:
                        strs = str1 + ' . '
                else:
                    strs = str2 + ' . '

                return strs
            
 
    return strs
Esempio n. 37
0
def pass2act(doc, rec=False):
    parse = nlp(doc)
    newdoc = ''
    for sent in parse.sents:

        # Init parts of sentence to capture:
        subjpass = ''
        subj = ''
        verb = ''
        verbaspect = ''
        verbtense = ''
        adverb = {'bef':'', 'aft':''}
        part = ''
        prep = ''
        agent = ''
        aplural = False
        advcltree = None
        aux = list(list(nlp('. .').sents)[0]) # start with 2 'null' elements
        xcomp = ''
        punc = '.'
        # Analyse dependency tree:
        for word in sent:
            if word.dep_ == 'advcl':
                if word.head.dep_ in ('ROOT', 'auxpass'):
                    advcltree = word.subtree
            if word.dep_ == 'nsubjpass':
                if word.head.dep_ == 'ROOT':
                    subjpass = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
            if word.dep_ == 'nsubj':
                subj = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
                if word.head.dep_ == 'auxpass':
                    if word.head.head.dep_ == 'ROOT':
                        subjpass = subj
            if word.dep_ in ('advmod','npadvmod','oprd'):
                if word.head.dep_ == 'ROOT':
                    if verb == '':
                        adverb['bef'] = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
                    else:
                        adverb['aft'] = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
            if word.dep_ == 'auxpass':
                if word.head.dep_ == 'ROOT':
                    if not subjpass:
                        subjpass = subj
            if word.dep_ in ('aux','auxpass','neg'):
                if word.head.dep_ == 'ROOT':
                    aux += [word]
            if word.dep_ == 'ROOT':
                verb = word.text
                if word.tag_ == 'VB':
                    verbtense = en.INFINITIVE
                elif word.tag_ == 'VBD':
                    verbtense = en.PAST
                elif word.tag_ == 'VBG':
                    verbtense = en.PRESENT
                    verbaspect = en.PROGRESSIVE
                elif word.tag_ == 'VBN':
                    verbtense = en.PAST
                else:
                    verbtense = en.tenses(word.text)[0][0]
            if word.dep_ == 'prt':
                if word.head.dep_ == 'ROOT':
                    part = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
            if word.dep_ == 'prep':
                if word.head.dep_ == 'ROOT':
                    prep = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
            if word.dep_.endswith('obj'):
                if word.head.dep_ == 'agent':
                    if word.head.head.dep_ == 'ROOT':
                        agent = ''.join(w.text + ', ' if w.dep_=='appos' else (w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws) for w in word.subtree).strip()
                        aplural = word.tag_ in ('NNS','NNPS')
            if word.dep_ in ('xcomp','ccomp','conj'):
                if word.head.dep_ == 'ROOT':
                    xcomp = ''.join(w.text_with_ws.lower() if w.tag_ not in ('NNP','NNPS') else w.text_with_ws for w in word.subtree).strip()
                    that = xcomp.startswith('that')
                    xcomp = pass2act(xcomp, True).strip(' .')
                    if not xcomp.startswith('that') and that:
                        xcomp = 'that '+xcomp
            if word.dep_ == 'punct' and not rec:
                if word.text != '"':
                    punc = word.text

        # exit if not passive:
        if subjpass == '':
            newdoc += str(sent) + ' '
            continue

        # if no agent is found:
        if agent == '':
            # what am I gonna do? BITconEEEEEEECT!!!!
            newdoc += str(sent) + ' '
            continue

        # invert nouns:
        agent = nouninv(agent)
        subjpass = nouninv(subjpass)

        # F*****G CONJUGATION!!!!!!!!!!!!!:
        auxstr = ''
        num = en.SINGULAR if not aplural or agent in ('he','she') else en.PLURAL
        aux.append(aux[0])
        verbaspect = None
        for (pp, p, a, n) in zip(aux,aux[1:],aux[2:],aux[3:]):
            if a.lemma_ == '.':
                continue

            if a.lemma_ == 'not':
                if p.lemma_ == 'be':
                    if n.lemma_ == 'be':
                        verbtense = en.tenses(a.text)[0][0]
                        auxstr += en.conjugate('be',tense=en.tenses(p.text)[0][0],number=num) + ' '
                        verbaspect = en.PROGRESSIVE
                    else:
                        auxstr += en.conjugate('do',tense=en.tenses(p.text)[0][0],number=num) + ' '
                        verbtense = en.INFINITIVE
                auxstr += 'not '
            elif a.lemma_ == 'be':
                if p.lemma_ == 'be':
                    verbtense = en.tenses(a.text)[0][0]
                    auxstr += en.conjugate('be',tense=en.tenses(a.text)[0][0],number=num) + ' '
                    verbaspect = en.PROGRESSIVE
                elif p.tag_ == 'MD':
                    verbtense = en.INFINITIVE
            elif a.lemma_ == 'have':
                num == en.PLURAL if p.tag_ == 'MD' else num
                auxstr += en.conjugate('have',tense=en.tenses(a.text)[0][0],number=num) + ' '
                if n.lemma_ == 'be':
                    verbaspect = en.PROGRESSIVE
                    verbtense = en.tenses(n.text)[0][0]
            else:
                auxstr += a.text_with_ws
        auxstr = auxstr.lower().strip()

        if verbaspect:
            verb = en.conjugate(verb,tense=verbtense,aspect=verbaspect)
        else:
            verb = en.conjugate(verb,tense=verbtense)

        advcl = ''
        if advcltree:
            for w in advcltree:
                if w.pos_ == 'VERB' and en.tenses(w.text)[0][4] == en.PROGRESSIVE:
                    advcl += 'which ' + en.conjugate(w.text,tense=en.tenses(verb)[0][0]) + ' '
                else:
                    advcl += w.text_with_ws

        newsent = ' '.join(list(filter(None, [agent,auxstr,adverb['bef'],verb,part,subjpass,adverb['aft'],advcl,prep,xcomp])))+punc
        if not rec:
            newsent = newsent[0].upper() + newsent[1:]
        newdoc += newsent + ' '
    return newdoc
Esempio n. 38
0
 def test_tenses(self):
     # Assert tense of "am".
     self.assertTrue(en.PRESENT_1ST_PERSON_SINGULAR in en.tenses("am"))
     self.assertTrue("1sg" in en.tenses("am"))
     print "pattern.en.tenses()"
Esempio n. 39
0
# The lexeme() function returns a list of all possible verb inflections.
# The lemma() function returns the base form (infinitive) of a verb.
print "lexeme:", lexeme("be")
print "lemma:", lemma("was")
print

# The conjugate() function inflects a verb to another tense.
# You can supply: 
# - tense : INFINITIVE, PRESENT, PAST, 
# - person: 1, 2, 3 or None, 
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g., 
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False)
print conjugate("being", tense="1sg", negated=False)
print

# Prefer the full constants for code that will be reused/shared.

# The tenses() function returns a list of all tenses for the given verb form.
# Each tense is a tuple of (tense, person, number, mood, aspect).
# For example: tenses("are") => [('present', 2, 'plural', 'indicative', 'imperfective'), ...]
# You can then check if a tense constant is in the list.
# This will also work with aliases, even though they are not explicitly in the list.
from pattern.en import PRESENT, PLURAL
print tenses("are")
print (PRESENT, 1, PLURAL) in tenses("are")
print "pl" in tenses("are")
Esempio n. 40
0
# The conjugate() function inflects a verb to another tense.
# You can supply:
# - tense : INFINITIVE, PRESENT, PAST,
# - person: 1, 2, 3 or None,
# - number: SINGULAR, PLURAL,
# - mood  : INDICATIVE, IMPERATIVE,
# - aspect: IMPERFECTIVE, PROGRESSIVE.
# The tense can also be given as an abbreviated alias, e.g.,
# inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart.
from pattern.en import PRESENT, SINGULAR
print conjugate("being",
                tense=PRESENT,
                person=1,
                number=SINGULAR,
                negated=False)
print conjugate("being", tense="1sg", negated=False)
print

# Prefer the full constants for code that will be reused/shared.

# The tenses() function returns a list of all tenses for the given verb form.
# Each tense is a tuple of (tense, person, number, mood, aspect).
# For example: tenses("are") => [('present', 2, 'plural', 'indicative', 'imperfective'), ...]
# You can then check if a tense constant is in the list.
# This will also work with aliases, even though they are not explicitly in the list.
from pattern.en import PRESENT, PLURAL
print tenses("are")
print(PRESENT, 1, PLURAL) in tenses("are")
print "pl" in tenses("are")
Esempio n. 41
0
def simp_passive_sent(tokens, node_list):
    dict1 = {
        'me': 'I',
        'him': 'He',
        'her': 'She',
        'them': 'They',
        'i': 'me',
        'he': 'him',
        'she': 'her',
        'they': 'them'
    }

    strs = ""
    """
    # the original tokens in the sent
    #import pdb; pdb.set_trace()
    print(sent)
    #import pdb; pdb.set_trace()
    tokens = StanfordTokenizer().tokenize(str(sent))
    tokens.insert(0, '')

    result = list(eng_parser.raw_parse(sent))[0]
    root = result.root['word']

    #w = result.tree()
    #print "parse_tree:", w

    #TODO: use the tree structure, check again
    node_list = [] # dict (4 -> 4, u'said', u'VBD', u'root', [[18], [22], [16], [3]])
    for node in result.nodes.items():
        node_list.append(base.get_triples(node))
        #node_list[base.get_triples[0]] = base.get_triples(node)
    """
    root = ""
    root_ind = node_list[0][4]['root'][0]
    for nd in node_list:
        if root_ind == nd[0]:
            root=nd[1]

    #split_ind = 0
    for nd in node_list[1:]:
        #import pdb; pdb.set_trace()
        #print(nd)
        # A passive nominal subjec
        if (root in nd) and ('nsubjpass' in nd[4].keys()):
            pass

        if (root in nd) and ('nsubjpass' in nd[4].keys()):
            #print "conj: ", nd
            #print "conj node: ", nd[4]['conj']

            #import pdb; pdb.set_trace()
            nsubjpass_ind = nd[4]['nsubjpass'][0]
            det_ind = 0
            #amod_ind_list = [] # the list of adjectival modifier  
            for _nd in node_list:
                if nsubjpass_ind == _nd[0]:
                    if ('det' in _nd[4].keys()):
                        det_ind = _nd[4]['det'][0]
                        #amod_ind_list = _nd[4]['amod']

            #import pdb; pdb.set_trace()
            nsubjpass = tokens[nsubjpass_ind]
            # amod
            """
            amod_list = ""
            if len(amod_ind_list) > 0:
                for i in amod_ind_list:
                    amod_list = amod_list + " " + tokens[i]
                nsubjpass = amod_list + " " + nsubjpass
            """
            amod_list = base.get_dependency_list(tokens, node_list, nsubjpass_ind)

            nsubjpass = amod_list + " " + nsubjpass
            
            # det

            #import pdb; pdb.set_trace()
            if det_ind:
                nsubjpass = tokens[det_ind] + " " + nsubjpass
            elif str(nsubjpass.lower().strip()) in dict1:
                nsubjpass = dict1[str(nsubjpass.lower().strip())]
            else:
                pass

            auxpass_ind = 0
            if ('auxpass' in nd[4].keys()):
                auxpass_ind = nd[4]['auxpass'][0]

            #det_ind = 0
            subj = ""
            if ('nmod' in nd[4].keys()):
                # bugs: the case
                nmod_ind_list = nd[4]['nmod']

                case_ind = 0
                case_ind_2 = 0
                for nmod_ind in nmod_ind_list:
                    _case_ind = 0       
                    for nd in node_list[1:]:
                        if nmod_ind == nd[0]:
                            if ('case' in nd[4].keys()):
                                _case_ind = nd[4]['case'][0]
                                break
                    # check whether the agent is explicitly stated using "by"
                    if _case_ind > 0:
                        if tokens[_case_ind] == 'by':
                            case_ind = _case_ind
                            break
                        else:
                            case_ind_2 = _case_ind
                            
                #import pdb; pdb.set_trace()
                if case_ind == 0:
                    return strs
                #if tokens[case_ind] != 'by':
                #    return strs

                nmod_dict = {}
                for _nd in node_list[1:]: #BUG
                    if nmod_ind == _nd[0]:
                         nmod_dict = _nd[4]
                         break

                #import pdb; pdb.set_trace()
            #if ('case' in nmod_dict.keys()): # 'by'
                #[NOTICE]: connect the nsubj + acl as 1st
                #import pdb; pdb.set_trace()
                det_ind = 0
                nsubj_compound_list = []
                if ('det' in nmod_dict):
                    det_ind = nmod_dict['det'][0]
                if ('compound' in nmod_dict):
                    nsubj_compound_list = nmod_dict['compound']

                for i in nsubj_compound_list:
                    subj = subj + " " + tokens[i]

                if det_ind:
                    subj = base.upper_first_char(tokens[det_ind]) + " " + subj + tokens[nmod_ind]
                elif tokens[nmod_ind] in dict1:
                    subj = dict1[tokens[nmod_ind]]
                else:
                    subj = subj + " " + tokens[nmod_ind]

                #import pdb; pdb.set_trace()
                verb = root
                if len(tenses(root)) > 0:
                    if auxpass_ind != 0:
                        if subj.strip().lower() == 'they':
                            verb = conjugate(root, tenses(tokens[auxpass_ind])[0][0], 2)
                        else:
                            verb = conjugate(root, tenses(tokens[auxpass_ind])[0][0], 3)
                    else:
                        if subj.strip().lower() == 'they':
                            verb = conjugate(root, tenses(root)[0][0], 2)
                        else:
                            verb = conjugate(root, tenses(root)[0][0], 3)

                #import pdb; pdb.set_trace()
                if case_ind_2 > 0:
                    _case_str = " ".join(tokens[case_ind_2:case_ind])
                    strs = subj + " " + verb + " " + nsubjpass.lower() + " " + _case_str + " ."
                else:
                    strs = subj + " " + verb + " " + nsubjpass.lower() + " ."

                return strs
            """
                #[NOTICE]: remove the ',' after the nsubj
                if tokens[nsubj_ind + 1] in PUNCTUATION:
                    tokens[nsubj_ind + 1] = ''

                tokens.insert(nsubj_ind + 1, verb)

                #root_ind = tokens.index(root)
                #_str1 = tokens[nsubj_ind:root_ind]

                if _str1[-1] in PUNCTUATION:
                    _str1[-1] = ''
                str1 =  ' '.join(_str1)
                #print "1st sent: ", str1

                # upper the 1st char in 2nd sent
                _str2 = tokens[root_ind:]
                #w = _w + ' '
                str2 = upper_first_char(subj) + " " + ' '.join(_str2)
                #print "2nd sent: ", str2
            """
                #strs = str1 + ' . ' + str2
            #return strs


    return strs