def tense_of_verb(verb_str): """ Identifies the verb tense of a word, and returns it in a tuple along with its base word. @param verb_str: a str containing a verb @return: a tuple t, where t[0] is 'AUX' if the verb is a special auxiliary verb, is '?' if the verb tense cannot be recognized, and otherwise is 'VBD', 'VBP', or 'VBZ', which correspond to the Penn Treebank P.O.S. tags for past tense, non-3rd person present tense, and 3rd person present tense. """ aux_verbs = [ 'am', 'is', 'are', 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could', 'ought' ] if verb_str.lower() in aux_verbs: return ('AUX', verb_str) lemm_str = getLemma(verb_str, upos='VERB')[0] if verb_str in getInflection(lemm_str, tag='VBD'): return ('VBD', lemm_str) elif verb_str in getInflection(lemm_str, tag='VBP'): return ('VBP', lemm_str) elif verb_str in getInflection(lemm_str, tag='VBZ'): return ('VBZ', lemm_str) else: return ('?', lemm_str)
def testProperNouns(self): infls = lemminflect.getInflection('Alaskan', 'NN', inflect_oov=False) self.assertEqual(len(infls), 0) infls = lemminflect.getInflection('Alaskan', 'NNP', inflect_oov=False) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Alaskan') infls = lemminflect.getInflection('Alaskan', 'NNPS', inflect_oov=False) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Alaskans') infls = lemminflect.getInflection('Axxlaskan', 'NNP', inflect_oov=True) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Axxlaskan') infls = lemminflect.getInflection('Axxlaskan', 'NNPS', inflect_oov=True) self.assertEqual(len(infls), 1) self.assertEqual(infls[0], 'Axxlaskans') lemminflect.Inflections().setUseInternalLemmatizer( True) # lemmatize with lemminflect token = self.nlp('The Alaskan went South.')[1] self.assertEqual(token._.inflect('NNPS', inflect_oov=False), 'Alaskans') token = self.nlp('The Axxlaskan went South.')[1] self.assertEqual(token._.inflect('NNPS', inflect_oov=True), 'Axxlaskans')
def testOverrides(self): # run the inflection system once to assure the overrides is loaded (ie.. lazy loading) lemminflect.getInflection('watch', 'VBD'), ('watched', ) # Hack the code to replace the overrides dictionary orig_dict = lemminflect.Inflections().overrides_dict with self.assertLogs(): lemmas = lemminflect.getLemma('WORD', 'X') self.assertEqual(lemmas, ()) with self.assertLogs(): lemmas = lemminflect.getAllLemmas('WORD', 'X') self.assertEqual(lemmas, {}) with self.assertLogs(): lemmas = lemminflect.getAllLemmasOOV('WORD', 'X') self.assertEqual(lemmas, {}) token = self.nlp('I')[0] self.assertEqual(token._.lemma(), 'I') lemminflect.Inflections().overrides_dict = { 'watch': { 'VBD': ('xxx', ) } } inflections = lemminflect.getInflection('watch', 'VBD', inflect_oov=False) self.assertEqual(inflections, ('xxx', )) # put the original dictionary back lemminflect.Inflections().overrides_dict = orig_dict
def testGetInflection04(self): self.assertEqual(lemminflect.getAllInflections('watch', 'ADJ'), {}) self.assertEqual( lemminflect.getInflection('watch', 'JJ', inflect_oov=False), ()) self.assertEqual( lemminflect.getInflection('watch', 'JJ', inflect_oov=True), ('watch', )) self.assertEqual(lemminflect.getInflection('watch', 'VBD'), ('watched', ))
def testGetInflectionOOV(self): self.assertEqual(lemminflect.getInflection('xxbike', 'NN', inflect_oov=False), ()) self.assertEqual(lemminflect.getInflection('xxbike', 'NNS', inflect_oov=False), ()) self.assertEqual(lemminflect.getInflection('xxbike', 'NN', inflect_oov=True), ('xxbike',)) # reg self.assertEqual(lemminflect.getInflection('xxbike', 'NNS', inflect_oov=True), ('xxbikes',)) # reg self.assertEqual(lemminflect.getInflection('xxbaggy', 'JJR', inflect_oov=True), ('xxbaggier',)) # reg self.assertEqual(lemminflect.getInflection('xxclean', 'RBS', inflect_oov=True), ('xxcleanest',)) # reg self.assertEqual(lemminflect.getInflection('xxformat', 'VBG', inflect_oov=True), ('xxformatting',)) # regd self.assertEqual(lemminflect.getInflection('xxbacklog', 'VBD', inflect_oov=True), ('xxbacklogged',)) # regd self.assertEqual(lemminflect.getInflection('xxgenesis', 'NNS', inflect_oov=True), ('xxgeneses',)) # glreg self.assertEqual(lemminflect.getInflection('xxalumus', 'NNS', inflect_oov=True), ('xxalumi',)) # glreg
def match_pronoun_present(verb_str: str, pronoun_str: str) -> str: """ Returns a verb form that matches the passed pronoun. This function should only be used for present tense """ pronoun_str = pronoun_str.lower() if pronoun_str not in __pronoun_to_verb_upenn_dict.keys(): raise ValueError( 'Unexpected value for pronoun "{}"'.format(pronoun_str)) aff_verb_str, negation_str = split_verb_negation(verb_str) if is_modal_verb(aff_verb_str): return verb_str lemma_lst = getLemma(aff_verb_str, "VERB") lemma_lst = __collapse_lemma_list(lemma_lst) if len(lemma_lst) != 1: logging.warning( 'WARNING: Ambigous or no lemma for "{}". Output was {}. Keeping original verb.' .format(verb_str, lemma_lst)) return verb_str lemma_str = lemma_lst[0] inflect_lst = getInflection(lemma_str, __pronoun_to_verb_upenn_dict[pronoun_str]) if len(inflect_lst) > 2 or not len(inflect_lst): logging.warning( 'WARNING: Ambigous or no inflection list for lemma "{}" from verb "{}". Output was {}. Keeping original verb.' .format(lemma_str, verb_str, inflect_lst)) return verb_str elif len(inflect_lst) == 2: if pronoun_str == 'i': new_verb_str = inflect_lst[0] else: new_verb_str = inflect_lst[1] else: new_verb_str = inflect_lst[0] return merge_verb_negation(new_verb_str, negation_str)
def filter_out_tense(self, sent, so, eo, candidates): stems = [] out = [] word_tag = nltk.pos_tag([sent[so:eo]])[0][1] stems.append(self.ps.stem(sent[so:eo])) for word in candidates: cand_stem = self.ps.stem(word) if cand_stem not in stems: stems.append(cand_stem) try: cand_tag = self.tag_for_lemmatizer(word) if cand_tag is None: out.append( getInflection(self.lem.lemmatize(word, pos=cand_tag), tag=word_tag)[0]) else: out.append(word) except IndexError: # Lemminflect does not support all POS tags - lemminflect.readthedocs.io/en/latest/tags/ out.append(word) logger.debug( "ERROR: Lemminflect cannot convert {} with type {}, skipping" .format(word, word_tag)) return out
def inflect(string, mode): words = string.split(" ") for i, word in enumerate(words): if word[0] == "[" and word[-1] == "]": words[i] = word[1:-1] elif len(words) > 1: continue # Local checking for forms 3rd party library does wrong override = override_inflection(words[i], mode) if override != None: return override if mode == "ppart": words[i] = lemminflect.getInflection(words[i], tag='VBN')[0] elif mode == "part": words[i] = lemminflect.getInflection(words[i], tag='VBG')[0] elif mode == "3sg": words[i] = lemminflect.getInflection(words[i], tag='VBZ')[0] elif mode == "inf": continue elif mode == "sg": words[i] = lemminflect.getInflection(words[i], tag='NN')[0] elif mode == "pl": words[i] = lemminflect.getInflection(words[i], tag='NNS')[0] elif mode == "mass": words[i] = lemminflect.getInflection(words[i], tag='NN')[0] elif mode == "singleton": words[i] = lemminflect.getInflection(words[i], tag='NN')[0] return " ".join(words)
def testGetInflection03(self): self.assertEqual(lemminflect.getAllInflections('watch'), {'NNS': ('watches', 'watch'), 'NN': ('watch',), 'VBD': ('watched',), 'VBG': ('watching',), 'VBZ': ('watches',), 'VB': ('watch',), 'VBP': ('watch',)}) self.assertEqual(lemminflect.getAllInflections('watch', 'VERB'), {'VBD': ('watched',), 'VBG': ('watching',), 'VBZ': ('watches',), 'VB': ('watch',), 'VBP': ('watch',)}) self.assertEqual(lemminflect.getInflection('watch', 'VBD'), ('watched',)) self.assertEqual(lemminflect.getAllInflections('watch', 'ADJ'), {})
def get_legal_words(tag: str, second_tag: Optional[str] = None, # also counterbalance list of other word forms (e.g. plural) seed: int = configs.Data.seed, exclude: Optional[Tuple[str, ...]] = None, verbose: bool = False, ) -> Union[List[str], List[Tuple[str, str]]]: print(f'Obtaining counterbalanced subset of legal words with tag={tag} and second_tag={second_tag}') # get words with requested tag and order df_legal = pd.read_csv(configs.Dirs.legal_words / f'{tag}.csv') bool_ids = df_legal['is_legal'].astype(bool).tolist() first_forms_ = df_legal['word'][bool_ids].tolist() # exclude any words ? if exclude: first_forms_ = [w for w in first_forms_ if w not in exclude] # also counterbalance 2nd forms of words ? if second_tag is None: second_forms_ = None elif second_tag == 'NNP': plural = inflect.engine() second_forms_ = [plural.plural(w) for w in first_forms_] elif second_tag.startswith('VB'): lemmas = [getLemma(w, upos='VERB')[0] for w in first_forms_] second_forms_ = [getInflection(lemma, tag=second_tag)[0] for lemma in lemmas] # requires lemma as input else: raise AttributeError('Invalid arg to second_tag') # remove words if their 2nd form is not in vocab or if it is identical to 1st form if second_tag is not None: first_forms = [] second_forms = [] for w1, w2 in zip(first_forms_, second_forms_): if w2 in vocab and w2 != w1: first_forms.append(w1) second_forms.append(w2) if verbose: print(f'Included {w1:<12} and {w2:<12}') assert first_forms assert second_forms else: first_forms = first_forms_ second_forms = second_forms_ # find subset of words such that their total corpus frequencies are approx equal across corpora num_words_in_sample = configs.Data.tag2num_words[tag] res = find_counterbalanced_subset(first_forms, min_size=num_words_in_sample, max_size=num_words_in_sample+100, second_forms=second_forms, seed=seed, verbose=verbose, ) return res
def sample_verb(tag_list, source_tag, source): tag_list = [tag for tag in tag_list if tag != source_tag] tag = rd.choice(tag_list) cand_list = getInflection(source, tag) if cand_list == []: cand_list = getAllInflectionsOOV(source, upos='VERB').values() if len(cand_list) > 0: cand = rd.choice(cand_list) else: cand = None return cand
def inflection(pred_lemma, pred_pos, pred_word): if pred_pos == "VERB": inflection = getInflection(pred_lemma, tag='VBG')[0] # to cater to the errors in the lemma if pred_lemma.lower().endswith("ing"): return pred_word else: return inflection return else: return pred_word
def testUPOSLog(self): with self.assertLogs(): infl = lemminflect.getInflection('WORD', 'X') self.assertEqual(infl, ()) with self.assertLogs(): infls = lemminflect.getAllInflections('WORD', 'X') self.assertEqual(infls, {}) with self.assertLogs(): infls = lemminflect.getAllInflectionsOOV('WORD', 'X') self.assertEqual(infls, {}) token = self.nlp('testing')[0] self.assertEqual(token._.inflect('X'), 'testing')
def inflect_lemma(self, lemma, tag=None, pos=None): inflections = [] # tag based if tag: inflection_tuple = lemminflect.getInflection(lemma, tag=tag) inflections = list(inflection_tuple) else: # pos based, can be None too inflection_dict = lemminflect.getAllInflections(lemma, upos=pos) for i in inflection_dict.values(): inflections += list(i) return inflections
def convert_tokens_to_string(self, tokens): result = [] for i, token in enumerate(tokens): # combine wordpiece tokens if len(token) > 2 and token[:2] == '##': if result: result[-1] += token[2:] else: result.append(token[2:]) continue if token in self.inflection_tokens: if i != 0: inflected = getInflection(result[-1], tag=token[1:-1]) if inflected: result[-1] = inflected[0] else: result.append(token) return ' '.join(result)
def get_lemminflect(token): text = token.text lemma = token.lemma_ tag = token.tag_ pos = token.pos_ word_lemminflect = set() if pos not in REPLACE_POS: return list(word_lemminflect) tags = POS_TO_TAGS[pos] for tg in tags: if tg == tag: continue inflects = getInflection(lemma, tag=tg) for word in inflects: if word.lower() != text.lower(): word_lemminflect.add(word) return list(word_lemminflect)
def detokenize(self, tokens: List[str], as_list: bool = False) -> Union[str, List[str]]: result = [] for i, token in enumerate(tokens): # combine wordpiece tokens if token in self.reverse_single_char_map: token = self.reverse_single_char_map[token] if token in self.inflection_tokens: if i != 0: inflected = getInflection(result[-1], tag=token[1:-1]) if inflected: result[-1] = inflected[0] else: result.append(token) if as_list: # Allow users to detokenize using their own detokenizers return result if self.pretok_type == 'moses': return self.detokenizer.detokenize(result) return ' '.join(result)
def __call__(self, sent, index): # get word if sent[index].org is not None: word = sent[index].org else: word = sent[index].lemma # get cand cand = None source = word.lower() if source != '': source_tag = sent[index].tag tag_list = [tag for tag in self.tag_list if tag != source_tag] tag = rd.choice(tag_list) cand_list = getInflection(source, tag) if cand_list == []: cand_list = getAllInflectionsOOV(source, upos='VERB').values() if len(cand_list) > 0: cand = rd.choice(cand_list) # replace to cand if cand is not None: if word.istitle(): cand = cand.title() sent[index].org = cand if ((index >= 1 and sent[index - 1].pos != 'AUX') and (index >= 2 and sent[index - 2].pos != 'AUX') and self.sampler() < self.aux_ratio ): # 直前にAUXがなくVBG, VBNなら"have (been)"の変化を直前に挿入する if tag == 'VBG': sent[index].addition.append( EnToken(index=sent[index].index - 0.25, org=self.vbg_sampler())) elif tag == 'VBN': sent[index].addition.append( EnToken(index=sent[index].index - 0.25, org=self.vbn_sampler())) sent[index] = self.add_history(sent[index]) return sent
def inflect(self, tag): self.tag = tag self.text = lemminflect.getInflection(self.lemma, tag)[0] self.text_with_ws = self.text + self.whitespace
def sample_cand(tag_list, source_tag, source): tag_list = [tag for tag in tag_list if tag != source_tag] tag = rd.choice(tag_list) cand_list = getInflection(source, tag) cand = rd.choice(cand_list) return cand
def api_getInflection(): content = request.json result = getInflection(content['lemma'], content['tag'], content['inflect_oov']) return jsonify(result)
import json import codecs with codecs.open('svo_triples_lemmatised.txt', 'r', 'utf-8-sig') as json_file: svo_triples = json.load(json_file) svo_triples = svo_triples[3:] pos_sentences = [] neg_sentences = [] passive_sentences = [] swapped_sentences = [] for triple in svo_triples: triple_subject = lem.getInflection(triple[0], tag='NNS', inflect_oov=False) triple_verb = lem.getInflection(triple[1], tag='VBD', inflect_oov=False) triple_negative_verb = lem.getInflection(triple[1], tag='VB', inflect_oov=False) triple_passive_verb = lem.getInflection(triple[1], tag='VBN', inflect_oov=False) triple_object = lem.getInflection(triple[2], tag='NNS', inflect_oov=False) if triple_subject != () and triple_verb != () and triple_object != (): #print((triple_subject,triple_verb,triple_object)) #print(triple) pos_sentences += [ triple_subject[0].capitalize() + " " + triple_verb[0] + " " + triple_object[0] + "." ]
async def postInflection(word: Word, pos: PartOfSpeech = Body(...)): text = word.text tag = pos.tag inflection = getInflection(text, tag) return { "inflection": inflection }
def inflection(pred_lemma, pred_pos, pred_word): #print(f"lemma: {pred_lemma}, pos: {pred_pos}, word: {pred_word}") if pred_pos=="VERB": return getInflection(pred_lemma, tag='VBG')[0] else: return pred_word
def testGetInflection02(self): self.assertEqual(lemminflect.getInflection('squirrel', 'NN'), ('squirrel', )) self.assertEqual(lemminflect.getInflection('squirrel', 'NNS'), ('squirrels', 'squirrel'))