def __init__(self): self.default_conjugator = mlconjug.Conjugator(language='en') self.lemmatizer = WordNetLemmatizer() # self.lemma_exceptions = {'am': 'be', 'are': 'be', 'is': 'be'} self.lemma_exceptions = {} self.prons_to_flip = { 'your': 'my', 'my': 'your', 'yours': 'mine', 'mine': 'yours', 'there': 'here', 'here': 'there' }
def test_get_verb_conjug(mocker): verb = "be" conjug = mlconjug.Conjugator(language="en") conjug_verb = conjug.conjugate(verb) mocker.patch.object(mlconjug.Conjugator, "conjugate", return_value=conjug_verb) mocker.patch.object(mlconjug.PyVerbiste.VerbEn, 'iterate', return_value=[(verb, verb)]) list_conjugs = action_extractor.get_verb_conjug([verb]) assert mlconjug.Conjugator.conjugate.call_count == 1 assert mlconjug.PyVerbiste.VerbEn.iterate.call_count == 1 assert list_conjugs == [verb]
def get_verb_conjug(verb_list): """ it conjugates te list of verbs that it receives as input :param verb_list: list list of verbs that we want to conjugate :return: list that contains all conjugations of the input verbs """ verbs = [] for verb in verb_list: default_conjugator = mlconjug.Conjugator(language='en') test_verb = default_conjugator.conjugate(verb) all_conjugated_forms = test_verb.iterate() verbs.append( list(set([verb_tuple[-1] for verb_tuple in all_conjugated_forms]))) verbs = list(itertools.chain.from_iterable(verbs)) return verbs
def __init__(self): global conjugator global tenseKeyDict global subjectKeysDict conjugator = mlconjug.Conjugator(language='es') tenseKeyDict = { 'preterite': ['Indicativo', 'Indicativo pretérito perfecto simple'], 'present': ['Indicativo', 'Indicativo presente'], 'imperfect': ['Indicativo', 'Indicativo pretérito imperfecto'], 'subjunctive': ['Subjuntivo', 'Subjuntivo presente'] } subjectKeysDict = { 'yo': '1s', 'tu': '2s', 'el/ella': '3s', 'nosotros': '1p', 'ellos/ellas': '3p' }
def try_fix_form(word_pos, syn_pos): word = word_pos[0] syn = syn_pos[0] pos_tag_word = word_pos[1] pos_tag_syn = syn_pos[1] if pos_tag_syn != pos_tag_word: # Check if its only plural version if pos_tag_word == pos_tag_syn + 'S': if pos_tag_syn.startswith('J'): return en.superlative(syn) elif pos_tag_syn.startswith('N'): return en.pluralize(syn) return None if pos_tag_syn[:2] != pos_tag_word[:2] else syn else: if not pos_tag_syn.startswith('V'): return syn # We check if verb is in correct form default_conjugator = mlconjug.Conjugator(language='en') if pos_tag_word == 'VB': return default_conjugator.conjugate( syn).conjug_info['indicative']['indicative present']['1s'] elif pos_tag_word == 'VBG': return default_conjugator.conjugate(syn).conjug_info[ 'indicative']['indicative present continuous']['1s 1s'] elif pos_tag_word == 'VBN': return default_conjugator.conjugate(syn).conjug_info[ 'indicative']['indicative present perfect']['1p'] elif pos_tag_word == 'VBP': return default_conjugator.conjugate( syn).conjug_info['indicative']['indicative present']['1s'] elif pos_tag_word == 'VBZ': return default_conjugator.conjugate( syn).conjug_info['indicative']['indicative present']['3s'] elif pos_tag_word == 'VBD': return default_conjugator.conjugate(syn).conjug_info[ 'indicative']['indicative past tense']['1s']
import mlconjug # Set the conjugator defining the language: conjugator = mlconjug.Conjugator(language="pt") # Get all the possible conjugations organized in a dictionary # formated in the way of modo verbal>tempo verbal>pessoa verbal verbo_ver_information = conjugator.conjugate("ver").conjug_info # Printing every single verb for modo_verbal, tempos_verbais in verbo_ver_information.items(): print(str(modo_verbal).upper(), ':', sep='') for tempo_verbal, pessoas_verbais in tempos_verbais.items(): print(tempo_verbal, ':', sep='') for pessoa_verbal in pessoas_verbais.items(): print(pessoa_verbal)
new_alias = "%s>%s" % (p.plural(w1), p.plural(w2)) myvocabulary["NNS"].append(new_alias) else: myvocabulary["NNS"].append(p.plural(word)) ### conjugation ## silencing some scipy warnings import warnings warnings.filterwarnings("ignore") ### (VBG) import mlconjug conj = mlconjug.Conjugator(language='en') def get_vbg(baseform): return conj.conjugate(baseform).conjug_info['indicative'][ 'indicative present continuous']['1s 1s'] for word in myvocabulary["VB"]: if ">" in word: w1, w2 = word.split(">") new_alias = "%s>%s" % (get_vbg(w1), get_vbg(w2)) myvocabulary["VBG"].append(new_alias) else: myvocabulary["VBG"].append(get_vbg(word))
def run(inputFilename, input_main_dir_path, outputPath, search_by_dictionary_var, search_by_keyword_var, keyword, lemmatization): IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', "Started running the file search script at", True) if input_main_dir_path == '' and inputFilename != '': inputDir = os.path.dirname(inputFilename) files = [inputFilename] elif input_main_dir_path != '': inputDir = input_main_dir_path files = IO_files_util.getFileList(inputFilename, inputDir, 'txt') if len(files) == 0: return #print("files",files) for file in files: #print("file",file) if search_by_dictionary_var: break if search_by_keyword_var: output_dir_path = inputDir + os.sep + "search_result_csv" if not os.path.exists(output_dir_path): os.mkdir(output_dir_path) if file[-4:] != '.txt': continue kwtokens = word_tokenize(keyword) kwlist = [ ] #list of list which includes conjugated forms of each token in keyword phrase default_conjugator = mlconjug.Conjugator(language='en') for token in kwtokens: conjus = default_conjugator.conjugate(token) formlist = conjus.iterate() forms = [] for form in formlist: forms.append(form[-1]) kwlist.append(list(dict.fromkeys(forms))) #reduce repetitions csvtitle = outputPath + '/' + os.path.split( os.path.split(outputPath)[0])[1] + "_" + keyword + '.csv' if lemmatization: csvtitle = outputPath + '/' + os.path.split( os.path.split(outputPath)[0])[1] + "_" + keyword + '_lemma.csv' csvExist = os.path.exists(csvtitle) with open(csvtitle, "a", newline="", encoding='utf-8', errors='ignore') as csvFile: writer = csv.writer(csvFile) if csvExist == False: writer.writerow([ "Document ID", "Document", "Sentence ID", "SENTENCE", "SEARCH_WORD", "LEMMATIZED", "Sentence ID of FIRST_OCCURRENCE", "RELATIVE_POSITION", "FREQUENCY of OCCURRENCE" ]) docIndex = 1 else: df = pd.read_csv(csvtitle, encoding="ISO-8859-1") if len(df) == 0: docIndex = 1 else: docIndex = df.iloc[-1][0] + 1 first_occurrence_index = 0 frequency = 0 contents = [] head, docname = os.path.split(inputFilename) title = docname.partition('.')[0] f = open(file, "r", encoding='utf-8', errors='ignore') docText = f.read() f.close() sentences_ = sent_tokenize( docText) #the list of sentences in corpus sentence_index = 1 for sent in sentences_: tokens_ = word_tokenize(sent) kwindex = 0 kw = False form = '' for token in tokens_: t = token.lower() if kwindex == len(kwlist): break if t == kwtokens[kwindex] or ( lemmatization and (t in kwlist[kwindex] or kwtokens[kwindex] == wordnet.morphy(t)) ): #two ways to recognize the keyword #(1) the form in corpus match item in the conjugation list(verbs) #(2) the lemmatized form in corpus match the keyword token(for nouns or adjectives) kw = True kwindex += 1 form += t + " " else: kw = False kwindex = 0 form = '' if len(form) > 0: form = form[:-1] if kw == True: #if keyword is detected, generate the next subfile frequency += 1 if frequency == 1: first_occurrence_index = sentence_index if lemmatization: writer.writerow([ docIndex, file, sentence_index, sent, keyword, form, first_occurrence_index, sentence_index / len(sentences_), frequency ]) else: writer.writerow([ docIndex, file, sentence_index, sent, keyword, '', first_occurrence_index, sentence_index / len(sentences_), frequency ]) # writer.writerow([docIndex, inputFilename, presubfile, keyword, sent, first_occurrence_index, sentence_index / len(sentences_), frequency]) else: writer.writerow([ docIndex, file, sentence_index, sent, '', '', '', '', '' ]) sentence_index += 1 IO_user_interface_util.timed_alert( GUI_util.window, 2000, "Analysis end", "Finished running the file search script at", True)
import mlconjug # # To use mlconjug with the default parameters and a pre-trained conjugation model. # default_conjugator = mlconjug.Conjugator(language='fr') # # Verify that the model works # test1 = default_conjugator.conjugate("manger").conjug_info['Indicatif']['Passé Simple']['1p'] # test2 = default_conjugator.conjugate("partir").conjug_info['Indicatif']['Passé Simple']['1p'] # test3 = default_conjugator.conjugate("facebooker").conjug_info['Indicatif']['Passé Simple']['1p'] # test4 = default_conjugator.conjugate("astigratir").conjug_info['Indicatif']['Passé Simple']['1p'] # test5 = default_conjugator.conjugate("mythoner").conjug_info['Indicatif']['Passé Simple']['1p'] # print(test1) # print(test2) # print(test3) # print(test4) # print(test5) # You can now iterate over all conjugated forms of a verb by using the newly added Verb.iterate() method. default_conjugator = mlconjug.Conjugator(language='en') test_verb = default_conjugator.conjugate("sit") all_conjugated_forms = test_verb.iterate() print(all_conjugated_forms)
class Conjug: # static methods __conjugator = mlconjug.Conjugator(language='es') __tense = "Indicativo pretérito perfecto simple" __record = {} # enum wasn't working the way i expected first_singular = "1s" second_singular = "2s" third_singular = "3s" first_plural = "1p" second_plural = "2p" third_plural = "3p" abbrev = { first_singular: "me ", second_singular: "te ", third_singular: "se ", first_plural: "nos ", second_plural: "os ", third_plural: "se " } def __init__(self): pass def get_all(self, infinitive: str): return { Conjug.first_singular: self.get(Conjug.first_singular, infinitive), Conjug.second_singular: self.get(Conjug.second_singular, infinitive), Conjug.third_singular: self.get(Conjug.third_singular, infinitive), Conjug.first_plural: self.get(Conjug.first_plural, infinitive), Conjug.second_plural: self.get(Conjug.second_plural, infinitive), Conjug.third_plural: self.get(Conjug.third_plural, infinitive) } def get(self, part_of_speech: str, infinitive: str) -> str: if infinitive.lower() in Conjug.__record: if Conjug.__record[infinitive][0] == part_of_speech.lower(): return Conjug.__record[infinitive][1] og_verb = infinitive contains = infinitive[len(infinitive) - 2:len(infinitive)] == "se" if contains: infinitive = infinitive[0:len(infinitive) - 2] if infinitive == "pasar": if part_of_speech == Conjug.first_singular: Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), ("me " if contains else "") + "pasé") }) return ("me " if contains else "") + "pasé" elif part_of_speech == Conjug.second_singular: Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), ("te " if contains else "") + "pasaste") }) return ("te " if contains else "") + "pasaste" elif part_of_speech == Conjug.third_singular: Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), ("se " if contains else "") + "pasó") }) return ("se " if contains else "") + "pasó" elif part_of_speech == Conjug.first_plural: Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), ("nos " if contains else "") + "pasamos") }) return ("nos " if contains else "") + "pasamos" elif part_of_speech == Conjug.second_plural: Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), ("os " if contains else "") + "pasasteis") }) return ("os " if contains else "") + "pasasteis" elif part_of_speech == Conjug.third_plural: Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), ("se " if contains else "") + "pasaron") }) return ("se " if contains else "") + "pasaron" try: iteratable = Conjug.__conjugator.conjugate(infinitive).iterate() except: return '' for conjugation in iteratable: if conjugation[1] == Conjug.__tense and conjugation[ 2] == part_of_speech: correct = (Conjug.abbrev[part_of_speech] if contains else "") + conjugation[3] Conjug.__record.update({ og_verb.lower(): (part_of_speech.lower(), correct.lower()) }) return correct
def run(inputFilename, outputPath, keyword, first_occurrence, lemmatization=True): title_keyword = keyword for letter in keyword: if letter == '<' or letter == '>' or letter == ':' or letter == '"' or letter == '/' or letter == '\\' or letter == '|' or letter == '?' or letter == '*': title_keyword = keyword.replace(letter, "") kwtokens = word_tokenize(keyword.lower()) kwlist = [ ] #list of list which includes conjugated forms of each token in keyword phrase default_conjugator = mlconjug.Conjugator(language='en') if first_occurrence == True: outputPathone = outputPath + "/subfile_1" outputPathtwo = outputPath + "/subfile_2" if not os.path.exists(outputPathone) and not os.path.exists( outputPathtwo): os.mkdir(outputPathone) os.mkdir(outputPathtwo) for token in kwtokens: if token.isalpha(): conjus = default_conjugator.conjugate(token.lower()) formlist = conjus.iterate() forms = [] for form in formlist: forms.append(form[-1]) kwlist.append(list(dict.fromkeys(forms))) #reduce repetitions else: #mlconjug can't conjugate punctuation(if that's a part of the keyword) kwlist.append([token]) csvtitle = outputPath + '/' + os.path.split(inputFilename)[1].split( ".")[0] + "_" + title_keyword + '.csv' csvExist = os.path.exists(csvtitle) with open(csvtitle, "a", newline="", encoding='utf-8', errors='ignore') as csvFile: writer = csv.writer(csvFile) if csvExist == False: writer.writerow([ "Document ID", "Document", 'SPLIT_Document', "SEARCH_WORD", "SENTENCE", "Sentence ID of FIRST_OCCURRENCE", "RELATIVE_POSITION", "FREQUENCY of OCCURRENCE" ]) docIndex = 1 else: df = pd.read_csv(csvtitle, encoding="ISO-8859-1") if len(df) == 0: docIndex = 1 else: docIndex = df.iloc[-1][0] + 1 first_occurrence_index = 0 frequency = 0 contents = [] head, docname = os.path.split(inputFilename) title = docname.partition('.')[0] f = open(inputFilename, "r", encoding='utf-8', errors='ignore') docText = f.read() f.close() sentences_ = sent_tokenize(docText) #the list of sentneces in corpus subfileindex = 1 subfilePath = outputPath + os.sep + title + "_" + str( subfileindex) + '.txt' if first_occurrence == True: subfilePath = outputPathone + os.sep + title + "_" + str( subfileindex) + '.txt' subfile = open(subfilePath, 'w', encoding='utf-8', errors='ignore') sentence_index = 1 for sent in sentences_: tokens_ = word_tokenize(sent) kwindex = 0 kw = False for token in tokens_: t = token.lower() if kwindex == len(kwlist): break if t.lower() == kwtokens[kwindex] or ( lemmatization and (t.lower() in kwlist[kwindex] or kwtokens[kwindex] == wordnet.morphy(t)) ): #two ways to recognize the keyword #(1) the form in corpus match item in the conjugation list(verbs) #(2) the lemmatized form in corpus match the keyword token(for nouns or adjectives) kw = True kwindex += 1 else: kw = False kwindex = 0 if kw == True: #if keyword is detected, generate the next subfile frequency += 1 presubfile = subfilePath if frequency == 1: first_occurrence_index = sentence_index if first_occurrence == False or frequency <= 1: subfileindex += 1 subfilePath = outputPath + os.sep + title + "_" + str( subfileindex) + '.txt' if first_occurrence == True and subfileindex == 1: subfilePath = outputPathone + os.sep + title + "_" + str( subfileindex) + '.txt' if first_occurrence == True and subfileindex == 2: subfilePath = outputPathtwo + os.sep + title + "_" + str( subfileindex) + '.txt' subfile = open(subfilePath, 'w', encoding='utf-8', errors='ignore') contents.append([ docIndex, inputFilename, presubfile, keyword, sent, first_occurrence_index, sentence_index / len(sentences_), frequency ]) # writer.writerow([docIndex, inputFilename, presubfile, keyword, sent, first_occurrence_index, sentence_index / len(sentences_), frequency]) subfile.write(sent + " ") sentence_index += 1 # print(contents) l = len(contents) # print("length:",l) if l != 0 and first_occurrence: f = contents[-1][-1] if f > 1: f -= 1 # print(f) subpath = contents[-1][2] for i in reversed(range(l)): # print(contents[i][2]) if contents[i][2] == subpath: contents[i][-1] = f # elif l > 1: # f = contents[i][-1] - 1 # contents[i][-1] = f # subpath = contents[i][2] writer.writerows(contents)