def tryer(self, i, kw, tag): regex = self.regex w = self.dictionnary test_wd = clean(basify(kw)).lower() reg = [r for r in regex if r[0] == tag][0] to_remove_from_d = reg[1] pseudo_end = reg[2] if to_remove_from_d > 0: test_wd = test_wd[:-to_remove_from_d + i] + pseudo_end sh = w keep = np.where(np.asarray([distance(test_wd, s) for s in sh]) == 0) if len(keep[0]) > 0: final = np.asarray(sh)[keep] else: keep = np.where( np.asarray([distance(test_wd, s) for s in sh]) == 1) if len(keep[0]) > 0: final = np.asarray(sh)[keep] else: keep = np.where( np.asarray([distance(test_wd, s) for s in sh]) == 2) if len(keep[0]) > 0: final = np.asarray(sh)[keep] else: final = np.asarray(w)[self.last_chance(kw)] return final
def dict_lemmatizer(self, st): lemmatizer = self.lemma st = clean(basify(st)).lower() try: out = lemmatizer[lemmatizer[0]==st].values[0][1] except IndexError: out = 'unk' return out
def levdist_lemmatizer(self, kw, i=0): w = self.dictionnary test_wd = clean(basify(kw)).lower() keep = np.where(np.asarray([distance(test_wd,s) for s in w])==i) if len(keep[0]) > 0: final = keep[0][0] out = w[final] else: out = 'unk' return out
def dummy_lemma(self, kw, tag): regex = self.regex w = self.dictionnary test_wd = clean(basify(kw)).lower() reg = [r for r in regex if r[0] == tag][0] to_remove_from_d = reg[1] pseudo_end = reg[2] if to_remove_from_d is not None and to_remove_from_d > 0: test_wd = test_wd[:-to_remove_from_d] + pseudo_end return test_wd
def get_tags(path): r = Replacer() entire_treebank = path with open(entire_treebank, 'r') as f: xml_string = f.read() root = etree.fromstring(xml_string) body = root.findall('body')[0] sentences = body.findall('sentence') sentences_list = [] for sentence in sentences: words_list = sentence.findall('word') sentence_list = [] for x in words_list: word = x.attrib form = word['form'].upper() form = r.beta_code(form) try: if form[-1] == 's': form = form[:-1] + '?' except IndexError: pass form = form.lower() form = clean(basify(form)) form_list = [ char for char in form if char not in [' ', "'", '?', '’', '[', ']'] ] form = ''.join(form_list) try: postag1 = word['postag'] postag1 = postag1 postag2 = word['lemma'] postag2 = clean(basify(postag2)) except: postag = 'x--------' if len(form) == 0: continue word_tag = '/'.join([form, postag1, postag2]) sentence_list.append(word_tag) sentence_str = ' '.join(sentence_list) sentences_list.append(sentence_str) treebank_training_set = '\n\n'.join(sentences_list) return treebank_training_set
def tag(self, st): to_pos = clean(basify(st)).lower().split() wd_list = self.tnt_tot._wd.keys() to_pos_unk = [item for item in to_pos if item not in list(wd_list)] print("nb of unk wd "+str(len(to_pos_unk))) print(to_pos_unk) if len(to_pos_unk)>0: tnt_new = tnt.TnT() tnt_new.train([self.top_k_wd(wd, 2) for wd in to_pos_unk]) self.tnt_tot._wd = tnt_new._wd.__add__(self.tnt_tot._wd) return self.tnt_tot.tag(to_pos)
def last_chance(self, kw): w = self.dictionnary no_find = True i = 0 while no_find: test_wd = clean(basify(kw)).lower() keep = np.where(np.asarray([distance(test_wd, s) for s in w]) == i) if len(keep[0]) > 0: no_find = False final = keep[0][0] i = i + 1 if i > 3: final = 'unk' return final
def pos_lemmatizer(self, kw, tag, rk = 0): regex = self.regex w = self.dictionnary test_wd = clean(basify(kw)).lower() try: reg=[r for r in regex if r[0]==tag][0] except IndexError: reg=[0,0,None] to_remove_from_d = reg[1] pseudo_end = reg[2] if to_remove_from_d is not None and to_remove_from_d > 0: test_wd = test_wd[:-to_remove_from_d] + pseudo_end sh = w keep = np.where(np.asarray([distance(test_wd,s) for s in sh])==rk) if len(keep[0])>0: final = np.asarray(sh)[keep] max_lcs = np.argmax([lcs(kw, i) for i in list(final)]) final = list(final)[max_lcs] else: final = 'unk' return final
def st_to_lemma(self, st): ta = self.CltkTnt lemma = [] tagged = ta.tag(st) to_pos = clean(basify(st)).lower().split() len_st = len(to_pos) for i in range(len_st): try: le = self.lemmatize(to_pos[i]) except IndexError: try: le = self.tryer(0, *tagged[i]) if type(le) == np.str_: le = le elif len(le) > 0: le = le[0] else: le = 'unk' except IndexError: le = 'unk' lemma.append(le) print(le) out = lemma return out
def greek_to_token(self, wrd): input=" ".join(list(clean(basify(wrd)))) sequences = self.tok.texts_to_sequences([input]) sequences_matrix = sequence.pad_sequences(sequences, maxlen=16) out = to_categorical(sequences_matrix[0], num_classes=self.alpha).reshape(1, 16, self.alpha) return out
def get_root(self): wd = self.word_string.split("/")[2] wd = wd.lower() wd = clean(basify(wd)) return wd
def sentence_to_lemma(self, st): to = clean(basify(st)) all_w = [list(word) for word in to.split()] to_tok = [" ".join(a) for a in all_w] return [self.attention_lemmatizer(target) for target in to_tok]