コード例 #1
0
ファイル: lemmatizer.py プロジェクト: groschene/cltk2
 def tryer(self, i, kw, tag):
     regex = self.regex
     w = self.dictionnary
     test_wd = clean(basify(kw)).lower()
     reg = [r for r in regex if r[0] == tag][0]
     to_remove_from_d = reg[1]
     pseudo_end = reg[2]
     if to_remove_from_d > 0:
         test_wd = test_wd[:-to_remove_from_d + i] + pseudo_end
     sh = w
     keep = np.where(np.asarray([distance(test_wd, s) for s in sh]) == 0)
     if len(keep[0]) > 0:
         final = np.asarray(sh)[keep]
     else:
         keep = np.where(
             np.asarray([distance(test_wd, s) for s in sh]) == 1)
         if len(keep[0]) > 0:
             final = np.asarray(sh)[keep]
         else:
             keep = np.where(
                 np.asarray([distance(test_wd, s) for s in sh]) == 2)
             if len(keep[0]) > 0:
                 final = np.asarray(sh)[keep]
             else:
                 final = np.asarray(w)[self.last_chance(kw)]
     return final
コード例 #2
0
 def dict_lemmatizer(self, st):
     lemmatizer = self.lemma
     st = clean(basify(st)).lower()
     try:
         out = lemmatizer[lemmatizer[0]==st].values[0][1]
     except IndexError:
         out = 'unk'
     return out
コード例 #3
0
 def levdist_lemmatizer(self, kw, i=0):
     w = self.dictionnary
     test_wd = clean(basify(kw)).lower()
     keep = np.where(np.asarray([distance(test_wd,s) for s in w])==i)
     if len(keep[0]) > 0:
         final = keep[0][0]
         out = w[final]
     else:
         out = 'unk'
     return out
コード例 #4
0
ファイル: lemmatizer.py プロジェクト: groschene/cltk2
 def dummy_lemma(self, kw, tag):
     regex = self.regex
     w = self.dictionnary
     test_wd = clean(basify(kw)).lower()
     reg = [r for r in regex if r[0] == tag][0]
     to_remove_from_d = reg[1]
     pseudo_end = reg[2]
     if to_remove_from_d is not None and to_remove_from_d > 0:
         test_wd = test_wd[:-to_remove_from_d] + pseudo_end
     return test_wd
コード例 #5
0
ファイル: pos.py プロジェクト: groschene/cltk2
def get_tags(path):
    r = Replacer()
    entire_treebank = path
    with open(entire_treebank, 'r') as f:
        xml_string = f.read()
    root = etree.fromstring(xml_string)
    body = root.findall('body')[0]
    sentences = body.findall('sentence')
    sentences_list = []
    for sentence in sentences:
        words_list = sentence.findall('word')
        sentence_list = []
        for x in words_list:
            word = x.attrib
            form = word['form'].upper()
            form = r.beta_code(form)
            try:
                if form[-1] == 's':
                    form = form[:-1] + '?'
            except IndexError:
                pass
            form = form.lower()
            form = clean(basify(form))
            form_list = [
                char for char in form
                if char not in [' ', "'", '?', '’', '[', ']']
            ]
            form = ''.join(form_list)
            try:
                postag1 = word['postag']
                postag1 = postag1
                postag2 = word['lemma']
                postag2 = clean(basify(postag2))
            except:
                postag = 'x--------'
            if len(form) == 0: continue
            word_tag = '/'.join([form, postag1, postag2])
            sentence_list.append(word_tag)
        sentence_str = ' '.join(sentence_list)
        sentences_list.append(sentence_str)
    treebank_training_set = '\n\n'.join(sentences_list)
    return treebank_training_set
コード例 #6
0
 def tag(self, st):
     to_pos =  clean(basify(st)).lower().split()
     wd_list = self.tnt_tot._wd.keys()
     to_pos_unk = [item for item in to_pos if item not in list(wd_list)]
     print("nb of unk wd "+str(len(to_pos_unk)))
     print(to_pos_unk)
     if len(to_pos_unk)>0:
         tnt_new = tnt.TnT()
         tnt_new.train([self.top_k_wd(wd, 2) for wd in to_pos_unk])
         self.tnt_tot._wd = tnt_new._wd.__add__(self.tnt_tot._wd)
     return self.tnt_tot.tag(to_pos)
コード例 #7
0
ファイル: lemmatizer.py プロジェクト: groschene/cltk2
 def last_chance(self, kw):
     w = self.dictionnary
     no_find = True
     i = 0
     while no_find:
         test_wd = clean(basify(kw)).lower()
         keep = np.where(np.asarray([distance(test_wd, s) for s in w]) == i)
         if len(keep[0]) > 0:
             no_find = False
             final = keep[0][0]
         i = i + 1
         if i > 3:
             final = 'unk'
     return final
コード例 #8
0
 def pos_lemmatizer(self, kw, tag, rk = 0):
     regex = self.regex
     w = self.dictionnary
     test_wd = clean(basify(kw)).lower()
     try:
         reg=[r for r in regex if r[0]==tag][0]
     except IndexError:
         reg=[0,0,None]
     to_remove_from_d = reg[1]
     pseudo_end = reg[2]
     if to_remove_from_d is not None and to_remove_from_d > 0:
         test_wd = test_wd[:-to_remove_from_d] + pseudo_end
     sh = w
     keep = np.where(np.asarray([distance(test_wd,s) for s in sh])==rk)
     if len(keep[0])>0:
         final = np.asarray(sh)[keep]
         max_lcs = np.argmax([lcs(kw, i) for i in list(final)])
         final = list(final)[max_lcs]
     else:
         final = 'unk'
     return final
コード例 #9
0
ファイル: lemmatizer.py プロジェクト: groschene/cltk2
 def st_to_lemma(self, st):
     ta = self.CltkTnt
     lemma = []
     tagged = ta.tag(st)
     to_pos = clean(basify(st)).lower().split()
     len_st = len(to_pos)
     for i in range(len_st):
         try:
             le = self.lemmatize(to_pos[i])
         except IndexError:
             try:
                 le = self.tryer(0, *tagged[i])
                 if type(le) == np.str_:
                     le = le
                 elif len(le) > 0:
                     le = le[0]
                 else:
                     le = 'unk'
             except IndexError:
                 le = 'unk'
         lemma.append(le)
         print(le)
     out = lemma
     return out
コード例 #10
0
 def greek_to_token(self, wrd):
     input=" ".join(list(clean(basify(wrd))))
     sequences = self.tok.texts_to_sequences([input])
     sequences_matrix = sequence.pad_sequences(sequences, maxlen=16)
     out = to_categorical(sequences_matrix[0], num_classes=self.alpha).reshape(1, 16, self.alpha)
     return out
コード例 #11
0
ファイル: pos.py プロジェクト: groschene/cltk2
 def get_root(self):
     wd = self.word_string.split("/")[2]
     wd = wd.lower()
     wd = clean(basify(wd))
     return wd
コード例 #12
0
 def sentence_to_lemma(self, st):
     to = clean(basify(st))
     all_w = [list(word) for word in to.split()]
     to_tok = [" ".join(a) for a in all_w]
     return [self.attention_lemmatizer(target) for target in to_tok]