Ejemplo n.º 1
0
    def term_to_id(self, term0):

        term = Preprocessing.convert_word_to_normal_form(term0)
        term = Preprocessing.lemmatize(term)
        if not re.match(r'[a-zа-я]+$', term):
            return None
        if self.excluds_stopwords and StopWords.is_stop_word(term):
            return None
        try:
            term_id = self.vocas_id[term]
        except:
            term_id = len(self.vocas)
            self.vocas_id[term] = term_id
            self.vocas.append(term)
            self.docfreq.append(0)
        return term_id
Ejemplo n.º 2
0
 def test_lemmatize(self):
     text = Preprocessing.lemmatize("Тебя")
     self.assertEqual("тебя", text)