コード例 #1
0
ファイル: Dictionary.py プロジェクト: balaremember/data-repo
    def term_to_id(self, term0):

        term = Preprocessing.convert_word_to_normal_form(term0)
        term = Preprocessing.lemmatize(term)
        if not re.match(r'[a-zа-я]+$', term):
            return None
        if self.excluds_stopwords and StopWords.is_stop_word(term):
            return None
        try:
            term_id = self.vocas_id[term]
        except:
            term_id = len(self.vocas)
            self.vocas_id[term] = term_id
            self.vocas.append(term)
            self.docfreq.append(0)
        return term_id
コード例 #2
0
 def get_text_as_list_of_words(self):
     return Preprocessing.convert_text_to_list_of_words(self._text)
コード例 #3
0
 def test_remove_stop_words_from_list_of_words(self):
     answer = Preprocessing.remove_stop_words_from_list_of_words(["Я"], ["Я", "люблю", "тебя"])
     self.assertEqual(answer, ["люблю", "тебя"])
コード例 #4
0
 def test_convert_word_to_normal_form(self):
     word = Preprocessing.convert_word_to_normal_form("гонит")
     self.assertEqual(word, "гнать")
コード例 #5
0
 def test_convert_list_of_words_to_normal_forms(self):
     word_list = Preprocessing.convert_list_of_words_to_normal_forms(["люблю", "тебя"])
     self.assertEqual(word_list, ["любить", "ты"])
コード例 #6
0
 def test_convert_document_to_list_of_words(self):
     doc = Document("Привет как дела")
     word_list = Preprocessing.convert_document_to_list_of_words(doc)
     self.assertEqual(word_list, ["Привет", "как", "дела"])
コード例 #7
0
 def test_convert_text_to_list_of_words(self):
     word_list = Preprocessing.convert_text_to_list_of_words("Привет как дела")
     self.assertEqual(word_list, ["Привет", "как", "дела"])
コード例 #8
0
 def test_lemmatize(self):
     text = Preprocessing.lemmatize("Тебя")
     self.assertEqual("тебя", text)
コード例 #9
0
 def test_convert_word_list_to_text(self):
     word_list = ["re", "wa", "et", "ya"]
     text = Preprocessing.convert_word_list_to_text(word_list)
     self.assertEqual("re wa et ya", text)