Beispiel #1
0
 def convert_from_conllu(input_filename,
                         output_filename,
                         with_forth_column=False,
                         with_punct=True,
                         add_number=False):
     with open(input_filename, "r",
               encoding='utf-8') as r, open(output_filename,
                                            "w",
                                            encoding='utf-8') as w:
         i = 0
         for line in r:
             if line[0] == "#" or line[0] == "=":
                 continue
             if line == "\n":
                 w.write("\n")
                 i = 0
                 continue
             records = line.split("\t")
             pos = records[3]
             if with_forth_column:
                 gram = records[5]
             else:
                 gram = records[4]
             gram = process_gram_tag(gram)
             if pos == "PUNCT" and not with_punct:
                 continue
             if add_number:
                 i += 1
                 w.write("\t".join(
                     [str(i), records[1], records[2].lower(), pos, gram]) +
                         "\n")
             else:
                 w.write("\t".join(
                     [records[1], records[2].lower(), pos, gram]) + "\n")
 def add_grammemes(self, pos_tag: str, gram: str) -> int:
     """
     Добавить новое грамматическое значение в список известных
     """
     gram = process_gram_tag(gram)
     vector_name = pos_tag + '#' + gram
     if vector_name not in self.name_to_index:
         self.name_to_index[vector_name] = len(self.name_to_index)
         self.all_grammemes["POS"].add(pos_tag)
         gram = gram.split("|") if gram != "_" else []
         for grammeme in gram:
             category = grammeme.split("=")[0]
             value = grammeme.split("=")[1]
             self.all_grammemes[category].add(value)
     return self.name_to_index[vector_name]
Beispiel #3
0
 def __process_line(self, line: str) -> None:
     """
     Обработка строчки в корпусе с морфоразметкой.
     :param line: 
     :return: 
     """
     text, lemma, pos_tag, grammemes = line.strip().split("\t")[0:4]
     # Заполняем набор возможных выходных тегов.
     self.grammeme_vectorizer_output.add_grammemes(pos_tag, grammemes)
     # Заполняем набор возможных входных тегов.
     for parse in self.morph.parse(text):
         pos, gram = convert_from_opencorpora_tag(self.converter, parse.tag,
                                                  text)
         gram = process_gram_tag(gram)
         self.grammeme_vectorizer_input.add_grammemes(pos, gram)
Beispiel #4
0
 def __get_lemma(self, word: str, pos_tag: str, gram: str, word_forms=None,
                 enable_normalization: bool=True):
     """
     Получить лемму.
     
     :param word: слово.
     :param pos_tag: часть речи.
     :param gram: граммаическое значение.
     :param enable_normalization: использовать ли нормализацию как в корпусе ГИКРЯ.
     :return: лемма.
     """
     if '_' in word:
         return word
     if self.language == "ru":
         if word_forms is None:
             word_forms = self.morph.parse(word)
         guess = ""
         max_common_tags = 0
         for word_form in word_forms:
             word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(self.converter, word_form.tag, word)
             word_form_gram = process_gram_tag(word_form_gram)
             common_tags_len = len(set(word_form_gram.split("|")).intersection(set(gram.split("|"))))
             if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag:
                 max_common_tags = common_tags_len
                 guess = word_form
         if guess == "":
             guess = word_forms[0]
         if enable_normalization:
             lemma = self.__normalize_for_gikrya(guess)
         else:
             lemma = guess.normal_form
         return lemma
     elif self.language == "en":
         lemmatizer = nltk.stem.WordNetLemmatizer()
         pos_map = defaultdict(lambda: 'n')
         pos_map.update({
             'ADJ': 'a',
             'ADV': 'r',
             'NOUN': 'n',
             'VERB': 'v'
         })
         return lemmatizer.lemmatize(word, pos=pos_map[pos_tag])
     else:
         assert False
Beispiel #5
0
    def get_sample(sentence: List[str], morph: pymorphy2.MorphAnalyzer,
                   grammeme_vectorizer: GrammemeVectorizer, max_word_len: int):
        """
        Получние признаков для отдельного предложения.
        
        :param sentence: предложение.
        :param morph: морфология.
        :param grammeme_vectorizer: грамматический словарь. 
        :param max_word_len: количество обрабатываемых букв в слове.
        :return: индексы слов, грамматические векторы, индексы символов.
        """
        to_ud = converters.converter('opencorpora-int', 'ud14')
        word_char_vectors = []
        word_gram_vectors = []
        for word in sentence:
            char_indices = np.zeros(max_word_len)
            gram_value_indices = np.zeros(grammeme_vectorizer.grammemes_count())

            # Индексы символов слова.
            word_char_indices = [CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET) for ch in word][:max_word_len]
            char_indices[-min(len(word), max_word_len):] = word_char_indices
            word_char_vectors.append(char_indices)

            # Грамматический вектор слова.
            # Складываем все возможные варианты разбора поэлементно.
            for parse in morph.parse(word):
                pos, gram = convert_from_opencorpora_tag(to_ud, parse.tag, word)
                gram = process_gram_tag(gram)
                gram_value_indices += np.array(grammeme_vectorizer.get_vector(pos + "#" + gram))
            # Нормируем по каждой категории отдельно.
            sorted_grammemes = sorted(grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0])
            index = 0
            for category, values in sorted_grammemes:
                mask = gram_value_indices[index:index+len(values)]
                s = sum(mask)
                gram_value_indices[index:index+len(values)] = mask/s
                index += len(values)
            word_gram_vectors.append(gram_value_indices)

        return word_gram_vectors, word_char_vectors
Beispiel #6
0
 def convert_from_conllu(input_filename,
                         output_filename,
                         with_forth_column=False,
                         with_punct=True):
     with open(input_filename, "r") as r:
         with open(output_filename, "w") as w:
             for line in r:
                 if line[0] == "#" or line[0] == "=":
                     continue
                 if line == "\n":
                     w.write("\n")
                     continue
                 records = line.split("\t")
                 pos = records[3]
                 if with_forth_column:
                     gram = records[5]
                 else:
                     gram = records[4]
                 gram = process_gram_tag(gram)
                 if pos == "PUNCT" and not with_punct:
                     continue
                 w.write("\t".join(
                     [records[1], records[2].lower(), pos, gram]) + "\n")
Beispiel #7
0
 def __process_line(self, line: str):
     """
     Обработка строчки в корпусе с морфоразметкой.
     :param line: 
     :return: 
     """
     text, lemma, pos_tag, grammemes = line.strip().split("\t")[0:4]
     # Заполняем словарь.
     self.word_vocabulary.add_word(text.lower())
     # Заполняем набор символов
     self.char_set |= {ch for ch in text}
     # Заполняем набор возможных выходных тегов.
     self.grammeme_vectorizer_output.add_grammemes(pos_tag, grammemes)
     # Заполняем набор возможных входных тегов.
     if self.language == "ru":
         for parse in self.morph.parse(text):
             pos, gram = convert_from_opencorpora_tag(
                 self.converter, parse.tag, text)
             gram = process_gram_tag(gram)
             self.grammeme_vectorizer_input.add_grammemes(pos, gram)
     elif self.language == "en":
         _, tags = zip(*nltk.pos_tag([text], tagset='universal'))
         pos = tags[0]
         self.grammeme_vectorizer_input.add_grammemes(pos, "_")
Beispiel #8
0
 def __get_lemma(self,
                 word: str,
                 pos_tag: str,
                 gram: str,
                 enable_gikrya_normalization: bool = True):
     """
     Получить лемму.
     
     :param word: слово.
     :param pos_tag: часть речи.
     :param gram: граммаическое значение.
     :param enable_gikrya_normalization: использовать ли нормализацию как в корпусе ГИКРЯ.
     :return: лемма.
     """
     if '_' in word:
         return word
     to_ud = converters.converter('opencorpora-int', 'ud14')
     guess = ""
     max_common_tags = 0
     for word_form in self.morph.parse(word):
         word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(
             to_ud, word_form.tag, word)
         word_form_gram = process_gram_tag(word_form_gram)
         common_tags_len = len(
             set(word_form_gram.split("|")).intersection(
                 set(gram.split("|"))))
         if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag:
             max_common_tags = common_tags_len
             guess = word_form
     if guess == "":
         guess = self.morph.parse(word)[0]
     if enable_gikrya_normalization:
         lemma = self.__normalize_for_gikrya(guess)
     else:
         lemma = guess.normal_form
     return lemma
 def get_index_by_name(self, name):
     pos = name.split("#")[0]
     gram = process_gram_tag(name.split("#")[1])
     return self.name_to_index[pos + "#" + gram]
Beispiel #10
0
    def get_sample(sentence: List[str], language: str, converter,
                   morph: MorphAnalyzer,
                   grammeme_vectorizer: GrammemeVectorizer, max_word_len: int,
                   word_vocabulary: WordVocabulary, word_count: int,
                   char_set: str):
        """
        Получние признаков для отдельного предложения.

        :param language: язык.
        :param sentence: предложение.
        :param morph: морфология.
        :param converter: конвертер тегов в UD.
        :param grammeme_vectorizer: грамматический словарь. 
        :param max_word_len: количество обрабатываемых букв в слове.
        :param word_vocabulary: список слов.
        :param word_count: максимальный индекс слова.
        :param char_set: список возможных символов, для которых есть эмбеддинги.
        :return: индексы слов, грамматические векторы, индексы символов.
        """
        word_char_vectors = []
        word_gram_vectors = []
        word_indices = []
        for word in sentence:
            char_indices = np.zeros(max_word_len)
            gram_value_indices = np.zeros(
                grammeme_vectorizer.grammemes_count())

            # Индексы символов слова.
            word_char_indices = [
                char_set.index(ch) if ch in char_set else len(char_set)
                for ch in word
            ][-max_word_len:]
            char_indices[-min(len(word), max_word_len):] = word_char_indices
            word_char_vectors.append(char_indices)

            # Индексы слов.
            word_index = word_vocabulary.word_to_index[
                word.lower()] if word_vocabulary.has_word(word) else word_count
            word_index = min(word_index, word_count)
            word_indices.append(word_index)

            # Грамматический вектор слова.
            if language == "ru":
                # Складываем все возможные варианты разбора поэлементно.
                for parse in morph.parse(word):
                    pos, gram = convert_from_opencorpora_tag(
                        converter, parse.tag, word)
                    gram = process_gram_tag(gram)
                    gram_value_indices += np.array(
                        grammeme_vectorizer.get_vector(pos + "#" + gram))
            elif language == "en":
                _, tags = zip(*nltk.pos_tag([word], tagset='universal'))
                pos = tags[0]
                gram_value_indices += np.array(
                    grammeme_vectorizer.get_vector(pos + "#_"))

            # Нормируем по каждой категории отдельно.
            sorted_grammemes = sorted(
                grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0])
            index = 0
            for category, values in sorted_grammemes:
                mask = gram_value_indices[index:index + len(values)]
                s = sum(mask)
                gram_value_indices[index:index + len(values)] = mask / s
                index += len(values)
            word_gram_vectors.append(gram_value_indices)

        return word_indices, word_gram_vectors, word_char_vectors