Example #1
0
    def __iter__(self):
        """
        Получение очередного батча.

        :return: индексы словоформ, грамматические векторы, ответы-индексы.
        """
        sentences = [[]]
        i = 0
        for filename in self.filenames:
            with tqdm_open(filename, encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if len(line) == 0:
                        last_sentence = sentences[-1]
                        is_wrong_sentence = (i not in self.indices) or \
                                            (len(last_sentence) < self.sentence_len_low) or \
                                            (len(last_sentence) > self.sentence_len_high)
                        if is_wrong_sentence:
                            sentences.pop()
                        if len(sentences) >= self.batch_size:
                            yield self.__to_tensor(sentences)
                            sentences = []
                        sentences.append([])
                        i += 1
                    else:
                        word, lemma, pos, tags = line.split('\t')[0:4]
                        word, lemma = word.lower(), lemma.lower() + '_' + pos
                        gram_vector_index = self.grammeme_vectorizer_output.get_index_by_name(pos + "#" + tags)
                        sentences[-1].append(WordForm(lemma, gram_vector_index, word))
        if len(sentences[-1]) == 0:
            sentences.pop()
        yield self.__to_tensor(sentences)
Example #2
0
    def __iter__(self):
        """
        Получение очередного батча.

        :return: индексы словоформ, грамматические векторы, ответы-индексы.
        """
        last_sentence = []
        i = 0
        for filename in self.filenames:
            with tqdm_open(filename, encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if len(line) == 0:
                        if i in self.indices:
                            for index, bucket in enumerate(self.buckets):
                                if self.bucket_borders[index][0] <= len(last_sentence) < self.bucket_borders[index][1]:
                                    bucket.append(last_sentence)
                                if len(bucket) >= self.batch_size:
                                    yield self.__to_tensor(bucket)
                                    self.buckets[index] = []
                        last_sentence = []
                        i += 1
                    else:
                        word, lemma, pos, tags = line.split('\t')[0:4]
                        word, lemma = word.lower(), lemma.lower() + '_' + pos
                        gram_vector_index = self.grammeme_vectorizer_output.get_index_by_name(pos + "#" + tags)
                        last_sentence.append(WordForm(lemma, gram_vector_index, word))
        for index, bucket in enumerate(self.buckets):
            yield self.__to_tensor(bucket)
 def collect_grammemes(self, filename: str) -> None:
     """
     Собрать возможные грамматические значения по файлу с морфоразметкой.
     
     :param filename: файл с морфоразметкой.
     """
     with tqdm_open(filename, encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             if len(line) == 0:
                 continue
             pos_tag, grammemes = line.split("\t")[2:4]
             self.add_grammemes(pos_tag, grammemes)
Example #4
0
    def parse_corpora(self, file_names: List[str]):
        """
        Построить WordVocabulary, GrammemeVectorizer по корпусу

        :param file_names: пути к файлам корпуса.
        """
        for file_name in file_names:
            with tqdm_open(file_name, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.grammeme_vectorizer_input.init_possible_vectors()
        self.grammeme_vectorizer_output.init_possible_vectors()
        self.word_vocabulary.sort()
        self.char_set = " " + "".join(self.char_set).replace(" ", "")
Example #5
0
    def parse_corpora(
            self, filenames: List[str]
    ) -> Tuple[GrammemeVectorizer, GrammemeVectorizer]:
        """
        Построить WordVocabulary, GrammemeVectorizer по корпусу

        :param filenames: пути к файлам корпуса.
        """
        for filename in filenames:
            with tqdm_open(filename, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.grammeme_vectorizer_input.init_possible_vectors()
        self.grammeme_vectorizer_output.init_possible_vectors()
        return self.grammeme_vectorizer_input, self.grammeme_vectorizer_output