def __iter__(self): """ Получение очередного батча. :return: индексы словоформ, грамматические векторы, ответы-индексы. """ sentences = [[]] i = 0 for filename in self.filenames: with tqdm_open(filename, encoding='utf-8') as f: for line in f: line = line.strip() if len(line) == 0: last_sentence = sentences[-1] is_wrong_sentence = (i not in self.indices) or \ (len(last_sentence) < self.sentence_len_low) or \ (len(last_sentence) > self.sentence_len_high) if is_wrong_sentence: sentences.pop() if len(sentences) >= self.batch_size: yield self.__to_tensor(sentences) sentences = [] sentences.append([]) i += 1 else: word, lemma, pos, tags = line.split('\t')[0:4] word, lemma = word.lower(), lemma.lower() + '_' + pos gram_vector_index = self.grammeme_vectorizer_output.get_index_by_name(pos + "#" + tags) sentences[-1].append(WordForm(lemma, gram_vector_index, word)) if len(sentences[-1]) == 0: sentences.pop() yield self.__to_tensor(sentences)
def __iter__(self): """ Получение очередного батча. :return: индексы словоформ, грамматические векторы, ответы-индексы. """ last_sentence = [] i = 0 for filename in self.filenames: with tqdm_open(filename, encoding='utf-8') as f: for line in f: line = line.strip() if len(line) == 0: if i in self.indices: for index, bucket in enumerate(self.buckets): if self.bucket_borders[index][0] <= len(last_sentence) < self.bucket_borders[index][1]: bucket.append(last_sentence) if len(bucket) >= self.batch_size: yield self.__to_tensor(bucket) self.buckets[index] = [] last_sentence = [] i += 1 else: word, lemma, pos, tags = line.split('\t')[0:4] word, lemma = word.lower(), lemma.lower() + '_' + pos gram_vector_index = self.grammeme_vectorizer_output.get_index_by_name(pos + "#" + tags) last_sentence.append(WordForm(lemma, gram_vector_index, word)) for index, bucket in enumerate(self.buckets): yield self.__to_tensor(bucket)
def collect_grammemes(self, filename: str) -> None: """ Собрать возможные грамматические значения по файлу с морфоразметкой. :param filename: файл с морфоразметкой. """ with tqdm_open(filename, encoding="utf-8") as f: for line in f: line = line.strip() if len(line) == 0: continue pos_tag, grammemes = line.split("\t")[2:4] self.add_grammemes(pos_tag, grammemes)
def parse_corpora(self, file_names: List[str]): """ Построить WordVocabulary, GrammemeVectorizer по корпусу :param file_names: пути к файлам корпуса. """ for file_name in file_names: with tqdm_open(file_name, encoding="utf-8") as f: for line in f: if line == "\n": continue self.__process_line(line) self.grammeme_vectorizer_input.init_possible_vectors() self.grammeme_vectorizer_output.init_possible_vectors() self.word_vocabulary.sort() self.char_set = " " + "".join(self.char_set).replace(" ", "")
def parse_corpora( self, filenames: List[str] ) -> Tuple[GrammemeVectorizer, GrammemeVectorizer]: """ Построить WordVocabulary, GrammemeVectorizer по корпусу :param filenames: пути к файлам корпуса. """ for filename in filenames: with tqdm_open(filename, encoding="utf-8") as f: for line in f: if line == "\n": continue self.__process_line(line) self.grammeme_vectorizer_input.init_possible_vectors() self.grammeme_vectorizer_output.init_possible_vectors() return self.grammeme_vectorizer_input, self.grammeme_vectorizer_output