def __init__(self): self.grammeme_vectorizer = GrammemeVectorizer() self.word_form_vocabulary = WordFormVocabulary() self.lemma_to_word_forms = defaultdict( set) # type: Dict[str, Set[WordForm]] self.lemma_case = {} self.lemma_counter = Counter() # type: Counter
def prepare( self, filenames: List[str] = list(), word_form_vocab_dump_path: str = GENERATOR_WORD_FORM_VOCAB_PATH, gram_dump_path: str = GENERATOR_GRAM_VECTORS) -> None: """ Подготовка векторизатора грамматических значений и словаря словоформ по корпусу. :param filenames: имена файлов с морфоразметкой. :param word_form_vocab_dump_path: путь к дампу словаря словоформ. :param gram_dump_path: путь к векторам грамматических значений. """ self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path) self.word_form_vocabulary = WordFormVocabulary( word_form_vocab_dump_path) if self.grammeme_vectorizer.is_empty( ) or self.word_form_vocabulary.is_empty(): loader = CorporaInformationLoader() self.word_form_vocabulary, self.grammeme_vectorizer = loader.parse_corpora( filenames) self.grammeme_vectorizer.save() self.word_form_vocabulary.save() if self.recalculate_softmax: self.softmax_size = self.word_form_vocabulary.get_softmax_size_by_lemma_size( self.embedding_size) print("Recalculated softmax: ", self.softmax_size)
class Loader(object): """ Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу """ def __init__(self, gram_dump_path, word_dump_path): self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path) self.word_vocabulary = WordVocabulary(word_dump_path) self.morph = pymorphy2.MorphAnalyzer() def parse_corpora( self, filenames: List[str]) -> Tuple[GrammemeVectorizer, WordVocabulary]: """ Построить WordFormVocabulary, GrammemeVectorizer по корпусу :param filenames: пути к файлам корпуса. """ for filename in filenames: with tqdm_open(filename, encoding="utf-8") as f: for line in f: if line == "\n": continue self.__process_line(line) self.grammeme_vectorizer.init_possible_vectors() return self.grammeme_vectorizer, self.word_vocabulary def __process_line(self, line: str) -> None: text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4] self.word_vocabulary.add_word(text) self.grammeme_vectorizer.add_grammemes(pos_tag, grammemes) to_ud = converters.converter('opencorpora-int', 'ud14') for parse in self.morph.parse(text): ud_tag = to_ud(str(parse.tag), text) pos = ud_tag.split()[0] gram = ud_tag.split()[1].split("|") dropped = ["Animacy", "Aspect", "NumType"] gram = [ grammem for grammem in gram if sum([drop in grammem for drop in dropped]) == 0 ] gram = "|".join(gram) self.grammeme_vectorizer.add_grammemes(pos, gram)
def __init__(self, gram_dump_path, word_dump_path): self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path) self.word_vocabulary = WordVocabulary(word_dump_path) self.morph = pymorphy2.MorphAnalyzer()
class CorporaInformationLoader(object): """ Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу """ def __init__(self): self.grammeme_vectorizer = GrammemeVectorizer() self.word_form_vocabulary = WordFormVocabulary() self.lemma_to_word_forms = defaultdict( set) # type: Dict[str, Set[WordForm]] self.lemma_case = {} self.lemma_counter = Counter() # type: Counter def parse_corpora( self, filenames: List[str] ) -> Tuple[WordFormVocabulary, GrammemeVectorizer]: """ Построить WordFormVocabulary, GrammemeVectorizer по корпусу :param filenames: пути к файлам корпуса. """ for filename in filenames: with tqdm_open(filename, encoding="utf-8") as f: for line in f: if line == "\n": continue self.__process_line(line) self.__add_seq_end() self.grammeme_vectorizer.init_possible_vectors() self.word_form_vocabulary.init_by_vocabulary(self.lemma_counter, self.lemma_to_word_forms, self.lemma_case) self.word_form_vocabulary.lemma_indices[SEQ_END_WF] = 1 return self.word_form_vocabulary, self.grammeme_vectorizer def __add_seq_end(self): self.lemma_to_word_forms[SEQ_END].add(SEQ_END_WF) self.lemma_case[SEQ_END] = SEQ_END_WF.case self.lemma_counter[SEQ_END] = sys.maxsize def __process_line(self, line: str) -> None: try: text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4] lemma = lemma.lower() + '_' + pos_tag gram_vector_index = self.grammeme_vectorizer.add_grammemes( pos_tag, grammemes) self.lemma_to_word_forms[lemma].add( WordForm(lemma, gram_vector_index, text.lower())) self.lemma_counter[lemma] += 1 self.__update_lemma_case(lemma, text) except ValueError: pass def __update_lemma_case(self, lemma: str, text: str) -> None: if lemma not in self.lemma_case: self.lemma_case[lemma] = LemmaCase.UPPER_CASE if text.isupper() else \ LemmaCase.PROPER_CASE if text[0].isupper() else LemmaCase.NORMAL_CASE elif self.lemma_case[lemma] == LemmaCase.UPPER_CASE: if not text.isupper(): self.lemma_case[lemma] = LemmaCase.PROPER_CASE if text[ 0].isupper() else LemmaCase.NORMAL_CASE elif self.lemma_case[lemma] == LemmaCase.PROPER_CASE: if not text[0].isupper(): self.lemma_case[lemma] = LemmaCase.NORMAL_CASE
class LSTMGenerator: """ Языковая модель на основе двухуровневой LSTM RNN. """ def __init__(self, embedding_size: int = 30000, softmax_size: int = 60000, external_batch_size: int = 10000, nn_batch_size: int = 768, sentence_maxlen: int = 10, lstm_units=368, embeddings_dimension: int = 150, grammeme_dense_units: List[int] = [35, 15], dense_units: int = 256): """ :param embeddings_size: размер входного слоя (=размер словаря) :param softmax_size: размер выхода softmax-слоя (=размер итогового набора вероятностей) :param external_batch_size: размер набора семплов для BatchGenerator'а. :param nn_batch_size: размер набора семплов для обучения. :param sentence_maxlen: маскимальная длина куска предложения. """ self.embedding_size = embedding_size # type: int self.softmax_size = softmax_size # type: int self.external_batch_size = external_batch_size # type: int self.nn_batch_size = nn_batch_size # type: int self.sentence_maxlen = sentence_maxlen # type: int self.word_form_vocabulary = None # type: WordFormVocabulary self.grammeme_vectorizer = None # type: GrammemeVectorizer self.lstm_units = lstm_units # type: int self.embeddings_dimension = embeddings_dimension # type: int self.grammeme_dense_units = grammeme_dense_units # type: List[int] self.dense_units = dense_units # type: int self.model = None # type: Model def prepare( self, filenames: List[str] = list(), word_form_vocab_dump_path: str = GENERATOR_WORD_FORM_VOCAB_PATH, gram_dump_path: str = GENERATOR_GRAM_VECTORS) -> None: """ Подготовка векторизатора грамматических значений и словаря словоформ по корпусу. :param filenames: имена файлов с морфоразметкой. :param word_form_vocab_dump_path: путь к дампу словаря словоформ. :param gram_dump_path: путь к векторам грамматических значений. """ self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path) self.word_form_vocabulary = WordFormVocabulary( word_form_vocab_dump_path) if self.grammeme_vectorizer.is_empty( ) or self.word_form_vocabulary.is_empty(): loader = CorporaInformationLoader() self.word_form_vocabulary, self.grammeme_vectorizer = loader.parse_corpora( filenames) self.grammeme_vectorizer.save() self.word_form_vocabulary.save() def load(self, model_filename: str) -> None: """ Загрузка модели. :param model_filename: файл с моделью. """ self.model = load_model(model_filename) def load_with_weights(self, json_filename: str, weights_filename: str) -> None: """ Загрузка модели из json описания и файла с весами. :param json_filename: json описание. :param weights_filename: файл с весам. """ json_string = open(json_filename, 'r', encoding='utf8').readline() self.model = model_from_json(json_string) self.model.load_weights(weights_filename) def build(self): """ Описание модели. """ # Вход лемм lemmas = Input(shape=(None, ), name='lemmas') lemmas_embedding = Embedding(self.embedding_size + 1, self.embeddings_dimension, name='embeddings')(lemmas) lemmas_embedding = SpatialDropout1D(.3)(lemmas_embedding) # Вход граммем grammemes_input = Input( shape=(None, self.grammeme_vectorizer.grammemes_count()), name='grammemes') grammemes_layer = Masking(mask_value=0.)(grammemes_input) for grammeme_dense_layer_units in self.grammeme_dense_units: grammemes_layer = Dense(grammeme_dense_layer_units, activation='relu')(grammemes_layer) layer = Merge(mode='concat', name='LSTM_input')([lemmas_embedding, grammemes_layer]) layer = LSTM(self.lstm_units, dropout=.2, recurrent_dropout=.2, return_sequences=True, name='LSTM_1')(layer) layer = LSTM(self.lstm_units, dropout=.2, recurrent_dropout=.2, return_sequences=False, name='LSTM_2')(layer) layer = Dense(self.dense_units)(layer) layer = BatchNormalization()(layer) layer = Activation('relu')(layer) output = Dense(self.softmax_size + 1, activation='softmax')(layer) self.model = Model(inputs=[lemmas, grammemes_input], outputs=[output]) self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam') print(self.model.summary()) @staticmethod def __get_validation_data(batch_generator, size): """ Берет первые size батчей и batch_generator для валидационной выборки """ lemmas_list, grammemes_list, y_list = [], [], [] for lemmas, grammemes, y in islice(batch_generator, size): lemmas_list.append(lemmas) grammemes_list.append(grammemes) y_list.append(y) return np.vstack(lemmas_list), np.vstack(grammemes_list), np.hstack( y_list) def train(self, filenames: List[str], validation_size: int = 5, validation_verbosity: int = 5, dump_model_freq: int = 10) -> None: """ Обучение модели. :param filenames: имена файлов с морфоразметкой. """ batch_generator = BatchGenerator( filenames, batch_size=self.external_batch_size, embedding_size=self.embedding_size, softmax_size=self.softmax_size, sentence_maxlen=self.sentence_maxlen, word_form_vocabulary=self.word_form_vocabulary, grammeme_vectorizer=self.grammeme_vectorizer) lemmas_val, grammemes_val, y_val = LSTMGenerator.__get_validation_data( batch_generator, validation_size) for big_epoch in range(0, 1000): print('------------Big Epoch {}------------'.format(big_epoch)) for epoch, (lemmas, grammemes, y) in enumerate(batch_generator): if epoch < validation_size: continue self.model.fit([lemmas, grammemes], y, batch_size=self.nn_batch_size, epochs=1, verbose=2) if epoch != 0 and epoch % validation_verbosity == 0: print( 'val loss:', self.model.evaluate([lemmas_val, grammemes_val], y_val, batch_size=self.nn_batch_size * 2, verbose=0)) indices = [ self.word_form_vocabulary.get_sequence_end_index( SEQ_END_WF) ] for _ in range(10): indices.append(self._sample(self.predict(indices))) sentence = [ self.word_form_vocabulary.get_word_form_by_index(index) for index in indices ] print('Sentence', str(big_epoch), str(epoch), end=': ') for word in sentence[::-1]: print(word.text, end=' ') print() if epoch != 0 and epoch % dump_model_freq == 0: self.model.save(GENERATOR_LSTM_MODEL_PATH) def predict(self, word_indices: List[int]) -> np.array: """ Предсказание вероятностей следующего слова. :param word_indices: индексы предыдущих слов. :return: проекция языковой модели (вероятности следующего слова). """ if len(word_indices) == 0: return np.full(self.softmax_size, 1.0 / self.softmax_size, dtype=np.float) cur_sent = [ self.word_form_vocabulary.get_word_form_by_index(ind) for ind in word_indices ] x_lemmas = np.zeros((1, len(cur_sent))) x_grammemes = np.zeros( (1, len(cur_sent), self.grammeme_vectorizer.grammemes_count())) for index, word in enumerate(cur_sent): x_lemmas[ 0, index] = self.word_form_vocabulary.get_word_form_index_min( word, self.softmax_size) x_grammemes[0, index] = self.grammeme_vectorizer.vectors[ word.gram_vector_index] prob = self.model.predict([x_lemmas, x_grammemes], verbose=0)[0] return prob @staticmethod def _sample(prob: np.array, temperature: float = 1.0) -> int: """ Выбор слова по набору вероятностей с заданной температурой (распределение Больцмана). :param prob: вероятности. :param temperature: температура. :return: индекс итогового слова. """ prob = prob[:-1] # Для исключения неизвестных слов. prob = np.log(prob) / temperature prob = np.exp(prob) / np.sum(np.exp(prob)) return np.random.choice(len(prob), p=prob)