Esempio n. 1
0
 def prepare(
         self,
         filenames: List[str] = list(),
         word_form_vocab_dump_path: str = GENERATOR_WORD_FORM_VOCAB_PATH,
         gram_dump_path: str = GENERATOR_GRAM_VECTORS) -> None:
     """
     Подготовка векторизатора грамматических значений и словаря словоформ по корпусу.
     
     :param filenames: имена файлов с морфоразметкой.
     :param word_form_vocab_dump_path: путь к дампу словаря словоформ.
     :param gram_dump_path: путь к векторам грамматических значений.
     """
     self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
     self.word_form_vocabulary = WordFormVocabulary(
         word_form_vocab_dump_path)
     if self.grammeme_vectorizer.is_empty(
     ) or self.word_form_vocabulary.is_empty():
         loader = CorporaInformationLoader()
         self.word_form_vocabulary, self.grammeme_vectorizer = loader.parse_corpora(
             filenames)
         self.grammeme_vectorizer.save()
         self.word_form_vocabulary.save()
     if self.recalculate_softmax:
         self.softmax_size = self.word_form_vocabulary.get_softmax_size_by_lemma_size(
             self.embedding_size)
         print("Recalculated softmax: ", self.softmax_size)
Esempio n. 2
0
 def __init__(self):
     self.grammeme_vectorizer = GrammemeVectorizer()
     self.word_form_vocabulary = WordFormVocabulary()
     self.lemma_to_word_forms = defaultdict(
         set)  # type: Dict[str, Set[WordForm]]
     self.lemma_case = {}
     self.lemma_counter = Counter()  # type: Counter
Esempio n. 3
0
 def get_lstm_generator(self, model_path: str,
                        word_form_vocab_dump_path: str,
                        stress_vocab_dump_path: str,
                        gram_dump_path: str) -> Generator:
     if self.lstm_generator is None:
         lstm = LSTMModelContainer(model_path, word_form_vocab_dump_path,
                                   gram_dump_path)
         word_form_vocabulary = WordFormVocabulary(
             word_form_vocab_dump_path)
         vocabulary = StressVocabulary(stress_vocab_dump_path)
         self.lstm_generator = Generator(lstm, vocabulary,
                                         word_form_vocabulary)
     return self.lstm_generator
Esempio n. 4
0
 def get_sample(sentence, embedding_size: int, max_word_len: int,
                word_form_vocabulary: WordFormVocabulary,
                grammeme_vectorizer: GrammemeVectorizer):
     lemmas_vector = [
         min(word_form_vocabulary.get_lemma_index(x), embedding_size)
         for x in sentence
     ]
     grammemes_vector = [
         grammeme_vectorizer.get_vector_by_index(x.gram_vector_index)
         for x in sentence
     ]
     word_char_vectors = []
     for word in sentence:
         char_indices = np.zeros(max_word_len)
         word_char_indices = [
             CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET)
             for ch in word.text
         ][:max_word_len]
         char_indices[-min(len(word.text), max_word_len
                           ):] = word_char_indices
         word_char_vectors.append(char_indices)
     return lemmas_vector, grammemes_vector, word_char_vectors
Esempio n. 5
0
    def is_rhyme(word1: StressedWord, word2: StressedWord, score_border: int=4, syllable_number_border: int=4,
                 word_form_vocabulary: WordFormVocabulary=None) -> bool:
        """
        Проверка рифмованности 2 слов.

        :param word1: первое слово для проверки рифмы, уже акцентуированное (Word).
        :param word2: второе слово для проверки рифмы, уже акцентуированное (Word).
        :param score_border: граница определния рифмы, чем выше, тем строже совпадение.
        :param syllable_number_border: ограничение на номер слога с конца, на который падает ударение.
        :param word_form_vocabulary: словарь словоформ.
        :return result: является рифмой или нет.
        """
        if word_form_vocabulary is not None:
            lemma1 = word_form_vocabulary.get_word_form_by_text(word1.text.lower()).lemma.lower()
            lemma2 = word_form_vocabulary.get_word_form_by_text(word2.text.lower()).lemma.lower()
            if lemma1 == lemma2:
                return False
        profile1 = Rhymes.__get_rhyme_profile(word1)
        profile2 = Rhymes.__get_rhyme_profile(word2)
        score = 0
        for i, ch1 in enumerate(profile1.stressed_syllable_text):
            for j, ch2 in enumerate(profile2.stressed_syllable_text[i:]):
                if ch1 != ch2:
                    continue
                if ch1 in VOWELS:
                    score += 3
                else:
                    score += 1
        if profile1.next_syllable_text == profile2.next_syllable_text and profile1.next_syllable_text != '':
            score += 3
        elif profile1.next_char == profile2.next_char and profile1.next_char != '':
            score += 1
        return (profile1.stressed_syllable_number == profile2.stressed_syllable_number and
                profile1.syllable_count == profile2.syllable_count and
                profile1.stressed_syllable_number <= syllable_number_border and
                score >= score_border)
Esempio n. 6
0
class CorporaInformationLoader(object):
    """
    Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу
    """
    def __init__(self):
        self.grammeme_vectorizer = GrammemeVectorizer()
        self.word_form_vocabulary = WordFormVocabulary()
        self.lemma_to_word_forms = defaultdict(
            set)  # type: Dict[str, Set[WordForm]]
        self.lemma_case = {}
        self.lemma_counter = Counter()  # type: Counter

    def parse_corpora(
            self, filenames: List[str]
    ) -> Tuple[WordFormVocabulary, GrammemeVectorizer]:
        """
        Построить WordFormVocabulary, GrammemeVectorizer по корпусу

        :param filenames: пути к файлам корпуса.
        """
        for filename in filenames:
            with tqdm_open(filename, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.__add_seq_end()
        self.grammeme_vectorizer.init_possible_vectors()
        self.word_form_vocabulary.init_by_vocabulary(self.lemma_counter,
                                                     self.lemma_to_word_forms,
                                                     self.lemma_case)
        self.word_form_vocabulary.lemma_indices[SEQ_END_WF] = 1
        return self.word_form_vocabulary, self.grammeme_vectorizer

    def __add_seq_end(self):
        self.lemma_to_word_forms[SEQ_END].add(SEQ_END_WF)
        self.lemma_case[SEQ_END] = SEQ_END_WF.case
        self.lemma_counter[SEQ_END] = sys.maxsize

    def __process_line(self, line: str) -> None:
        try:
            text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4]
            lemma = lemma.lower() + '_' + pos_tag
            gram_vector_index = self.grammeme_vectorizer.add_grammemes(
                pos_tag, grammemes)
            self.lemma_to_word_forms[lemma].add(
                WordForm(lemma, gram_vector_index, text.lower()))
            self.lemma_counter[lemma] += 1
            self.__update_lemma_case(lemma, text)
        except ValueError:
            pass

    def __update_lemma_case(self, lemma: str, text: str) -> None:
        if lemma not in self.lemma_case:
            self.lemma_case[lemma] = LemmaCase.UPPER_CASE if text.isupper() else \
                              LemmaCase.PROPER_CASE if text[0].isupper() else LemmaCase.NORMAL_CASE
        elif self.lemma_case[lemma] == LemmaCase.UPPER_CASE:
            if not text.isupper():
                self.lemma_case[lemma] = LemmaCase.PROPER_CASE if text[
                    0].isupper() else LemmaCase.NORMAL_CASE
        elif self.lemma_case[lemma] == LemmaCase.PROPER_CASE:
            if not text[0].isupper():
                self.lemma_case[lemma] = LemmaCase.NORMAL_CASE
Esempio n. 7
0
class LSTMGenerator:
    """
    Языковая модель на основе двухуровневой LSTM RNN.
    """
    def __init__(self, embedding_size: int=30000, external_batch_size: int=10000, nn_batch_size: int=768,
                 sentence_maxlen: int=10, lstm_units=368, embeddings_dimension: int=150, 
                 grammeme_dense_units: Tuple[int]=(35, 15), dense_units: int=256, softmax_size: int=60000,
                 dropout: float=0.2, recalculate_softmax=False, max_word_len: int=30, char_embeddings_dimension: int=20,
                 char_lstm_output_dim: int=64):
        """
        :param embedding_size: размер входного слоя (=размер словаря)
        :param softmax_size: размер выхода softmax-слоя (=размер итогового набора вероятностей)
        :param external_batch_size: размер набора семплов для BatchGenerator'а.
        :param nn_batch_size: размер набора семплов для обучения.
        :param sentence_maxlen: маскимальная длина куска предложения.
        """
        self.embedding_size = embedding_size  # type: int
        self.softmax_size = softmax_size  # type: int
        self.external_batch_size = external_batch_size  # type: int
        self.nn_batch_size = nn_batch_size  # type: int
        self.sentence_maxlen = sentence_maxlen  # type: int
        self.word_form_vocabulary = None  # type: WordFormVocabulary
        self.grammeme_vectorizer = None  # type: GrammemeVectorizer
        self.lstm_units = lstm_units  # type: int
        self.embeddings_dimension = embeddings_dimension  # type: int
        self.grammeme_dense_units = grammeme_dense_units  # type: List[int]
        self.dense_units = dense_units  # type: int
        self.dropout = dropout  # type: float
        self.max_word_len = max_word_len  # type: int
        self.char_embeddings_dimension = char_embeddings_dimension  # type: int
        self.char_lstm_output_dim = char_lstm_output_dim  # type: int
        self.model = None  # type: Model
        self.recalculate_softmax = recalculate_softmax  # type: bool

    def prepare(self, filenames: List[str]=list(),
                word_form_vocab_dump_path: str=GENERATOR_WORD_FORM_VOCAB_PATH,
                gram_dump_path: str=GENERATOR_GRAM_VECTORS) -> None:
        """
        Подготовка векторизатора грамматических значений и словаря словоформ по корпусу.
        
        :param filenames: имена файлов с морфоразметкой.
        :param word_form_vocab_dump_path: путь к дампу словаря словоформ.
        :param gram_dump_path: путь к векторам грамматических значений.
        """
        self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
        self.word_form_vocabulary = WordFormVocabulary(word_form_vocab_dump_path)
        if self.grammeme_vectorizer.is_empty() or self.word_form_vocabulary.is_empty():
            loader = CorporaInformationLoader()
            self.word_form_vocabulary, self.grammeme_vectorizer = loader.parse_corpora(filenames)
            self.grammeme_vectorizer.save()
            self.word_form_vocabulary.save()
        if self.recalculate_softmax:
            self.softmax_size = self.word_form_vocabulary.get_softmax_size_by_lemma_size(self.embedding_size)
            print("Recalculated softmax: ", self.softmax_size)

    def save(self, model_filename: str):
        self.model.save(model_filename)

    def load(self, model_filename: str) -> None:
        self.model = load_model(model_filename)

    def build(self):
        """
        Описание модели.
        """
        # Вход лемм
        lemmas = Input(shape=(None,), name='lemmas')
        lemmas_embedding = Embedding(self.embedding_size + 1, self.embeddings_dimension, name='embeddings')(lemmas)
        lemmas_embedding = SpatialDropout1D(.3)(lemmas_embedding)

        # Вход символов
        chars = Input(shape=(None, self.max_word_len), name='chars')
        chars_embedding = Embedding(len(CHAR_SET) + 1, self.char_embeddings_dimension, name='char_embeddings')(chars)
        chars_lstm = TimeDistributed(Bidirectional(
            LSTM(self.char_lstm_output_dim // 2, dropout=self.dropout, recurrent_dropout=self.dropout,
                 return_sequences=False, name='CharLSTM')))(chars_embedding)

        # Вход граммем
        grammemes_input = Input(shape=(None, self.grammeme_vectorizer.grammemes_count()), name='grammemes')
        grammemes_layer = grammemes_input
        for grammeme_dense_layer_units in self.grammeme_dense_units:
            grammemes_layer = Dense(grammeme_dense_layer_units, activation='relu')(grammemes_layer)

        layer = concatenate([lemmas_embedding, grammemes_layer, chars_lstm], name="LSTM_input")
        layer = LSTM(self.lstm_units, dropout=self.dropout, recurrent_dropout=self.dropout,
                     return_sequences=True, name='LSTM_1')(layer)
        layer = LSTM(self.lstm_units, dropout=self.dropout, recurrent_dropout=self.dropout,
                     return_sequences=False, name='LSTM_2')(layer)

        layer = Dense(self.dense_units)(layer)
        layer = BatchNormalization()(layer)
        layer = Activation('relu')(layer)

        output = Dense(self.softmax_size + 1, activation='softmax')(layer)

        self.model = Model(inputs=[lemmas, grammemes_input, chars], outputs=[output])
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
        print(self.model.summary())

    @staticmethod
    def __get_validation_data(batch_generator, size):
        """
        Берет первые size батчей из batch_generator для валидационной выборки
        """
        lemmas_list, grammemes_list, chars_list, y_list = [], [], [], []
        for lemmas, grammemes, chars, y in islice(batch_generator, size):
            lemmas_list.append(lemmas)
            grammemes_list.append(grammemes)
            chars_list.append(chars)
            y_list.append(y)
        return np.vstack(lemmas_list), np.vstack(grammemes_list), np.vstack(chars_list), np.hstack(y_list)

    def train(self, filenames: List[str], validation_size: int=5,
              validation_verbosity: int=5, dump_model_freq: int=10,
              save_path: str=GENERATOR_LSTM_MODEL_PATH, start_epoch: int=0) -> None:
        """
        Обучение модели.
        
        :param filenames: имена файлов с морфоразметкой.
        :param validation_size: размер val выборки.
        :param validation_verbosity: каждый validation_verbosity-шаг делается валидация.
        :param dump_model_freq: каждый dump_model_freq-шаг сохраняется модель.
        :param save_path: путь, куда сохранять модель.
        :param start_epoch: эпоха, с которой надо начать.
        """
        batch_generator = BatchGenerator(filenames,
                                         batch_size=self.external_batch_size,
                                         embedding_size=self.embedding_size,
                                         softmax_size=self.softmax_size,
                                         sentence_maxlen=self.sentence_maxlen,
                                         word_form_vocabulary=self.word_form_vocabulary,
                                         grammeme_vectorizer=self.grammeme_vectorizer,
                                         max_word_len=self.max_word_len)

        lemmas_val, grammemes_val, chars_val, y_val = \
            LSTMGenerator.__get_validation_data(batch_generator, validation_size)
        for big_epoch in range(0, 1000):
            print('------------Big Epoch {}------------'.format(big_epoch))
            for epoch, (lemmas, grammemes, chars, y) in enumerate(batch_generator):
                if epoch < start_epoch:
                    continue
                if epoch < validation_size:
                    continue
                self.model.fit([lemmas, grammemes, chars], y, batch_size=self.nn_batch_size, epochs=1, verbose=2)

                if epoch != 0 and epoch % validation_verbosity == 0:
                    print('val loss:', self.model.evaluate([lemmas_val, grammemes_val, chars_val],
                                                           y_val, batch_size=self.nn_batch_size * 2, verbose=0))

                indices = [self.word_form_vocabulary.get_sequence_end_index()]
                for _ in range(10):
                    indices.append(self._sample(self.predict(indices)))
                sentence = [self.word_form_vocabulary.get_word_form_by_index(index) for index in indices]
                print('Sentence', str(big_epoch), str(epoch), end=': ')
                for word in sentence[::-1]:
                    print(word.text, end=' ')
                print()

                if epoch != 0 and epoch % dump_model_freq == 0:
                    self.save(save_path)

    def predict(self, word_indices: List[int]) -> np.array:
        """
        Предсказание вероятностей следующего слова.
        
        :param word_indices: индексы предыдущих слов.
        :return: проекция языковой модели (вероятности следующего слова).
        """
        if len(word_indices) == 0:
            return np.full(self.softmax_size, 1.0 / self.softmax_size, dtype=np.float)

        cur_sent = [self.word_form_vocabulary.get_word_form_by_index(ind) for ind in word_indices]

        x_lemmas = np.zeros((1, len(cur_sent)))
        x_grammemes = np.zeros((1, len(cur_sent), self.grammeme_vectorizer.grammemes_count()))
        x_chars = np.zeros((1, len(cur_sent), self.max_word_len))

        lemmas_vector, grammemes_vector, chars_vector =\
            BatchGenerator.get_sample(cur_sent, self.embedding_size, self.max_word_len,
                                      word_form_vocabulary=self.word_form_vocabulary,
                                      grammeme_vectorizer=self.grammeme_vectorizer)

        x_lemmas[0, -len(cur_sent):] = lemmas_vector
        x_grammemes[0, -len(cur_sent):] = grammemes_vector
        x_chars[0, -len(cur_sent):] = chars_vector
        prob = self.model.predict([x_lemmas, x_grammemes, x_chars], verbose=0)[0]
        return prob

    @staticmethod
    def _sample(prob: np.array, temperature: float=1.0) -> int:
        """
        Выбор слова по набору вероятностей с заданной температурой (распределение Больцмана).
        
        :param prob: вероятности.
        :param temperature: температура.
        :return: индекс итогового слова.
        """
        prob = prob[:-1]  # Для исключения неизвестных слов.
        prob = np.log(prob) / temperature
        prob = np.exp(prob) / np.sum(np.exp(prob))
        return np.random.choice(len(prob), p=prob)