Esempio n. 1
0
 def __init__(self):
     self.grammeme_vectorizer_input = GrammemeVectorizer()
     self.grammeme_vectorizer_output = GrammemeVectorizer()
     self.word_dictionary = WordDictionary()
     self.char_set = set()
     self.morph = MorphAnalyzer() # pyMorphy2
     self.converter = converters.converter('opencorpora-int', 'ud14')
Esempio n. 2
0
 def __init__(self):
     self.morph = MorphAnalyzer()  # использ. pyMorphy2
     self.converter = converters.converter('opencorpora-int', 'ud14')
     self.grammeme_vectorizer_input = GrammemeVectorizer()
     self.grammeme_vectorizer_output = GrammemeVectorizer()
     self.word_dictionary = WordDictionary()
     self.char_set = ""
     self.train_model = None
     self.main_model = None
Esempio n. 3
0
class Loader(object):
    """
    Класс для построения GrammemeVectorizer и WordDictionary по корпусу
    """
    def __init__(self):
        self.grammeme_vectorizer_input = GrammemeVectorizer()
        self.grammeme_vectorizer_output = GrammemeVectorizer()
        self.word_dictionary = WordDictionary()
        self.char_set = set()
        self.morph = MorphAnalyzer() # pyMorphy2
        self.converter = converters.converter('opencorpora-int', 'ud14')

    def parse_corpora(self, file_names: List[str]):
        """
        Построить WordDictionary, GrammemeVectorizer по корпусу
        file_names: пути к файлам корпуса.
        """
        for file_name in file_names:
            with open(file_name, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.grammeme_vectorizer_input.init_possible_vectors()
        self.grammeme_vectorizer_output.init_possible_vectors()
        self.word_dictionary.sort()
        self.char_set = " " + "".join(self.char_set).replace(" ", "")

    def __process_line(self, line: str):
        """
        Обработка строки в корпусе с морфоразметкой.
        """
        i, text, lemma, pos_tag, grammemes = line.strip().split("\t")[0:5]
        # Заполняем словарь.
        self.word_dictionary.add_word(text.lower())
        # набор уникальных символов
        self.char_set |= {ch for ch in text}
        # Заполняем набор возможных выходных тегов.
        self.grammeme_vectorizer_output.add_grammemes(pos_tag, grammemes)
        # Заполняем набор возможных входных тегов.
        for parse in self.morph.parse(text): #Получаем с помощью pyMorphy#
            pos, gram = convert_from_opencorpora_tag(self.converter, parse.tag, text)
            gram = filter_gram_tag(gram)
            self.grammeme_vectorizer_input.add_grammemes(pos, gram)
Esempio n. 4
0
    def getFeaturesForSentence(sentence: List[str], converter,
                               morph: MorphAnalyzer,
                               grammeme_vectorizer: GrammemeVectorizer,
                               max_word_len: int,
                               word_dictionary: WordDictionary,
                               word_count: int, char_set: str):
        """
        Получение признаков для отдельного предложения.

        :return: индексы слов, грамматические векторы, индексы символов.
        """
        word_char_vectors = []
        word_gram_vectors = []
        word_indices = []
        for word in sentence:
            char_indices = np.zeros(max_word_len)
            gram_value_indices = np.zeros(
                grammeme_vectorizer.grammemes_count())

            # Индексы символов слова.
            word_char_indices = [
                char_set.index(ch) if ch in char_set else len(char_set)
                for ch in word
            ][-max_word_len:]
            char_indices[-min(len(word), max_word_len):] = word_char_indices
            word_char_vectors.append(char_indices)

            # Индексы слов.
            word_index = word_dictionary.word_to_index[
                word.lower()] if word_dictionary.has_word(word) else word_count
            word_index = min(word_index, word_count)
            word_indices.append(word_index)

            # Грамматический вектор слова.
            # Складываем все возможные варианты разбора поэлементно.
            for parse in morph.parse(word):
                pos, gram = convert_from_opencorpora_tag(
                    converter, parse.tag, word)
                gram = filter_gram_tag(gram)
                gram_value_indices += np.array(
                    grammeme_vectorizer.get_vector(pos + "#" + gram))

            # Нормируем по каждой категории отдельно.
            sorted_grammemes = sorted(
                grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0])
            index = 0
            for category, values in sorted_grammemes:
                mask = gram_value_indices[index:index + len(values)]
                s = sum(mask)
                gram_value_indices[index:index +
                                   len(values)] = mask / s if s != 0 else 0.0
                index += len(values)
            word_gram_vectors.append(gram_value_indices)

        return word_indices, word_gram_vectors, word_char_vectors
Esempio n. 5
0
def get_char_model(char_layer,
                   max_word_length: int,
                   dictionary: WordDictionary,
                   char_set: str,
                   embeddings: np.array,
                   model_weights_path: str,
                   model_config_path: str,
                   batch_size: int = 128,
                   test_part: float = 0.2,
                   seed: int = 42):
    """
    Обучение или загрузка char-level функции.

    :param char_layer: заданная char-level функция, которую и обучаем.
    :param max_word_length: максимальная длина слова, по которой идёт обрезка.
    :param dictionary: список слов.
    :param char_set: набор символов, для которых строятся эмбеддинги.
    :param embeddings: матрица эмбеддингов.
    :param batch_size: размер батча.
    :param model_weights_path: путь, куда сохранять веса модели.
    :param model_config_path: путь, куда сохранять конфиг модели.
    :param test_part: доля выборки, которая станет test.
    :param seed: seed для ГПСЧ.
    """
    model = CharEmbeddingsModel()
    if model_config_path is not None and os.path.exists(model_config_path):
        assert model_weights_path is not None and os.path.exists(
            model_weights_path)
        model.load(model_config_path, model_weights_path)
    else:
        dictionary = copy.deepcopy(dictionary)
        dictionary.shrink(embeddings.shape[0])
        model.build(dictionary_size=dictionary.size(),
                    word_embeddings_dimension=embeddings.shape[1],
                    max_word_length=max_word_length,
                    word_embeddings=embeddings.T,
                    char_layer=char_layer)
        model.train(dictionary, char_set, test_part, seed, batch_size,
                    max_word_length)
        if model_config_path is not None and model_weights_path is not None:
            model.save(model_config_path, model_weights_path)
    return model.char_layer
Esempio n. 6
0
def load_embeddings(embeddings_file_name: str, dictionary: WordDictionary,
                    word_count: int):
    ###
    ### Загрузка словных эмбэндингов из файла
    ###
    with open(embeddings_file_name, "r", encoding='utf-8') as f:
        line = next(f)
        dimension = int(line.strip().split()[1])
        matrix = np.random.rand(min(dictionary.size(), word_count + 1),
                                dimension) * 0.05
        words = {
            word: i
            for i, word in enumerate(dictionary.words[:word_count])
        }
        for line in f:
            try:
                word = line.strip().split()[0]
                embedding = [float(i) for i in line.strip().split()[1:]]
                index = words.get(word)
                if index is not None:
                    matrix[index] = embedding
            except ValueError or UnicodeDecodeError:
                continue
        return matrix
Esempio n. 7
0
class LSTMModel:
    def __init__(self):
        self.morph = MorphAnalyzer()  # использ. pyMorphy2
        self.converter = converters.converter('opencorpora-int', 'ud14')
        self.grammeme_vectorizer_input = GrammemeVectorizer()
        self.grammeme_vectorizer_output = GrammemeVectorizer()
        self.word_dictionary = WordDictionary()
        self.char_set = ""
        self.train_model = None
        self.main_model = None

    def prepare(self,
                gram_dump_path_input: str,
                gram_dump_path_output: str,
                word_dictionary_dump_path: str,
                char_set_dump_path: str,
                file_names: List[str] = None) -> None:
        """
        Подготовка векторизатора грамматических значений и словаря слов по корпусу.
        """
        if os.path.exists(gram_dump_path_input):
            self.grammeme_vectorizer_input.load(gram_dump_path_input)
        if os.path.exists(gram_dump_path_output):
            self.grammeme_vectorizer_output.load(gram_dump_path_output)
        if os.path.exists(word_dictionary_dump_path):
            self.word_dictionary.load(word_dictionary_dump_path)
        if os.path.exists(char_set_dump_path):
            with open(char_set_dump_path, 'r', encoding='utf-8') as f:
                self.char_set = f.read().rstrip()
        if self.grammeme_vectorizer_input.is_empty() or \
                self.grammeme_vectorizer_output.is_empty() or \
                self.word_dictionary.is_empty() or \
                not self.char_set:
            loader = Loader()
            loader.parse_corpora(file_names)

            self.grammeme_vectorizer_input = loader.grammeme_vectorizer_input
            self.grammeme_vectorizer_input.save(gram_dump_path_input)
            self.grammeme_vectorizer_output = loader.grammeme_vectorizer_output
            self.grammeme_vectorizer_output.save(gram_dump_path_output)
            self.word_dictionary = loader.word_dictionary
            self.word_dictionary.save(word_dictionary_dump_path)
            self.char_set = loader.char_set
            with open(char_set_dump_path, 'w', encoding='utf-8') as f:
                f.write(self.char_set)

    def save_model(self, model_config_path: str, model_weights_path: str,
                   main_model_config_path: str, main_model_weights_path: str):
        if self.main_model is not None:
            with open(main_model_config_path, "w", encoding='utf-8') as f:
                f.write(self.main_model.to_yaml())
            self.main_model.save_weights(main_model_weights_path)
        if self.train_model is not None:
            with open(model_config_path, "w", encoding='utf-8') as f:
                f.write(self.train_model.to_yaml())
            self.train_model.save_weights(model_weights_path)

    def load_train_model(self,
                         config: BuildModelConfig,
                         model_config_path: str = None,
                         model_weights_path: str = None):
        with open(model_config_path, "r", encoding='utf-8') as f:
            custom_objects = {'ReversedLSTM': ReversedLSTM}
            self.train_model = model_from_yaml(f.read(),
                                               custom_objects=custom_objects)
        self.train_model.load_weights(model_weights_path)

        loss = {}
        metrics = {}
        out_layer_name = 'main_pred'
        loss[out_layer_name] = 'sparse_categorical_crossentropy'
        metrics[out_layer_name] = 'accuracy'

        if config.use_pos_lm:
            prev_layer_name = 'shifted_pred_prev'
            next_layer_name = 'shifted_pred_next'
            loss[prev_layer_name] = loss[
                next_layer_name] = 'sparse_categorical_crossentropy'
            metrics[prev_layer_name] = metrics[next_layer_name] = 'accuracy'
        self.train_model.compile(Adam(clipnorm=5.), loss=loss, metrics=metrics)

        self.main_model = Model(inputs=self.train_model.inputs,
                                outputs=self.train_model.outputs[0])

    def load_main_model(self, config: BuildModelConfig,
                        main_model_config_path: str,
                        main_model_weights_path: str) -> None:
        with open(main_model_config_path, "r", encoding='utf-8') as f:
            custom_objects = {'ReversedLSTM': ReversedLSTM}
            self.main_model = model_from_yaml(f.read(),
                                              custom_objects=custom_objects)
        self.main_model.load_weights(main_model_weights_path)
        self.main_model._make_predict_function()

    def build(self, config: BuildModelConfig, word_embeddings=None):
        """
        Собирает LSTM модель
        """
        inputs = []
        embeddings = []

        if config.use_word_embeddings and word_embeddings is not None:
            words = Input(shape=(None, ), name='words')
            word_dictionary_size = word_embeddings.size.shape[0]
            word_embeddings_dim = word_embeddings.size.shape[1]
            words_embedding = Embedding(word_dictionary_size,
                                        word_embeddings_dim,
                                        name='word_embeddings')(words)
            embeddings.append(words_embedding)

        if config.use_gram:
            grammemes_input = Input(
                shape=(None, self.grammeme_vectorizer_input.grammemes_count()),
                name='grammemes')
            grammemes_embedding = Dropout(config.gram_dropout)(grammemes_input)
            grammemes_embedding = Dense(config.gram_hidden_size,
                                        activation='relu')(grammemes_embedding)
            inputs.append(grammemes_input)
            embeddings.append(grammemes_embedding)

        if config.use_chars:
            chars_input = Input(shape=(None, config.char_max_word_length),
                                name='chars')

            char_layer = build_dense_chars_layer(
                max_word_length=config.char_max_word_length,
                char_vocab_size=len(self.char_set) + 1,
                char_emb_dim=config.char_embedding_dim,
                hidden_dim=config.char_function_hidden_size,
                output_dim=config.char_function_output_size,
                dropout=config.char_dropout)
            if config.use_trained_char_embeddings:
                char_layer = get_char_model(
                    char_layer=char_layer,
                    max_word_length=config.char_max_word_length,
                    embeddings=word_embeddings,
                    model_config_path=config.char_model_config_path,
                    model_weights_path=config.char_model_weights_path,
                    dictionary=self.word_dictionary,
                    char_set=self.char_set)
            chars_embedding = char_layer(chars_input)
            inputs.append(chars_input)
            embeddings.append(chars_embedding)

        if len(embeddings) > 1:
            layer = concatenate(embeddings, name="LSTM_input")
        else:
            layer = embeddings[0]

        lstm_input = Dense(config.rnn_input_size, activation='relu')(layer)
        lstm_forward_1 = LSTM(config.rnn_hidden_size,
                              dropout=config.rnn_dropout,
                              recurrent_dropout=config.rnn_dropout,
                              return_sequences=True,
                              name='LSTM_1_forward')(lstm_input)

        lstm_backward_1 = ReversedLSTM(config.rnn_hidden_size,
                                       dropout=config.rnn_dropout,
                                       recurrent_dropout=config.rnn_dropout,
                                       return_sequences=True,
                                       name='LSTM_1_backward')(lstm_input)
        layer = concatenate([lstm_forward_1, lstm_backward_1],
                            name="BiLSTM_input")

        for i in range(config.rnn_n_layers - 1):
            layer = Bidirectional(
                LSTM(config.rnn_hidden_size,
                     dropout=config.rnn_dropout,
                     recurrent_dropout=config.rnn_dropout,
                     return_sequences=True,
                     name='LSTM_' + str(i)))(layer)

        layer = TimeDistributed(Dense(config.dense_size))(layer)
        layer = TimeDistributed(Dropout(config.dense_dropout))(layer)
        layer = TimeDistributed(BatchNormalization())(layer)
        layer = TimeDistributed(Activation('relu'))(layer)

        outputs = []
        loss = {}
        metrics = {}
        num_of_classes = self.grammeme_vectorizer_output.size() + 1

        out_layer_name = 'main_pred'
        outputs.append(
            Dense(num_of_classes, activation='softmax',
                  name=out_layer_name)(layer))
        loss[out_layer_name] = 'sparse_categorical_crossentropy'
        metrics[out_layer_name] = 'accuracy'

        if config.use_pos_lm:
            prev_layer_name = 'shifted_pred_prev'
            next_layer_name = 'shifted_pred_next'
            prev_layer = Dense(num_of_classes,
                               activation='softmax',
                               name=prev_layer_name)
            next_layer = Dense(num_of_classes,
                               activation='softmax',
                               name=next_layer_name)
            outputs.append(
                prev_layer(
                    Dense(config.dense_size,
                          activation='relu')(lstm_backward_1)))
            outputs.append(
                next_layer(
                    Dense(config.dense_size,
                          activation='relu')(lstm_forward_1)))
            loss[prev_layer_name] = loss[
                next_layer_name] = 'sparse_categorical_crossentropy'
            metrics[prev_layer_name] = metrics[next_layer_name] = 'accuracy'

        if config.use_word_lm:
            out_layer_name = 'out_embedding'
            out_embedding = Dense(word_embeddings.shape[0],
                                  weights=[
                                      word_embeddings.T,
                                      np.zeros(word_embeddings.shape[0])
                                  ],
                                  activation='softmax',
                                  name=out_layer_name,
                                  trainable=False)
            outputs.append(
                out_embedding(
                    Dense(word_embeddings.shape[1],
                          activation='relu')(lstm_backward_1)))
            outputs.append(
                out_embedding(
                    Dense(word_embeddings.shape[1],
                          activation='relu')(lstm_forward_1)))
            loss[out_layer_name] = 'sparse_categorical_crossentropy'
            metrics[out_layer_name] = 'accuracy'

        self.train_model = Model(inputs=inputs, outputs=outputs)
        self.train_model.compile(Adam(clipnorm=5.), loss=loss, metrics=metrics)
        self.main_model = Model(inputs=inputs, outputs=outputs[0])
        print(self.train_model.summary())

    def train(self, file_names: List[str], train_config: TrainConfig,
              build_config: BuildModelConfig) -> None:
        np.random.seed(train_config.random_seed)
        sample_counter = self.count_sentences(file_names)
        train_idx, test_idx = self.split_data_set(sample_counter,
                                                  train_config.test_part)
        for big_epoch in range(train_config.epochs_num):
            print('------------Big Epoch {}------------'.format(big_epoch))
            training_set_generator = TrainingSetGenerator(
                file_names=file_names,
                config=train_config,
                grammeme_vectorizer_input=self.grammeme_vectorizer_input,
                grammeme_vectorizer_output=self.grammeme_vectorizer_output,
                build_config=build_config,
                indices=train_idx,
                word_dictionary=self.word_dictionary,
                char_set=self.char_set)
            for epoch, (inputs, target) in enumerate(training_set_generator):
                self.train_model.fit(inputs,
                                     target,
                                     batch_size=train_config.batch_size,
                                     epochs=1,
                                     verbose=2)
                if epoch != 0 and epoch % train_config.dump_model_freq == 0:
                    self.save_model(train_config.train_model_config_path,
                                    train_config.train_model_weights_path,
                                    train_config.main_model_config_path,
                                    train_config.main_model_weights_path)
            self.evaluate(file_names=file_names,
                          test_idx=test_idx,
                          train_config=train_config,
                          build_config=build_config)

    @staticmethod
    def count_sentences(file_names: List[str]):
        """
        Считает количество предложений в выборке.
        """
        sample_counter = 0
        for filename in file_names:
            with open(filename, "r", encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if len(line) == 0:
                        sample_counter += 1
        return sample_counter

    @staticmethod
    def split_data_set(sentences_counter: int,
                       test_part: float) -> Tuple[np.array, np.array]:
        """
        Разделяет выборку на train и test.

        :param sentences_counter: количество предложений.
        :param test_part: доля выборки, которая станет test.
        """
        perm = np.random.permutation(sentences_counter)
        border = int(sentences_counter * (1 - test_part))
        train_idx = perm[:border]
        test_idx = perm[border:]
        return train_idx, test_idx

    def evaluate(self, file_names, test_idx, train_config: TrainConfig,
                 build_config: BuildModelConfig) -> None:
        """
        Оценка точности обучения на test выборке
        """
        word_count = 0
        word_errors = 0
        sentence_count = 0
        sentence_errors = 0
        training_set_generator = TrainingSetGenerator(
            file_names=file_names,
            config=train_config,
            grammeme_vectorizer_input=self.grammeme_vectorizer_input,
            grammeme_vectorizer_output=self.grammeme_vectorizer_output,
            build_config=build_config,
            indices=test_idx,
            word_dictionary=self.word_dictionary,
            char_set=self.char_set)
        for epoch, (inputs, target) in enumerate(training_set_generator):
            predicted_y = self.main_model.predict(
                inputs, batch_size=train_config.batch_size, verbose=0)
            for i, sentence in enumerate(target[0]):
                sentence_has_errors = False
                count_zero = sum([1 for num in sentence if num == [0]])
                real_sentence_tags = sentence[count_zero:]
                answer = []
                for grammeme_probs in predicted_y[i][count_zero:]:
                    num = np.argmax(grammeme_probs)
                    answer.append(num)
                for tag, predicted_tag in zip(real_sentence_tags, answer):
                    tag = tag[0]
                    pos = self.grammeme_vectorizer_output.get_name_by_index(
                        tag).split("#")[0]
                    predicted_pos = self.grammeme_vectorizer_output.get_name_by_index(
                        predicted_tag).split("#")[0]
                    word_count += 1
                    if pos != predicted_pos:
                        word_errors += 1
                        sentence_has_errors = True
                sentence_count += 1
                if sentence_has_errors:
                    sentence_errors += 1

        print("Word accuracy: ", 1.0 - float(word_errors) / word_count)
        print("Sentence accuracy: ",
              1.0 - float(sentence_errors) / sentence_count)

    def predict_gram_analysis(
            self, sentences: List[List[str]], batch_size: int,
            build_config: BuildModelConfig) -> List[List[List[float]]]:
        """
        Предсказание полного грамматического разбора для предложений (грамммемы, части речи) с вероятностями

        :param sentences: Список списков слов (список предложений)
        :param build_config: Конфиг архитектуры модели.
        :param batch_size: Количество предложений в выборке
        :return: вероятности наборов граммем.
        """
        max_sentence_len = max([len(sentence) for sentence in sentences])
        if max_sentence_len == 0:
            return [[] for _ in sentences]
        n_samples = len(sentences)

        words = np.zeros((n_samples, max_sentence_len), dtype=np.int)
        grammemes = np.zeros(
            (n_samples, max_sentence_len,
             self.grammeme_vectorizer_input.grammemes_count()),
            dtype=np.float)
        chars = np.zeros(
            (n_samples, max_sentence_len, build_config.char_max_word_length),
            dtype=np.int)

        for i, sentence in enumerate(sentences):
            if not sentence:
                continue
            word_indices, gram_vectors, char_vectors = TrainingSetGenerator.getFeaturesForSentence(
                sentence,
                converter=self.converter,
                morph=self.morph,
                grammeme_vectorizer=self.grammeme_vectorizer_input,
                max_word_len=build_config.char_max_word_length,
                word_dictionary=self.word_dictionary,
                word_count=build_config.word_max_count,
                char_set=self.char_set)
            words[i, -len(sentence):] = word_indices
            grammemes[i, -len(sentence):] = gram_vectors
            chars[i, -len(sentence):] = char_vectors

        inputs = []
        if build_config.use_word_embeddings:
            inputs.append(words)
        if build_config.use_gram:
            inputs.append(grammemes)
        if build_config.use_chars:
            inputs.append(chars)
        return self.main_model.predict(inputs, batch_size=batch_size)