コード例 #1
0
class DataPrepare:
    """
    Class for creating x train and test data.
    """
    def __init__(self):

        self.config = Config(model_type='bilstm')
        self.sent_max_len = self.config.get('Sent_max_length')
        self.corpora_limit = self.config.get('Corpora_sent_limit')

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        char_emb_feature = load_bin_data(
            self.data_path + '/' + self.config.get('Corpora') +
            '/bilstm/char_emb_rnn_feature_data.pkl')
        self.word2ind = char_emb_feature['word2index']

        self.preparator(data_name='train')
        self.preparator(data_name='test')

    def preparator(self, data_name='test'):
        """
        Prepare data: encoding, padding.
        """

        if self.corpora_limit != 'False':
            sent = load_data(self.data_path + '/' +
                             self.config.get('Corpora') + '/%s' %
                             (data_name, ))[:self.corpora_limit]
        else:
            sent = load_data(self.data_path + '/' +
                             self.config.get('Corpora') + '/%s' %
                             (data_name, ))
        x_data = seq_form(sent, data_type='x')
        X_data = self.data_prepare(x_data, data_name)
        del (x_data, sent)
        print('X_%s' % (data_name, ), X_data.dtype)
        save_binary(
            X_data, self.data_path + '/%s/' % (self.config.get('Corpora'), ) +
            '/bilstm/x_%s.pkl' % (data_name, ))

    def data_prepare(self, x_set, name):
        """
        Подготовка данных.
        :param x:
        :param y:
        :return:
        """

        x_enc = [[self.word2ind[c] for c in x] for x in x_set]
        x_train = pad_sequences(x_enc, maxlen=self.sent_max_len)
        print('\nTraining tensor shapes:')
        print('x_%s_forward: %s;' % (
            name,
            x_train.shape,
        ))
        return x_train
コード例 #2
0
class Stat:
    def __init__(self):
        self.config = Config(model_type='bilstm')
        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        data = [
            self.data_path + '/' + self.config.get('Corpora') + '/test',
            self.data_path + '/' + self.config.get('Corpora') + '/train'
        ]

        for set in data:
            self.stat_pipline(set)

    def stat_pipline(self, file):
        sent = load_data(file)
        sent_len = length_count(sent)
        sent_count = len(sent_len)
        sent_freq_length_stat(sorted(sent_len))

        x_set = seq_form(sent)
        unique_tokens = unique_elements(x_set)
        unique_symbols = unique_chars(x_set)
        max_token_length = max([len(token) for token in unique_tokens])

        y_pos_set = seq_form(sent, data_type='y', task_type='POS')
        unique_pos_labels = unique_elements(y_pos_set)

        y_all_set = seq_form(sent, data_type='y', task_type='All')
        unique_all_labels = unique_elements(y_all_set)

        print('unique_tokens:', len(unique_tokens))
        print('unique_symbols:', len(unique_symbols))
        print('max_token_length:', max_token_length)
        print('tokens count',
              len([tokens for sent in x_set for tokens in sent]))
        print('unique_pos_labels:', len(unique_pos_labels))
        print('unique_all_labels:', len(unique_all_labels))
        print('sent_count:', sent_count)
        print(get_unique_grammatical_category(sent))
コード例 #3
0
class CNNProbEmbeddings:
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 dev=True,
                 verbose=1,
                 prob_cnn_emb_layer_name="dense_3"):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.prob_cnn_emb_layer_name = self.config.get(
                'Network_options').get('prob_cnn_emb_layer_name')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.prob_cnn_emb_layer_name = prob_cnn_emb_layer_name

        print('CNNProbEmbeddings for 2 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_1/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        char_emb_feature = self.load_binary_data(
            self.data_path + '/char_emb_cnn1_feature_data_%s.pkl' %
            (self.task_type, ))

        self.ind2symbol = char_emb_feature['ind2symbol']
        self.max_token_length = char_emb_feature['max_token_length']
        self.x_train = self.load_binary_data(self.data_path +
                                             '/x_train_cnn1level.pkl')
        self.x_test = self.load_binary_data(self.data_path +
                                            '/x_test_cnn1level.pkl')
        if dev:
            self.x_dev = self.load_binary_data(self.data_path +
                                               '/x_dev_cnn1level.pkl')

        if verbose == 1:
            print("Loading char_emb_cnn1_feature_data_%s ..." %
                  (self.task_type, ))
            print('x_train shape:', self.x_train.shape)
            print('x_test shape:', self.x_test.shape)
            print('x_dev shape:', self.x_dev.shape)

        ################################################################################################################

        str2vector = {}
        for el_ in self.x_train:
            str2vector[''.join([self.ind2symbol[s] for s in el_
                                if s != 0])] = el_
        for _el in self.x_test:
            str2vector[''.join([self.ind2symbol[s] for s in _el
                                if s != 0])] = _el
        if dev:
            for el in self.x_dev:
                str2vector[''.join([self.ind2symbol[s] for s in el
                                    if s != 0])] = el
        str2vector = OrderedDict(str2vector)
        if verbose == 1:
            print("Unique_tokens:", len(str2vector))

        ################################################################################################################

        null_word = [0 for i in range(self.max_token_length)]
        null_vector = np.array(null_word)

        str2vector.update({'_null_': null_vector})
        str2vector.move_to_end('_null_', last=False)
        str2vector = [(el, str2vector[el]) for el in str2vector]
        if verbose == 1:
            print("Checking null word:", str2vector[0])

        ################################################################################################################

        non_tuned_embeddings = np.array([el[1] for el in str2vector])
        if verbose == 1:
            print("Non tune embeddings:", non_tuned_embeddings.shape)

        ################################################################################################################

        if verbose == 1:
            print('Loading cnn_1level_model_%s_%s.pkl' % (
                self.corpora,
                self.task_type,
            ))
        self.model = load_model(self.model_path +
                                '/cnn_1level_model_%s_%s.pkl' % (
                                    self.corpora,
                                    self.task_type,
                                ))

        activation_values = self.get_prob_from_layer(
            layer_name=self.prob_cnn_emb_layer_name, data=non_tuned_embeddings)
        if verbose == 1:
            print('activity_values_train shape:', activation_values.shape)

        ################################################################################################################

        if self.task_type == "All":
            model_pos = load_model(self.model_path +
                                   '/cnn_1level_model_%s_%s.pkl' % (
                                       self.corpora,
                                       "POS",
                                   ))
            pr = model_pos.predict(non_tuned_embeddings, verbose=1)
            activation_values = np.concatenate((activation_values, pr), axis=1)
            if verbose == 1:
                print("Predictons shape:", pr.shape)
                print("Predictons shape + activations:",
                      activation_values.shape)
                # TODO проверить предсказание по _null_

        ################################################################################################################

        result_train = OrderedDict(
            list(zip([el[0] for el in str2vector], activation_values)))
        if verbose == 1:
            print("Checking null word:", len(result_train['_null_']))
        self.save_binary(
            result_train,
            '_%s_%s' % (self.prob_cnn_emb_layer_name, self.task_type))

        if dev:
            del (result_train, activation_values, self.model, str2vector,
                 null_vector, null_word, char_emb_feature, self.ind2symbol,
                 self.max_token_length, self.x_train, self.x_test, self.x_dev,
                 non_tuned_embeddings)
        else:
            del (result_train, activation_values, self.model, str2vector,
                 null_vector, null_word, char_emb_feature, self.ind2symbol,
                 self.max_token_length, self.x_train, self.x_test,
                 non_tuned_embeddings)

    def load_binary_data(self, path_to_data):
        """
        Load data
        :return:
        """

        with open(path_to_data, 'rb') as f:
            data = pickle.load(f)
        return data

    def get_prob_from_layer(self, layer_name=None, data=None):
        """
        The output of an intermediate layer.
        :param layer_name: 
        :return: 
        """

        intermediate_layer_model = Model(
            inputs=self.model.input,
            outputs=self.model.get_layer(layer_name).output)
        intermediate_output = intermediate_layer_model.predict(data)
        return intermediate_output

    def load_data(self, path_to_data):
        """
        Data loader.
        :param path_to_data:
        :return:
        """

        # print('Loading:', path_to_data, '\n')
        raw = open(path_to_data, 'r').readlines()
        all_x = []
        point = []
        for line in raw:
            stripped_line = line.strip().split('\t')
            point.append(stripped_line)
            if line == '\n':
                all_x.append(point[:-1])
                point = []
        all_x = all_x[:-1]
        return all_x

    def save_binary(self, data, file_name):
        """
        Сохранение данных в бинарном формате.
        :param data:
        :param file_name:
        :return:
        """

        with open(self.data_path + '/cnn_prob_emb%s.pkl' % (file_name, ),
                  'wb') as file:
            pickle.dump(data, file)
コード例 #4
0
class CharEmb:
    def __init__(self, model_type='bilstm'):
        """
        Stata per corpora:
            gicrya: self.sent_max_len = 55  # optimal:55; max: 110;
            gicrya: self.max_token_length = 35;

        network_type: cnn or lstm;
            
        """

        self.config = Config(model_type=model_type)
        self.sent_max_len = self.config.get('Sent_max_length')

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')
        sent = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') + \
               load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')

        x_set = seq_form(sent, data_type='x')
        unique_tokens = unique_elements(x_set)
        self.unique_symbols = unique_chars(x_set)
        self.max_token_length = max([len(token) for token in unique_tokens])
        self.word2ind, self.ind2word = self.token_encode(unique_tokens)

        char_embeddings = self.char_matrix(unique_tokens)

        print('vocabulary:', len(unique_tokens))
        print('unique_symbols:', len(self.unique_symbols))
        print('Maximum sequence length:', self.sent_max_len)
        print('Maximum token length:', self.max_token_length)
        print('char embeddings:', char_embeddings.shape)

        self.save_emb(('unique_symbols', self.unique_symbols),
                      ('unique_tokens', unique_tokens),
                      ('word2index', self.word2ind),
                      ('max_sent_length', self.sent_max_len),
                      ('char_matrix', char_embeddings))

    def save_emb(self,
                 unique_symbols=None,
                 unique_tokens=None,
                 word2index=None,
                 max_sent_length=None,
                 char_matrix=None):
        """
        Save data.
        """

        emb_feature_data = dict()
        emb_feature_data[unique_symbols[0]] = unique_symbols[1]
        emb_feature_data[unique_tokens[0]] = unique_tokens[1]
        emb_feature_data[word2index[0]] = word2index[1]
        emb_feature_data[max_sent_length[0]] = max_sent_length[1]
        emb_feature_data[char_matrix[0]] = char_matrix[1]
        save_binary(
            emb_feature_data,
            self.data_path + '/%s/' % (self.config.get('Corpora'), ) +
            '/bilstm/char_emb_rnn_feature_data.pkl')

    def token_encode(self, uniq_tokens):
        """
        Create vocabulary.
        :param x_data:
        :return: {'heeft': 0, 'leveranciers': 4112, 'SGR': 1, 'revolutie': 4113, ...}
        """

        return {word: index+1 for index, word in enumerate(uniq_tokens)}, \
               {index+1: word for index, word in enumerate(uniq_tokens)}

    def char_matrix(self, unique_tokens):
        """
        Creating matrix with char embedding.
        :param data:
        :return:
        """

        embed_vocab = list()
        base_vector = numpy.zeros(
            len(self.unique_symbols) * self.max_token_length)
        embed_vocab.append(base_vector)
        for tokens in unique_tokens:
            features_per_token = numpy.array([], dtype='int8')
            for index_chars in range(0, self.max_token_length):
                array_char = numpy.zeros((len(self.unique_symbols), ))
                try:
                    array_char[self.unique_symbols.index(
                        tokens[index_chars])] = 1
                    # print(word[index_chars], array_char)
                except IndexError:
                    pass
                features_per_token = numpy.append(features_per_token,
                                                  array_char)
            embed_vocab.append(features_per_token)
        return numpy.array(embed_vocab).astype('int8')
コード例 #5
0
    return x


label_index = {'UPOS': 3, 'XPOS': 4}

# ----------------------------------------------------------------------------------------------------------------------

config_models = Config(model_type='models')
config_language_id = Config(model_type='tracks')

model_dir = '../tagger_models/'
test_files_udipipe_dir = '../data/conll2017_x/ud-test-v2.0-conll2017/input/conll17-ud-test-2017-05-09/'
data_path = os.path.abspath(
    os.path.split(os.path.abspath(__file__))[0] + '/../data/')

for corpora_name in config_models.get("models"):

    if corpora_name == 'UD_Russian':

        for tag_types in config_models.get("models")[corpora_name]:

            best_restart = config_models.get(
                "models")[corpora_name][tag_types].get('Best restart #')
            model_level = config_models.get(
                "models")[corpora_name][tag_types].get('Best model level #')

            if model_level != 'None' and model_level != 'INPROGRESS':

                model = None
                x_test_udipipe_tokenize_data = None
                tokens_for_prediction = None
コード例 #6
0
class DataCNN2Level:
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 dev=False,
                 verbose=1,
                 prob_cnn_emb_layer_name="dense_3"):
        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.prob_cnn_emb_layer_name = self.config.get(
                'Network_options').get('prob_cnn_emb_layer_name')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.prob_cnn_emb_layer_name = prob_cnn_emb_layer_name

        print('\nData preparation for 2 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')
        self.tuned_vectors_path = os.path.abspath(
            file_path + '/../data/%s/cnn/model_level_1/' % (self.corpora, ))

        tokens_tune_vectors = load_bin_data(
            self.tuned_vectors_path + '/cnn_prob_emb%s.pkl' %
            ('_%s_%s' % (self.prob_cnn_emb_layer_name, self.task_type)))

        sent_test = load_data(self.data_path + '/' + self.corpora + '/test')
        sent_train = load_data(self.data_path + '/' + self.corpora + '/train')
        if dev:
            sent_valid = load_data(self.data_path + '/' + self.corpora +
                                   '/dev')
        else:
            sent_valid = []

        test_tokens_data_seq = seq_form(sent_test)
        train_tokens_data_seq = seq_form(sent_train)
        if dev:
            dev_tokens_data_seq = seq_form(sent_valid)
        else:
            dev_tokens_data_seq = []

        test_labels_data_seq = seq_form(sent_test,
                                        data_type='y',
                                        task_type=self.task_type,
                                        task_index=class_index)
        train_labels_data_seq = seq_form(sent_train,
                                         data_type='y',
                                         task_type=self.task_type,
                                         task_index=class_index)
        if dev:
            dev_labels_data_seq = seq_form(sent_valid,
                                           data_type='y',
                                           task_type=self.task_type,
                                           task_index=class_index)
        else:
            dev_labels_data_seq = []

        self.MAX_SENT_LENGTH = max([len(s) for s in test_tokens_data_seq] +
                                   [len(s) for s in train_tokens_data_seq] +
                                   [len(s) for s in dev_tokens_data_seq])
        if verbose == 1:
            print('Max sent length:', self.MAX_SENT_LENGTH)

        test_labels_data = [
            labels for sent in test_labels_data_seq for labels in sent
        ]
        train_labels_data = [
            labels for sent in train_labels_data_seq for labels in sent
        ]
        if dev:
            dev_labels_data = [
                labels for sent in dev_labels_data_seq for labels in sent
            ]
        else:
            dev_labels_data = []

        # After we can encode y test and train data.
        self.ADDING_INDEX = 1
        self.PADDING_VALUE = 0

        UNIQUE_LABELS = sorted(
            set(test_labels_data + train_labels_data + dev_labels_data))
        self.label2ind_with_adding, self.ind2label_with_adding = self.labels_encode_cnn2(
            UNIQUE_LABELS)
        self.max_label_number = max(self.label2ind_with_adding.values())
        if verbose == 1:
            print('max_label_number:', self.max_label_number)

        y_train = self.label_data_prepare(train_labels_data_seq,
                                          verbose=verbose)
        y_test = self.label_data_prepare(test_labels_data_seq, verbose=verbose)
        if dev:
            y_dev = self.label_data_prepare(dev_labels_data_seq,
                                            verbose=verbose)
        else:
            y_dev = []

        save_binary(
            y_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/y_test_cnn2level_%s' % (self.task_type, ))
        save_binary(
            y_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/y_train_cnn2level_%s' % (self.task_type, ))
        if dev:
            save_binary(
                y_dev, self.data_path + '/%s/' % (self.corpora, ) +
                'cnn/model_level_2/y_dev_cnn2level_%s' % (self.task_type, ))

        save_binary(
            self.label2ind_with_adding,
            self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/y_label2ind_cnn2level_%s' % (self.task_type, ))
        del (y_train, y_test, y_dev, self.label2ind_with_adding,
             self.ind2label_with_adding, UNIQUE_LABELS)

        # After we can encode x test and train data.
        unique_tokens = sorted(set([k for k in tokens_tune_vectors]))
        self.word2ind_with_adding = {
            token: (index + 2)
            for index, token in enumerate(unique_tokens)
        }
        if verbose == 1:
            print("\nUnique tokens:", len(unique_tokens))

        x_test = self.data_prepare(test_tokens_data_seq,
                                   name="test",
                                   verbose=verbose)
        x_train = self.data_prepare(train_tokens_data_seq,
                                    name="train",
                                    verbose=verbose)
        if dev:
            x_dev = self.data_prepare(dev_tokens_data_seq,
                                      name="dev",
                                      verbose=verbose)
        else:
            x_dev = []

        tune_char_emb_matrix = self.matrix_creating(unique_tokens,
                                                    tokens_tune_vectors)
        if verbose == 1:
            print("Tune embedding matrix:", tune_char_emb_matrix.shape)

        save_binary(
            x_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/x_test_cnn2level_%s.pkl' % (self.task_type, ))
        save_binary(
            x_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/x_train_cnn2level_%s.pkl' % (self.task_type, ))
        if dev:
            save_binary(
                x_dev, self.data_path + '/%s/' % (self.corpora, ) +
                'cnn/model_level_2/x_dev_cnn2level_%s.pkl' %
                (self.task_type, ))

        self.save_emb(('max_label_numbers', self.max_label_number),
                      ('max_sent_length', self.MAX_SENT_LENGTH),
                      ('tune_char_emb_matrix', tune_char_emb_matrix),
                      ('word2ind', self.word2ind_with_adding))

        del (self.max_label_number, self.MAX_SENT_LENGTH, tune_char_emb_matrix,
             self.word2ind_with_adding, x_test, x_train, x_dev, sent_test,
             sent_valid, sent_train, test_tokens_data_seq,
             train_tokens_data_seq, dev_tokens_data_seq, test_labels_data_seq,
             train_labels_data_seq, dev_labels_data_seq, test_labels_data,
             train_labels_data, dev_labels_data, unique_tokens)

    def save_emb(self,
                 max_label_numbers=None,
                 max_sent_length=None,
                 tune_char_emb_matrix=None,
                 word2ind=None):
        """
        Сохранение данных по признаку.
        """

        emb_feature_data = dict()
        emb_feature_data[max_label_numbers[0]] = max_label_numbers[1]
        emb_feature_data[max_sent_length[0]] = max_sent_length[1]
        emb_feature_data[tune_char_emb_matrix[0]] = tune_char_emb_matrix[1]
        emb_feature_data[word2ind[0]] = word2ind[1]
        save_binary(
            emb_feature_data, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_2/char_emb_cnn2_feature_data_%s.pkl' %
            (self.task_type, ))

    def matrix_creating(self, unique_tokens, tokens_tune_vectors):
        emb_vocab = list()
        zero_vector = np.zeros(len(tokens_tune_vectors['_null_']))
        emb_vocab.append(zero_vector)
        emb_vocab.append(tokens_tune_vectors['_null_'])
        for tokens in unique_tokens:
            emb_vocab.append(tokens_tune_vectors[tokens])
        return np.array(emb_vocab)

    def data_prepare(self, x_set, name, verbose=1):
        """
        Подготовка данных.
        :param x:
        :param y:
        :return:
        """

        x_enc = [[self.word2ind_with_adding[c] for c in x] for x in x_set]
        x = pad_sequences(x_enc, maxlen=self.MAX_SENT_LENGTH, value=1)
        if verbose == 1:
            print('x_shape: %s;' % (name, ), x.shape)
            print('sequence example:', x[0])
        return x

    def labels_encode_cnn2(self, unique_elements):
        """
        Encoding labels by numbers. 1 for null;
        Short sentences are extended from the beginning with “null words” consisting of “null” label characters.
        Such “null words” belong to special null class.
        :param unique_labels:
        :return:
        """

        return {label: (index + 2) for index, label in enumerate(unique_elements)}, \
               {(index + 2): label for index, label in enumerate(unique_elements)}

    def label_data_prepare(self, y_set, verbose=1):
        """
        Creating one-hot vector for encoding labels.
        :param y:
        :return:
        """

        y_set = [[self.label2ind_with_adding[t] for t in s] for s in y_set]
        y_set = pad_sequences(y_set, maxlen=self.MAX_SENT_LENGTH, value=1)
        if verbose == 1:
            print('y_shape:', y_set.shape, y_set[0])
        return y_set
コード例 #7
0
class CNN2levelTagger:
    """
    Mean of option "Task_type". Grammem: "Grammem_tag_Animacy", POS, All (all morphology properties.).
    """
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 verbose=1,
                 batch_size=512,
                 epoch=300,
                 dev=False):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.batch_size_level_2 = self.config.get('Network_options').get(
                'batch_size_level_2')
            self.epoch = self.config.get('Network_options').get(
                'training_epoch')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.batch_size_level_2 = batch_size
            self.epoch = epoch

        print('\nModel train for 2 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_2/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.tune_char_emb_matrix = self.load_binary_data(
            self.data_path + '/char_emb_cnn2_feature_data_%s.pkl' %
            (self.task_type, ))
        self.sent_max_len = self.tune_char_emb_matrix['max_sent_length']

        self.x_train = self.load_binary_data(self.data_path +
                                             '/x_train_cnn2level_%s.pkl' %
                                             (self.task_type, ))
        self.y_train, self.out_size = self.load_grammatical_cat(
            verbose=verbose)
        if verbose == 1:
            print('x_train shape:', self.x_train.shape)
            print('y_train shape:', self.y_train.shape)

        if dev:
            self.x_dev = self.load_binary_data(self.data_path +
                                               '/x_dev_cnn2level_%s.pkl' %
                                               (self.task_type, ))
            self.y_dev, _ = self.load_grammatical_cat(y_data_name='dev',
                                                      verbose=verbose)
            if verbose == 1:
                print('x_dev shape:', self.x_dev.shape)
                print('y_dev shape:', self.y_dev.shape)

        self.max_features = len(self.tune_char_emb_matrix['word2ind']) + 2
        self.data_for_emb_layers = {
            'tune_char_emb_matrix':
            self.tune_char_emb_matrix['tune_char_emb_matrix']
        }

        self.num_batches_per_epoch_train = math.ceil(self.x_train.shape[0] /
                                                     self.batch_size_level_2)
        if dev:
            self.num_batches_per_epoch_valid = math.ceil(
                self.x_dev.shape[0] / self.batch_size_level_2)

        if verbose == 1:
            print("num_batches_per_epoch_train:",
                  self.num_batches_per_epoch_train)
            print("num_batches_per_epoch_valid:",
                  self.num_batches_per_epoch_valid)

        self.model = None

    def load_grammatical_cat(self, y_data_name='train', verbose=1):
        """
        Loading y data for each grammatical category.
        """

        labels = None
        labels_2indexes = None

        for files in os.listdir(self.data_path):
            if self.task_type == 'All':
                if 'y_%s_cnn2level_All' % (y_data_name, ) in files:
                    labels = self.load_binary_data(self.data_path + '/' +
                                                   files)
                if 'y_label2ind_cnn2level_All' in files:
                    labels_2indexes = len(
                        self.load_binary_data(self.data_path + '/' + files))
            else:
                if 'y_%s_cnn2level_%s' % (y_data_name,
                                          self.task_type) in files:
                    labels = self.load_binary_data(self.data_path + '/' +
                                                   files)
                if 'y_label2ind_cnn2level_%s' % (self.task_type, ) in files:
                    labels_2indexes = len(
                        self.load_binary_data(self.data_path + '/' + files))
        if verbose == 1:
            print('y_data:', labels.shape)
        return labels, labels_2indexes

    def __to_categorical(self, data):
        return np.array([[to_categorical(t, self.out_size + 2)[0] for t in s]
                         for s in data])

    def data_generator(self, batch_size, x, y, num_batches_per_epoch):
        """
        A generator or an instance of Sequence (keras.utils.Sequence) object in order to avoid duplicate data when
        using multiprocessing. The output of the generator must be either a tuple (inputs, targets)
        :return:
        """

        while True:
            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = (batch_num + 1) * batch_size
                x_batch = x[start_index:end_index]
                y_batch = self.__to_categorical(y[start_index:end_index])
                if x_batch.shape[0] != 0:
                    yield x_batch, y_batch

    def load_binary_data(self, path_to_data):
        """
        Load data
        :return:
        """

        with open(path_to_data, 'rb') as f:
            data = pickle.load(f)
        return data

    def network_initialization(self, verbose=1):
        """
        Network compilation.            
        :return:
        """

        ###################################################Network_level_2##############################################

        seq_input = Input((self.sent_max_len, ))
        seq_emb = Embedding(
            input_dim=self.max_features,
            output_dim=self.data_for_emb_layers.get(
                'tune_char_emb_matrix').shape[1],
            input_length=self.sent_max_len,
            weights=[self.data_for_emb_layers.get('tune_char_emb_matrix')],
            # https://github.com/fchollet/keras/issues/3335
            # https://groups.google.com/forum/#!topic/keras-users/KfoTsCHldM4
            mask_zero=False,
            trainable=True)(seq_input)
        seq_dropout = Dropout(0.5)(seq_emb)

        seq_conv1d_0 = Conv1D(
            filters=256,
            kernel_size=3,
            padding='same',
            activation='relu',
            name='conv1d_7',
        )(seq_dropout)

        seq_conv1d_1 = Conv1D(
            filters=256,
            kernel_size=3,
            padding='same',
            activation='relu',
            name='conv1d_8',
        )(seq_conv1d_0)

        seq_conv1d_2 = Conv1D(
            # the PoS-classes number + 1 (for the zero padding)
            filters=self.out_size + 2,
            kernel_size=3,
            padding='same',
            activation='softmax',
            name='conv1d_9',
        )(seq_conv1d_1)

        model = Model(inputs=seq_input, outputs=seq_conv1d_2)
        model.compile(optimizer='adamax',
                      loss="mean_squared_error",
                      metrics=["accuracy"])
        self.model = model
        print(self.model.summary())

        # plot_model(model, to_file=self.model_path + '/cnn_2level_model_schema_%s_%s.png' %
        #                                             (self.config.get('Corpora'),
        #                                              self.task_type,), show_shapes=True)
        if verbose == 1:
            print(model.summary())
            print('Model comp done.')

    def training(self, verbose=1, dev=True):
        """
        Training.

        model_checkpoint
            For `val_acc`, this should be `max`, for `val_loss` this should  be `min`, etc. In `auto` mode, the
            direction is automatically inferred from the name of the monitored quantity.

        early_stopping
                In `min` mode, training will stop when the quantity monitored has stopped decreasing; in `max` mode it
                will stop when the quantity  monitored has stopped increasing; in `auto` mode, the direction is
                automatically inferred from the name of the monitored quantity.

        :return:
        """

        if self.task_type == "All":
            __save_best_only = False
        else:
            __save_best_only = True

        model_checkpoint = ModelCheckpoint(
            filepath=os.path.join(self.model_path +
                                  '/cnn_2level_model_%s_%s.pkl' % (
                                      self.corpora,
                                      self.task_type,
                                  )),
            monitor='val_loss',
            verbose=0,
            save_weights_only=False,
            save_best_only=__save_best_only,
            mode='auto')

        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=10,
                                       verbose=0,
                                       mode='auto')

        if dev:
            self.model.fit_generator(
                generator=self.data_generator(
                    batch_size=self.batch_size_level_2,
                    x=self.x_train,
                    y=self.y_train,
                    num_batches_per_epoch=self.num_batches_per_epoch_train),
                steps_per_epoch=self.num_batches_per_epoch_train,
                epochs=self.epoch,
                validation_data=self.data_generator(
                    batch_size=self.batch_size_level_2,
                    x=self.x_dev,
                    y=self.y_dev,
                    num_batches_per_epoch=self.num_batches_per_epoch_valid),
                validation_steps=self.num_batches_per_epoch_valid,
                verbose=2,
                shuffle=True,
                callbacks=[model_checkpoint, early_stopping],
                workers=1,
                use_multiprocessing=False)

            del (self.x_train, self.y_train, self.out_size, self.x_dev,
                 self.y_dev, self.tune_char_emb_matrix, self.sent_max_len,
                 self.max_features, self.data_for_emb_layers, self.model)

        else:
            self.model.fit(x=self.x_train,
                           y=self.__to_categorical(self.y_train),
                           batch_size=self.batch_size_level_2,
                           epochs=self.epoch,
                           validation_split=0.1,
                           verbose=2,
                           shuffle=True,
                           callbacks=[model_checkpoint, early_stopping])

            del (self.x_train, self.y_train, self.out_size,
                 self.tune_char_emb_matrix, self.sent_max_len,
                 self.max_features, self.data_for_emb_layers, self.model)

    def plot_report(self, model_history):
        """
        Plot of loss function.
        :param model_history:
        :return:
        """

        plt.plot(model_history.history['loss'])
        plt.plot(model_history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.savefig(self.data_path + '/nn_report/' +
                    'cnn_1level_model_loss.jpeg')
コード例 #8
0
class ModelCNN2Test:
    """
    Mean of option "Task_type". Grammem: "Grammem_tag_Animacy", POS, All (all morphology properties.).
    """
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 verbose=1,
                 class_index=3):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.corpora = self.config.get('Corpora')
        else:
            self.task_type = task_type
            self.corpora = corpora

        logging.info('\nModel test for 2 level model.')
        logging.info("Task: {}".format(self.task_type))
        logging.info("Corpora: {}".format(self.corpora))
        logging.info("Label index: {}".format(str(class_index)))

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_2/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.x_test = load_bin_data(self.data_path +
                                    '/x_test_cnn2level_%s.pkl' %
                                    (self.task_type, ))
        if verbose == 1:
            print('x_test shape:', self.x_test.shape)

        self.y_test, self.out_size = load_grammatical_cat_model2(
            self.data_path, self.task_type, verbose=1)
        self.y_test = transform2categorical(self.y_test, self.out_size)

        self.estimation = 0

    def testing(self, verbose=1):
        """
        Model testing.
        The Kappa or Cohen’s kappa is the classification accuracy normalized by the imbalance of the classes in the data.
        :return:
        """

        model = load_model(self.model_path + '/cnn_2level_model_%s_%s.pkl' % (
            self.corpora,
            self.task_type,
        ))

        pr = model.predict(self.x_test, verbose=verbose)
        if verbose == 1:
            print('Model load.')
            print(
                '\nTesting acc keras:',
                model.evaluate(self.x_test,
                               self.y_test,
                               batch_size=32,
                               verbose=1,
                               sample_weight=None)[1])
            print('\n', '*' * 100)
        fyh, fpr = preparation_data_to_score_model2(self.y_test, pr)
        logging.info("Testing sklearn acc: {}".format(
            str(accuracy_score(fyh, fpr))))
        logging.info("Testing sklearn f1_score: {}".format(
            str(f1_score(fyh, fpr, average='macro'))))
        logging.info("Testing sklearn cohen_kappa_score: {}\n".format(
            str(cohen_kappa_score(fyh, fpr))))
        self.save_classes(fpr)
        self.estimation = accuracy_score(fyh, fpr)
        del (fyh, fpr, model, self.x_test, self.y_test, pr, self.out_size)

    def save_classes(self, data):
        save_binary(
            data, self.data_path + '/results/cnn2_level_marks_%s.pkl' %
            (self.task_type, ))

    def max_el(self, sent):
        return [np.argmax(el) for el in sent]
コード例 #9
0
class CNN1levelTagger:
    """
    Mean of option "Task_type". Grammem: "Grammem_tag_Animacy", POS, All (all morphology properties.).
    """
    def __init__(
        self,
        use_config=True,
        corpora='UD_Russian-SynTagRus',
        task_type='POS',
        class_index=3,
        verbose=1,
        batch_size=512,
        epoch=300,
        dev=False,
    ):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
            self.batch_size_level_1 = self.config.get('Network_options').get(
                'batch_size_level_1')
            self.epoch = self.config.get('Network_options').get(
                'training_epoch')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index
            self.batch_size_level_1 = batch_size
            self.epoch = epoch

        print('\nModel train for 1 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_1/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.char_emb_feature = self.load_binary_data(
            self.data_path + '/char_emb_cnn1_feature_data_%s.pkl' %
            (self.task_type, ))

        self.symbol2ind = self.char_emb_feature['symbol2ind']
        self.max_token_length = self.char_emb_feature['max_token_length']

        self.x_train = self.load_binary_data(self.data_path +
                                             '/x_train_cnn1level.pkl')
        self.y_train, self.out_size = self.load_grammatical_cat(
            verbose=verbose)
        if verbose == 1:
            print('x_train shape:', self.x_train.shape)
            print('y_train shape:', self.y_train.shape)

        if dev:
            self.x_dev = self.load_binary_data(self.data_path +
                                               '/x_dev_cnn1level.pkl')
            self.y_dev, _ = self.load_grammatical_cat(y_data_name='dev',
                                                      verbose=verbose)
            if verbose == 1:
                print('x_dev shape:', self.x_dev.shape)
                print('y_dev shape:', self.y_dev.shape)

        self.max_features = max(self.symbol2ind.values()) + 1
        self.data_for_emb_layers = {
            'char': self.char_emb_feature['char_matrix']
        }

        if verbose == 1:
            print('data embedding char shape:',
                  self.data_for_emb_layers['char'].shape,
                  self.data_for_emb_layers['char'].dtype)

        self.model = None

    def load_grammatical_cat(self, y_data_name='train', verbose=1):
        """
        Loading y data for each grammatical category.
        """

        labels = None
        labels_2indexes = None

        for files in os.listdir(self.data_path):
            if self.task_type == 'All':
                if 'y_%s_cnn1level_All' % (y_data_name, ) in files:
                    labels = self.load_binary_data(self.data_path + '/' +
                                                   files)
                if 'y_label2ind_cnn1level_All' in files:
                    labels_2indexes = len(
                        self.load_binary_data(self.data_path + '/' + files))
            else:
                if 'y_%s_cnn1level_%s' % (y_data_name,
                                          self.task_type) in files:
                    labels = self.load_binary_data(self.data_path + '/' +
                                                   files)
                if 'y_label2ind_cnn1level_%s' % (self.task_type, ) in files:
                    labels_2indexes = len(
                        self.load_binary_data(self.data_path + '/' + files))
        if verbose == 1:
            print('y_data:', labels.shape)
        return labels, labels_2indexes

    def load_binary_data(self, path_to_data):
        """
        Load data
        :return:
        """

        with open(path_to_data, 'rb') as f:
            data = pickle.load(f)
        return data

    def network_initialization(self, verbose=1):
        """
        Network compilation.            
        :return:
        """

        ###################################################Network_level_1##############################################

        model = Sequential()

        model.add(
            Embedding(input_dim=self.max_features,
                      output_dim=self.data_for_emb_layers.get('char').shape[1],
                      input_length=self.max_token_length,
                      weights=[self.data_for_emb_layers.get('char')],
                      mask_zero=False,
                      trainable=False))

        model.add(
            Conv1D(
                filters=1024,
                kernel_size=5,
                padding='valid',
                activation='relu',
                strides=1,
                name='conv1d',
            ))
        model.add(GlobalMaxPooling1D(name='global_max_pooling1d'))

        model.add(BatchNormalization())

        model.add(Dense(256, activation='relu', name='dense_3'))

        model.add(Dense(256, activation='relu', name='dense_4'))

        model.add(BatchNormalization())

        model.add(Dense(self.out_size, activation='softmax', name='dense_5'))

        model.compile(optimizer='adamax',
                      loss="mean_squared_error",
                      metrics=["accuracy"])

        self.model = model
        print(model.summary())

        if verbose == 1:
            plot_model(model,
                       to_file=self.model_path +
                       '/cnn_1level_model_schema_%s_%s.png' % (
                           self.corpora,
                           self.task_type,
                       ),
                       show_shapes=True)
            print(model.summary())

    def training(self, verbose=1, dev=False):
        """
        Training.

        model_checkpoint
            For `val_acc`, this should be `max`, for `val_loss` this should  be `min`, etc. In `auto` mode, the
            direction is automatically inferred from the name of the monitored quantity.

        early_stopping
                In `min` mode, training will stop when the quantity monitored has stopped decreasing; in `max` mode it
                will stop when the quantity  monitored has stopped increasing; in `auto` mode, the direction is
                automatically inferred from the name of the monitored quantity.

        :return:
        """

        model_checkpoint = ModelCheckpoint(
            filepath=os.path.join(self.model_path +
                                  '/cnn_1level_model_%s_%s.pkl' % (
                                      self.corpora,
                                      self.task_type,
                                  )),
            monitor='val_loss',
            verbose=0,
            save_weights_only=False,
            save_best_only=True,
            mode='min')

        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=15,
                                       verbose=0,
                                       mode='auto')

        if dev:
            self.model.fit(x=self.x_train,
                           y=self.y_train,
                           batch_size=self.batch_size_level_1,
                           epochs=self.epoch,
                           validation_data=(self.x_dev, self.y_dev),
                           verbose=2,
                           shuffle=True,
                           callbacks=[model_checkpoint, early_stopping])

            del (self.char_emb_feature, self.x_train, self.y_train,
                 self.out_size, self.data_for_emb_layers, self.symbol2ind,
                 self.max_token_length, self.model)
        else:
            self.model.fit(x=self.x_train,
                           y=self.y_train,
                           batch_size=self.batch_size_level_1,
                           epochs=self.epoch,
                           validation_split=0.1,
                           verbose=2,
                           shuffle=True,
                           callbacks=[model_checkpoint, early_stopping])

            del (self.char_emb_feature, self.x_train, self.y_train,
                 self.out_size, self.data_for_emb_layers, self.symbol2ind,
                 self.max_token_length, self.model)

    def plot_report(self, model_history):
        """
        Plot of loss function.
        :param model_history:
        :return:
        """

        plt.plot(model_history.history['loss'])
        plt.plot(model_history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.savefig(self.data_path + '/nn_report/' +
                    'cnn_1level_model_loss.jpeg')
コード例 #10
0
class LSTMTagger:
    """
    Mean of option "Task_type". Grammem: "Grammem_tag_Animacy", POS, All (all morphology properties.).
    """
    def __init__(self'):
        self.config = Config(model_type='bilstm')
        self.task_type = self.config.get('Task_type')

        print('#' * 100)
        print('Task:', self.task_type)
        print('Corpora:', self.config.get('Corpora'))
        print('#' * 100)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/%s/' % (self.config.get('Corpora'),))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        char_emb_feature = self.load_binary_data(self.data_path + '/char_emb_feature_data.pkl')
        w2v_emb_feature = self.load_binary_data(self.data_path + '/w2v_emb_feature_data.pkl')

        w2v_emb_feature['w2v_matrix'] = w2v_emb_feature['w2v_matrix'].astype('int64')
        self.word2ind = char_emb_feature['word2index']
        self.sent_max_len = char_emb_feature['max_sent_length']

        self.x_train = self.load_binary_data(self.data_path + '/x_train.pkl')

        self.max_features = len(self.word2ind) + 1
        self.random_embedding_size = self.config.get('Network_options').get('random_embedding_size')
        self.lstm_hidden_size = self.config.get('Network_options').get('lstm_hidden_size')
        self.dense_hidden_size = self.config.get('Network_options').get('dense_hidden_size')
        self.batch_size = self.config.get('Network_options').get('batch_size')
        self.epoch = self.config.get('Network_options').get('training_epoch')

        self.data_for_emb_layers = {
            'char': char_emb_feature['char_matrix'],
            'w2v':  w2v_emb_feature['w2v_matrix']
        }

        self.y_train, self.out_size = self.load_grammatical_cat()

        print('data embedding char shape:', self.data_for_emb_layers['char'].shape, self.data_for_emb_layers['char'].dtype)
        print('data embedding w2v shape:', self.data_for_emb_layers['w2v'].shape, self.data_for_emb_layers['w2v'].dtype)
        
        self.model = None

    def load_grammatical_cat(self):
        """
        Loading y data for each grammatical category.
        """

        labels = []
        labels_2indexes = []
        for files in os.listdir(self.data_path):

            if 'y_train_Grammem_tag' in files:
                if self.task_type != 'All': 
                    if self.task_type in files:
                        labels.append(
                            [files.split('.')[0].split('y_train_Grammem_tag_')[1], 
                            self.load_binary_data(self.data_path + '/' + files)])
                else:
                    labels.append(
                        [files.split('.')[0].split('y_train_Grammem_tag_')[1], 
                        self.load_binary_data(self.data_path + '/' + files)])

            if 'y_label2ind_Grammem_tag' in files:
                if self.task_type != 'All': 
                    if self.task_type in files:
                        labels_2indexes.append(
                            [files.split('.')[0].split('y_label2ind_Grammem_tag_')[1], 
                            len(self.load_binary_data(self.data_path + '/' + files)) + 1])
                else:
                    labels_2indexes.append(
                        [files.split('.')[0].split('y_label2ind_Grammem_tag_')[1], 
                        len(self.load_binary_data(self.data_path + '/' + files)) + 1])
        return sorted(labels), sorted(labels_2indexes)

    def load_binary_data(self, path_to_data):
        """
        Загрузка признаков
        :return:
        """

        with open(path_to_data, 'rb') as f:
            data = pickle.load(f)
        return data

    def network_initialization(self):
        """
        Инициализация сети.

        MaskLambda
            The next crucial building block is a way to reverse sequences, and also their masks. One way to reverse
            sequences in Keras is with a Lambda layer that wraps x[:,::-1,:] on the input tensor. Unfortunately I
            couldn't find a way in straight  Keras that will also reverse the mask, but @braingineer created the perfect
            custom lambda layer that allows us to manipulate the mask with an arbitrary function.
            
        
        LSTM vs GRU 
        The GRU unit controls the flow of information like the LSTM unit, but without having to use a memory unit. 
        It just exposes the full hidden content without any control.
        
        GRU is relatively new, and from my perspective, the performance is on par with LSTM, but computationally 
        more efficient (less complex structure as pointed out). So we are seeing it being used more and more.
        
        https://arxiv.org/pdf/1412.3555v1.pdf
            
        :return:
        """

        #####################################################################################################################
        input_char_emb = Input((self.sent_max_len,), name='input_char_emb')
        char_emb = Embedding(
            input_dim=self.max_features,
            output_dim=self.data_for_emb_layers.get('char').shape[1],
            input_length=self.sent_max_len,
            weights=[self.data_for_emb_layers.get('char')],
            mask_zero=True,
            trainable=False
            )(input_char_emb)
        bilstm_layer_char_emb_0 = Bidirectional(LSTM(
            self.lstm_hidden_size,
            return_sequences=True,
            activation='tanh',
            recurrent_activation="hard_sigmoid",))(char_emb)
            # dropout=0.2, recurrent_dropout=0.2)))
        bilstm_layer_char_emb_1 = Bidirectional(LSTM(
            self.lstm_hidden_size,
            return_sequences=True,
            activation='tanh',
            recurrent_activation="hard_sigmoid",))(bilstm_layer_char_emb_0)
            # dropout=0.2, recurrent_dropout=0.2)))
        char_emb_out = Dropout(0.5)(bilstm_layer_char_emb_1)
        #####################################################################################################################
        input_w2v_emb = Input((self.sent_max_len,), name='input_w2v_emb')
        mw2v_emb = Embedding(
            input_dim=self.max_features,
            output_dim=self.data_for_emb_layers.get('w2v').shape[1],
            input_length=self.sent_max_len,
            weights=[self.data_for_emb_layers.get('w2v')],
            mask_zero=True,
            trainable=False
        )(input_w2v_emb)
        bilstm_layer_w2v_emb_0 = Bidirectional(LSTM(
            self.lstm_hidden_size,
            return_sequences=True,
            activation='tanh',
            recurrent_activation="hard_sigmoid", ))(mw2v_emb)
            # dropout=0.2, recurrent_dropout=0.2)))
        bilstm_layer_w2v_emb_1 = Bidirectional(LSTM(
            self.lstm_hidden_size,
            return_sequences=True,
            activation='tanh',
            recurrent_activation="hard_sigmoid", ))(bilstm_layer_w2v_emb_0)
            # dropout=0.2, recurrent_dropout=0.2)))
        w2v_emb_out = Dropout(0.5)(bilstm_layer_w2v_emb_1)
        #####################################################################################################################
        dense_network = concatenate([char_emb_out, w2v_emb_out])
        dense_network = TimeDistributed(Dense(self.dense_hidden_size, activation='relu'))(dense_network)
        dense_network = Dropout(0.5)(dense_network)
        dense_network = TimeDistributed(Dense(self.dense_hidden_size, activation='relu'))(dense_network)
        dense_network = Dropout(0.5)(dense_network)

        output_layers_massive = []
        for i in range(len(self.y_train)):
            output = Dense(self.out_size[i][1], activation='softmax', name=self.out_size[i][0])(dense_network)
            output_layers_massive.append(output)

        #####################################################################################################################

        model = Model(inputs=[input_char_emb, input_w2v_emb], outputs=output_layers_massive)
        model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["accuracy"])
        self.model = model

        plot_model(model, to_file=self.model_path + '/model_schema_%s_%s.png' %
                                                                 (self.config.get('Corpora'),
                                                                  self.task_type,), show_shapes=True)
        print(model.summary())
        print('Model comp done.')

    def training(self):
        """
        Обучение.

        model_checkpoint
            For `val_acc`, this should be `max`, for `val_loss` this should  be `min`, etc. In `auto` mode, the
            direction is automatically inferred from the name of the monitored quantity.

        early_stopping
                In `min` mode, training will stop when the quantity monitored has stopped decreasing; in `max` mode it
                will stop when the quantity  monitored has stopped increasing; in `auto` mode, the direction is
                automatically inferred from the name of the monitored quantity.

        :return:
        """

        model_checkpoint = ModelCheckpoint(filepath=os.path.join(self.model_path + '/model_%s_%s.pkl' %
                                                                 (self.config.get('Corpora'),
                                                                  self.task_type,)),
                                           monitor='val_loss',
                                           verbose=0,
                                           save_weights_only=False,
                                           save_best_only=True,
                                           mode='min')

        # TODO Early stopping in multi-task learning
        early_stopping = EarlyStopping(monitor='val_loss',
                                       patience=5,
                                       verbose=1,
                                       mode='auto')

        train_data = [self.x_train, self.x_train]

        history = self.model.fit(train_data,
                                 [el[1] for el in self.y_train],
                                 batch_size=self.batch_size,
                                 epochs=self.epoch,
                                 validation_split=0.1,
                                 verbose=2,
                                 shuffle=True,
                                 callbacks=[model_checkpoint, early_stopping])

    def plot_report(self, model_history):
        """
        График loss function.
        :param model_history:
        :return:
        """

        plt.plot(model_history.history['loss'])
        plt.plot(model_history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.savefig(self.data_path + '/nn_report/' + 'model_loss.jpeg')
コード例 #11
0
class LabelPreparation:
    def __init__(self):
        self.config = Config(model_type='bilstm')

        print('#' * 100)
        print('Task:', self.config.get('Task_type'))
        print('Corpora:', self.config.get('Corpora'))
        print('Label encoding')
        print('#' * 100)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')
        self.sent_max_len = self.config.get('Sent_max_length')
        self.corpora_limit = self.config.get('Corpora_sent_limit')

        sent_test = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test')
        if self.corpora_limit != 'False':
            sent_train = load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')[
                         :self.config.get('Corpora_sent_limit')]
        else:
            sent_train = load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')

        for classification_task in self.config.get('Classification_tasks')[self.config.get('Corpora')]:
            print('\nClassification tasks:', classification_task)
            # We must find all unique labels in test and train for replace thenm by index.
            y_set = self.y_set_form(
                sent_test + sent_train,
                (classification_task,
                self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task])
                )
            unique_labels = unique_elements(y_set)
            self.label2ind, self.ind2label = elements_encode(unique_labels)
            self.max_label_numbers = max(self.label2ind.values()) + 1
            print('labels: %s; with label for 0: %s' % (len(unique_labels), self.max_label_numbers))
            del (y_set, unique_labels)

            # After we can encode test and train data.
            y_train = self.data_prepare(
                self.y_set_form(
                sent_train,
                (classification_task,
                self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task])
                )
            )
            save_binary(y_train, 'y_train_%s.pkl' % (classification_task, ))
            del y_train

            y_test = self.data_prepare(
                self.y_set_form(
                sent_test,
                (classification_task,
                self.config.get('Classification_tasks')[self.config.get('Corpora')][classification_task])
                )
            )
            save_binary(y_test,
                        self.data_path + '/%s/' % (self.config.get('Corpora'),) + '/bilstm/' + 'y_test_%s.pkl' % (
                        classification_task,))

            del y_test

            # Save label2ind
            save_binary(self.label2ind,
                        self.data_path + '/%s/' % (
                            self.config.get('Corpora'),) + '/bilstm/' + 'y_label2ind_%s.pkl' % (classification_task,))

    def y_set_form(self, data, task_type):
        """
        Forming y in accordance with classification task.
        :param data:
        :param task_type:
        :return:
        """

        y = None
        if 'Grammem_tag' in task_type[0]:
            y = [[''.join([grammems for grammems in t[4].split('|') if task_type[1][0] in grammems]) for t in sent] for sent in data]
            y = [[t if t != '' else 'Null' for t in s] for s in y]
            print(y[:10])
        else:
            if task_type[0] == 'POS':
                y = [[t[task_type[1][0]] for t in sent] for sent in data]
                print(y[:10])
            if task_type[0] == 'Morpho_tag':
                y = [[t[task_type[1][0]] for t in sent] for sent in data]
                print(y[:10])
            if task_type[0] == 'All':
                y = [['Pos=' + t[task_type[1][0]] + '|' + t[task_type[1][1]] for t in sent] for sent in data]
                print(y[:10])
        return y

    def data_prepare(self, y_set):
        """
        Creating one-hot vector for encoding labels.
        :param y:
        :return:
        """

        y_enc = [[0] * (self.sent_max_len - len(ey)) + [self.label2ind[c] for c in ey] for ey in y_set]
        y_enc = [[one_hot_encode(c, self.max_label_numbers) for c in ey] for ey in y_enc]
        y_train = pad_sequences(y_enc, maxlen=self.sent_max_len)
        print('Testing tensor shapes:')
        print('y_shape:', y_train.shape)
        return y_train
コード例 #12
0
class ModelCNN1Test:
    """
    Mean of option "Task_type". Grammem: "Grammem_tag_Animacy", POS, All (all morphology properties.).
    """
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 verbose=1,
                 class_index=3):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.corpora = self.config.get('Corpora')
        else:
            self.task_type = task_type
            self.corpora = corpora

        logging.info('\nModel test for 1 level model.')
        logging.info("Task: {}".format(self.task_type))
        logging.info("Corpora: {}".format(self.corpora))
        logging.info("Label index: {}".format(str(class_index)))

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path +
                                         '/../data/%s/cnn/model_level_1/' %
                                         (self.corpora, ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.x_test = load_bin_data(self.data_path + '/x_test_cnn1level.pkl')
        if verbose == 1:
            print('x_test shape:', self.x_test.shape)
        self.y_test = load_grammatical_cat_model1(self.data_path,
                                                  self.task_type,
                                                  verbose=1)

        self.estimation = 0

    def testing(self, verbose=1):
        """
        Model testing.
        :return:
        """

        model = load_model(self.model_path + '/cnn_1level_model_%s_%s.pkl' % (
            self.corpora,
            self.task_type,
        ))
        pr = model.predict(self.x_test, verbose=verbose)
        fyh, fpr = preparation_data_to_score_model1(self.y_test, pr)
        logging.info("Testing sklearn acc: {}".format(
            str(accuracy_score(fyh, fpr))))
        logging.info("Testing sklearn f1_score: {}".format(
            str(f1_score(fyh, fpr, average='weighted'))))
        logging.info("Testing sklearn cohen_kappa_score: {}\n".format(
            str(cohen_kappa_score(fyh, fpr))))
        self.save_classes(fpr)
        self.estimation = accuracy_score(fyh, fpr)
        del (fyh, fpr, model, self.x_test, self.y_test, pr)

    def save_classes(self, data):
        save_binary(
            data, self.data_path + '/results/cnn1_level_marks_%s.pkl' %
            (self.task_type, ))
コード例 #13
0
class ModelTest:
    """
    Mean of option "Task_type". Grammem: "Grammem_tag_Animacy", POS, All (all morphology properties.).
    """
    def __init__(self):
        self.config = Config(model_type='bilstm')
        self.task_type = self.config.get('Task_type')

        print('#' * 100)
        print('Task:', self.task_type)
        print('Corpora:', self.config.get('Corpora'))
        print('#' * 100)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/%s/' %
                                         (self.config.get('Corpora'), ))
        self.model_path = os.path.abspath(file_path + '/../tagger_models/')

        self.x_test = self.load_binary_data(self.data_path + '/x_test.pkl')
        print('X test shape:', self.x_test.shape)

        self.y_test = self.load_grammatical_cat()
        self.model = None

    def load_grammatical_cat(self):
        """
        Loading y data for each grammatical category.
        """

        labels = []
        for files in os.listdir(self.data_path):
            if 'y_test_Grammem_tag' in files:
                if self.task_type != 'All':
                    if self.task_type in files:
                        labels.append([
                            files.split('.')[0].split('y_test_Grammem_tag_')
                            [1],
                            self.load_binary_data(self.data_path + '/' + files)
                        ])
                else:
                    labels.append([
                        files.split('.')[0].split('y_test_Grammem_tag_')[1],
                        self.load_binary_data(self.data_path + '/' + files)
                    ])
        return sorted(labels)

    def load_binary_data(self, path_to_data):
        """
        Data load.
        :return:
        """

        with open(path_to_data, 'rb') as f:
            data = pickle.load(f)
        return data

    def testing(self):
        """
        Model testing.
        :return:
        """

        test_data = [self.x_test, self.x_test]
        model = load_model(self.model_path + '/model_%s_%s.pkl' % (
            self.config.get('Corpora'),
            self.task_type,
        ))

        print('Model load.')
        pr = model.predict(test_data, verbose=1)
        for i in range(len(self.y_test)):
            print('*' * 100)
            print('Testing category:', self.y_test[i][0])
            fyh, fpr = self.preparetion_data_to_score(self.y_test[i][1], pr[i])
            print('Testing sklearn: acc:', accuracy_score(fyh, fpr))
            print('Testing sklearn: f1_score:',
                  f1_score(fyh, fpr, average='weighted'))
            # The Kappa or Cohen’s kappa is the classification accuracy normalized by the imbalance of the classes in the data.
            print('Testing sklearn: cohen_kappa_score:',
                  cohen_kappa_score(fyh, fpr))
            del (fyh, fpr)

    def preparetion_data_to_score(self, yh, pr):
        """    
        yh = [array([ 57, 156, 300, 120, 306,  31, 148,  38,  70,  36, 196, 306, 200,
                31, 116, 275]), array([ 36,  35,  35, 294, 109, 275])]
        ypr = [array([  0,   0,   0,   0,   0, 120, 120, 120, 120, 120, 120, 120, 120,
               120, 120, 120]), array([0, 0, 0, 0, 0, 0])]
    
        fyh = [57, 156, 300, 120, 306, 31, 148, 38, 70, 36, 196, 306, 200, 31, 116, 275, 36, 35, 35, 294, 109, 275]
        fpr = [0, 0, 0, 0, 0, 0, 275, 275, 275, 275, 275, 275, 275, 275, 275, 275, 0, 0, 0, 0, 0, 275]
    
        :param yh:
        :param pr:
        :return:
        """

        yh = yh.argmax(2)
        pr = [list(np.argmax(el, axis=1)) for el in pr]
        # с какого элемента начинаются не 0; напрмиер: [37, 47];
        coords = [np.where(yhh > 0)[0][0] for yhh in yh]
        # оставляем только не 0 элементы;
        yh = [yhh[co:] for yhh, co in zip(yh, coords)]
        # по границам y оставялем не 0 в предсказании;
        ypr = [prr[co:] for prr, co in zip(pr, coords)]
        # конкатенация массивов для сравнения;
        fyh = [c for row in yh for c in row]
        fpr = [c for row in ypr for c in row]
        return fyh, fpr
コード例 #14
0
class W2vEmb:
    def __init__(self):

        self.config = Config(model_type='bilstm')

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        sent = load_data(self.data_path + '/' + self.config.get('Corpora') + '/test') + \
               load_data(self.data_path + '/' + self.config.get('Corpora') + '/train')

        self.model = gensim.models.Word2Vec.load_word2vec_format(
            self.data_path + '/' +
            "w2v_models/mix_corpora_5_10_300_skip_neg.bin",
            binary=True)
        unique_tokens = self.unique_tokens(sent)
        w2v_embeddings = self.form_emb_vocab(unique_tokens)

        print('vocabulary:', len(unique_tokens))
        print('char embeddings:', w2v_embeddings.shape)

        self.save_emb(('w2v_matrix', w2v_embeddings))

    def form_emb_vocab(self, unique_tokens):
        tokens_with_zero_vector = 0
        embed_vocab = list()
        base_vector = numpy.zeros(300, dtype='int64')
        embed_vocab.append(base_vector)
        for tokens in unique_tokens:
            feature_vector = base_vector
            try:
                feature_vector = self.model[tokens.lower()]
            except KeyError:
                tokens_with_zero_vector += 1
            embed_vocab.append(feature_vector)
        print('tokens_with_zero_vector:', tokens_with_zero_vector)
        return numpy.array(embed_vocab, dtype='int64')

    def save_emb(self, w2v_matrix=None):
        """
        Сохранение данных по признаку.
        :param w2v_matrix:
        :return:
        """

        emb_feature_data = dict()
        emb_feature_data[w2v_matrix[0]] = w2v_matrix[1]
        self.save_binary(emb_feature_data, 'w2v_emb_feature_data')

    def save_binary(self, data, file_name):
        """
        Сохранение данных в бинарном формате.
        :param data:
        :param file_name:
        :return:
        """

        with open(
                self.data_path + '/%s/' % (self.config.get('Corpora'), ) +
                '/bilstm/%s.pkl' % (file_name, ), 'wb') as file:
            pickle.dump(data, file)

    def unique_tokens(self, data):
        return sorted(set([tokens[1] for sent in data for tokens in sent]))
コード例 #15
0
class DataCNN1Level:
    def __init__(self,
                 use_config=True,
                 corpora='UD_Russian-SynTagRus',
                 task_type='POS',
                 class_index=3,
                 dev=True,
                 verbose=1):

        if use_config:
            self.config = Config(model_type='cnn')
            self.task_type = self.config.get('Task_type')
            self.class_index = self.config.get(
                'Classification_tasks')['UD2']['POS'][0]
            self.corpora = self.config.get('Corpora')
        else:
            self.task_type = task_type
            self.corpora = corpora
            self.class_index = class_index

        print('Data preparation for 1 level model.')
        print('Task:', self.task_type)
        print('Corpora:', self.corpora)
        print('Label index:', class_index)

        file_path = os.path.split(os.path.abspath(__file__))[0]
        self.data_path = os.path.abspath(file_path + '/../data/')

        sent_test = load_data(self.data_path + '/' + self.corpora + '/test')
        sent_train = load_data(self.data_path + '/' + self.corpora + '/train')
        if dev:
            sent_valid = load_data(self.data_path + '/' + self.corpora +
                                   '/dev')
        else:
            sent_valid = []

        test_tokens_data_seq = seq_form(sent_test, task_type=self.task_type)
        train_tokens_data_seq = seq_form(sent_train, task_type=self.task_type)
        if dev:
            dev_tokens_data_seq = seq_form(sent_valid,
                                           task_type=self.task_type)
        else:
            dev_tokens_data_seq = []

        test_labels_data_seq = seq_form(sent_test,
                                        data_type='y',
                                        task_type=self.task_type,
                                        task_index=self.class_index)
        train_labels_data_seq = seq_form(sent_train,
                                         data_type='y',
                                         task_type=self.task_type,
                                         task_index=self.class_index)
        if dev:
            dev_labels_data_seq = seq_form(sent_valid,
                                           data_type='y',
                                           task_type=self.task_type,
                                           task_index=self.class_index)
        else:
            dev_labels_data_seq = []

        test_tokens_data = [
            tokens for sent in test_tokens_data_seq for tokens in sent
        ]
        train_tokens_data = [
            tokens for sent in train_tokens_data_seq for tokens in sent
        ]
        if dev:
            dev_tokens_data = [
                tokens for sent in dev_tokens_data_seq for tokens in sent
            ]
        else:
            dev_tokens_data = []

        test_labels_data = [
            labels for sent in test_labels_data_seq for labels in sent
        ]
        train_labels_data = [
            labels for sent in train_labels_data_seq for labels in sent
        ]
        if dev:
            dev_labels_data = [
                labels for sent in dev_labels_data_seq for labels in sent
            ]
        else:
            dev_labels_data = []

        # After we can encode y test and train data.
        self.ADDING_INDEX = 1
        self.PADDING_VALUE = 0

        UNIQUE_LABELS = sorted(
            set(test_labels_data + train_labels_data + dev_labels_data))
        self.label2ind_with_adding, self.ind2label_with_adding = labels_encode(
            UNIQUE_LABELS, 0)
        self.max_label_numbers = max(self.label2ind_with_adding.values())
        if verbose == 1:
            print('Unique labels:', self.max_label_numbers)
            print("\nLabels:", self.label2ind_with_adding.keys())

        y_train = self.label_data_prepare(train_labels_data, verbose=verbose)
        y_test = self.label_data_prepare(test_labels_data, verbose=verbose)
        if dev:
            y_dev = self.label_data_prepare(dev_labels_data, verbose=verbose)
        else:
            y_dev = []

        save_binary(
            y_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_test_cnn1level_%s' % (self.task_type, ))
        save_binary(
            y_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_train_cnn1level_%s' % (self.task_type, ))
        save_binary(
            y_dev, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_dev_cnn1level_%s' % (self.task_type, ))
        save_binary(
            self.label2ind_with_adding,
            self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/y_label2ind_cnn1level_%s' % (self.task_type, ))

        del (y_train, y_test, y_dev, self.label2ind_with_adding,
             self.ind2label_with_adding, self.max_label_numbers, UNIQUE_LABELS)

        # After we can encode x test, dev, train data.
        unique_tokens = sorted(
            set(test_tokens_data + train_tokens_data + dev_tokens_data))
        if verbose == 1:
            print("\nUnique tokens:", len(unique_tokens))

        self.unique_symbols = unique_chars(unique_tokens)
        self.max_token_length = max([len(token) for token in unique_tokens])
        self.symbol2ind_with_adding, self.ind2symbol_with_adding = symbols_encode(
            self.unique_symbols, self.ADDING_INDEX)
        if verbose == 1:
            print("\nUnique symbols:", self.symbol2ind_with_adding.keys())

        x_test = self.data_prepare(test_tokens_data, verbose=verbose)
        x_train = self.data_prepare(train_tokens_data, verbose=verbose)
        x_dev = self.data_prepare(dev_tokens_data, verbose=verbose)

        save_binary(
            x_test, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/x_test_cnn1level.pkl')
        save_binary(
            x_train, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/x_train_cnn1level.pkl')
        save_binary(
            x_dev, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/x_dev_cnn1level.pkl')

        char_embeddings = self.char_matrix_cnn()
        if verbose == 1:
            print('\nChar_embeddings shape:', char_embeddings.shape)

        self.save_emb(('symbol2ind', self.symbol2ind_with_adding),
                      ('ind2symbol', self.ind2symbol_with_adding),
                      ('max_token_length', self.max_token_length),
                      ('char_matrix', char_embeddings))

        del (self.symbol2ind_with_adding, self.ind2symbol_with_adding,
             self.max_token_length, char_embeddings, self.unique_symbols,
             x_test, x_train, x_dev, sent_test, sent_valid, sent_train,
             test_tokens_data_seq, train_tokens_data_seq, dev_tokens_data_seq,
             test_labels_data_seq, train_labels_data_seq, dev_labels_data_seq,
             test_tokens_data, train_tokens_data, dev_tokens_data,
             test_labels_data, train_labels_data, dev_labels_data,
             unique_tokens)

    def data_prepare(self, x_set, verbose=1):
        """
        Encoding symbols using dict symbols per digit and padding symbols sequence.
        :param x:
        :param y:
        :return:
        """

        x_enc = [[self.symbol2ind_with_adding[char] for char in token]
                 for token in x_set]
        x = pad_sequences(x_enc,
                          maxlen=self.max_token_length,
                          value=self.PADDING_VALUE)
        if verbose == 1:
            print('x tensor shapes: %s' % (x.shape, ), x[0])
        return x

    def save_emb(self,
                 symbol2ind=None,
                 ind2symbol=None,
                 max_token_length=None,
                 char_matrix=None):
        """
        Сохранение данных по признаку.
        """

        emb_feature_data = dict()
        emb_feature_data[symbol2ind[0]] = symbol2ind[1]
        emb_feature_data[ind2symbol[0]] = ind2symbol[1]
        emb_feature_data[max_token_length[0]] = max_token_length[1]
        emb_feature_data[char_matrix[0]] = char_matrix[1]
        save_binary(
            emb_feature_data, self.data_path + '/%s/' % (self.corpora, ) +
            'cnn/model_level_1/char_emb_cnn1_feature_data_%s.pkl' %
            (self.task_type, ))

    def char_matrix_cnn(self):
        """
        Creating matrix with char embedding for cnn network.
        Example:
            0 [1 0 0 ... 0 ]
            ! [0 1 0 0 ...]
        """

        char_emb_vocab = list()
        null_vector = np.zeros(len(self.symbol2ind_with_adding) + 1,
                               dtype='int8')
        null_vector[0] = 1
        char_emb_vocab.append(null_vector)
        for symbols in self.unique_symbols:
            features_per_symbol = np.zeros(len(self.symbol2ind_with_adding) +
                                           1,
                                           dtype='int8')
            features_per_symbol[self.symbol2ind_with_adding[symbols]] = 1
            char_emb_vocab.append(features_per_symbol)
        return np.array(char_emb_vocab).astype('int8')

    def label_data_prepare(self, y_set, verbose=1):
        """
        Creating one-hot vector for encoding labels.
        :param y:
        :return:
        """

        y_set = to_categorical([self.label2ind_with_adding[l] for l in y_set],
                               self.max_label_numbers + 1)
        if verbose == 1:
            print('y tensor shapes: %s' % (y_set.shape, ), y_set[0])
        return y_set