def load(self, rdr):
        for line in rdr:
            if len(line) == 0:
                eof_reached = True
                break

            line = line.strip()
            if len(line) == 0:
                if len(self.premises) > 0:
                    break
                else:
                    continue

            if line.startswith(u'T:'):
                if len(self.questions) > 0:
                    self.invalid_format()

                premise = line.replace(u'T:',
                                       u'').replace(u'ё',
                                                    u'е').lower().strip()
                premise_words = lpad_wordseq(self.tokenizer.tokenize(premise),
                                             self.max_wordseq_len)
                self.premises_str.append(u' '.join(premise_words))
                self.premises.append(premise_words)
            elif line.startswith(u'Q:'):
                question = line.replace(u'Q:', u'').replace(u'ё', u'е').strip()
                question = lpad_wordseq(self.tokenizer.tokenize(question),
                                        self.max_wordseq_len)
                self.questions.append(question)
            else:
                self.invalid_format()

        eof_reached = len(self.premises) == 0
        return eof_reached
Exemple #2
0
    def load(self, data_folder):
        eval_path = os.path.join(data_folder, 'evaluate_relevancy.txt')
        self.eval_data = []  # список из EvaluationGroup
        with codecs.open(eval_path, 'r', 'utf-8') as rdr:
            while True:
                group = EvaluationGroup(self.max_wordseq_len, self.tokenizer)
                eof_reached = group.load(rdr)
                if eof_reached:
                    break
                if not group.is_empty():
                    self.eval_data.append(group)

        # Нам нужен набор нерелевантных предпосылок.
        self.all_premises = []

        if False:
            # Возьмем их из тренировочного набора.
            # Вообще говоря, могут быть коллизии, когда для одного вопроса есть несколько
            # релевантных предпосылок, и мы можем случайно выбрать релевантных вариант как недопустимый.
            df = pd.read_csv(os.path.join(data_folder,
                                          'premise_question_answer.csv'),
                             encoding='utf-8',
                             delimiter='\t',
                             quoting=3)
            for premise in df['premise'].unique():
                premise_words = self.tokenizer.tokenize(premise)
                if u'кого' not in premise_words:
                    premise = lpad_wordseq(premise_words, self.max_wordseq_len)
                    self.all_premises.append(premise)
        else:
            # Берем нерелевантные предпосылки из базы фактов чат-бота
            with codecs.open(os.path.join(data_folder, 'premises.txt'), 'r',
                             'utf-8') as rdr:
                for line in rdr:
                    line = line.strip()
                    if len(line) > 0:
                        premise = lpad_wordseq(self.tokenizer.tokenize(line),
                                               self.max_wordseq_len)
                        self.all_premises.append(premise)
Exemple #3
0
def load_dataset(params):
    tokenizer = Tokenizer()
    tokenizer.load()

    # Датасет должен быть заранее сформирован скриптом ./preparation/prepare_req_interpretation_classif.py
    df = pd.read_csv(os.path.join(data_folder,
                                  'req_interpretation_dataset.csv'),
                     sep='\t',
                     encoding='utf-8')
    samples = [
        Sample(row['text'], int(row['label'])) for i, row in df.iterrows()
    ]

    # Токенизация сэмплов
    for sample in samples:
        sample.words = tokenizer.tokenize(sample.phrase)

    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = max(len(sample.words) for sample in samples)
    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    if params['padding'] == 'left':
        for sample in samples:
            sample.words = lpad_wordseq(sample.words, max_wordseq_len)
    else:
        for sample in samples:
            sample.words = rpad_wordseq(sample.words, max_wordseq_len)

    computed_params = {
        'max_wordseq_len': max_wordseq_len,
        'nb_0': nb_0,
        'nb_1': nb_1
    }

    return samples, computed_params
Exemple #4
0
    def extract_entity(self, entity_name, phrase, text_utils, embeddings):
        # TODO: брать модель для указанного entity_name, когда будет множество разных entity

        self.X_probe.fill(0)

        words = text_utils.tokenize(phrase)
        if self.padding == 'right':
            words = rpad_wordseq(words, self.max_inputseq_len)
        else:
            words = lpad_wordseq(words, self.max_inputseq_len)

        embeddings.vectorize_words(self.w2v_filename, words, self.X_probe, 0)

        inputs = dict()
        inputs['input'] = self.X_probe

        y = self.model.predict(x=inputs)[0]
        predicted_labels = np.argmax(y, axis=-1)

        selected_words = [
            word for word, label in zip(words, predicted_labels) if label == 1
        ]
        entity_text = u' '.join(selected_words).strip()
        return entity_text
Exemple #5
0
 def pad_wordseq(self, words, n):
     if self.padding == 'left':
         return lpad_wordseq(words, n)
     else:
         return rpad_wordseq(words, n)
Exemple #6
0
    computed_params['embeddings'] = embeddings
    computed_params['word_dims'] = embeddings.vector_size

    print('Restoring model architecture from {}'.format(arch_filepath))
    with open(arch_filepath, 'r') as f:
        model = model_from_json(f.read())

    print('Loading model weights from {}'.format(weights_path))
    model.load_weights(weights_path)

    tokenizer = Tokenizer()
    tokenizer.load()

    while True:
        phrase = utils.console_helpers.input_kbd(':> ').strip()
        sample1 = Sample(phrase, 0)
        sample1.words = tokenizer.tokenize(phrase)

        if padding == 'left':
            sample1.words = lpad_wordseq(sample1.words, max_wordseq_len)
        else:
            sample1.words = rpad_wordseq(sample1.words, max_wordseq_len)

        for istep, xy in enumerate(
                generate_rows([sample1], 2, computed_params, 1)):
            x = xy[0]
            y_pred = model.predict(x=x, verbose=0)[0]
            print('y==0 --> {}'.format(y_pred[0]))
            print('y==1 --> {}'.format(y_pred[1]))
            break
Exemple #7
0
    nb_0 = sum(sample.y == 0 for sample in samples)
    nb_1 = sum(sample.y == 1 for sample in samples)
    logging.info('nb_0={} nb_1={}'.format(nb_0, nb_1))

    max_wordseq_len = 0
    for sample in samples:
        for phrase in [sample.phrase1, sample.phrase2]:
            words = tokenizer.tokenize(phrase)
            max_wordseq_len = max(max_wordseq_len, len(words))

    logging.info('max_wordseq_len={}'.format(max_wordseq_len))

    if padding == 'left':
        for sample in samples:
            sample.words1 = lpad_wordseq(tokenizer.tokenize(sample.phrase1),
                                         max_wordseq_len)
            sample.words2 = lpad_wordseq(tokenizer.tokenize(sample.phrase2),
                                         max_wordseq_len)
    else:
        for sample in samples:
            sample.words1 = rpad_wordseq(tokenizer.tokenize(sample.phrase1),
                                         max_wordseq_len)
            sample.words2 = rpad_wordseq(tokenizer.tokenize(sample.phrase2),
                                         max_wordseq_len)

    # суммарное кол-во дополнительных фич, подаваемых на вход сетки
    # помимо двух отдельных предложений.
    nb_addfeatures = 0

    if net_arch == 'cnn2':
        # попарные похожести слов в двух предложениях.