Ejemplo n.º 1
0
def nmt(model,
        corpus_src,
        path_vocab_src=path_vocab_src,
        path_vocab_tgt=path_vocab_tgt,
        src_seq_len_max=src_seq_len_max):
    # 读取字典
    word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict(
        path_vocab_src)
    word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(
        path_vocab_tgt)
    id2word_vocab_tgt = {
        value: key
        for key, value in word2id_vocab_tgt.items()
    }
    ids = []
    id_extendeds = []
    vocab_extends = []
    # 处理输入语料数据
    for sentence in corpus_src:
        sent = sentence.strip().split()
        id, id_extended, vocab_extend_raw = nn_lib.sentence2id(
            sent=sent,
            word2id_vocab=word2id_vocab_src,
            build_extend_vocab=True)
        ids.append(id)
        id_extendeds.append(id_extended)
        vocab_extend = {
            key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt)
            for key, value in vocab_extend_raw.items()
        }
        vocab_extends.append(copy.copy(vocab_extend))
    # 序列补0,统计附加词表大小
    ids = nn_lib.pad_sequences(sequences=ids, max_seq_len=src_seq_len_max)
    id_extendeds = nn_lib.pad_sequences(sequences=id_extendeds,
                                        max_seq_len=src_seq_len_max)
    vocab_size_extened = max([len(i) for i in vocab_extends])
    # 规整数据
    ids = np.array(ids)
    id_extendeds = np.array(id_extendeds)
    vocab_extends = np.array(vocab_extends).reshape([-1, 1])
    data = [ids, id_extendeds, vocab_extends]
    # 进行预测,输出时序概率分布
    tgt_prob_seqs = model.infer(data=data)
    # 转换预测结果至自然语言语句
    tgt_seqs = []
    for seq in tgt_prob_seqs:
        seq = np.argmax(seq, axis=1)
        seq = [id2word_vocab_tgt[id] for id in seq]
        seq = np.array(seq).reshape([-1, 1])
        tgt_seqs.append(seq)
    corpus_tgt = np.concatenate(tgt_seqs, axis=1)
    corpus_tgt = [
        ''.join([tmp for tmp in corpus_tgt[i, :] if tmp != '<PAD>'])
        for i in range(corpus_tgt.shape[0])
    ]
    return corpus_tgt
Ejemplo n.º 2
0
def preprocess_data():
    # 读取语料
    data = read_ner_corpus(path_ner + r'train_data')
    # 读取字典
    word2id, vocab_size = nn_lib.read_word2id_dict(path_ner + r'word2id.pkl')
    # 转换语料至id
    seqs, labels = [], []
    for (sent_, label_) in data:
        sent_id_ = nn_lib.sentence2id(sent_, word2id)
        label_id_ = [label2id[label] for label in label_]
        seqs.append(sent_id_)
        labels.append(label_id_)
    max_seq_len = max([len(x) for x in labels])
    # 规整语料数据
    seqs = nn_lib.pad_sequences(seqs, max_seq_len)
    seqs = np.array(seqs)
    labels = nn_lib.pad_sequences(labels, max_seq_len)
    labels = np.array(labels)
    # 构建训练集、测试集、验证集
    x, x_vali, y, y_vali = train_test_split(seqs, labels, test_size=1024)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1024)
    # 存储训练集、测试集、验证集
    for name in processed_corpus_names:
        with open(path_corpus_processed + name, 'wb') as file:
            pickle.dump(eval(name), file)
    return
Ejemplo n.º 3
0
def demo():
    word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict(
        path_vocab_src)
    word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(
        path_vocab_tgt)
    id2word_vocab_src = {
        value: key
        for key, value in word2id_vocab_src.items()
    }
    id2word_vocab_tgt = {
        value: key
        for key, value in word2id_vocab_tgt.items()
    }
    # ' '.join([id2word_vocab_src[i] for i in corpus['x_test'][100]])
    # ''.join([id2word_vocab_tgt[i] for i in corpus['y_test'][100]])
    return
Ejemplo n.º 4
0
def demo_rebuild_w2v_matrix(path_word2id, path_w2v):
    path_word2id = u'E:\\MachineLearning\\data\\seq2seq_nmt\\vocab_zh.pkl'
    path_w2v = path_wiki + u'45000-samll.txt'
    word2id_vocab, vocab_size = nn_lib.read_word2id_dict(path_word2id)
    w2v_vector = KeyedVectors.load_word2vec_format(path_w2v)
    w2v_matrix = rebuild_w2v_matrix(word2id_vocab, w2v_vector)
    return w2v_matrix
Ejemplo n.º 5
0
    for i in range(len(word_list)):
        corpus_label = []
        for j in range(len(word_list[i])):
            corpus_label.append(
                (word_list[i][j], id2label[label_id_list[i][j]]))
        corpus_labels.append(corpus_label)

    return corpus_labels


if __name__ == '__main__':
    if flag_build_vocab is True:
        build_vocab()
    if flag_process_data is True:
        preprocess_data()
    word2id, vocab_size = nn_lib.read_word2id_dict(path_ner + r'word2id.pkl')
    corpus = load_processed_corpus()
    data = [corpus['x_train'], corpus['y_train']]
    data_test = [corpus['x_test'], corpus['y_test']]
    model = NeuralNetwork(data=data,
                          model_type='bilstm_crf',
                          loss_fun_type='bilstm_crf',
                          model_parameter={
                              'word_embd_pretrain': None,
                              'keep_prob': keep_prob,
                              'vocab_num': vocab_size,
                              'word_embd_dim': word_embd_dim,
                              'label_num': len(label2id),
                              'dim_rnn': dim_rnn,
                              'batch_size': batch_size
                          },
Ejemplo n.º 6
0
def preprocess_data():
    # 读取字典
    word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict(
        path_vocab_src)
    word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(
        path_vocab_tgt)
    # 读取语料
    corpus_src = read_file(path_corpus_src)
    corpus_tgt = read_file(path_corpus_tgt)
    # 转换语料文本至id值
    src_ids = []
    src_id_extendeds = []
    word2id_vocab_extends = []
    tgt_ids = []
    tgt_id_extendeds = []
    for src, tgt in zip(corpus_src, corpus_tgt):
        # 转换src至ID
        src_id, src_id_extended, vocab_extend_raw = nn_lib.sentence2id(
            sent=src, word2id_vocab=word2id_vocab_src, build_extend_vocab=True)
        src_ids.append(src_id)
        src_id_extendeds.append(src_id_extended)
        vocab_extend = {
            key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt)
            for key, value in vocab_extend_raw.items()
        }
        word2id_vocab_extends.append(copy.copy(vocab_extend))
        # 转换tgt至ID
        tgt_id = nn_lib.sentence2id(sent=tgt,
                                    word2id_vocab=word2id_vocab_tgt,
                                    build_extend_vocab=False)
        vocab_extend.update(word2id_vocab_tgt)
        tgt_id_extended = nn_lib.sentence2id(sent=tgt,
                                             word2id_vocab=vocab_extend,
                                             build_extend_vocab=False)
        tgt_ids.append(tgt_id)
        tgt_id_extendeds.append(tgt_id_extended)
    del src, src_id, src_id_extended, tgt, tgt_id, tgt_id_extended
    # 序列补0,统计附加词表大小
    src_ids = nn_lib.pad_sequences(sequences=src_ids,
                                   max_seq_len=src_seq_len_max)
    src_id_extendeds = nn_lib.pad_sequences(sequences=src_id_extendeds,
                                            max_seq_len=src_seq_len_max)
    tgt_ids = nn_lib.pad_sequences(sequences=tgt_ids,
                                   max_seq_len=tgt_seq_len_max,
                                   add_sos=True)
    tgt_id_extendeds = nn_lib.pad_sequences(sequences=tgt_id_extendeds,
                                            max_seq_len=tgt_seq_len_max,
                                            add_sos=True)
    vocab_size_extened = max([len(i) for i in word2id_vocab_extends])
    # 规整数据
    src_ids = np.array(src_ids)
    src_id_extendeds = np.array(src_id_extendeds)
    tgt_ids = np.array(tgt_ids)
    tgt_id_extendeds = np.array(tgt_id_extendeds)
    word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1])
    # 构建训练集、测试集、验证集
    nn_lib.generate_train_test_vali(src_ids,
                                    src_id_extendeds,
                                    tgt_ids,
                                    tgt_id_extendeds,
                                    word2id_vocab_extends,
                                    file_path=path_corpus_processed,
                                    corpus_names=corpus_names,
                                    data_test_size=128 * 2,
                                    data_vali_size=128 * 3)
    # x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \
    #     = train_test_split(src_ids, src_id_extendeds, tgt_ids, tgt_id_extendeds, word2id_vocab_extends, test_size=128*3)
    # x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \
    #     = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=128*2)
    # del x, x_extended, y, y_extended, vocab_extend
    # # 存储训练集、测试集、验证集
    # for name in processed_corpus_names:
    #     with open(path_corpus_processed + name, 'wb') as file:
    #         pickle.dump(eval(name), file)
    return
Ejemplo n.º 7
0
if __name__ == "__main__":
    # 创建字典
    if flag_build_vocab is True:
        word2id_vocab_en, word2id_vocab_zh = build_vocab()
    # 预处理数据
    if flag_process_data is True:
        preprocess_data()
    # 预训练词向量
    if flag_pretrain_w2v is True:
        nn_lib.train_word2vec(path_corpus=path_corpus_src,
                              word2vec_dim=word_embd_dim,
                              path_w2v_model=path_seq2seq + 'en_w2v_model',
                              path_w2v_vector=path_seq2seq + 'en_w2v_vector')
        w2v_vector = nn_lib.load_w2v_vector(path_seq2seq + 'en_w2v_vector')
        word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict(
            path_vocab_src)
        encoder_word_embd_pretrain = nn_lib.rebuild_w2v_matrix(
            word2id_vocab_src, w2v_vector)
        with open(path_seq2seq + 'encoder_word_embd_pretrain', 'wb') as file:
            pickle.dump(encoder_word_embd_pretrain, file)
        # word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(path_vocab_tgt)

    corpus = load_processed_corpus()
    data = [
        corpus['x_train'], corpus['x_extended_train'],
        corpus['vocab_extend_train'], corpus['y_train'],
        corpus['y_extended_train']
    ]
    data_test = [
        corpus['x_test'], corpus['x_extended_test'],
        corpus['vocab_extend_test'], corpus['y_test'],
Ejemplo n.º 8
0
def preprocess_data(vocab_size):
    # 读取语料数据
    data_q = []
    data_a = []
    for file_name in corpus_list:
        print('loading file {}'.format(path_corpus + file_name))
        data_q_, data_a_ = load_file(path_corpus + file_name)
        data_q = data_q + data_q_
        data_a = data_a + data_a_
    del data_q_, data_a_
    # 构建/读取字典
    if flag_build_vocab:
        word2id_vocab, vocab_size = nn_lib.build_word2id_vocab(
            data=data_q,
            saved_path=path_vocab,
            vocab_size=vocab_size,
            use_seg=True)
    else:
        word2id_vocab, vocab_size = nn_lib.read_word2id_dict(path_vocab)
    # 转换语料文本至id值
    q_ids = []
    q_id_extendeds = []
    word2id_vocab_extends = []
    a_ids = []
    a_id_extendeds = []
    for q, a in zip(data_q, data_a):
        # 转换问题Q至ID
        q_seg = nn_lib.str_segment(q)
        q_id, q_id_extended, vocab_extend = nn_lib.sentence2id(
            sent=q_seg, word2id_vocab=word2id_vocab, build_extend_vocab=True)
        q_ids.append(q_id)
        q_id_extendeds.append(q_id_extended)
        word2id_vocab_extends.append(copy.copy(vocab_extend))
        # 转换答案A至ID
        a_seg = nn_lib.str_segment(a)
        a_id = nn_lib.sentence2id(sent=a_seg,
                                  word2id_vocab=word2id_vocab,
                                  build_extend_vocab=False)
        vocab_extend.update(word2id_vocab)
        a_id_extended = nn_lib.sentence2id(sent=a_seg,
                                           word2id_vocab=vocab_extend,
                                           build_extend_vocab=False)
        a_ids.append(a_id)
        a_id_extendeds.append(a_id_extended)
    del q, q_seg, q_id, q_id_extended, a, a_seg, a_id, a_id_extended
    # 序列补0,统计附加词表大小
    q_ids = nn_lib.pad_sequences(sequences=q_ids, max_seq_len=max_seq_len)
    q_id_extendeds = nn_lib.pad_sequences(sequences=q_id_extendeds,
                                          max_seq_len=max_seq_len)
    a_ids = nn_lib.pad_sequences(sequences=a_ids,
                                 max_seq_len=max_seq_len,
                                 add_sos=True)
    a_id_extendeds = nn_lib.pad_sequences(sequences=a_id_extendeds,
                                          max_seq_len=max_seq_len,
                                          add_sos=True)
    vocab_size_extened = max([len(i) for i in word2id_vocab_extends])
    # 规整数据
    q_ids = np.array(q_ids)
    q_id_extendeds = np.array(q_id_extendeds)
    a_ids = np.array(a_ids)
    a_id_extendeds = np.array(a_id_extendeds)
    word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1])
    # 构建训练集、测试集、验证集
    x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \
        = train_test_split(q_ids, q_id_extendeds, a_ids, a_id_extendeds, word2id_vocab_extends, test_size=1024*8)
    x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \
        = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=1024)
    del x, x_extended, y, y_extended, vocab_extend
    # 存储训练集、测试集、验证集
    for name in processed_corpus_names:
        with open(path_corpus_processed + name, 'wb') as file:
            pickle.dump(eval(name), file)
    return