Ejemplo n.º 1
0
def preprocess_data():
    # 读取语料
    data = read_ner_corpus(path_ner + r'train_data')
    # 读取字典
    word2id, vocab_size = nn_lib.read_word2id_dict(path_ner + r'word2id.pkl')
    # 转换语料至id
    seqs, labels = [], []
    for (sent_, label_) in data:
        sent_id_ = nn_lib.sentence2id(sent_, word2id)
        label_id_ = [label2id[label] for label in label_]
        seqs.append(sent_id_)
        labels.append(label_id_)
    max_seq_len = max([len(x) for x in labels])
    # 规整语料数据
    seqs = nn_lib.pad_sequences(seqs, max_seq_len)
    seqs = np.array(seqs)
    labels = nn_lib.pad_sequences(labels, max_seq_len)
    labels = np.array(labels)
    # 构建训练集、测试集、验证集
    x, x_vali, y, y_vali = train_test_split(seqs, labels, test_size=1024)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1024)
    # 存储训练集、测试集、验证集
    for name in processed_corpus_names:
        with open(path_corpus_processed + name, 'wb') as file:
            pickle.dump(eval(name), file)
    return
Ejemplo n.º 2
0
def nmt(model,
        corpus_src,
        path_vocab_src=path_vocab_src,
        path_vocab_tgt=path_vocab_tgt,
        src_seq_len_max=src_seq_len_max):
    # 读取字典
    word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict(
        path_vocab_src)
    word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(
        path_vocab_tgt)
    id2word_vocab_tgt = {
        value: key
        for key, value in word2id_vocab_tgt.items()
    }
    ids = []
    id_extendeds = []
    vocab_extends = []
    # 处理输入语料数据
    for sentence in corpus_src:
        sent = sentence.strip().split()
        id, id_extended, vocab_extend_raw = nn_lib.sentence2id(
            sent=sent,
            word2id_vocab=word2id_vocab_src,
            build_extend_vocab=True)
        ids.append(id)
        id_extendeds.append(id_extended)
        vocab_extend = {
            key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt)
            for key, value in vocab_extend_raw.items()
        }
        vocab_extends.append(copy.copy(vocab_extend))
    # 序列补0,统计附加词表大小
    ids = nn_lib.pad_sequences(sequences=ids, max_seq_len=src_seq_len_max)
    id_extendeds = nn_lib.pad_sequences(sequences=id_extendeds,
                                        max_seq_len=src_seq_len_max)
    vocab_size_extened = max([len(i) for i in vocab_extends])
    # 规整数据
    ids = np.array(ids)
    id_extendeds = np.array(id_extendeds)
    vocab_extends = np.array(vocab_extends).reshape([-1, 1])
    data = [ids, id_extendeds, vocab_extends]
    # 进行预测,输出时序概率分布
    tgt_prob_seqs = model.infer(data=data)
    # 转换预测结果至自然语言语句
    tgt_seqs = []
    for seq in tgt_prob_seqs:
        seq = np.argmax(seq, axis=1)
        seq = [id2word_vocab_tgt[id] for id in seq]
        seq = np.array(seq).reshape([-1, 1])
        tgt_seqs.append(seq)
    corpus_tgt = np.concatenate(tgt_seqs, axis=1)
    corpus_tgt = [
        ''.join([tmp for tmp in corpus_tgt[i, :] if tmp != '<PAD>'])
        for i in range(corpus_tgt.shape[0])
    ]
    return corpus_tgt
Ejemplo n.º 3
0
def ner_predict(model, x, word2id, label2id, max_len=None, do_word2id=True):
    # 反映射
    id2word = {id: word for word, id in word2id.items()}
    id2label = {id: label for label, id in label2id.items()}
    # 获取最大seq长度
    if max_len == None:
        max_len = max(map(lambda seq: len(seq), x))
    # 规整输入文本
    if do_word2id == True:
        seqs = []
        word_list = []
        for seq in x:
            seq = list(seq)
            word_list.append(seq)
            seq = nn_lib.sentence2id(seq, word2id)
            seqs.append(seq)
        seqs = nn_lib.pad_sequences(seqs, max_len)
    else:
        seqs = x
        word_list = []
        for row in x:
            word_list.append(series(row).map(id2word).tolist())
    seqs = np.array(seqs)
    # 预测标签
    label_id_list = model.infer([seqs])
    # 拼接语料和标签
    corpus_labels = []
    for i in range(len(word_list)):
        corpus_label = []
        for j in range(len(word_list[i])):
            corpus_label.append(
                (word_list[i][j], id2label[label_id_list[i][j]]))
        corpus_labels.append(corpus_label)

    return corpus_labels
Ejemplo n.º 4
0
def preprocess_data():
    # 读取字典
    word2id_vocab_src, vocab_size_src = nn_lib.read_word2id_dict(
        path_vocab_src)
    word2id_vocab_tgt, vocab_size_tgt = nn_lib.read_word2id_dict(
        path_vocab_tgt)
    # 读取语料
    corpus_src = read_file(path_corpus_src)
    corpus_tgt = read_file(path_corpus_tgt)
    # 转换语料文本至id值
    src_ids = []
    src_id_extendeds = []
    word2id_vocab_extends = []
    tgt_ids = []
    tgt_id_extendeds = []
    for src, tgt in zip(corpus_src, corpus_tgt):
        # 转换src至ID
        src_id, src_id_extended, vocab_extend_raw = nn_lib.sentence2id(
            sent=src, word2id_vocab=word2id_vocab_src, build_extend_vocab=True)
        src_ids.append(src_id)
        src_id_extendeds.append(src_id_extended)
        vocab_extend = {
            key: value - len(word2id_vocab_src) + len(word2id_vocab_tgt)
            for key, value in vocab_extend_raw.items()
        }
        word2id_vocab_extends.append(copy.copy(vocab_extend))
        # 转换tgt至ID
        tgt_id = nn_lib.sentence2id(sent=tgt,
                                    word2id_vocab=word2id_vocab_tgt,
                                    build_extend_vocab=False)
        vocab_extend.update(word2id_vocab_tgt)
        tgt_id_extended = nn_lib.sentence2id(sent=tgt,
                                             word2id_vocab=vocab_extend,
                                             build_extend_vocab=False)
        tgt_ids.append(tgt_id)
        tgt_id_extendeds.append(tgt_id_extended)
    del src, src_id, src_id_extended, tgt, tgt_id, tgt_id_extended
    # 序列补0,统计附加词表大小
    src_ids = nn_lib.pad_sequences(sequences=src_ids,
                                   max_seq_len=src_seq_len_max)
    src_id_extendeds = nn_lib.pad_sequences(sequences=src_id_extendeds,
                                            max_seq_len=src_seq_len_max)
    tgt_ids = nn_lib.pad_sequences(sequences=tgt_ids,
                                   max_seq_len=tgt_seq_len_max,
                                   add_sos=True)
    tgt_id_extendeds = nn_lib.pad_sequences(sequences=tgt_id_extendeds,
                                            max_seq_len=tgt_seq_len_max,
                                            add_sos=True)
    vocab_size_extened = max([len(i) for i in word2id_vocab_extends])
    # 规整数据
    src_ids = np.array(src_ids)
    src_id_extendeds = np.array(src_id_extendeds)
    tgt_ids = np.array(tgt_ids)
    tgt_id_extendeds = np.array(tgt_id_extendeds)
    word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1])
    # 构建训练集、测试集、验证集
    nn_lib.generate_train_test_vali(src_ids,
                                    src_id_extendeds,
                                    tgt_ids,
                                    tgt_id_extendeds,
                                    word2id_vocab_extends,
                                    file_path=path_corpus_processed,
                                    corpus_names=corpus_names,
                                    data_test_size=128 * 2,
                                    data_vali_size=128 * 3)
    # x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \
    #     = train_test_split(src_ids, src_id_extendeds, tgt_ids, tgt_id_extendeds, word2id_vocab_extends, test_size=128*3)
    # x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \
    #     = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=128*2)
    # del x, x_extended, y, y_extended, vocab_extend
    # # 存储训练集、测试集、验证集
    # for name in processed_corpus_names:
    #     with open(path_corpus_processed + name, 'wb') as file:
    #         pickle.dump(eval(name), file)
    return
Ejemplo n.º 5
0
def preprocess_data(vocab_size):
    # 读取语料数据
    data_q = []
    data_a = []
    for file_name in corpus_list:
        print('loading file {}'.format(path_corpus + file_name))
        data_q_, data_a_ = load_file(path_corpus + file_name)
        data_q = data_q + data_q_
        data_a = data_a + data_a_
    del data_q_, data_a_
    # 构建/读取字典
    if flag_build_vocab:
        word2id_vocab, vocab_size = nn_lib.build_word2id_vocab(
            data=data_q,
            saved_path=path_vocab,
            vocab_size=vocab_size,
            use_seg=True)
    else:
        word2id_vocab, vocab_size = nn_lib.read_word2id_dict(path_vocab)
    # 转换语料文本至id值
    q_ids = []
    q_id_extendeds = []
    word2id_vocab_extends = []
    a_ids = []
    a_id_extendeds = []
    for q, a in zip(data_q, data_a):
        # 转换问题Q至ID
        q_seg = nn_lib.str_segment(q)
        q_id, q_id_extended, vocab_extend = nn_lib.sentence2id(
            sent=q_seg, word2id_vocab=word2id_vocab, build_extend_vocab=True)
        q_ids.append(q_id)
        q_id_extendeds.append(q_id_extended)
        word2id_vocab_extends.append(copy.copy(vocab_extend))
        # 转换答案A至ID
        a_seg = nn_lib.str_segment(a)
        a_id = nn_lib.sentence2id(sent=a_seg,
                                  word2id_vocab=word2id_vocab,
                                  build_extend_vocab=False)
        vocab_extend.update(word2id_vocab)
        a_id_extended = nn_lib.sentence2id(sent=a_seg,
                                           word2id_vocab=vocab_extend,
                                           build_extend_vocab=False)
        a_ids.append(a_id)
        a_id_extendeds.append(a_id_extended)
    del q, q_seg, q_id, q_id_extended, a, a_seg, a_id, a_id_extended
    # 序列补0,统计附加词表大小
    q_ids = nn_lib.pad_sequences(sequences=q_ids, max_seq_len=max_seq_len)
    q_id_extendeds = nn_lib.pad_sequences(sequences=q_id_extendeds,
                                          max_seq_len=max_seq_len)
    a_ids = nn_lib.pad_sequences(sequences=a_ids,
                                 max_seq_len=max_seq_len,
                                 add_sos=True)
    a_id_extendeds = nn_lib.pad_sequences(sequences=a_id_extendeds,
                                          max_seq_len=max_seq_len,
                                          add_sos=True)
    vocab_size_extened = max([len(i) for i in word2id_vocab_extends])
    # 规整数据
    q_ids = np.array(q_ids)
    q_id_extendeds = np.array(q_id_extendeds)
    a_ids = np.array(a_ids)
    a_id_extendeds = np.array(a_id_extendeds)
    word2id_vocab_extends = np.array(word2id_vocab_extends).reshape([-1, 1])
    # 构建训练集、测试集、验证集
    x, x_vali, x_extended, x_extended_vali, y, y_vali, y_extended, y_extended_vali, vocab_extend, vocab_extend_vali \
        = train_test_split(q_ids, q_id_extendeds, a_ids, a_id_extendeds, word2id_vocab_extends, test_size=1024*8)
    x_train, x_test, x_extended_train, x_extended_test, y_train, y_test, y_extended_train, y_extended_test, vocab_extend_train, vocab_extend_test \
        = train_test_split(x, x_extended, y, y_extended, vocab_extend, test_size=1024)
    del x, x_extended, y, y_extended, vocab_extend
    # 存储训练集、测试集、验证集
    for name in processed_corpus_names:
        with open(path_corpus_processed + name, 'wb') as file:
            pickle.dump(eval(name), file)
    return