Beispiel #1
0
def data_generate(train_data_path,test_data_path):
    '''
    第一版数据处理#数据加载与预处理
    :param train_data_path:
    :param test_data_path:
    :return:
    '''
    #1.加载数据
    train_df,test_df = load_dataset(train_data_path, test_data_path)
    print('train data size {},test data size {}'.format(len(train_df), len(test_df)))
    #2.空值清洗
    train_df.dropna(subset=['Question', 'Dialogue', 'Report'], how="any", inplace=True)
    test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)

    #3.并行化处理,多线程处理
    train_df = parallelize(train_df, data_frame_proc)
    test_df = parallelize(test_df, data_frame_proc)
    # 4. 保存处理完成的数据
    train_df.to_csv(train_seg_path, index=None, header=True)
    test_df.to_csv(test_seg_path, index=None, header=True)

    #5. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1)
    test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df),
                                                                          len(merged_df)))
    # 6. 保存合并数据
    merged_df.to_csv(merger_seg_path, index=None, header=True)

    return train_df, test_df, merged_df
Beispiel #2
0
def preprocess(train_data_path, test_data_path):
    if os.path.exists(config.train_seg_path) and \
        os.path.exists(config.train_seg_path) and \
        os.path.exists(config.merger_seg_path):
        train_df = pd.read_csv(config.train_seg_path)
        test_df = pd.read_csv(config.train_seg_path)

        train_df.dropna(subset=['Report'], inplace=True)
        test_df.dropna(subset=['Report'], inplace=True)

        train_df.fillna('', inplace=True)
        test_df.fillna('', inplace=True)

    else:
        # 1.加载数据
        train_df = pd.read_csv(train_data_path)
        test_df = pd.read_csv(test_data_path)
        print('train data size {},test data size {}'.format(
            len(train_df), len(test_df)))
        # 2. 空值剔除
        train_df.dropna(subset=['Report'], inplace=True)
        test_df.dropna(subset=['Report'], inplace=True)

        train_df.fillna('', inplace=True)
        test_df.fillna('', inplace=True)
        # 3.多线程, 批量数据处理
        train_df = parallelize(train_df, sentences_proc)
        test_df = parallelize(test_df, sentences_proc)
        # 4. 合并训练测试集合
        train_df['merged'] = train_df[['Question', 'Dialogue',
                                       'Report']].apply(lambda x: ' '.join(x),
                                                        axis=1)
        test_df['merged'] = test_df[['Question', 'Dialogue',
                                     'Report']].apply(lambda x: ' '.join(x),
                                                      axis=1)
        merged_df = pd.concat([train_df[['merged']], test_df[['merged']]],
                              axis=0)
        print('train data size {},test data size {},merged_df data size {}'.
              format(len(train_df), len(test_df), len(merged_df)))
        # 5.保存处理好的 训练 测试集合
        train_df = train_df.drop(['merged'], axis=1)
        test_df = test_df.drop(['merged'], axis=1)
        train_df.to_csv(config.train_seg_path, index=False)
        test_df.to_csv(config.test_seg_path, index=False)
        # 6. 保存合并数据
        merged_df.to_csv(config.merger_seg_path, index=False)
    return train_df, test_df
def data_loader(params, is_rebuild_dataset=False):
    if os.path.exists(config.train_x_path) and not is_rebuild_dataset:
        x_train = np.load(config.train_x_path)
        x_test = np.load(config.test_x_path)
        y_train = np.load(config.train_y_path)
        y_test = np.load(config.test_y_path)

        with open(config.vocab_save_path, 'r', encoding='utf-8') as f:
            vocab = {}
            for content in f.readlines():
                k, v = content.strip().split('\t')
                vocab[k] = int(v)
        label_df = pd.read_csv(config.data_label_path)
        # 多标签编码
        mlb = MultiLabelBinarizer()
        mlb.fit([label_df['label']])

        return x_train, x_test, y_train, y_test, vocab, mlb

    df = pd.read_csv(config.data_path, header=None).rename(columns={
        0: 'label',
        1: 'content'
    })
    df = parallelize(df, proc)

    text_preprocesser = tf.keras.preprocessing.text.Tokenizer(
        num_words=params['vocab_size'], oov_token="<UNK>")
    text_preprocesser.fit_on_texts(df['content'])

    vocab = text_preprocesser.word_index
    with open(config.vocab_save_path, 'w', encoding='utf-8') as f:
        for k, v in vocab.items():
            f.write(f'{k}\t{str(v)}\n')

    x = text_preprocesser.texts_to_sequences(df['content'])
    x = tf.keras.preprocessing.sequence.pad_sequences(
        x, maxlen=params['padding_size'], padding='post', truncating='post')

    # label_df = pd.read_csv(config.data_label_path)

    mlb = MultiLabelBinarizer()
    df['label'] = df['label'].apply(lambda x: x.split())
    mlb.fit(df['label'])

    y = mlb.transform(df['label'])

    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    np.save(config.train_x_path, x_train)
    np.save(config.test_x_path, x_test)
    np.save(config.train_y_path, y_train)
    np.save(config.test_y_path, y_test)

    return x_train, x_test, y_train, y_test, vocab, mlb
def data_loader(params, is_rebuild_dataset=False):
    if os.path.exists(os.path.join(root, 'data',
                                   'X_train.npy')) and not is_rebuild_dataset:
        X_train = np.load(os.path.join(root, 'data', 'X_train.npy'))
        X_test = np.load(os.path.join(root, 'data', 'X_test.npy'))
        y_train = np.load(os.path.join(root, 'data', 'y_train.npy'))
        y_test = np.load(os.path.join(root, 'data', 'y_test.npy'))
        return X_train, X_test, y_train, y_test

    # 读取数据
    df = pd.read_csv(params.data_path, header=None).rename(columns={
        0: 'label',
        1: 'content'
    })
    # 并行清理数据
    df = parallelize(df, proc)
    # word2index
    text_preprocesser = Tokenizer(num_words=params.vocab_size,
                                  oov_token="<UNK>")
    text_preprocesser.fit_on_texts(df['content'])
    # save vocab
    word_dict = text_preprocesser.word_index
    with open(params.vocab_save_dir + 'voab.txt', 'w', encoding='utf-8') as f:
        for k, v in word_dict.items():
            f.write(f'{k}\t{str(v)}\n')

    x = text_preprocesser.texts_to_sequences(df['content'])
    # padding
    x = pad_sequences(x,
                      maxlen=params.padding_size,
                      padding='post',
                      truncating='post')
    # 划分标签
    df['label'] = df['label'].apply(lambda x: x.split())
    # 多标签编码
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['label'])
    # 数据集划分
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    # 保存数据
    np.save(os.path.join(root, 'data', 'X_train.npy'), X_train)
    np.save(os.path.join(root, 'data', 'X_test.npy'), X_test)
    np.save(os.path.join(root, 'data', 'y_train.npy'), y_train)
    np.save(os.path.join(root, 'data', 'y_test.npy'), y_test)

    return X_train, X_test, y_train, y_test
def build_data(params):

    if os.path.exists(os.path.join(root, 'data', 'X_train.npy')):
        X_train = np.load(os.path.join(root, 'data', 'X_train.npy'))
        X_test = np.load(os.path.join(root, 'data', 'X_test.npy'))
        y_train = np.load(os.path.join(root, 'data', 'y_train.npy'))
        y_test = np.load(os.path.join(root, 'data', 'y_test.npy'))
        return X_train, X_test, y_train, y_test


    data = pd.read_csv(params['data_path'],header = None).rename(columns={0: 'label', 1: 'content'})
    processed_data = parallelize(data, proc)
    #word2index
    text_preprocesser = Tokenizer(num_words=params['vocab_size'], oov_token="<UNK>")
    text_preprocesser.fit_on_texts(processed_data['content'])
    #save vocab
    word_dict = text_preprocesser.word_index
    with open(params['vocab_path'], 'w', encoding='utf-8') as f:
        for k, v in word_dict.items():
            f.write(f'{k}\t{str(v)}\n')

    x = text_preprocesser.texts_to_sequences(processed_data['content'])
    # padding
    x = pad_sequences(x, maxlen=params['padding_size'], padding='post', truncating='post')
    # 划分标签

    if params['train_mode'] == "multi_label":
        processed_data['label'] = processed_data['label'].apply(lambda x: x.split())
        # 多标签编码
        mlb = MultiLabelBinarizer()
        y = mlb.fit_transform(processed_data['label'])
        # 数据集划分


    elif params['train_mode'] == "multi_class":
        processed_data['subject'] = processed_data['label'].apply(lambda x: x.split()[1])
        print("class category: ", set(processed_data['subject']))
        lb=LabelBinarizer()
        y = lb.fit_transform(processed_data['subject'])

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    # 保存数据
    np.save(os.path.join(root,'data','X_train.npy'),X_train)
    np.save(os.path.join(root, 'data', 'X_test.npy'), X_test)
    np.save(os.path.join(root, 'data', 'y_train.npy'), y_train)
    np.save(os.path.join(root, 'data', 'y_test.npy'), y_test)

    return X_train, X_test, y_train, y_test, mlb, word_dict # vocab
def build_dataset(train_data_path, test_data_path):
    '''
    数据加载+预处理
    :param train_data_path:训练集路径
    :param test_data_path: 测试集路径
    :return: 训练数据 测试数据  合并后的数据
    '''
    # 1.加载数据
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    print('train data size {},test data size {}'.format(
        len(train_df), len(test_df)))

    # 2. 空值剔除
    train_df.dropna(subset=['Report'], inplace=True)

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)

    # 3.多线程, 批量数据处理
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)

    # 4. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue',
                                   'Report']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    test_df['merged'] = test_df[['Question',
                                 'Dialogue']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(
        len(train_df), len(test_df), len(merged_df)))

    # 5.保存处理好的 训练 测试集合
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

    train_df.to_csv(train_seg_path, index=None, header=False)
    test_df.to_csv(test_seg_path, index=None, header=False)

    # 6. 保存合并数据
    merged_df.to_csv(merger_seg_path, index=None, header=False)

    # 7. 训练词向量
    print('start build w2v model')
    wv_model = Word2Vec(LineSentence(merger_seg_path),
                        size=embedding_dim,
                        sg=1,
                        workers=8,
                        iter=wv_train_epochs,
                        window=5,
                        min_count=5)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 训练集 验证集划分
    X_train, X_val, y_train, y_val = train_test_split(
        train_df['X'],
        train_df['Report'],
        test_size=0.002,  # 8W*0.002
    )

    X_train.to_csv(train_x_seg_path, index=None, header=False)
    y_train.to_csv(train_y_seg_path, index=None, header=False)
    X_val.to_csv(val_x_seg_path, index=None, header=False)
    y_val.to_csv(val_y_seg_path, index=None, header=False)

    test_df['X'].to_csv(test_x_seg_path, index=None, header=False)

    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))

    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))

    # 10. 保存pad oov处理后的,数据和标签
    train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
    train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
    test_df['X'].to_csv(test_x_pad_path, index=None, header=False)
    #
    # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len))

    # 11. 词向量再次训练
    # print('start retrain w2v model')
    # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('1/3')
    # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('2/3')
    # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)

    # 保存词向量模型
    wv_model.save(save_wv_model_path)
    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    # 12. 更新vocab
    vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
    reverse_vocab = {
        index: word
        for index, word in enumerate(wv_model.wv.index2word)
    }

    # 保存字典
    save_dict(save_vocab_path, vocab)
    save_dict(reverse_vocab_path, reverse_vocab)

    # 13. 保存词向量矩阵
    embedding_matrix = wv_model.wv.vectors
    np.save(embedding_matrix_path, embedding_matrix)

    # 14. 数据集转换 将词转换成索引  [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231
    # vocab = Vocab()

    train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab))
    train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab))
    test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab))

    # 15. 数据转换成numpy数组
    # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
    train_X = np.array(train_ids_x.tolist())
    train_Y = np.array(train_ids_y.tolist())
    test_X = np.array(test_ids_x.tolist())

    # 保存数据
    np.save(train_x_path, train_X)
    np.save(train_y_path, train_Y)
    np.save(test_x_path, test_X)
    return train_X, train_Y, test_X
Beispiel #7
0
    :param data: 待统计的数据  train_df['Question']
    :return: 最大长度值
    """
    max_lens = data.apply(lambda x: x.count(' ') + 1)
    return int(np.mean(max_lens) + 2 * np.std(max_lens))


if __name__ == '__main__':
    train_df = pd.read_csv(config.train_data_path)
    test_df = pd.read_csv(config.test_data_path)
    train_df.dropna(subset=['Question', 'Dialogue', 'Report'],
                    how='any',
                    inplace=True)
    test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)

    train_df = parallelize(train_df, process_seq2seq)
    test_df = parallelize(test_df, process_seq2seq)

    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 获取输入数据 适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_x_max_len = get_max_len(test_df['X'])

    x_max_len = max(train_x_max_len, test_x_max_len)

    # 获取标签数据 适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
Beispiel #8
0
def build_dataset(train_data_path,
                  test_data_path,
                  save_wv_model_path,
                  testOnly=True,
                  toCSV=True):
    '''
    数据加载+预处理
    :param train_data_path:训练集路径
    :param test_data_path: 测试集路径
    :return: 训练数据 测试数据  合并后的数据
    '''

    # 1.加载数据
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    print('train data size {},test data size {}'.format(
        len(train_df), len(test_df)))

    # 2. 空值填充
    train_df.dropna(subset=['Question', 'Dialogue', 'Report'],
                    how='any',
                    inplace=True)
    test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)

    # 3.多进程, 批量数据处理
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)

    # 4. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue',
                                   'Report']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    test_df['merged'] = test_df[['Question',
                                 'Dialogue']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(
        len(train_df), len(test_df), len(merged_df)))

    # 5.保存处理好的 训练 测试集合
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

    if toCSV:
        train_df.to_csv(train_seg_path, index=None, header=True)
        test_df.to_csv(test_seg_path, index=None, header=True)
        # 6. 保存合并数据
        merged_df.to_csv(merger_seg_path, index=None, header=False)

    if osp.exists(save_wv_model_path):
        wv_model = Word2Vec.load(save_wv_model_path)
    else:
        # 7. 训练词向量
        print('start build w2v model')
        wv_model = Word2Vec(LineSentence(merger_seg_path),
                            size=embedding_dim,
                            negative=5,
                            workers=8,
                            iter=wv_train_epochs,
                            window=3,
                            min_count=5)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))

    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))

    # 10. 保存pad oov处理后的,数据和标签
    if toCSV:
        train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
        train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
        test_df['X'].to_csv(test_x_pad_path, index=None, header=False)

    if testOnly:
        print("No retraining! Test only...")
        return train_df['X'], train_df['Y'], test_df['X'], wv_model
    else:
        # 11. 词向量再次训练
        print('start retrain w2v model')
        wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
        wv_model.train(LineSentence(train_x_pad_path),
                       epochs=wv_train_epochs,
                       total_examples=wv_model.corpus_count)
        print('1/3')
        wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
        wv_model.train(LineSentence(train_y_pad_path),
                       epochs=wv_train_epochs,
                       total_examples=wv_model.corpus_count)
        print('2/3')
        wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
        wv_model.train(LineSentence(test_x_pad_path),
                       epochs=wv_train_epochs,
                       total_examples=wv_model.corpus_count)

        # 保存词向量模型
        wv_model.save(save_wv_model_path)
    # or load wv_model
    # wv_model = Word2Vec.load(save_wv_model_path)

    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    return train_df['X'], train_df['Y'], test_df['X'], wv_model
Beispiel #9
0
def build_dataset(train_data_path, test_data_path, word2vec_type=True):
    '''
    构建数据集

    :param train_data_path:
    :param test_data_path:
    :param w2v_model_trained_path: 如果有已经训练好的词向量
    :return:
    '''
    # 1.加载数据
    train_df, test_df = load_dataset(train_data_path, test_data_path)
    print('train data size {},test data size {}'.format(len(train_df), len(test_df)))
    # 2.空值清洗
    train_df.dropna(subset=['Report'], inplace=True)
    #test_df.dropna(subset=['Question', 'Dialogue'], how='any', inplace=True)
    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)
    # 3.并行化处理,多线程处理

    train_df = parallelize(train_df, data_frame_proc)
    test_df = parallelize(test_df, data_frame_proc)
    # 4. 保存处理完成的数据
    train_df.to_csv(train_seg_path, index=None, header=True)
    test_df.to_csv(test_seg_path, index=None, header=True)

    # 5. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue', 'Report']].apply(lambda x: ' '.join(x), axis=1)
    test_df['merged'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

     # 6. 保存合并数据
    train_df.to_csv(train_seg_path, index=None, header=True)
    test_df.to_csv(test_seg_path, index=None, header=True)
    merged_df.to_csv(merger_seg_path, index=None, header=False)
    print('train data size {},test data size {},merged_df data size {}'.format(len(train_df), len(test_df),
                                                                               len(merged_df)))
    #7.词向量训练
    print('start build w2v model')

    if word2vec_type:
        wv_model = Word2Vec(LineSentence(merger_seg_path), size=embedding_dim,
                            negative=5,
                            workers=8,
                            iter=wv_train_epochs,
                            window=3,
                            min_count=5)
    else:
        wv_model = FastText(LineSentence(merger_seg_path), workers=8, min_count=5, size=300, window = 3,iter=wv_train_epochs)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question', 'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    # 划分训练测试集
    x_train, x_val, y_train,y_val = train_test_split(train_df['X'], train_df['Report'],test_size = 0.002 )

    x_train.to_csv(train_x_seg_path, index=None, header=False)
    y_train.to_csv(train_y_seg_path, index=None, header=False)
    x_val.to_csv(val_x_seg_path, index=None, header=False)
    y_val.to_csv(val_y_seg_path, index=None, header=False)

    test_df['X'].to_csv(test_x_seg_path, index=None, header=False)
    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)

    print("training sequence length is: ", X_max_len)
    train_df['X'] = train_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(lambda x: pad_proc(x, train_y_max_len, vocab))
    print("report sequence length is: ", train_y_max_len)

    # 10. 保存pad oov处理后的,数据和标签
    train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
    train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
    test_df['X'].to_csv(test_x_pad_path, index=None, header=False)

    # 11. 词向量再次训练 这里重新封装一下
    # print('start retrain w2v model')
    # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    # wv_model.train(LineSentence(train_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
    # print('1/3')
    # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    # wv_model.train(LineSentence(train_y_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)
    # print('2/3')
    # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    # wv_model.train(LineSentence(test_x_pad_path), epochs=wv_train_epochs, total_examples=wv_model.corpus_count)

    # 保存词向量模型
    wv_model.save(save_wv_model_path)
    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    #12 更新词典
    vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
    reverse_vocab = {index: word for index, word in enumerate(wv_model.wv.index2word)}
    #保存词典
    save_dict(vocab_path,vocab)
    save_dict(reverse_vocab_path, reverse_vocab)

    #13 保存词向量矩阵
    embedding_matrix = wv_model.wv.vectors
    np.save(embedding_matrix_path, embedding_matrix)


    #14 将数据集转换为索引

    vocab = Vocab()

    train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab))
    train_ids_y = train_df['Y'].apply(lambda x: transform_data(x,vocab))
    test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab))

    # 15. 数据转换成numpy数组
    # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
    train_X = np.array(train_ids_x.tolist())
    train_Y = np.array(train_ids_y.tolist())
    test_X = np.array(test_ids_x.tolist())

    # 保存数据
    np.save(train_x_path, train_X)
    np.save(train_y_path, train_Y)
    np.save(test_x_path, test_X)

    return train_X, train_Y, test_X
Beispiel #10
0
def build_dataset(search_dev_data_path, zhidao_dev_data_path):
    '''
    数据加载+预处理
    :param search_dev_data_path:search集路径
    :param zhidao_dev_data_path: zhidao集路径
    :return: 合并后的数据
    '''
    # 1.加载数据
    search_dev_df = pd.read_json(search_dev_data_path, lines=True)
    zhidao_dev_df = pd.read_json(zhidao_dev_data_path,encoding='utf-8', lines=True)
    print('search dev data size {},zhidao dev data size {}'.format(len(search_dev_df), len(zhidao_dev_df)))
    # print(search_dev_data.columns)

    search_dev_df['answers'] = search_dev_df[['answers']].apply(sentence_proc, axis=1)
    search_dev_df['entity_answers'] = search_dev_df[['entity_answers']].apply(sentences_proc, axis=1)
    search_dev_df['documents'] = search_dev_df[['documents']].apply(documents_proc, axis=1)
    zhidao_dev_df['answers'] = zhidao_dev_df[['answers']].apply(sentence_proc, axis=1)
    zhidao_dev_df['entity_answers'] = zhidao_dev_df[['entity_answers']].apply(sentences_proc, axis=1)
    zhidao_dev_df['documents'] = zhidao_dev_df[['documents']].apply(documents_proc, axis=1)

    # print(search_dev_df["documents"])
    # print(search_dev_df['entity_answers'])
    # print(search_dev_df['question'])
    # print(search_dev_df['answers'])
    # print(zhidao_dev_df["documents"])
    # print(zhidao_dev_df['entity_answers'])
    # print(zhidao_dev_df['question'])
    # print(zhidao_dev_df['answers'])


    # 3.多线程, 批量数据处理
    search_dev_df = parallelize(search_dev_df, split_sentences_proc)
    zhidao_dev_df = parallelize(zhidao_dev_df, split_sentences_proc)

    # 4. 合并训练测试集合
    search_dev_df['merged'] = search_dev_df[['documents', 'entity_answers', 'question', 'answers']].apply(lambda x: ' '.join(x), axis=1)
    zhidao_dev_df['merged'] = search_dev_df[['documents', 'entity_answers', 'question', 'answers']].apply(lambda x: ' '.join(x), axis=1)
    merged_df = pd.concat([search_dev_df[['merged']], zhidao_dev_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(len(search_dev_df),
                                                                               len(zhidao_dev_df),
                                                                               len(merged_df)))
    # 6. 保存合并数据
    merged_df.to_csv(merger_dev_seg_path, index=None, header=False)

    # 7. 训练词向量
    print('start build w2v model')
    wv_model = Word2Vec(LineSentence(merger_dev_seg_path),
                        size=embedding_dim,
                        sg=1,
                        workers=cores,
                        iter=wv_train_epochs,
                        window=5,
                        min_count=5)
    # 8. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 9、保存字典
    save_dict(vocab_path, vocab)

    # 10、保存词向量模型
    wv_model.save(save_wv_model_path)




    return