Python Vocab Examples, utils.wv_loader.Vocab Python Examples

Example #1

0

Show file

File: train.py Project: dudu19940515/question-abstract

def train(params):
    # GPU资源配置
    config_gpu(use_cpu=False, gpu_memory=params['gpu_memory'])
    # 读取vocab训练
    print("Building the model ...")
    vocab = Vocab(params["vocab_path"], params["max_vocab_size"])
    params['vocab_size'] = vocab.count

    # 构建模型
    print("Building the model ...")
    # model = Seq2Seq(params)
    model = PGN(params)

    print("Creating the batcher ...")
    dataset = batcher(vocab, params)
    # print('dataset is ', dataset)

    # 获取保存管理者
    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(PGN=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    params['checkpoint_dir'],
                                                    max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    # 训练模型
    print("Starting the training ...")
    train_model(model, dataset, params, checkpoint_manager)

Example #2

0

Show file

def train(params):
    # GPU资源配置
    config_gpu()
    # 读取vocab训练
    print("Building the model ...")
    vocab = Vocab(params["vocab_path"], params["vocab_size"])

    # 构建模型
    print("Building the model ...")
    # model = Seq2Seq(params)
    model = PGN(params)

    print("Creating the batcher ...")
    # dataset = batcher(params["train_seg_x_dir"], params["train_seg_y_dir"], vocab, params)
    # print('dataset is ', dataset)

    # 获取保存管理者
    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(Seq2Seq=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    checkpoint_dir,
                                                    max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    # 训练模型
    print("Starting the training ...")
    train_model(model, vocab, params, checkpoint_manager)

Example #3

0

Show file

File: test.py Project: wangyang0922/Q-A-Summary-and-Reasoning

def test(params):
    assert params["mode"].lower() in [
        "test", "eval"
    ], "change training mode to 'test' or 'eval'"
    if params['decode_mode'] == 'beam':
        assert params["beam_size"] == params[
            "batch_size"], "Beam size must be equal to batch_size, change the params"
    # GPU资源配置
    config_gpu(use_cpu=True)

    print("Building the model ...")
    model = PGN(params)

    print("Creating the vocab ...")
    vocab = Vocab(params["vocab_path"], params["vocab_size"])
    params['vocab_size'] = vocab.count

    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(PGN=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    checkpoint_dir,
                                                    max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    print("Model restored")

    results = predict_result(model, params, vocab, params['result_save_path'])
    print('save result to :{}'.format(params['result_save_path']))
    print('save result :{}'.format(results[:5]))

Example #4

0

Show file

File: test.py Project: lijunforNLPtraining/kaikeba

def test(params):
    assert params["mode"].lower() in ["test", "eval"], "change training mode to 'test' or 'eval'"
    assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params"
    # GPU资源配置
    config_gpu(use_cpu=True)

    print("Building the model ...")
    model = Seq2Seq(params)

    print("Creating the vocab ...")
    vocab = Vocab(params["vocab_path"], params["vocab_size"])
    params['vocab_size'] = vocab.count

    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(Seq2Seq=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    print("Model restored")

    if params['greedy_decode']:
        predict_result(model, params, vocab, params['result_save_path'])
    else:
        b = bream_test_batch_generator(params["beam_size"])
        results = []
        for batch in b:
            best_hyp = beam_decode(model, batch, vocab, params)
            results.append(best_hyp.abstract)
        save_predict_result(results, params['result_save_path'])
        print('save result to :{}'.format(params['result_save_path']))

Example #5

0

Show file

File: test.py Project: zyffrank/nlp

def test(params):
    assert params["mode"].lower(
    ) == "test", "change training mode to 'test' or 'eval'"
    assert params["beam_size"] == params[
        "batch_size"], "Beam size must be equal to batch_size, change the params"

    print("Building the model ...")
    model = PGN(params)

    print("Creating the vocab ...")
    vocab = Vocab(params["vocab_path"], params["vocab_size"])

    print("Creating the batcher ...")
    b = batcher(vocab, params)

    print("Creating the checkpoint manager")
    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(Seq2Seq=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    checkpoint_dir,
                                                    max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    print("Model restored")

    for batch in b:
        print(beam_decode(model, batch, vocab, params))

Example #6

0

Show file

def test(params):
    assert params["mode"].lower() in ["test", "eval"], "change training mode to 'test' or 'eval'"
    assert params["beam_size"] == params["batch_size"], "Beam size must be equal to batch_size, change the params"
    # GPU资源配置
    config_gpu(use_cpu=True)

    print("Building the model ...")
    model = PGN(params)

    print("Creating the vocab ...")
    vocab = Vocab(params["vocab_path"], params["vocab_size"])
    params['vocab_size'] = vocab.count

    print("Creating the batcher ...")
    b = batcher(vocab, params)

    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(PGN=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    print("Model restored")

    if params['mode'] == 'eval' or params['mode'] == 'test':
        for batch in b:
            #print("batch is:\n", batch)

            yield beam_decode(model, batch, vocab, params)

    else:
        for batch in b:
            print(beam_decode(model, batch, vocab, params))

Example #7

0

Show file

def save_example_dataset(params):
    # vocab 对象
    vocab = Vocab(vocab_path)
    # 获取batch data
    dataset = batcher(vocab, params)
    # 批量保存一轮数据
    step = 0
    for batch_data in dataset:
        # 存储路径
        pickle_path = os.path.join(params['train_pickle_dir'],
                                   str(step) + '.pickle')
        # 保存处理好的pickle文件
        save_pickle(batch_data, pickle_path)
        step += 1
        if step > params['max_train_steps']:
            break

Example #8

0

Show file

File: train.py Project: lijunforNLPtraining/kaikeba

def train(params):
    # GPU资源配置
    config_gpu()

    # 读取vocab训练
    vocab = Vocab(params["vocab_path"], params["vocab_size"])

    params['vocab_size'] = vocab.count

    # 构建模型
    print("Building the model ...")
    model = Seq2Seq(params)

    # 获取保存管理者
    checkpoint = tf.train.Checkpoint(Seq2Seq=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint, params['checkpoint_dir'], max_to_keep=5)

    # 训练模型
    train_model(model, vocab, params, checkpoint_manager)

Example #9

0

Show file

File: train.py Project: wangyang0922/Q-A-Summary-and-Reasoning

def train(params):
    # GPU资源配置
    config_gpu()
    # 读取vocab训练
    print("Building the model ...")
    vocab = Vocab(params["vocab_path"], params["max_vocab_size"])
    params['vocab_size'] = vocab.count

    # 构建模型
    print("Building the model ...")
    # model = Seq2Seq(params)
    model = PGN(params)

    print("Creating the batcher ...")
    dataset, params['steps_per_epoch'] = batcher(vocab, params)

    # print('dataset is ', dataset)

    # 获取保存管理者
    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(PGN=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    params['checkpoint_dir'],
                                                    max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
        params["trained_epoch"] = int(checkpoint_manager.latest_checkpoint[-1])
    else:
        print("Initializing from scratch.")
        params["trained_epoch"] = 1

    # 学习率衰减
    params["learning_rate"] *= np.power(0.95, params["trained_epoch"])
    print('learning_rate:{}'.format(params["learning_rate"]))
    # 训练模型
    print("Starting the training ...")

    train_model(model, dataset, params, checkpoint_manager)

Example #10

0

Show file

File: train.py Project: lijunforNLPtraining/kaikeba

def train(params):
    # GPU资源配置
    # config_gpu(use_cpu=False, gpu_memory=params['gpu_memory'])
    gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
    if gpus:
        tf.config.experimental.set_visible_devices(devices=gpus[0],
                                                   device_type='GPU')
        tf.config.experimental.set_memory_growth(gpus[0], enable=True)
    # 读取vocab训练
    print("Building the model ...")
    vocab = Vocab(params["vocab_path"], params["max_vocab_size"])
    params['vocab_size'] = vocab.count

    # 构建模型
    print("Building the model ...")
    # model = Seq2Seq(params)
    model = PGN(params)

    print("Creating the batcher ...")
    dataset = batcher(vocab, params)
    # print('dataset is ', dataset)

    # 获取保存管理者
    print("Creating the checkpoint manager")
    checkpoint = tf.train.Checkpoint(PGN=model)
    checkpoint_manager = tf.train.CheckpointManager(checkpoint,
                                                    params['checkpoint_dir'],
                                                    max_to_keep=5)
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    if checkpoint_manager.latest_checkpoint:
        print("Restored from {}".format(checkpoint_manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")
    # 训练模型
    print("Starting the training ...")
    train_model(model, dataset, params, checkpoint_manager)

Example #11

0

Show file

            "dec_target": entry["target"],
            "dec_len": entry["dec_len"],
            "abstract": entry["abstract"],
            "sample_decoder_pad_mask": entry["sample_decoder_pad_mask"]
        })

    dataset = dataset.map(update)
    return dataset


def batcher(vocab, params):
    if params['mode'] == 'train' and params['load_batch_train_data']:
        dataset = load_batch_generator(params)
    else:
        dataset = batch_generator(example_generator, params, vocab,
                                  params["max_enc_len"], params["max_dec_len"],
                                  params["batch_size"], params["mode"])
    return dataset


if __name__ == '__main__':
    # 获取参数
    params = get_params()
    params['mode'] = 'test'
    # vocab 对象
    vocab = Vocab(vocab_path)
    # 获取batch data
    dataset = batcher(vocab, params)

    next(iter(dataset))

Example #12

0

Show file

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        prediction = self.fc(output)

        return prediction, state


if __name__ == '__main__':
    # GPU资源配置
    config_gpu()
    # 获得参数
    params = get_params()
    # 读取vocab训练
    vocab = Vocab(params["vocab_path"], params["vocab_size"])
    # 计算vocab size
    vocab_size = vocab.count
    # 使用GenSim训练好的embedding matrix
    embedding_matrix = load_word2vec_file(save_wv_model_path)

    input_sequence_len = 250
    BATCH_SIZE = 64
    embedding_dim = 500
    units = 1024

    # 编码器结构
    encoder = Encoder(vocab_size, embedding_dim, embedding_matrix, units,
                      BATCH_SIZE)
    # example_input
    example_input_batch = tf.ones(shape=(BATCH_SIZE, input_sequence_len),

Example #13

0

Show file

File: data_loader.py Project: lijunforNLPtraining/kaikeba

def build_dataset(train_data_path, test_data_path):
    '''
    数据加载+预处理
    :param train_data_path:训练集路径
    :param test_data_path: 测试集路径
    :return: 训练数据 测试数据  合并后的数据
    '''
    # 1.加载数据
    train_df = pd.read_csv(train_data_path)
    test_df = pd.read_csv(test_data_path)
    print('train data size {},test data size {}'.format(
        len(train_df), len(test_df)))

    # 2. 空值剔除
    train_df.dropna(subset=['Report'], inplace=True)

    train_df.fillna('', inplace=True)
    test_df.fillna('', inplace=True)

    # 3.多线程, 批量数据处理
    train_df = parallelize(train_df, sentences_proc)
    test_df = parallelize(test_df, sentences_proc)

    # 4. 合并训练测试集合
    train_df['merged'] = train_df[['Question', 'Dialogue',
                                   'Report']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    test_df['merged'] = test_df[['Question',
                                 'Dialogue']].apply(lambda x: ' '.join(x),
                                                    axis=1)
    merged_df = pd.concat([train_df[['merged']], test_df[['merged']]], axis=0)
    print('train data size {},test data size {},merged_df data size {}'.format(
        len(train_df), len(test_df), len(merged_df)))

    # 5.保存处理好的 训练 测试集合
    train_df = train_df.drop(['merged'], axis=1)
    test_df = test_df.drop(['merged'], axis=1)

    train_df.to_csv(train_seg_path, index=None, header=False)
    test_df.to_csv(test_seg_path, index=None, header=False)

    # 6. 保存合并数据
    merged_df.to_csv(merger_seg_path, index=None, header=False)

    # 7. 训练词向量
    print('start build w2v model')
    wv_model = Word2Vec(LineSentence(merger_seg_path),
                        size=embedding_dim,
                        sg=1,
                        workers=cores,
                        iter=wv_train_epochs,
                        window=5,
                        min_count=5)

    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)

    # 训练集 验证集划分
    X_train, X_val, y_train, y_val = train_test_split(
        train_df['X'],
        train_df['Report'],
        test_size=0.002,  # 8W*0.002
    )

    X_train.to_csv(train_x_seg_path, index=None, header=False)
    y_train.to_csv(train_y_seg_path, index=None, header=False)
    X_val.to_csv(val_x_seg_path, index=None, header=False)
    y_val.to_csv(val_y_seg_path, index=None, header=False)

    test_df['X'].to_csv(test_x_seg_path, index=None, header=False)

    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.vocab

    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))

    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))

    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))

    # 10. 保存pad oov处理后的,数据和标签
    train_df['X'].to_csv(train_x_pad_path, index=None, header=False)
    train_df['Y'].to_csv(train_y_pad_path, index=None, header=False)
    test_df['X'].to_csv(test_x_pad_path, index=None, header=False)
    #
    # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len))

    # 11. 词向量再次训练
    # print('start retrain w2v model')
    # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('1/3')
    # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('2/3')
    # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)

    # 保存词向量模型
    wv_model.save(save_wv_model_path)
    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.vocab))

    # 12. 更新vocab
    vocab = {word: index for index, word in enumerate(wv_model.wv.index2word)}
    reverse_vocab = {
        index: word
        for index, word in enumerate(wv_model.wv.index2word)
    }

    # 保存字典
    save_dict(vocab_path, vocab)
    save_dict(reverse_vocab_path, reverse_vocab)

    # 13. 保存词向量矩阵
    embedding_matrix = wv_model.wv.vectors
    np.save(embedding_matrix_path, embedding_matrix)

    # 14. 数据集转换 将词转换成索引  [<START> 方向机 重 ...] -> [32800, 403, 986, 246, 231
    vocab = Vocab()

    train_ids_x = train_df['X'].apply(
        lambda x: transform_data(x, vocab.word2id))
    train_ids_y = train_df['Y'].apply(
        lambda x: transform_data(x, vocab.word2id))
    test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab.word2id))

    # 15. 数据转换成numpy数组
    # 将索引列表转换成矩阵 [32800, 403, 986, 246, 231] --> array([[32800,   403,   986 ]]
    train_X = np.array(train_ids_x.tolist())
    train_Y = np.array(train_ids_y.tolist())
    test_X = np.array(test_ids_x.tolist())

    # 保存数据
    np.save(train_x_path, train_X)
    np.save(train_y_path, train_Y)
    np.save(test_x_path, test_X)
    return train_X, train_Y, test_X

Example #14

0

Show file

    # the final distribution for that decoder timestep
    # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
    final_dists = [
        vocab_dist + copy_dist
        for (vocab_dist,
             copy_dist) in zip(vocab_dists_extended, attn_dists_projected)
    ]

    return final_dists


if __name__ == '__main__':
    # GPU资源配置
    config_gpu()
    # 读取vocab训练
    vocab = Vocab(save_vocab_path)
    # 计算vocab size
    vocab_size = vocab.count
    # 使用GenSim训练好的embedding matrix
    embedding_matrix = load_embedding_matrix()

    params = defaultdict()
    params["vocab_size"] = vocab_size
    params["embed_size"] = 300
    params["enc_units"] = 512
    params["attn_units"] = 512
    params["dec_units"] = 512
    params["batch_size"] = 64
    params["max_enc_len"] = 200
    params["max_dec_len"] = 41

Example #15

0

Show file

    # the final distribution for that decoder timestep
    # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore.
    final_dists = [
        vocab_dist + copy_dist
        for (vocab_dist,
             copy_dist) in zip(vocab_dists_extended, attn_dists_projected)
    ]

    return final_dists


if __name__ == '__main__':
    # GPU资源配置
    config_gpu()
    # 读取vocab训练
    vocab, reverse_vocab = Vocab.load_vocab(save_wv_model_path)
    # 计算vocab size
    vocab_size = len(vocab)
    batch_size = 128
    input_sequence_len = 200

    params = {}
    params["vocab_size"] = vocab_size
    params["embed_size"] = 500
    params["enc_units"] = 512
    params["attn_units"] = 512
    params["dec_units"] = 512
    params["batch_size"] = batch_size

    model = Seq2Seq(params)

Example #16

0

Show file

File: data_loader.py Project: jim4399266/Text-Summarization

def generate_dataset_cache(train_df, test_df, wv_model):
    # 8. 分离数据和标签
    train_df['X'] = train_df[['Question',
                              'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    train_df['X'].to_csv(config.train_x_seg_path, index=None, header=False)
    train_df['Report'].to_csv(config.train_y_seg_path,
                              index=None,
                              header=False)
    test_df['X'] = test_df[['Question',
                            'Dialogue']].apply(lambda x: ' '.join(x), axis=1)
    test_df['X'].to_csv(config.val_x_seg_path, index=None, header=False)
    test_df['Report'].to_csv(config.val_y_seg_path, index=None, header=False)
    # 9. 填充开始结束符号,未知词填充 oov, 长度填充
    # 使用GenSim训练得出的vocab
    vocab = wv_model.wv.key_to_index  # The 'vocab' attribute was removed from KeyedVector in Gensim 4.0.0.
    # 训练集X处理
    # 获取适当的最大长度
    train_x_max_len = get_max_len(train_df['X'])
    test_X_max_len = get_max_len(test_df['X'])
    X_max_len = max(train_x_max_len, test_X_max_len)
    train_df['X'] = train_df['X'].apply(
        lambda x: pad_proc(x, X_max_len, vocab))
    # 测试集X处理
    # 获取适当的最大长度
    test_df['X'] = test_df['X'].apply(lambda x: pad_proc(x, X_max_len, vocab))
    # 训练集Y处理
    # 获取适当的最大长度
    train_y_max_len = get_max_len(train_df['Report'])
    train_df['Y'] = train_df['Report'].apply(
        lambda x: pad_proc(x, train_y_max_len, vocab))
    test_y_max_len = get_max_len(test_df['Report'])
    test_df['Y'] = test_df['Report'].apply(
        lambda x: pad_proc(x, test_y_max_len, vocab))
    # 10. 保存pad oov处理后的,数据和标签
    train_df['X'].to_csv(config.train_x_pad_path, index=False, header=False)
    train_df['Y'].to_csv(config.train_y_pad_path, index=False, header=False)
    test_df['X'].to_csv(config.test_x_pad_path, index=False, header=False)
    test_df['Y'].to_csv(config.test_y_pad_path, index=False, header=False)
    # print('train_x_max_len:{} ,train_y_max_len:{}'.format(X_max_len, train_y_max_len))
    # 11. 词向量再次训练
    # print('start retrain w2v model')
    # wv_model.build_vocab(LineSentence(train_x_pad_path), update=True)
    # wv_model.train(LineSentence(train_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('1/3')
    # wv_model.build_vocab(LineSentence(train_y_pad_path), update=True)
    # wv_model.train(LineSentence(train_y_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    #
    # print('2/3')
    # wv_model.build_vocab(LineSentence(test_x_pad_path), update=True)
    # wv_model.train(LineSentence(test_x_pad_path), epochs=1, total_examples=wv_model.corpus_count)
    # 保存词向量模型
    if not os.path.exists(os.path.dirname(config.save_wv_model_path)):
        os.makedirs(os.path.dirname(config.save_wv_model_path))
    wv_model.save(config.save_wv_model_path)
    print('finish retrain w2v model')
    print('final w2v_model has vocabulary of ', len(wv_model.wv.key_to_index))
    # 12. 更新vocab
    # The 'index2word' attribute has been replaced by 'index_to_key' since Gensim 4.0.0.
    vocab = {
        word: index
        for index, word in enumerate(wv_model.wv.index_to_key)
    }
    reverse_vocab = {
        index: word
        for index, word in enumerate(wv_model.wv.index_to_key)
    }
    # 保存字典
    save_dict(config.vocab_path, vocab)
    save_dict(config.reverse_vocab_path, reverse_vocab)
    # 13. 保存词向量矩阵
    embedding_matrix = wv_model.wv.vectors
    np.save(config.embedding_matrix_path, embedding_matrix)
    # 14. 数据集转换 将词转换成索引  [<START> 方向机 重 ...] -> [2, 403, 986, 246, 231
    vocab = Vocab()
    train_ids_x = train_df['X'].apply(lambda x: transform_data(x, vocab))
    train_ids_y = train_df['Y'].apply(lambda x: transform_data(x, vocab))
    test_ids_x = test_df['X'].apply(lambda x: transform_data(x, vocab))
    test_ids_y = test_df['Y'].apply(lambda x: transform_data(x, vocab))
    # 15. 数据转换成numpy数组
    # 将索引列表转换成矩阵 [2, 403, 986, 246, 231] --> array([[2,   403,   986 , 246, 231]]
    train_X = np.array(train_ids_x.tolist())
    train_Y = np.array(train_ids_y.tolist())
    test_X = np.array(test_ids_x.tolist())
    test_Y = np.array(test_ids_y.tolist())
    # 保存数据
    np.save(config.train_x_path, train_X)
    np.save(config.train_y_path, train_Y)
    np.save(config.test_x_path, test_X)
    np.save(config.test_y_path, test_Y)
    return test_X, test_Y, train_X, train_Y