Exemple #1
0
def save_model_m0(model):
    if save_model:
        show_title("保存网络模型")
        file_name = model_file_path + model_file_prefix + 'm0.h5'
        print("保存原始模型:{} →".format(file_name), end='')
        model.save(file_name)
        print("模型保存成功。")
def export_day_statistical_sequence(lst_data):
    from src.data.generate_data import generate_day_statistical_sequence
    show_title("加工数据为 91 天的序列数据,每天为6个特征(最大值、最小值、平均值)96维数据")
    x_data = generate_day_statistical_sequence(lst_data)
    from src.base.config import x_data_file_name, base_data_type
    save_model_data(x_data, data_file_path + x_data_file_name, base_data_type)
    return x_data
Exemple #3
0
def save_model_m2(history, model):
    if save_model:
        show_title("保存网络模型")
        file_name = model_file_path + model_file_prefix + 'm2.bin'
        print("保存第二次训练模型:{} → ".format(file_name), end='')
        model.save_weights(file_name)
        with open(model_file_path + model_file_prefix + 'm2.pkl',
                  'wb') as fname:
            pickle.dump(history.history, fname)
        print("模型保存成功。")
def export_train_balance(x_train, y_train):
    show_title(f"对类别 :{label_name}实施平衡{config.train_data_type}")
    x_train_balance, y_train_balance = generate_balance_data(x_train, y_train)

    from src.base.config import train_balance_data_type
    from src.base.config import x_train_balance_file_name, y_train_balance_file_name
    save_model_data(x_train_balance,
                    data_file_path + x_train_balance_file_name,
                    train_balance_data_type)
    save_model_data(y_train_balance,
                    data_file_path + y_train_balance_file_name,
                    train_balance_data_type)
def main():
    from keras_preprocessing.sequence import pad_sequences
    from src.data.show_data import show_result, show_parameters
    from src.data.load_data import load_train_data, load_train_val_data, load_val_data

    show_title("构建网络模型")
    show_parameters()
    model = construct_model()
    model.summary()
    save_model_m0(model)

    show_title("加载与填充{}".format(train_data_type))

    x_train_val, y_train_val = load_train_val_data()
    x_train_val_seq = pad_sequences(x_train_val,
                                    maxlen=max_len,
                                    padding='post')

    x_val, y_val = load_val_data()
    x_val_seq = pad_sequences(x_val, maxlen=max_len, padding='post')

    show_title("存在验证集训练网络模型")
    history = model.fit(x={'creative_id': x_train_val_seq},
                        y=y_train_val,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(x_val_seq, y_val),
                        verbose=2)
    save_model_m1(history, model)

    show_title("加载与填充测试数据集")
    x_test, y_test = load_test_data()
    x_test_seq = pad_sequences(x_test, maxlen=max_len, padding='post')
    results = model.evaluate({'creative_id': x_test_seq}, y_test, verbose=0)
    predictions = model.predict({'creative_id': x_test_seq}).squeeze()
    show_result(results, predictions, y_test)

    show_title("没有验证集训练网络模型,训练次数减半")
    x_train, y_train = load_train_data()
    x_train_seq = pad_sequences(x_train, maxlen=max_len, padding='post')
    # history = model.fit({'creative_id': x_train_seq}, y_train, epochs=epochs, batch_size=batch_size,
    #                     validation_split=0.2, verbose=2)
    history = model.fit({'creative_id': x_train_seq},
                        y_train,
                        epochs=epochs // 2,
                        batch_size=batch_size,
                        verbose=2)
    save_model_m2(history, model)

    results = model.evaluate({'creative_id': x_test_seq}, y_test, verbose=0)
    predictions = model.predict({'creative_id': x_test_seq}).squeeze()
    show_result(results, predictions, y_test)
def export_day_list_data():
    from src.data.load_data import load_original_data
    show_title("加载原始数据")
    x_csv, y_csv = load_original_data()

    from src.data.generate_data import generate_day_list_data
    show_title("导出每个用户每天访问数据的不截断列表")
    lst_data, y_data = generate_day_list_data(x_csv, y_csv)
    save_model_data(lst_data, data_file_path + config.lst_data_file_name,
                    config.base_data_type)
    save_model_data(y_data, data_file_path + config.y_data_file_name,
                    config.base_data_type)
    return lst_data, y_data
def save_word2vec_data(x_creative_id, creative_id_window, file_path):
    """
    保存训练 word2vec 用的数据
    :param x_creative_id:
    :param creative_id_window:
    :param file_path:
    :return:
    """
    file_name = file_path + 'creative_id_{0}'.format(creative_id_window)
    show_title("保存数据集:{0}".format(file_name))
    with open(file_name, 'wb') as fname:
        pickle.dump(x_creative_id, fname, -1)
    print("Word2Vec 数据保存成功")
Exemple #8
0
def train_word2vec_model_with_gensim(words_lists):
    from gensim.models import Word2Vec
    from src.base.config import embedding_size, embedding_window
    show_title(f"训练 word2vec({embedding_size}_{embedding_window}) 模型")
    model = Word2Vec(
        words_lists,
        size=embedding_size,
        window=embedding_window,
        min_count=1,
        seed=config.seed,
        workers=8,
        sg=0,  # 0:CBOW; 1:Skip-Gram
        iter=20,
        sorted_vocab=False,
        batch_words=4096)
    return model
def export_train_test_data(x_data, y_data):
    x_data = x_data[0:config.user_id_max]
    show_title("拆分训练数据集和测试数据集")
    x_train, x_test, y_train, y_test = train_test_split(
        x_data, y_data, random_state=config.seed, stratify=y_data)
    from src.base.config import train_data_type
    from src.base.config import x_train_file_name, y_train_file_name
    save_model_data(x_train, data_file_path + x_train_file_name,
                    train_data_type)
    save_model_data(y_train, data_file_path + y_train_file_name,
                    train_data_type)

    from src.base.config import test_data_type
    from src.base.config import x_test_file_name, y_test_file_name
    save_model_data(x_test, data_file_path + x_test_file_name, test_data_type)
    save_model_data(y_test, data_file_path + y_test_file_name, test_data_type)
    return x_train, y_train
def export_val_data(x_train, y_train):
    show_title("拆分训练数据集和验证数据集")
    x_train_val, x_val, y_train_val, y_val = train_test_split(
        x_train, y_train, random_state=config.seed, stratify=y_train)

    from src.base.config import train_val_data_type
    from src.base.config import x_train_val_file_name, y_train_val_file_name
    save_model_data(x_train_val, data_file_path + x_train_val_file_name,
                    train_val_data_type)
    save_model_data(y_train_val, data_file_path + y_train_val_file_name,
                    train_val_data_type)

    from src.base.config import val_data_type
    from src.base.config import x_val_file_name, y_val_file_name
    save_model_data(x_val, data_file_path + x_val_file_name, val_data_type)
    save_model_data(y_val, data_file_path + y_val_file_name, val_data_type)
    return x_train_val, y_train_val
def main():
    show_title("数据清洗开始...")
    from src.data.export_data import export_base_data
    x_data, y_data = export_base_data()
    # from src.data.load_data import load_base_data
    # x_data, y_data = load_base_data()

    from src.data.export_data import export_train_test_data
    x_train, y_train = export_train_test_data(x_data, y_data)
    # from src.data.load_data import load_train_data
    # x_train, y_train = load_train_data()
    # export_train_balance(x_train, y_train)

    from src.data.export_data import export_val_data
    x_train_val, y_train_val = export_val_data(x_train, y_train)
    # from src.data.load_data import load_val_data
    # x_train_val, y_train_val = load_val_data()
    # export_val_balance(x_train_val, y_train_val)

    show_title("数据清洗完成!")
Exemple #12
0
def save_word2vec_weights(model_w2v):
    """保存 word2vec 训练的权重
    :param model_w2v:
    :return:
    """
    from src.base.tools import show_title, get_w2v_file_name
    file_name = get_w2v_file_name()
    show_title(f"保存 word2vec 模型 {file_name}")
    # 初始化嵌入式模型权重矩阵;0 是占位符,因此不记入模型的数据;补:将 0 的权重大小设置为 0.5,效果并不好
    embedding_weights = np.zeros(
        (config.creative_id_window, config.embedding_size))
    # 需要将训练的单词(word) 与 数组的序号(ord(word))对应
    for word, index in model_w2v.vocab.items():
        try:
            embedding_weights[ord(word), :] = model_w2v[word]
        except KeyError:
            print(f"错误的键值{word}")

    model_w2v.save(file_name)
    np.save(file_name, embedding_weights)
    print("Word2Vec 模型保存完成。")
def train_multi_output():
    from src.data.show_data import show_result, show_parameters

    show_title("构建网络模型")
    show_parameters()
    model = construct_model_single_input()
    model.summary()

    from src.model.save_model import save_model_m0
    show_title("保存网络模型")
    save_model_m0(model)

    from src.data.load_data import load_train_val_data
    x_train_val, y_train_val = load_train_val_data()
    from src.base.config import day_feature_idx
    x_train_val = single_data_reshape(day_feature_idx, x_train_val, y_train_val.shape[0])
    from src.data.load_data import load_val_data
    x_val, y_val = load_val_data()
    x_val = single_data_reshape(day_feature_idx, x_val, y_val.shape[0])
    show_title("存在验证集训练网络模型")
    history = model.fit(x_train_val, y_train_val, epochs=epochs, batch_size=batch_size,
                        validation_data=(x_val, y_val), verbose=2)
    del x_train_val, x_val, y_train_val, y_val
    gc.collect()

    from src.model.save_model import save_model_m1
    save_model_m1(history, model)

    from src.data.load_data import load_test_data
    show_title("加载与填充测试数据集")
    x_test, y_test = load_test_data()
    x_test = single_data_reshape(day_feature_idx, x_test, y_test.shape[0])
    results = model.evaluate(x_test, y_test, verbose=0)
    predictions = model.predict(x_test).squeeze()
    show_result(results, predictions, y_test)

    show_title("没有验证集训练网络模型,训练次数减半")
    from src.data.load_data import load_train_data
    show_title("加载与填充{}".format(train_data_type))
    x_train, y_train = load_train_data()
    x_train = single_data_reshape(day_feature_idx, x_train, y_train.shape[0])

    history = model.fit(x_train, y_train, epochs=epochs // 2, batch_size=batch_size, verbose=2)
    from src.model.save_model import save_model_m2
    save_model_m2(history, model)

    results = model.evaluate(x_test, y_test, verbose=0)
    predictions = model.predict(x_test).squeeze()
    show_result(results, predictions, y_test)
    pass
def export_day_fix_sequence(lst_data):
    show_title("加工数据为 91 天的序列数据,每天为定长的数据序列")
    pass
def export_w2v_data(lst_data):
    show_title("导出用于Word2Vec训练的数据")
    from src.data.generate_data import generate_w2v_data
    x_w2v = generate_w2v_data(lst_data)
    from src.base.config import data_w2v_path, w2v_file_name, w2v_data_type
    save_model_data(x_w2v, data_w2v_path + w2v_file_name, w2v_data_type)
def load_word2vec_weights():
    file_name = get_w2v_file_name()
    show_title("加载 word2vec 模型 {0}".format(file_name))
    embedding_weights = np.load(file_name + '.npy', allow_pickle=True)
    print("Word2Vec 模型加载完成。")
    return embedding_weights
def export_user_fix_sequence(lst_data):
    show_title("加工数据为无间隔有重复的数据列表")
    pass