Exemple #1
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["module_name"] == "DeepFM":
        if dfm_params["use_fm"] and dfm_params["use_deep"]:
            clf_str = "DeepFM"
        elif dfm_params["use_fm"]:
            clf_str = "FM"
        elif dfm_params["use_deep"]:
            clf_str = "DNN"
    elif dfm_params["module_name"] == "LR":
        clf_str = "LR"
    elif dfm_params["module_name"] == "WideDeep":
        clf_str = "WideDeep"

    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #2
0
def run_base_model_nfm(dfTrain, dfTest, folds, pnn_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    # Xi_train :列的序号
    # Xv_train :列的对应的值
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    #print(dfTrain.dtypes)
    pnn_params['feature_size'] = fd.feat_dim
    pnn_params['field_size'] = len(Xi_train[0])

    _get = lambda x, l: [x[i] for i in l]

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        nfm = NFM(**pnn_params)
        nfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)
Exemple #3
0
def prepare(path="/data/houcunyue/zhoutong/data/CriteoData/train.txt"):
    col_names = ['target'] + ["feature_%s" % i for i in range(39)]
    # 已知前13列特征都是numeric
    dtype_dict = {x: float for x in col_names[:13]}
    for x in col_names[13:]:
        dtype_dict[x] = object
    chunk_size = 200 * 10000
    _reader = pd.read_csv(path,
                          header=None,
                          names=col_names,
                          delimiter="\t",
                          chunksize=chunk_size,
                          dtype=dtype_dict)
    train_data_chunks = []
    test_data_chunks = []
    print_t("   loading data from: %s" % path)
    for chunk in _reader:
        df_chunk = chunk
        cut_idx = int(0.8 * df_chunk.shape[0])
        train_data_chunks.append(df_chunk[:cut_idx])
        test_data_chunks.append(df_chunk[cut_idx:])
        print_t("   已拼接 %s 个 %s 行的chunk" %
                (len(train_data_chunks), chunk_size))
    print_t("   concatting data...")
    dfTrain = pd.concat(train_data_chunks, ignore_index=True)
    dfTest = pd.concat(test_data_chunks, ignore_index=True)
    print_t("   feature_dict generating ...")
    fd = FeatureDictionary(
        dfTrain=dfTrain,
        dfTest=dfTest,
        numeric_cols=list(
            dfTrain.select_dtypes(include=['float64', 'int64'],
                                  exclude=None).columns))
    return dfTrain, dfTest, fd
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest)
    data_parser = DataParser(
        feat_dict=fd
    )  #converting into-Xi: [[ind1_1, ind1_2, ...], [ind2_1, ind2_2, ...], ..., [indi_1, indi_2, ..., indi_j, ...], ...]
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 4),
                            dtype=float)  #crated array of (rows in dfTrain,4)
    y_test_meta = np.zeros((dfTest.shape[0], 4), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0:4] += dfm.predict(Xi_test, Xv_test)
        b = np.zeros_like(y_train_meta)
        b[np.arange(len(y_train_meta)), y_train_meta.argmax(1)] = 1
        #y_train_meta = np.array(y_train_meta, dtype=np.float32)
        gini_results_cv[i] = label_ranking_average_precision_score(
            y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))
    #b = np.zeros_like(y_test_meta)
    #b[np.arange(len(y_test_meta)), y_test_meta.argmax(1)] = 1
    #y_test_meta = b

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    #filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _final_result(ids_test, y_test_meta, filename="result.csv")

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #5
0
def run_base_model_dfm(dfTrain, dfTest, folds, prefix, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    _get = lambda x, l: [x[i] for i in l]
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        Xi_train_ = np.array(Xi_train_, dtype='int32')
        Xv_train_ = np.array(Xv_train_, dtype='float32')
        y_train_ = np.array(y_train_, dtype=np.int8)
        Xi_valid_ = np.array(Xi_valid_, dtype='int32')
        Xv_valid_ = np.array(Xv_valid_, dtype='float32')
        y_valid_ = np.array(y_valid_, dtype=np.int8)

        dfm = DeepFM(**dfm_params).build_model()
        dfm.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=[keras.metrics.AUC(name='auc')])
        print(dfm.summary())

        checkpoint_dir = "../checkpoints/{}_cpt_" + str(i)
        log_dir = "../logs/{}_train_logs_" + str(i)

        checkpoint_dir = checkpoint_dir.format(prefix)
        log_dir = log_dir.format(prefix)

        shutil.rmtree(checkpoint_dir, ignore_errors=True)
        os.makedirs(checkpoint_dir, exist_ok=True)
        shutil.rmtree(log_dir, ignore_errors=True)
        os.makedirs(log_dir, exist_ok=True)
        checkpoint_path = os.path.join(checkpoint_dir, "weights.hdf5")
        callbacks = [
            ModelCheckpoint(checkpoint_path,
                            monitor="val_loss",
                            save_best_only=True),
            EarlyStopping(patience=5, monitor="val_loss"),
            TensorBoard(log_dir=log_dir)
        ]

        dfm.fit(
            (Xi_train_, Xv_train_),
            y_train_,
            epochs=50,
            # epochs=1,
            batch_size=64,
            validation_data=((Xi_valid_, Xv_valid_), y_valid_),
            verbose=2,
            callbacks=callbacks)
Exemple #6
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,save_path:str,past_epoch:int=0):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    # gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    # gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    dfm = DeepFM(**dfm_params)
    if past_epoch!=0 :dfm.saver.restore(dfm.sess, save_path + '-'+str(past_epoch))
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
        # print(y_train)


        # print(y_train_)
        # print(dfm.predict(Xi_train_, Xv_train_))
        # continue
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        # gini_results_epoch_train[i] = dfm.train_result
        # gini_results_epoch_valid[i] = dfm.valid_result
        # print('saving')
        dfm.saver.save(dfm.sess, save_path, global_step=past_epoch+dfm_params["epoch"]*(i+1))

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)
    gini_results_epoch_train = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float)
    gini_results_epoch_valid = np.zeros((1, dfm_params["epoch"]*len(folds)), dtype=float)
    gini_results_epoch_train[0]=dfm.train_result
    gini_results_epoch_valid[0]=dfm.valid_result
    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta,dfm
Exemple #7
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        # todo dfm_params={'use_fm': True, 'use_deep': True, 'embedding_size': 8, 'dropout_fm': [1.0, 1.0], 'deep_layers': [32, 32],
        #  'dropout_deep': [0.5, 0.5, 0.5], 'deep_layers_activation': <function relu at 0x7fe4917da950>, 'epoch': 30, 'batch_size': 1024,
        #  'learning_rate': 0.001, 'optimizer_type': 'adam', 'batch_norm': 1, 'batch_norm_decay': 0.995, 'l2_reg': 0.01, 'verbose': True,
        #  'eval_metric': <function gini_norm at 0x7fe495b06048>, 'random_seed': 2017, 'feature_size': 259, 'field_size': 39}
        # print(f"dfm_params={dfm_params}")
        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        # todo 所谓的 train_result 是训练集的gini系数
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    # todo 上面用了5折交叉,y_test_meta是各折交叉的加和,这里相当于5折交叉取平均
    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #8
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS,
                           multi_value_cols=config.MULTI_VALUE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, Xmv_train, y_train = data_parser.parse(df=dfTrain,
                                                               has_label=True)
    Xi_test, Xv_test, Xmv_test, ids_test = data_parser.parse(df=dfTest)

    # pickle.dump((Xi_train, Xv_train,  Xmv_train, y_train), open('./data/train_set1.pkl','wb'))
    # pickle.dump((Xi_test, Xv_test, Xmv_test, ids_test), open('./data/test_set1.pkl','wb'))
    # Xi_train, Xv_train,  Xmv_train, y_train = pickle.load(open('./data/train_set1.pkl','rb'))
    # y_train = np.array(y_train)
    # y_train = np.where(y_train<0,0,y_train)
    # y_train = list(y_train)
    # Xi_test, Xv_test, Xmv_test, ids_test = pickle.load(open('./data/test_set1.pkl','rb'))

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])
    dfm_params["vocab_size"] = fd.vocab_size
    dfm_params["num_multiVal_feat"] = len(fd.multi_value_cols)
    dfm_params["sequence_length"] = config.MAXLEN
    print(dfm_params)
    del fd
    del data_parser
    gc.collect()

    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, Xmv_train_, y_train_ = \
        _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(Xmv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, Xmv_valid_, y_valid_ = \
        _get(Xi_train, valid_idx), _get(Xv_train, valid_idx),_get(Xmv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepCFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, Xmv_train_, y_train_, Xi_valid_,
                Xv_valid_, Xmv_valid_, y_valid_)

        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test, Xmv_test)

        break

    # y_test_meta /= float(len(folds))

    # save result
    _make_submission(ids_test, y_test_meta, "submission1.csv")

    # _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_test_meta
def init():
    start_time = datetime.now()
    dfTrain, dfTest, X_train, y_train = _load_data()
    # ------------------ DeepFM Model ------------------
    # params
    dfm_params = {
        "use_fm": True,
        "use_deep": True,
        "embedding_size": 16,
        "dropout_fm": [1.0, 1.0],
        "deep_layers": [2000, 2000, 1500],
        "dropout_deep": [0.5, 0.5, 0.8, 0.6],
        "deep_layers_activation": tf.nn.relu,
        "epoch": 2000,  # 500#1500
        "batch_size": 1024,
        "learning_rate": 0.001,
        "optimizer_type": "adagrad",
        "batch_norm": 1,
        "batch_norm_decay": 0.995,
        "l2_reg": 0.01,
        "verbose": True,
        "eval_metric": gini_norm,
        "random_seed": config.RANDOM_SEED,
        "loss_type": "mse"
    }
    dnn_params = dfm_params.copy()
    dnn_params["use_fm"] = False
    global dfm_dnn
    dfm_dnn = get_deep_fm_model(dnn_params)
    global past_epoch
    dfm_dnn.saver.restore(dfm_dnn.sess,
                          "save/FixedHashing/temp" + '-' + str(past_epoch))
    ###Prepare Data Parser
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    global data_parser
    data_parser = DataParser(feat_dict=fd)
    print('Consumed Time for Prepare Model: {} second(s)'.format(
        (datetime.now() - start_time).seconds))

    try:
        is_table_existing = hardware_table.table_status in ("CREATING",
                                                            "UPDATING",
                                                            "DELETING",
                                                            "ACTIVE")
    except ClientError:
        # do something here as you require
        pass
    else:
        pass
Exemple #10
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,
                        label2current_service):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    xx_score = []
    cv_pred = []
    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        xx_pred = dfm.predict(Xi_valid_, Xv_valid_)
        xx_score.append(f1_score(y_valid_, xx_pred, average='macro'))

        y_test = dfm.predict(Xi_test, Xv_test)
        if i == 0:
            cv_pred = np.asarray(y_test).reshape(-1, 1)
        else:
            cv_pred = np.hstack((cv_pred, np.asarray(y_test).reshape(-1, 1)))

    submit = []
    for line in cv_pred:
        submit.append(np.argmax(np.bincount(line)))

    # 保存结果
    df_test = pd.DataFrame()
    df_test['id'] = list(ids_test)
    df_test['predict'] = submit
    df_test['predict'] = df_test['predict'].map(label2current_service)

    df_test.to_csv('result.csv', index=False)

    print(xx_score, np.mean(xx_score))

    return y_train_meta, y_test_meta
Exemple #11
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        # k折交叉,每一折中的fit中,含有epoch轮训练,每一次epoch拆分了batch来喂入
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)  # fit中包含对train和valid的评估

        yy = dfm.predict(Xi_valid_, Xv_valid_)
        # print("type(yy):",type(yy))
        # print("type(y_valid_):", type(y_valid_))

        # print("yy.shape:",yy.shape)               #yy : array
        # print("y_valid_.shape:", y_valid_.shape)  #y_valid_ : list

        #print("yy:", yy)  # 原始的predict出来的是概率值
        for index in range(len(yy)):
            if (yy[index] <= 0.5):
                yy[index] = 0
            else:
                yy[index] = 1

        #print("y_valid_:", y_valid_)

        print("accuracy_score(y_valid_, yy):", accuracy_score(y_valid_, yy))

        y_train_meta[valid_idx, 0] = yy

        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

    y_test_meta /= float(len(folds))

    return y_train_meta, y_test_meta
Exemple #12
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    #获取dict
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,#训练集和测试集
                           numeric_cols=config.NUMERIC_COLS,#num类列
                           ignore_cols=config.IGNORE_COLS)#ignore特征,dfTrain和dfTest没有过滤掉
    data_parser = DataParser(feat_dict=fd)#data_parser对象
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)调用parse方法获取处理后的数据
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    dfm_params["feature_size"] = fd.feat_dim#处理之后的特征个数,即考虑了one-hot之后
    dfm_params["field_size"] = len(Xi_train[0])#field个数

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):#应该是划分k份
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)#k次折交
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)#每次训练都预测一次,然后把预测结果累加取来

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))#在测试集上的累加结果求平均

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:#deepFM
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:#FM
        clf_str = "FM"
    elif dfm_params["use_deep"]:#DNN
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta#返回验证的预测和测试集的预测
def k_fold_cross_valid(dfTrain, X_submission, folds, pnn_params, train_params):
    numeric_cols = []
    ignore_cols = []
    for col in dfTrain.columns:
        type_col = str(dfTrain[col].dtype)
        if (type_col == 'float32'
                or (type_col == 'int64' and col[:10] != 'pref_month')):
            numeric_cols.append(col)

    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=X_submission,
                           numeric_cols=numeric_cols,
                           ignore_cols=ignore_cols)
    data_parser = DataParser(feat_dict=fd)

    # Xi_train :列的序号
    # Xv_train :列的对应的值
    # 这里不方便调用imblearn实现过采样,因为他不是直接存储为one-hot矩阵
    # 而是索引和值分开存储的。也就是说,要用tensorflow自带的embedding函数,
    # 就很难再调用imlearn中的过采样了
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_submission, Xv_submission, ids_submission = data_parser.parse(
        df=X_submission)

    # print(y_train)
    pnn_params['feature_size'] = fd.feat_dim  #包括one-hot所有维度的总维度,n_all_feature
    pnn_params['field_size'] = len(Xi_train[0])  #将one-hot看做整体的总的域个数, n_field

    _get = lambda x, l: [x[i] for i in l]

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        pnn = PNN(**pnn_params)

        train_iter = tf.data.Dataset.from_tensor_slices(
            (Xi_train_, Xv_train_, y_train_)).batch(train_params['batch_size'])
        test_iter = tf.data.Dataset.from_tensor_slices(
            (Xi_valid_, Xv_valid_, y_valid_)).batch(train_params['batch_size'])

        train(pnn, train_iter, test_iter, **train_params)
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params,
	numerical_cols = ):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest,has_label=True)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])
    dfm = DeepFM(**dfm_params)
    
    
    
    dfm.fit(Xi_train, Xv_train, y_train,early_stopping=True)
    pred = dfm.predict(Xi_test,Xv_test)
    print(pred)
    dfm.evaluate(Xi_test, Xv_test, y_test)

    '''
Exemple #15
0
    def __init__(self,TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\
                    TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\
                    TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\
                    UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\
                    TEST_MERGE,TEST,name='deepffm',USE_TINY=False,RANDOMSTATE=2018):
        super(DFFM, self).__init__(
                    TRAINVALTEST_DENSE_X,TRAINVALTEST_DENSE_X_NAMES,\
                    TRAINVAL_SPARSE_X,TRAINVAL_SPARSE_X_NAMES,\
                    TEST_SPARSE_X,TEST_SPARSE_X_NAMES,\
                    UF_VW,ADF,TRAINVAL,UF_CSV,TRAINVAL_MERGE,\
                    TEST_MERGE,TEST,name,USE_TINY,RANDOMSTATE=2018)
        '''In Ridge, only 'sag' solver can currently fit the intercept when X is sparse.'''
        dfm_params = {
            "use_fm": True,
            "use_deep": True,
            "embedding_size": 8,
            "dropout_fm": [1.0, 1.0],
            "deep_layers": [32, 32],
            "dropout_deep": [0.5, 0.5, 0.5],
            "deep_layers_activation": tf.nn.relu,
            "epoch": 72,
            "batch_size": 1024,
            "learning_rate": 0.001,
            "optimizer_type": "adam",
            "batch_norm": 1,
            "batch_norm_decay": 0.995,
            "l2_reg": 0.01,
            "verbose": True,
            "eval_metric": roc_auc_score,
            "random_seed": 2018
        }
        dfTrainVal, dfTest = self.ds.load_TrainVal_Test()
        fd = FeatureDictionary(dfTrain=dfTrainVal,
                               dfTest=dfTest,
                               numeric_cols=[],
                               ignore_cols=[])
        data_parser = DataParser(feat_dict=fd)
        #dfTrain_x, dfVal_x, dfTrain_y, dfVal_y =train_test_split(dfTrainVal.drop(['label'],axis=1)\
        #                                        ,dfTrainVal['label'],test_size=0.1, random_state=self.randomstate)
        #dfTrain=pd.DataFrame([dfTrain_x,dfTrain_y])
        #dfVal=pd.DataFrame([dfVal_x,dfVal_y])
        #print dfTrain.shape
        devideline = int(0.9 * len(dfTrainVal))
        dfTrain = dfTrainVal[:devideline]
        dfVal = dfTrainVal[devideline:]
        Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain,
                                                        has_label=True)
        Xi_valid, Xv_valid, y_valid = data_parser.parse(df=dfVal,
                                                        has_label=True)
        Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

        dfm_params["feature_size"] = fd.feat_dim
        dfm_params["field_size"] = len(Xi_train[0])
        self.clf = DeepFM(**dfm_params)

        # fit a DeepFM model
        self.clf.fit(Xi_train,
                     Xv_train,
                     y_train,
                     Xi_valid,
                     Xv_valid,
                     y_valid,
                     early_stopping=True,
                     refit=True)

        y_pred = self.clf.predict(Xi_test, Xv_test)
        ids_test["label"] = y_pred
        ids_test.to_csv('submission_dffm.csv',
                        index=False,
                        float_format="%.5f")
        joblib.dump(self.clf, 'saved_model.model')
Exemple #16
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(
        df=dfTrain, has_label=True)  # 返回样本特征id, 样本特征值, label
    Xi_test, Xv_test, ids_test = data_parser.parse(
        df=dfTest)  # 返回样本特征id, 样本特征值, 样本id

    dfm_params["feature_size"] = fd.feat_dim  # 特征总数
    dfm_params["field_size"] = len(
        Xi_train[0])  # Xi_train[0]是训练集的第一条样本,该长度描述的是field的数量

    y_train_meta = np.zeros((dfTrain.shape[0], 1),
                            dtype=float)  # 构建中间变量,长度和样本数保持一致
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l
                         ]  # lambda语法,类似于C语言中的宏定义,冒号前的是变量,冒号后的是变量执行的语句
    gini_results_cv = np.zeros(len(folds),
                               dtype=float)  # len(folds)表示分割训练集和验证集的方法数(kfold)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                        dtype=float)
    for i, (train_idx, valid_idx) in enumerate(
            folds):  # 反复训练模型k(len(folds))次,只在训练集量少时进行, 数据量足够大时,无需循环
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)  # 构造网络
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_,
                y_valid_)  # 拟合

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_,
                                                 Xv_valid_)  # 在验证集上预测
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)  # 在测试集上预测

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #17
0
def _run_base_model_dfm(dfTrain,
                        dfTest,
                        folds,
                        dfm_params,
                        NUMERIC_COLS,
                        IGNORE_COLS,
                        application='classification'):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=NUMERIC_COLS,
                           ignore_cols=IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest, has_label=True)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    results_cv = np.zeros(len(folds), dtype=float)
    results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                   dtype=float)
    results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                   dtype=float)
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        if application == 'classification':
            results_cv[i] = roc_auc_score(y_valid_, y_train_meta[valid_idx])
        elif application == 'regression':
            results_cv[i] = np.sqrt(
                mean_squared_error(y_valid_, y_train_meta[valid_idx]))
        else:
            results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        results_epoch_train[i] = dfm.train_result
        results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: rmse/accuracy/gini is %.4f (std is %.4f)" %
          (clf_str, results_cv.mean(), results_cv.std()))
    filename = "%s_Mean%.5f.csv" % (clf_str, results_cv.mean())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(results_epoch_train, results_epoch_valid, clf_str, application)

    return y_train_meta, y_test_meta
def run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    '''
     对模型的运行部分,  可以往下看发现,这部分  同时可以用于   设置使用FM 、Deep 、DeepFM这三种不同的模型
    '''

    #  别忽视了  FeatureDictionary 这里面有非常多的信息包装 转换的。 这里 解析 和字典包装真的是有点不明白,太复杂了,v是怎么获取使用的
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    # 在解析数据中,逐行处理每一条数据,dfi 记录了当前的特征在总的输入的特征中的索引。dfv 中记录的是具体的值,
    # 如果是 numerical 特征,存的是原始的值,如果是 categories 类型的,就存放 1。这个相当于进行了 one-hot 编码,
    # 在 dfi 存储了特征所在的索引。输入到网络中的特征的长度是      ( numerical 特征的个数 +categories 特征 one-hot 编码的长度 )。
    # 最终,Xi 和 Xv 是一个二维的 list,里面的每一个 list 是一行数据,Xi 存放的是特征所在的索引,Xv 存放的是具体的特征值。
    data_parser = DataParser(feat_dict=fd)

    # Xi_train :列的序号
    # Xv_train :列的对应的值

    # 解析数据 Xi_train 存放的是特征对应的索引 Xv_train 存放的是特征的具体的值
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)

    #这里面是二维的,  大列表是 每个样本,小列表表示具体对应feature_index下的value的长度  。 小列表长度应该不是统一的,因为针对one-hot,只显示为1的
    print('Xi_train:', Xi_train)  #存储了对应标签索引
    print('Xv_train:', Xv_train)  #存储了真实值
    print('y_train:', y_train)
    print('Xi_test:', Xi_test)
    print('Xv_test:', Xv_test)

    print('Xi_train shape:', len(Xi_train))  # 存储了对应标签索引
    print('Xv_train shape:', len(Xv_train))  # 存储了真实值
    print('y_train shape:', len(y_train))
    print('Xi_test shape:', len(Xi_test))
    print('Xv_test shape:', len(Xv_test))
    #print('ids_test:', ids_test)
    print(dfTrain.dtypes)

    #field_size  是原始的特征size,   feature_size是经过对离散型数据one-hot处理后的特征数量
    dfm_params['feature_size'] = fd.feat_dim
    dfm_params['field_size'] = len(Xi_train[0])

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]

    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params['epoch']),
                                        dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params['epoch']),
                                        dtype=float)

    for i, (train_idx, valid_idx) in enumerate(folds):

        #   这里Xi_train_, Xv_train_, y_train_ 分别表示当前的特征在总的输入的特征中的索引、特征的具体的值、对应的标签索引
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        # 训练好模型 并进行预测
        dfm = DeepFM(**dfm_params)

        print('before fit   Xi_train_:', Xi_train_[0:3])
        print('before fit   Xv_train_:', Xv_train_[0:3])
        print('before fit   y_train_:', y_train_[0:3])
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, gini_results_cv.mean(), gini_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, gini_results_cv.mean(),
                                            gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    _plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #19
0
    "batch_norm_decay": 0.995,
    "l2_reg": 0.01,
    "verbose": True,
    "eval_metric": gini_norm,
    "random_seed": config.RANDOM_SEED,
    "loss_type": "mse"
}
dnn_params = dfm_params.copy()
dnn_params["use_fm"] = False
dfm_dnn = get_deep_fm_model(dnn_params)
past_epoch = 60400
dfm_dnn.saver.restore(dfm_dnn.sess,
                      "save/FixedHashing/temp" + '-' + str(past_epoch))
###Prepare Data Parser
fd = FeatureDictionary(dfTrain=dfTrain,
                       dfTest=dfTest,
                       numeric_cols=config.NUMERIC_COLS,
                       ignore_cols=config.IGNORE_COLS)
data_parser = DataParser(feat_dict=fd)

########
###Prepare Data Parser


###Parse Predict Data
# dfTest2=interpret_predict_FPS_Data('Core i7-8700K','GeForce GTX 1080 Ti MSI Gaming X 11GB','Kingdom Come: Deliverance',3840,2160,'High',16)
# dfTest2=interpret_predict_FPS_Data('Core i3-3220','GeForce GTX 960 Palit 2GB Edition','Kingdom Come: Deliverance',1920,1200,'Ultra',8)
# dfTest2=interpret_predict_FPS_Data('Ryzen 5 1500X','GeForce GTX 1060 EVGA SC GAMING 6GB','Star Wars: Battlefront 2',1920,1080,'High',16)
# dfTest2 = interpret_predict_FPS_Data('Ryzen 5 1500X', 'GeForce GTX 1060 3GB', 'Star Wars: Battlefront 2',1920, 1080, 'High', 8)
# dfTest2=interpret_predict_FPS_Data('Pentium G4560','GeForce GT 1030','Star Wars: Battlefront 2',1366,768,'High',8)
# dfTest2=interpret_predict_FPS_Data('Core i5-750','GeForce GTX 1050 Ti Gigabyte G1 Gaming 4GB','Star Wars: Battlefront 2',1920,1080,'High',8)
# dfTest2=interpret_predict_FPS_Data('Core i5-6600K','GeForce GTX 1080 Asus ROG Strix Gaming OC 8GB Edition','Dark Souls 3 - The Fire Fades Edition',1920,1200,'Ultra',16)###24414-0#60
Exemple #20
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)

    """
    Xi_x是一个n_samples x n_features的索引list,每个数值型特征编码为一个固定索引,每个类别型特征根据类别
    数编码为不同的索引
    Xv_x是一个n_samples x n_features的值list
    """
    _print("parse data begin")
    Xi_test, Xv_test, ids_test = data_parser.parse(df=dfTest)
  
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    _print("parse data finish")

    dfm_params["feature_size"] = fd.feat_dim #最大索引
    dfm_params["field_size"] = len(Xi_train[0]) #特征数,这个还是原始的特征数

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    _get = lambda x, l: [x[i] for i in l]
    gini_results_cv = np.zeros(len(folds), dtype=float)
    gini_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    gini_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)

    #train_idx和valid_idx分别是训练集和验证集的idx,因为做了kfold所以下面要从
    #全样本中根据idx提取出来
    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)

        _print("fit, fold=%d" % i)
        dfm = DeepFM(**dfm_params, n_samples=len(Xi_train_))
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] += dfm.predict(Xi_test, Xv_test)

        gini_results_cv[i] = gini_norm(y_valid_, y_train_meta[valid_idx])
        gini_results_epoch_train[i] = dfm.train_result
        gini_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"

    line = "%s: %.5f (%.5f)"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _print(line)

    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, gini_results_cv.mean(), gini_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    # 暂时不画图了
    #_plot_fig(gini_results_epoch_train, gini_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #21
0
def _run_base_model_dfm(dfTrain, dfTest, folds, dfm_params):
    if os.path.exists(config.DF_FILE):
        print("FD EXISTED")
        with open(config.DF_FILE, 'rb') as fd_f:
            fd = pickle.load(fd_f)
    else:
        print("FD NO EXISTED")
        fd = FeatureDictionary(dfTrain=dfTrain, dfTest=dfTest,
                               numeric_cols=config.NUMERIC_COLS,
                               ignore_cols=config.IGNORE_COLS)
        with open(config.DF_FILE, 'wb') as fd_f:
            pickle.dump(fd, fd_f)

    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain, has_label=True)
    Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest, has_label=True) #测试集也是有label
    # print(y_test)
    # print(Xi_train)
    # print(Xv_train)
    # print(y_train)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])
    print(dfm_params)
    # print(dfm_params)

    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]
    auc_results_cv = np.zeros(len(folds), dtype=float)
    test_auc_results_cv = np.zeros(len(folds), dtype=float)
    auc_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    auc_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]), dtype=float)
    # best_test_res = 0.0
    for i, (train_idx, valid_idx) in enumerate(folds):
        print(f"Fold {i}:")
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx)
        # print(Xi_train_)
        # print(Xv_train_)
        # print(y_train_)
        # print(Xi_valid_)
        # print(Xv_valid_)
        # print(y_valid_)
        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_, i)

        y_train_meta[valid_idx,0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:,0] = dfm.predict(Xi_test, Xv_test)
        auc_results_cv[i] = auc(y_valid_, y_train_meta[valid_idx])
        test_auc_results = auc(y_test, y_test_meta)
        # if test_auc_results > best_test_res:
        #     MODEL_PATH = config.MODEL_PATH % (i, )
        #     dfm.save_model(config.MODEL_PATH)#可以写保存地址

        test_auc_results_cv[i] = test_auc_results
        auc_results_epoch_train[i] = dfm.train_result
        auc_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)"%(clf_str, auc_results_cv.mean(), auc_results_cv.std()))
    print("test auc: ", test_auc_results_cv)
    filename = "%s_Mean%.5f_Std%.5f.csv"%(clf_str, auc_results_cv.mean(), auc_results_cv.std())
    # _make_submission(ids_test, y_test_meta, filename)

    # _plot_fig(auc_results_epoch_train, auc_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #22
0
def _run_base_model_dfm(dfTrain=None,
                        dfTest=None,
                        trainfile=None,
                        testfile=None,
                        dfm_params=None):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           trainfile=trainfile,
                           testfile=testfile,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS)
    data_parser = DataParser(feat_dict=fd)
    Xi_train, Xv_train, y_train = data_parser.parse(df=fd.dfTrain,
                                                    has_label=True,
                                                    target=config.LABEL)
    Xi_test, Xv_test, ids_test = data_parser.parse(df=fd.dfTest,
                                                   uid=config.UID)

    dfm_params["feature_size"] = fd.feat_dim
    dfm_params["field_size"] = len(Xi_train[0])

    y_train_meta = np.zeros((fd.dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((fd.dfTest.shape[0], 1), dtype=float)
    _get = lambda x, l: [x[i] for i in l]

    # folds
    folds = list(
        StratifiedKFold(n_splits=config.NUM_SPLITS,
                        shuffle=True,
                        random_state=config.RANDOM_SEED).split(
                            np.array(Xv_train), y_train))

    err_results_cv = np.zeros(len(folds), dtype=float)
    err_results_epoch_train = np.zeros((len(folds), dfm_params["epoch"]),
                                       dtype=float)
    err_results_epoch_valid = np.zeros((len(folds), dfm_params["epoch"]),
                                       dtype=float)

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_ = _get(Xi_train, train_idx), _get(
            Xv_train, train_idx), _get(y_train, train_idx)
        Xi_valid_, Xv_valid_, y_valid_ = _get(Xi_train, valid_idx), _get(
            Xv_train, valid_idx), _get(y_train, valid_idx)

        dfm = DeepFM(**dfm_params)
        dfm.fit(Xi_train_, Xv_train_, y_train_, Xi_valid_, Xv_valid_, y_valid_)

        y_train_meta[valid_idx, 0] = dfm.predict(Xi_valid_, Xv_valid_)
        y_test_meta[:, 0] += dfm.predict(Xi_test, Xv_test)

        # need change to mae
        err_results_cv[i] = err_norm(y_valid_, y_train_meta[valid_idx])
        err_results_epoch_train[i] = dfm.train_result
        err_results_epoch_valid[i] = dfm.valid_result

    y_test_meta /= float(len(folds))

    # save result
    if dfm_params["use_fm"] and dfm_params["use_deep"]:
        clf_str = "DeepFM"
    elif dfm_params["use_fm"]:
        clf_str = "FM"
    elif dfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" %
          (clf_str, err_results_cv.mean(), err_results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, err_results_cv.mean(),
                                            err_results_cv.std())
    _make_submission(ids_test, y_test_meta, filename)

    #_plot_fig(err_results_epoch_train, err_results_epoch_valid, clf_str)

    return y_train_meta, y_test_meta
Exemple #23
0

if __name__ == '__main__':
    '''launching TensorBoard: tensorboard --logdir=path/to/log-directory'''
    start = time.time()
    print(time.asctime(time.localtime(start)))
    print('111')
    # seting fields
    train = pd.read_csv('./data/train_feature.csv')
    test = pd.read_csv('./data/test_feature.csv')

    train.fillna('-1', inplace=True)
    test.fillna('-1', inplace=True)

    df = FeatureDictionary(dfTrain=train,
                           dfTest=test,
                           cv_cols=config.CV_COLS,
                           oh_cols=config.OH_COLS)

    feature_length = df.all_feat_dim
    feat_dict = df.feat_dict
    max_cols = df.max_cols

    # num of fields
    all_field_cnt = len(config.CV_COLS) + len(config.OH_COLS)

    model = DeepFM(config)
    # build graph for model
    model.build_graph()

    saver = tf.train.Saver(max_to_keep=5)
Exemple #24
0
def run_base_model_nfm(dfTrain, dfTest, folds, kdfm_params):
    fd = FeatureDictionary(dfTrain=dfTrain,
                           dfTest=dfTest,
                           numeric_cols=config.NUMERIC_COLS,
                           ignore_cols=config.IGNORE_COLS,
                           xm_cols=config.XM_COLS)
    data_parser = DataParser(feat_dict=fd)

    # 新添
    word2idx, idx2word = build_vocab(config.word_file)

    # Xi_train :列的序号
    # Xv_train :列的对应的值
    Xi_train, Xv_train, y_train = data_parser.parse(df=dfTrain)
    Xt_train, Xm_train = read_text_data(
        config.TRAIN_FILE, word2idx,
        config.num_unroll_steps)  # read data TODO:config 与 pnn_params
    Xi_test, Xv_test, y_test = data_parser.parse(df=dfTest)
    Xt_test, Xm_test = read_text_data(config.TEST_FILE, word2idx,
                                      config.num_unroll_steps)

    kdfm_params['feature_size_one_hot'] = fd.feat_dim
    kdfm_params['word_embeddings'] = load_embedding(
        config.embedding_size, filename=config.embedding_file)  # read data

    #TODO:change
    y_train_meta = np.zeros((dfTrain.shape[0], 1), dtype=float)
    y_test_meta = np.zeros((dfTest.shape[0], 1), dtype=float)

    results_cv = np.zeros(len(folds), dtype=float)
    results_epoch_train = np.zeros((len(folds), kdfm_params['epoch']),
                                   dtype=float)
    results_epoch_valid = np.zeros((len(folds), kdfm_params['epoch']),
                                   dtype=float)
    results_epoch_train_mae = np.zeros((len(folds), kdfm_params['epoch']),
                                       dtype=float)
    results_epoch_valid_mae = np.zeros((len(folds), kdfm_params['epoch']),
                                       dtype=float)

    def _get(x, l):
        return [x[i] for i in l]

    for i, (train_idx, valid_idx) in enumerate(folds):
        Xi_train_, Xv_train_, y_train_, Xt_train_, Xm_train_ = \
            _get(Xi_train, train_idx), _get(Xv_train, train_idx), _get(y_train, train_idx), \
            _get(Xt_train, train_idx), _get(Xm_train, train_idx)

        Xi_valid_, Xv_valid_, y_valid_, Xt_valid_, Xm_valid_ = \
            _get(Xi_train, valid_idx), _get(Xv_train, valid_idx), _get(y_train, valid_idx), \
            _get(Xt_train, valid_idx), _get(Xm_train, valid_idx)

        kdfm = DeepAFM(**kdfm_params)
        Xim_train_ = []
        Xvm_train_ = []
        Xim_valid_ = []
        Xvm_vaild_ = []
        Xim_test = []
        Xvm_test = []

        kdfm.fit(Xi_train_, Xv_train_, Xim_train_, Xvm_train_, Xt_train_,
                 y_train_, Xi_valid_, Xv_valid_, Xim_valid_, Xvm_vaild_,
                 Xt_valid_, y_valid_)

        y_train_meta[valid_idx,
                     0] = kdfm.predict(Xi_valid_, Xv_valid_, Xim_valid_,
                                       Xvm_vaild_, Xt_valid_)
        y_test_meta[:, 0] += kdfm.predict(Xi_test, Xv_test, Xim_test, Xvm_test,
                                          Xt_test)

        results_cv[i] = mse_norm(y_valid_, y_train_meta[valid_idx])
        results_epoch_train[i] = kdfm.train_result
        results_epoch_valid[i] = kdfm.valid_result

        results_epoch_train_mae[i] = kdfm.mae_train_result
        results_epoch_valid_mae[i] = kdfm.mae_valid_result

    y_test_meta /= float(len(folds))
    mse_test = mse(y_test, y_test_meta)

    # save result
    if kdfm_params["use_afm"] and kdfm_params["use_deep"]:
        clf_str = "KDFM"
    elif kdfm_params["use_afm"]:
        clf_str = "AFM"
    elif kdfm_params["use_deep"]:
        clf_str = "DNN"
    print("%s: %.5f (%.5f)" % (clf_str, results_cv.mean(), results_cv.std()))
    filename = "%s_Mean%.5f_Std%.5f.csv" % (clf_str, results_cv.mean(),
                                            results_cv.std())
    _make_submission(y_test, y_test_meta, mse_test, filename)
    _plot_fig(results_epoch_train, results_epoch_valid, clf_str + 'mse', "mse")
    _plot_fig(results_epoch_train_mae, results_epoch_valid_mae,
              clf_str + 'mae', "mae")