Ejemplo n.º 1
0
def load_test_data():
    with timer("loading test data"):
        print('loading test data...')
        if FLAGS.test_for_train:
            path_prefix = "test_Cnt_ForTrain"
        else:
            path_prefix = "test_Cnt"
        if FLAGS.debug:
            test_data_path = path + path_prefix + "_Top.ss.csv"
            test_df = pd.read_csv(test_data_path,
                                  dtype=dtypes,
                                  usecols=['click_id'] +
                                  keras_train.USED_FEATURE_LIST)
        else:
            test_data_path = path + path_prefix + ".csv"
            test_df = pd.read_csv(
                test_data_path,
                dtype=dtypes,
                header=None,
                sep='\t',
                names=['id', 'click_id'] +
                keras_train.DATA_HEADER,  #nrows = 10000,
                # usecols = ['click_id'] + keras_train.USED_FEATURE_LIST
            )
        if FLAGS.test_for_train:
            train_df = train_df.append(test_df[['is_attributed'] +
                                               keras_train.USED_FEATURE_LIST])
            test_df = test_df[:100000]
        print(test_df.info())
        gc.collect()
    return test_df
Ejemplo n.º 2
0
def find_best_iteration_search(bst):
    """
    """
    valide_df = load_valide_data()
    valide_data = valide_df[keras_train.USED_FEATURE_LIST].values.astype(
        DENSE_FEATURE_TYPE)
    valide_label = valide_df['is_attributed'].values.astype(np.uint8)
    del valide_df
    gc.collect()
    if FLAGS.stacking:
        valide_data = gen_stacking_data(valide_data)
    pos_cnt = valide_label.sum()
    neg_cnt = len(valide_label) - pos_cnt
    print("valide type: {0} valide size: {1} valide data pos: {2} neg: {3}".
          format(valide_data.dtype, len(valide_data), pos_cnt, neg_cnt))
    with timer("finding best iteration..."):
        search_iterations = [
            int(ii.strip()) for ii in FLAGS.search_iterations.split(',')
        ]
        for i in range(search_iterations[0], search_iterations[1],
                       search_iterations[2]):
            y_pred = bst.predict(valide_data, num_iteration=i)
            score = metrics.roc_auc_score(valide_label, y_pred)
            loss = metrics.log_loss(valide_label, y_pred)
            print("Iteration: {0} AUC: {1} Logloss: {2}".format(
                i, score, loss))
Ejemplo n.º 3
0
def neg_sample(input_data, labels, C=1):
    """
    Param:
    labels shape: (n_sample,)
    preds shape: (n_sample,)
    input_data shape: (n_sample, feature_dim)
    C: neg_number = C * pos_number   
    return:
    data after sampling
    """
    with timer("Negative sampling"):
        print('Negative sampling...')
        pos_ind = np.where(labels == 1)[0]
        neg_ind = np.where(labels == 0)[0]
        accept_rate = float(C * len(pos_ind)) / float(len(neg_ind))
        neg_select_ind = nrs.choice(neg_ind, len(pos_ind) * C, replace=True)
        select_ind = np.append(pos_ind, neg_select_ind)
        nrs.shuffle(select_ind)
        sample_data = input_data[select_ind, :]
        sample_labels = labels[select_ind]
        sample_neg_ind = np.where(sample_labels == 0)[0]
        weight = np.ones(len(sample_labels))
        weight[sample_neg_ind] = 1.0 / accept_rate
        print('-----Neg Sampling Before All: {} Pos: {} Neg: {}'.format(
            len(labels), np.sum(labels == 1), np.sum(labels == 0)))
        print('-----Neg Sampling After All: {} Pos: {} Neg: {}'.format(
            len(sample_labels), np.sum(sample_labels == 1),
            np.sum(sample_labels == 0)))
        print('-----Neg Sampling Rate: {}'.format(
            float(len(sample_labels)) / float(len(labels))))
    return sample_data, sample_labels, weight
Ejemplo n.º 4
0
def load_train_data():
    with timer("loading train data"):
        print('loading train data...')
        if FLAGS.split_train_val:
            path_prefix = "train_Cnt_Id"
        else:
            path_prefix = "train_part_Cnt_Neg20"
        if FLAGS.debug:
            train_data_path = path + path_prefix + "_Top.ss.csv"
            train_df = pd.read_csv(train_data_path,
                                   dtype=dtypes,
                                   usecols=['is_attributed'] +
                                   keras_train.USED_FEATURE_LIST)
        else:
            train_data_path = path + path_prefix + ".csv"
            if FLAGS.split_train_val:
                train_df = pd.read_csv(
                    train_data_path,
                    dtype=dtypes,
                    header=None,
                    sep='\t',
                    names=['is_attributed'] + keras_train.DATA_HEADER,
                    skiprows=range(0, 184903890 - FLAGS.train_eval_len),
                    usecols=['is_attributed'] + keras_train.USED_FEATURE_LIST)
            else:
                if FLAGS.stacking:
                    train_df = pd.read_csv(
                        train_data_path,
                        dtype=dtypes,
                        header=None,
                        sep='\t',
                        names=['is_attributed'] +
                        keras_train.DATA_HEADER,  #skiprows=range(0,10000000),
                        usecols=['is_attributed'] +
                        keras_train.USED_FEATURE_LIST)
                else:
                    train_df = pd.read_csv(
                        train_data_path,
                        dtype=dtypes,
                        header=None,
                        sep='\t',
                        names=['is_attributed'] + keras_train.
                        DATA_HEADER,  #nrows = 10000000, #skiprows=range(0,10000000),
                        # usecols = ['is_attributed'] + keras_train.USED_FEATURE_LIST
                    )
        print(train_df.info())
    return train_df
Ejemplo n.º 5
0
def predict_test(bst):
    test_df = load_test_data()
    test_data = test_df[keras_train.USED_FEATURE_LIST].values.astype(
        DENSE_FEATURE_TYPE)
    test_id = test_df['click_id'].values  #.astype(np.uint32)
    print("test type {0}".format(test_data.dtype))
    del test_df
    gc.collect()
    if FLAGS.stacking:
        test_data = gen_stacking_data(test_data)
    with timer("predicting test data"):
        print('predicting test data...')
        sub_re = pd.DataFrame(test_id, columns=['click_id'])
        sub_re['is_attributed'] = bst.predict(
            test_data, num_iteration=FLAGS.best_iteration)
        time_label = time.strftime('_%Y_%m_%d_%H_%M_%S', time.gmtime())
        sub_name = FLAGS.output_model_path + "sub" + time_label + ".csv"
        sub_re.to_csv(sub_name, index=False)
Ejemplo n.º 6
0
def load_valide_data():
    with timer("loading valide data"):
        print('loading valide data...')
        if not FLAGS.split_train_val:
            path_prefix = "valide_Cnt"
            if FLAGS.debug:
                valide_data_path = path + path_prefix + "_Top.ss.csv"
                valide_df = pd.read_csv(valide_data_path,
                                        dtype=dtypes,
                                        usecols=['is_attributed'] +
                                        keras_train.USED_FEATURE_LIST)
            else:
                valide_data_path = path + path_prefix + ".csv"
                valide_df = pd.read_csv(
                    valide_data_path,
                    dtype=dtypes,
                    header=None,
                    sep='\t',
                    names=['id', 'is_attributed'] +
                    keras_train.DATA_HEADER,  #nrows = 10000,
                    # usecols = ['id', 'is_attributed'] + keras_train.USED_FEATURE_LIST
                )
            print(valide_df.info())
    return valide_df