Beispiel #1
0
def fabert_predict(train_data, labels, valid_data, test_data, output_dir, time_budget, target_num, is_sparse):
    print (strftime("%Y-%m-%d %H:%M:%S"))
    print ("make multiclass prediction\n")
    np_seed = int(time.time())
    np.random.seed(np_seed)
    print ("np seed = ", np_seed)
    print (train_data.shape)

    print ("train_data.shape == (%d,%d)\n" % train_data.shape)
    n_features = train_data.shape[1]
    n_samples = train_data.shape[0]
    start_time = time.time()

    # #    FS_iterations = max(1,int(5000/target_num * (5000./n_samples)*2000./n_features))
    #     FS_iterations = 5000
    #     print ("FS_iterations = %d\n" % FS_iterations)
    # #    select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3)
    #     select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3)
    #     select_clf.fit(train_data, labels)
    #     print("FS time = ", time.time() - start_time)
    #
    #     my_mean =1./(1000*n_features)
    #     print(my_mean)
    #     print("feature importances: ", np.sort(select_clf.feature_importances_))
    #
    #     train_data = select_clf.transform(train_data,threshold=my_mean )
    #     valid_data = select_clf.transform(valid_data,threshold=my_mean )
    #     test_data = select_clf.transform(test_data,threshold=my_mean)
    #     print(my_mean)
    #     print(train_data.shape)
    #     #exit(1)

    ######################### Make validation/test predictions
    n_features = train_data.shape[1]
    # if n_features < 100:
    #     gbt_features=n_features
    # else:
    #     gbt_features=int(n_features**0.5)
    gbt_features = int(n_features ** 0.5)
    gbt_iterations = 15000  # int((time_budget / 3000.) * 3000000/(gbt_features * target_num) * (7000./n_samples))
    #    gbt_params=GBT_params(n_iterations=gbt_iterations,depth=int(10 * np.log2(gbt_iterations)/14.3), learning_rate=0.01,subsample_part=0.6,n_max_features=gbt_features,min_samples_split=5, min_samples_leaf=3)
    gbt_params = GBT_params(
        n_iterations=gbt_iterations,
        depth=4,
        learning_rate=0.01,
        subsample_part=0.6,
        n_max_features=gbt_features,
        min_samples_split=5,
        min_samples_leaf=3,
    )
    gbt_params.print_params()
    (y_valid, y_test) = make_classification(gbt_params, train_data, labels, valid_data, test_data)
    print ("y_valid.shape = ", y_valid.shape)
    print ("y_test.shape = ", y_test.shape)
    return (y_valid, y_test)
def multiclass_predict(train_data,labels,valid_data,test_data,output_dir,time_budget,target_num, is_sparse):
    print(strftime("%Y-%m-%d %H:%M:%S"))
    print("make multiclass prediction\n")
    np_seed = int(time.time())
    np.random.seed(np_seed)
    print ("np seed = " , np_seed)
    print(train_data.shape)

    print("train_data.shape == (%d,%d)\n"%train_data.shape)
    n_features = train_data.shape[1]
    n_samples = train_data.shape[0]
    start_time = time.time()
    if is_sparse:
        print("no FS, it is sparse data\n")
        train_data=train_data.toarray()
        valid_data=valid_data.toarray()
        test_data=test_data.toarray()
        # train_data = select_clf.transform(train_data,threshold=my_mean )
        # valid_data = select_clf.transform(valid_data,threshold=my_mean )
        # test_data = select_clf.transform(test_data,threshold=my_mean)
        print("sparse converting time = ", time.time() - start_time)
        start_time = time.time()


    FS_iterations = max(1,int(5000/target_num * (5000./n_samples)*2000./n_features))
    print ("FS_iterations = %d\n" % FS_iterations)
    select_clf = ExtraTreesClassifier(n_estimators=FS_iterations,max_depth=3)
    select_clf.fit(train_data, labels)
    print("FS time = ", time.time() - start_time)

    my_mean =1./(10*n_features)
    train_data = select_clf.transform(train_data,threshold=my_mean )
    valid_data = select_clf.transform(valid_data,threshold=my_mean )
    test_data = select_clf.transform(test_data,threshold=my_mean)
    print(my_mean)
    print(train_data.shape)

    ######################### Make validation/test predictions
    n_features=train_data.shape[1]
    if n_features < 100:
        gbt_features=n_features
    else:
        gbt_features=int(n_features**0.5)
    gbt_iterations= int((time_budget / 3000.) * 3000000/(gbt_features * target_num) * (7000./n_samples))
    gbt_params=GBT_params(n_iterations=gbt_iterations,depth=int(10 * np.log2(gbt_iterations)/14.3), learning_rate=0.01,subsample_part=0.6,n_max_features=gbt_features,min_samples_split=5, min_samples_leaf=3)
    gbt_params.print_params()
    (y_valid, y_test) = make_classification(gbt_params, train_data, labels, valid_data, test_data)
    print("y_valid.shape = ",y_valid.shape )
    print("y_test.shape = ",y_test.shape )
    return (y_valid, y_test)
def make_cross_validation(
    data, solution, cv_folds, params_begin, params_mult_factor, params_add_factor, params_num_iter
):
    #    params = GBT_params(params_begin.n_iterations,params_begin.depth, params_begin.learning_rate, params_begin.subsample_part, params_begin.n_max_features)
    params = GBT_params()
    cv_iterations = (
        params_num_iter.n_iterations
        * params_num_iter.depth
        * params_num_iter.learning_rate
        * params_num_iter.subsample_part
        * params_num_iter.n_max_features
    )
    cv_res = np.zeros(cv_iterations)
    cv_times = np.zeros(cv_iterations)

    cur_iter = 0
    params.n_iterations = params_begin.n_iterations
    for n_iterations in range(params_num_iter.n_iterations):
        params.learning_rate = params_begin.learning_rate
        for n_learning_rate in range(params_num_iter.learning_rate):
            params.depth = params_begin.depth
            for n_max_depth in range(params_num_iter.depth):
                params.subsample_part = params_begin.subsample_part
                for subsample_part in range(params_num_iter.subsample_part):
                    params.n_max_features = params_begin.n_max_features
                    for n_max_features in range(params_num_iter.n_max_features):
                        start_time = time.time()
                        params.print_params()
                        clf = ensemble.GradientBoostingClassifier(
                            n_estimators=params.n_iterations,
                            learning_rate=params.learning_rate,
                            max_depth=params.depth,
                            subsample=params.subsample_part,
                            max_features=int(params.n_max_features),
                        )
                        cv_res[cur_iter] = Calc_CV_ERROR(clf, data, solution, cv_folds)
                        print ("CV score = %1.5", cv_res[cur_iter])
                        cv_times[cur_iter] = time.time() - start_time
                        print ("CV time = %d", cv_times[cur_iter])
                        params.n_max_features *= params_mult_factor.n_max_features
                        params.n_max_features += params_add_factor.n_max_features
                        cur_iter += 1
                    params.subsample_part *= params_mult_factor.subsample_part
                    params.subsample_part += params_add_factor.subsample_part
                params.depth *= params_mult_factor.depth
                params.depth += params_add_factor.depth
            params.learning_rate *= params_mult_factor.learning_rate
            params.learning_rate += params_add_factor.learning_rate
        params.n_iterations *= params_mult_factor.n_iterations
        params.n_iterations += params_add_factor.n_iterations
    return (cv_res, cv_times)