Example #1
0
def find_threshold_split_pos_neg():
    """
    """
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data()
    predict = logistic.lr_solver(
        data[:train_number, :],
        label,
        data[train_number:-test_number, :],
        data[-test_number:, :],
        decomposition.gbdt_dimreduce_threshold,
        split.split_continuum_value_tvt,
    )
    print roc_auc_score(label, predict)
    neg = []
    pos = []
    for i in xrange(len(label)):
        if label[i] == 0:
            neg.append(predict[i])
        else:
            pos.append(predict[i])
    predict.sort()
    pos.sort()
    neg.sort()
    with open(ROOT + "/result/pos", "w") as out:
        for each in pos:
            out.write(str(each) + "\n")
    with open(ROOT + "/result/neg", "w") as out:
        for each in neg:
            out.write(str(each) + "\n")

    print predict[len(neg)]
Example #2
0
def cotraining (model_one, model_two, n_iter = 100) :
    """
    """
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data ()

    train = data[:train_number,:]
    validation = data[train_number:train_number+val_number:,:]
    test = data[train_number+val_number:-unlabel_number,:]
    unlabel = data[-unlabel_number:,:]

    train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) 
    # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel)

#    train_number = 100
#    unlabel_number = 1000
#
#    train = train[:100,:]
#    unlabel = unlabel[:1000,:]
#    label = label[:100]

    train_one = copy.deepcopy (train)
    label_one = copy.deepcopy (label)
    train_two = copy.deepcopy (train)
    label_two = copy.deepcopy (label)

    model_one.fit (train_one, label_one)
    model_two.fit (train_two, label_two)

    for iter in xrange (1 , n_iter + 1 , 1) :
        logging.info ('#%d iter for co-training :' % iter)

        unlabel_label = [-1] * unlabel_number
        unlabel_index = range (0, unlabel_number)
        step = 0
        while len (unlabel_index) > 0 :
            step += 1
            logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index)))
            model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two)
            model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one)
            
            evaluate.get_auc (model_one.predict_proba (validation)[:,1])
            evaluate.get_auc (model_two.predict_proba (validation)[:,1])
            evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0)

            joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step))
            joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step))
    
            evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step))
            evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step))
            evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
Example #3
0
    # train_data = train_data[:100,:]
    # validation = validation[:100,:]
    # test = test[:100,:]
    # train_label = train_label[:100]
    train_data, validation, test, unlabel = feature_extract(train_data, train_label, validation, test, unlabel)
    # print new_train_data.shape
    train_data, validation, test, unlabel = feature_handler(train_data, validation, test, unlabel)

    rf = RandomForestClassifier(warm_start=True, n_jobs=2, n_estimators=2000, max_depth=3, min_samples_split=50)
    rf.fit(train_data, train_label)
    # joblib.dump (rf, ROOT + '/result/rf.pkl')
    evaluate.get_auc(rf.predict_proba(validation)[:, 1])
    return rf.predict_proba(train_data)[:, 1]


if __name__ == "__main__":
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data()
    assert data.shape[0] == train_number + test_number + val_number + unlabel_number
    predict = rf_solver(
        data[:train_number, :],
        label,
        data[train_number : train_number + val_number, :],
        data[train_number + val_number : -unlabel_number, :],
        data[-unlabel_number:, :],
        decomposition.gbdt_dimreduce_threshold,
        split.undo,
    )

    evaluate.output(uid, predict, ROOT + "/result/rf.csv")