def find_threshold_split_pos_neg(): """ """ data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data() predict = logistic.lr_solver( data[:train_number, :], label, data[train_number:-test_number, :], data[-test_number:, :], decomposition.gbdt_dimreduce_threshold, split.split_continuum_value_tvt, ) print roc_auc_score(label, predict) neg = [] pos = [] for i in xrange(len(label)): if label[i] == 0: neg.append(predict[i]) else: pos.append(predict[i]) predict.sort() pos.sort() neg.sort() with open(ROOT + "/result/pos", "w") as out: for each in pos: out.write(str(each) + "\n") with open(ROOT + "/result/neg", "w") as out: for each in neg: out.write(str(each) + "\n") print predict[len(neg)]
def cotraining (model_one, model_two, n_iter = 100) : """ """ data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data () train = data[:train_number,:] validation = data[train_number:train_number+val_number:,:] test = data[train_number+val_number:-unlabel_number,:] unlabel = data[-unlabel_number:,:] train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel) # train_number = 100 # unlabel_number = 1000 # # train = train[:100,:] # unlabel = unlabel[:1000,:] # label = label[:100] train_one = copy.deepcopy (train) label_one = copy.deepcopy (label) train_two = copy.deepcopy (train) label_two = copy.deepcopy (label) model_one.fit (train_one, label_one) model_two.fit (train_two, label_two) for iter in xrange (1 , n_iter + 1 , 1) : logging.info ('#%d iter for co-training :' % iter) unlabel_label = [-1] * unlabel_number unlabel_index = range (0, unlabel_number) step = 0 while len (unlabel_index) > 0 : step += 1 logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index))) model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two) model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one) evaluate.get_auc (model_one.predict_proba (validation)[:,1]) evaluate.get_auc (model_two.predict_proba (validation)[:,1]) evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0) joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step)) joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step)) evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step)) evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
# train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test, unlabel = feature_extract(train_data, train_label, validation, test, unlabel) # print new_train_data.shape train_data, validation, test, unlabel = feature_handler(train_data, validation, test, unlabel) rf = RandomForestClassifier(warm_start=True, n_jobs=2, n_estimators=2000, max_depth=3, min_samples_split=50) rf.fit(train_data, train_label) # joblib.dump (rf, ROOT + '/result/rf.pkl') evaluate.get_auc(rf.predict_proba(validation)[:, 1]) return rf.predict_proba(train_data)[:, 1] if __name__ == "__main__": data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data() assert data.shape[0] == train_number + test_number + val_number + unlabel_number predict = rf_solver( data[:train_number, :], label, data[train_number : train_number + val_number, :], data[train_number + val_number : -unlabel_number, :], data[-unlabel_number:, :], decomposition.gbdt_dimreduce_threshold, split.undo, ) evaluate.output(uid, predict, ROOT + "/result/rf.csv")