Example #1
0
def cotraining (model_one, model_two, n_iter = 100) :
    """
    """
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data ()

    train = data[:train_number,:]
    validation = data[train_number:train_number+val_number:,:]
    test = data[train_number+val_number:-unlabel_number,:]
    unlabel = data[-unlabel_number:,:]

    train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel) 
    # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel)

#    train_number = 100
#    unlabel_number = 1000
#
#    train = train[:100,:]
#    unlabel = unlabel[:1000,:]
#    label = label[:100]

    train_one = copy.deepcopy (train)
    label_one = copy.deepcopy (label)
    train_two = copy.deepcopy (train)
    label_two = copy.deepcopy (label)

    model_one.fit (train_one, label_one)
    model_two.fit (train_two, label_two)

    for iter in xrange (1 , n_iter + 1 , 1) :
        logging.info ('#%d iter for co-training :' % iter)

        unlabel_label = [-1] * unlabel_number
        unlabel_index = range (0, unlabel_number)
        step = 0
        while len (unlabel_index) > 0 :
            step += 1
            logging.info ('co-training step #%d , reamining unlabel: %d' % (step, len (unlabel_index)))
            model_one, model_two, unlabel_label, unlabel_index, train_two, label_two = training (model_one, model_two, unlabel, unlabel_label, unlabel_index, train_two, label_two)
            model_two, model_one, unlabel_label, unlabel_index, train_one, label_one = training (model_two, model_one, unlabel, unlabel_label, unlabel_index, train_one, label_one)
            
            evaluate.get_auc (model_one.predict_proba (validation)[:,1])
            evaluate.get_auc (model_two.predict_proba (validation)[:,1])
            evaluate.get_auc ((model_one.predict_proba (validation)[:,1] + model_two.predict_proba (validation)[:,1]) / 2.0)

            joblib.dump (model_one, ROOT + '/result/model/model_one_%d_%d.pkl' % (iter, step))
            joblib.dump (model_two, ROOT + '/result/model/model_two_%d_%d.pkl' % (iter, step))
    
            evaluate.output (uid, (model_one.predict_proba (test)[:,1] + model_two.predict_proba (test)[:,1]) / 2.0, ROOT + '/result/predict/cotraining_%d_%d.csv' % (iter, step))
            evaluate.output (uid, model_one.predict_proba (test)[:,1], ROOT + '/result/predict/model_one_%d_%d.csv' % (iter, step))
            evaluate.output (uid, model_two.predict_proba (test)[:,1], ROOT + '/result/predict/model_two_%d_%d.csv' % (iter, step))
Example #2
0
sys.path.insert(0, '../..')
import feature.splitvalue as split
import model.evaluate as evaluate


if __name__ == '__main__' :
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data ()
    train = data[:train_number,:]
    validation = data[train_number:train_number+val_number,:]
    test = data[train_number+val_number:-unlabel_number,:]
    unlabel = data[-unlabel_number:,:]

    val_label = pd.read_csv ('../../data/val_cv_y.csv').y.values

    io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard')
    train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel)
    io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard_decompose')
    # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel)
    
    train_data, train_label, validation_data, validation_label, test, unlabel = io.grab ('../../data/data_standard')
    print 'training set:' , train_data.shape
    print 'validation set: ' , validation_data.shape
    print 'testing set', test.shape
    print 'unlabel set', unlabel.shape

    assert train_data.shape[0] == len (train_label)
    assert validation_data.shape[0] == len (validation_label)



    """