def make_submission(): print("Reading data") fea, status = features.online_extract_features('data/train.csv', limit=5e6) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=1) rf.fit(fea, status['OpenStatus']) print "Features Importance:" imps = zip(rf.feature_importances_, fea.keys()) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") test_features = features.online_extract_features('data/'+test_file, train=False, limit=1e12)[0] probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def make_submission(): print("Reading data") fea, status = features.online_extract_features('data/train.csv', train=True, limit=1e9) _dim(fea,'fea') print("Training Level 1 : Open/Rest model") open_status = [ or_binarize(e) for e in status['OpenStatus'] ] is_not_open_status = [ s != 'open' for s in open_status ] or_model = learn(fea,open_status) print("Training Level 2 : Not Open Split model") not_open_status = [ status['OpenStatus'][i] for i in range(len(is_not_open_status)) if is_not_open_status[i] ] no_fea = fea[is_not_open_status] _dim(no_fea,'no_fea') no_model = learn(no_fea,not_open_status) print("Reading test file and making predictions") test_features = features.online_extract_features('data/'+test_file, train=False, limit=1e9)[0] _dim(test_features,'test_features') or_probs = or_model.predict_proba(test_features) probs = [] for i in range(0,len(or_probs)): or_prob = or_probs[i] if or_prob[0] > or_prob[1]: probs.append(np.array([1.0,0.0,0.0,0.0,0.0])) else: f = [ test_features[ff][i] for ff in test_features.keys() ] a = no_model.predict_proba(f) aa = np.insert(a,0,[0.0]) probs.append(aa) probs = np.array(probs) if False: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)