def make_submission(): print("Reading data") fea, status = features.online_extract_features('data/train.csv', limit=5e6) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=1) rf.fit(fea, status['OpenStatus']) print "Features Importance:" imps = zip(rf.feature_importances_, fea.keys()) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") test_features = features.online_extract_features('data/'+test_file, train=False, limit=1e12)[0] probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf,feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) score()
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:140323]) print("Reading test file and making predictions") features.compute_features(test_file,feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) #print("Extracting features") #features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) clf = svm.SVC(decision_function_shape='ovo') clf.fit(fea, data["OpenStatus"][:178351]) print("Reading test file and making predictions") #features.compute_features("test_.csv",feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = clf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") X = features.extract_features(feature_names, data) y = [ class_labels[i] for i in data["OpenStatus"]] skf = StratifiedKFold(y, 10) result_f1 = 0 result_logloss = 0 fold = 1 for train, test in skf: print "Fold %d" % fold fold+=1 X_train = [X.ix[i] for i in train] y_train = [y[i] for i in train] X_test = [X.ix[i] for i in test] y_test = [y[i] for i in test] if (options.__dict__['classifier'] == 'erf'): classifier = ExtraTreesClassifier(n_estimators=100, verbose=0, compute_importances=True, n_jobs=-1) elif(options.__dict__['classifier'] == 'mnb'): classifier = MultinomialNB() elif (options.__dict__['classifier'] == 'knn'): classifier = KNeighborsClassifier(n_neighbors=11) elif (options.__dict__['classifier'] == 'gbc'): classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1) classifier.fit(X_train, y_train) probs = classifier.predict_proba(X_test) if (options.__dict__['priors'] != 0): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) y_pred = probs y_test = np.array(y_test) logloss = multiclass_log_loss(y_test, y_pred, eps=1e-15) y_pred = classifier.predict(X_test) f1 = f1_score(y_test, y_pred, pos_label=None) print "Log Loss: %f f1: %f" % (logloss, f1) result_f1 += f1 result_logloss += logloss print '\navg LogLoss: %f avg f1: %f' % (result_logloss/10.0, result_f1/10.0)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def make_submission(): data = None if os.path.exists('data.pik'): print("Unpickeling the data") data = pickle.load(open('data.pik')) else: print("Reading the data") data = cu.get_dataframe(full_train_file) pickle.dump(data,open('data.pik','w')) fea = None if os.path.exists('fea.pik'): print("Unpickeling the fea") fea = pickle.load(open('fea.pik')) else: print("Extracting features") fea = features.extract_features(feature_names, data) pickle.dump(fea,open('fea.pik','w')) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=2) rf.fit(fea, data["OpenStatus"]) print "Features Importance:" imps = zip(rf.feature_importances_, feature_names,) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): f = open(probs_file, 'r') lines = f.readlines() probs = [] for line in lines: probs.append(np.array([float(x) for x in line.split(',')])) print("Calculating priors and updating posteriors") probs = np.array(probs) new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) f.close()
def make_submission(): print("Reading data") fea, status = features.online_extract_features('data/train.csv', train=True, limit=1e9) _dim(fea,'fea') print("Training Level 1 : Open/Rest model") open_status = [ or_binarize(e) for e in status['OpenStatus'] ] is_not_open_status = [ s != 'open' for s in open_status ] or_model = learn(fea,open_status) print("Training Level 2 : Not Open Split model") not_open_status = [ status['OpenStatus'][i] for i in range(len(is_not_open_status)) if is_not_open_status[i] ] no_fea = fea[is_not_open_status] _dim(no_fea,'no_fea') no_model = learn(no_fea,not_open_status) print("Reading test file and making predictions") test_features = features.online_extract_features('data/'+test_file, train=False, limit=1e9)[0] _dim(test_features,'test_features') or_probs = or_model.predict_proba(test_features) probs = [] for i in range(0,len(or_probs)): or_prob = or_probs[i] if or_prob[0] > or_prob[1]: probs.append(np.array([1.0,0.0,0.0,0.0,0.0])) else: f = [ test_features[ff][i] for ff in test_features.keys() ] a = no_model.predict_proba(f) aa = np.insert(a,0,[0.0]) probs.append(aa) probs = np.array(probs) if False: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf, feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): print("Reading the data") train_data = cu.get_dataframe(train_file) print("Extracting features") train_features = features.extract_features(feature_names, train_data) print("Reading test file and making predictions") test_data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, test_data) # print("Training random forest") # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) # rf.fit(train_features, train_data["OpenStatus"]) # probs = rf.predict_proba(test_features) # print("Training decision tree") # dt = DecisionTreeClassifier() # dt.fit(train_features, train_data["OpenStatus"]) # probs = dt.predict_proba(test_features) # print("Training adaboost") # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"]) # probs = ada.predict_proba(test_features) print("Training nearest neighbors") scaler = preprocessing.StandardScaler().fit(train_features) train_features_scaled = scaler.transform(train_features) test_features_scaled = scaler.transform(test_features) nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled, train_data["OpenStatus"]) probs = nbrs.predict_proba(test_features_scaled) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) actual = cu.get_actual(test_data["OpenStatus"]) print(cu.get_log_loss(actual, probs, 10**(-15)))
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0) gb.fit(fea, data["OpenStatus"]) dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) dt.fit(fea, data["OpenStatus"]) et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) et.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) probs2 = gb.predict_proba(test_features) probs3 = dt.predict_proba(test_features) probs4 = et.predict_proba(test_features) for i in range(0, len(probs)): for j in range(0,5): probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4 print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X, y) y_submission = clf.predict_proba(X_test)[:, 1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) print print "Blending." clf = LogisticRegression() clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:, 1] print "Linear stretch of predictions to [0,1]" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) probs = y_submission print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "Saving Results." cu.write_submission(submission_file, probs)
def predict_class(test_file="test/public_leaderboard.csv", recompute_feats=False): """ Module that predicts class probabilities for test data from a .csv file or precomputed feature vectors. """ # custom variables DATA_DIR = "../data/" SUBMISSION_DIR = "../data/submission/" train_file_all = "train/train.csv" test_file = "test/private_leaderboard.csv" feature_file = "test/private_leaderboard-feats.csv" output_file = "predictions.csv" logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger(__name__) if recompute_feats: # features.compute_features( 'test/test.csv', 'test/test-feats.csv' ) features.compute_features(test_file, feature_file) log.info("π: load features from file") X_test = pd.io.parsers.read_csv(os.path.join(DATA_DIR, feature_file), header=None) X_test = X_test.as_matrix() log.info("π: load classifier") npz_file = np.load(SUBMISSION_DIR + "cfy.npz") clf_lda = joblib.load(SUBMISSION_DIR + "clf_lda.pkl") clf_rfc = joblib.load(SUBMISSION_DIR + "clf_rfc.pkl") clf_gbc = joblib.load(SUBMISSION_DIR + "clf_gbc.pkl") log.info("π: load standardizer, normalizer") standardizer = npz_file["standardizer"].item() normalizer = npz_file["normalizer"].item() # log.info( 'π: perform feature selection' ) # fselect = npz_file[ 'fselect' ].item() # X_test = fselect.transform( X_test ) log.info("π: Random Forest predictions") y_rfc = clf_rfc.predict_proba(X_test) log.info("π: standardize and normalize test features") standardizer.transform(X_test) # in-place normalizer.transform(X_test) # in-place log.info("π: LDA and GBC class membership predictions") # X_test = clf_lda.transform( X_test ) y_lda = clf_lda.predict_proba(X_test) y_gbc = clf_gbc.predict_proba(X_test) y_pred = (y_rfc + y_gbc) / 2.0 log.info("π: calculate priors and update posteriors") new_priors = cu.get_priors(train_file_all) closed_reasons = pd.io.parsers.read_csv(os.path.join(DATA_DIR, train_labels), header=None)["X0"] closed_reason_counts = Counter(closed_reasons) reasons = sorted(closed_reason_counts.keys()) total = len(closed_reasons) old_priors = [closed_reason_counts[reason] / total for reason in reasons] y_pred = cu.cap_and_update_priors(old_priors, y_pred, new_priors, 0.001) y_pred = (2 * y_pred + y_lda) / 3.0 log.info("π: write predictions to file") writer = csv.writer(open(os.path.join(SUBMISSION_DIR, output_file), "w"), lineterminator="\n") writer.writerows(y_pred)
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X, y) y_submission = clf.predict_proba(X_test)[:,1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1] dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) print print "Blending." clf = LogisticRegression() clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:,1] print "Linear stretch of predictions to [0,1]" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) probs = y_submission print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "Saving Results." cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) #print("Extracting features") #features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:178351]) important_features = [] for x, i in enumerate(rf.feature_importances_): if i > np.average(rf.feature_importances_): important_features.append([str(x),i]) print 'Most important features:',important_features print("Reading test file and making predictions") #features.compute_features("test1.csv",feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors,probs, new_priors, 0.01) y_pred=[] for i in probs: i=[float(k) for k in i] j=i.index(max(i)) if(j==3): y_pred.append("open") else: print "hi" y_pred.append("closed") y_true=[] a=0 b=0 test_reader = csv.reader(open(test_file)) headers=test_reader.next() for line in test_reader: if line[14]=='open': y_true.append("open") a=a+1 else: y_true.append("closed") b=b+1 print a print b print confusion_matrix(y_true[1:] , y_pred , labels=["open", "closed"]) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append( features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print("Reading the data from:" + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: depth = len(feature_names) print "depth=" + str(depth) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=depth, init=None, random_state=None) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "Fitting for fold " + str(fold) rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) # priors distribution over classes based on the training set #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] # priors distribution over classes based on the updated training set's last month new_priors = [ 0.03410911204982466, 0.01173872976800856, 0.018430671606251586, 0.926642216133641, 0.009079270442274271 ] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "depth=" + str(depth) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) else: #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=len(feature_names), init=None, random_state=None) rf.fit(fea, y) print("Reading test file " + test_file + " and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) # commented out, because we want to adjust probabilities to the last month data anyway #if do_full_train == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): priors = cu.get_priors("train.csv") num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = np.kron(np.ones((num_samples,1)), priors) cu.write_submission("prior_benchmark.csv", predictions)
def main(): priors = cu.get_priors("train.csv") num_samples = len(cu.get_dataframe("test_.csv")) predictions = np.kron(np.ones((num_samples, 1)), priors) cu.write_submission("prior_benchmark.csv", predictions)
def train_classifier( train_file='train/train.csv', recompute_feats=False ): ''' Module that reads stackoverflow data from a .csv file, generates features, and trains a classifier. ''' # custom variables DATA_DIR = "../data/" SUBMISSION_DIR = "../data/submission/" # train_file = 'train/train-sample.csv' label_file = 'train/train-labels.csv' feature_file = 'train/train-feats.csv' # display progress logs on stdout logging.basicConfig( level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s' ) log = logging.getLogger(__name__) if recompute_feats: features.compute_features( train_file, feature_file, label_file ) log.info( 'π: load features from file' ) X = pd.io.parsers.read_csv( os.path.join( DATA_DIR, feature_file ), header=None ) X = X.as_matrix() log.info( "π: encode labels" ) labels = pd.io.parsers.read_csv( os.path.join( DATA_DIR, label_file ), header=None )['X0'] lbl_map = { 'not a real question': 0, 'not constructive': 1, 'off topic': 2, 'open': 3, 'too localized': 4 } # cf. required submission format labels = labels.map( lbl_map ) y = labels.values log.info( 'π: select features' ) fselect = SelectPercentile( score_func=chi2, percentile=42 ) # !? # X = fselect.fit_transform( X, y ) log.info( 'π: define classifiers' ) priors = cu.get_priors( os.path.join( DATA_DIR, 'train/train.csv' ) ) clf_lda = LDA( priors=priors ) clf_rfc = RandomForestClassifier( n_estimators=50, verbose=2, n_jobs=-1, random_state=0, compute_importances=True, max_features=None ) #, criterion='entropy' ) clf_gbc = GradientBoostingClassifier() log.info( 'π: fit Random Forest' ) clf_rfc.fit( X, y ) log.info( "π: compute feature ranking for RFC" ) importances = clf_rfc.feature_importances_ std = np.std([ tree.feature_importances_ for tree in clf_rfc.estimators_ ], axis=0 ) indices = np.argsort( importances )[::-1] for f in xrange( 13 ): # the top thirteen features print "%d. feature %d (%f)" % (f + 1, indices[f], importances[ indices[f] ]) log.info( "π: standardize and normalize features" ) standardizer = StandardScaler( copy=False ).fit( X, y ) standardizer.transform( X, y ) # in-place normalizer = Normalizer( copy=False, norm='l2' ).fit( X, y ) # 'l1' normalizer.transform( X, y ) # in-place log.info( 'π: fit Linear Discriminant Analysis' ) clf_lda.fit( X, y ) # X = cld_lda.transform( X, y ) log.info( 'π: fit Gradient Boosting' ) clf_gbc.fit( X, y ) log.info( 'π: save classifiers' ) np.savez( SUBMISSION_DIR+'cfy.npz', X=X, y=y, fselect=fselect, standardizer=standardizer, normalizer=normalizer ) joblib.dump( clf_lda, SUBMISSION_DIR + 'clf_lda.pkl', compress=9 ) joblib.dump( clf_rfc, SUBMISSION_DIR + 'clf_rfc.pkl', compress=9 ) joblib.dump( clf_gbc, SUBMISSION_DIR + 'clf_gbc.pkl', compress=9 )
def main(): start = time.time() if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append(features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) fea = features.extract_features(feature_names,data) print "Collecting statuses" y = [] for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "fitting this fold's data" rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) #_pred_probs = denormalize(rf.predict_proba(X_test)) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) else: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True print "Fitting" logit.fit(fea, y) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = logit.predict_proba(test_fea) if is_full_train_set == 0: print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)