def main(): print("Reading the data") data = cu.get_dataframe(train_file) #print("Extracting features") #features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) clf = svm.SVC(decision_function_shape='ovo') clf.fit(fea, data["OpenStatus"][:178351]) print("Reading test file and making predictions") #features.compute_features("test_.csv",feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = clf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) score()
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:140323]) print("Reading test file and making predictions") features.compute_features(test_file,feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) #print(data['OpenStatusMod']) print("Extracting features") fea = features.extract_features(feature_names, data) #print(fea.columns) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1, random_state = 0) print("Training the model, created RFC") #rf.fit(fea, data["OpenStatus"]) rf.fit(fea, data["OpenStatusMod"]) print("Reading test file and making predictions") #data = cu.get_dataframe(test_file) data = cu.get_dataframe(full_train_file) print("Reading data frame") data['OpenStatusMod'] = data['OpenStatus'].map(convert_status) print("adding column") test_features = features.extract_features(feature_names, data) print("extract features") probs = rf.predict_proba(test_features) # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # print "new priors %s" %(new_priors) # print "old priors %s" %(old_priors) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf,feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): print("Reading the data") data = cu.get_dataframe(train_little) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") #classifier = MultinomialNB() #classifier = KNeighborsClassifier(n_neighbors=3, weights='distance') classifier = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1) #classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1) classifier.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_litte) test_features = features.extract_features(feature_names, data) probs = classifier.predict_proba(test_features) #print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) #old_priors = cu.get_priors(train_file) #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_litte) cu.write_submission(submission_litte, probs)
def make_submission(): print("Reading data") fea, status = features.online_extract_features('data/train.csv', limit=5e6) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=1) rf.fit(fea, status['OpenStatus']) print "Features Importance:" imps = zip(rf.feature_importances_, fea.keys()) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") test_features = features.online_extract_features('data/'+test_file, train=False, limit=1e12)[0] probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" percep.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs #probs = percep.predict_proba(test_fea) # only available for binary classification probs = percep.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): # The number of documents to analyze each iteration batchsize = 100 # The total number of questions on Stack Overflow D = 3.3e6 # The number of topics K = 20 # Make sure the topics are included as features for analysis feature_names.extend('Topic%d' % k for k in range(K)) print("Reading the vocabulary") vocab = [w.strip() for w in file('./vocab4.txt')] # How many words are in the vocabulary W = len(vocab) print("Reading the data") data = cu.get_dataframe(train_file) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7) print("Allocating the topics") allocate_topics(lda, data, K, batchsize, D) print("Extracting features") fea = extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=4) rf.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) allocate_topics(lda, data, K, batchsize, D) test_features = extract_features(feature_names, data) probs = rf.predict_proba(test_features) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def make_submission(): data = None if os.path.exists('data.pik'): print("Unpickeling the data") data = pickle.load(open('data.pik')) else: print("Reading the data") data = cu.get_dataframe(full_train_file) pickle.dump(data,open('data.pik','w')) fea = None if os.path.exists('fea.pik'): print("Unpickeling the fea") fea = pickle.load(open('fea.pik')) else: print("Extracting features") fea = features.extract_features(feature_names, data) pickle.dump(fea,open('fea.pik','w')) print("Training the model") rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, oob_score=True, #criterion='entropy', n_jobs=2) rf.fit(fea, data["OpenStatus"]) print "Features Importance:" imps = zip(rf.feature_importances_, feature_names,) imps.sort(reverse=True) print '\n'.join([ str(_) for _ in imps ]) print "Generalization Error:", rf.oob_score_ print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) if True: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): f = open(probs_file, 'r') lines = f.readlines() probs = [] for line in lines: probs.append(np.array([float(x) for x in line.split(',')])) print("Calculating priors and updating posteriors") probs = np.array(probs) new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) f.close()
def make_submission(): print("Reading data") fea, status = features.online_extract_features('data/train.csv', train=True, limit=1e9) _dim(fea,'fea') print("Training Level 1 : Open/Rest model") open_status = [ or_binarize(e) for e in status['OpenStatus'] ] is_not_open_status = [ s != 'open' for s in open_status ] or_model = learn(fea,open_status) print("Training Level 2 : Not Open Split model") not_open_status = [ status['OpenStatus'][i] for i in range(len(is_not_open_status)) if is_not_open_status[i] ] no_fea = fea[is_not_open_status] _dim(no_fea,'no_fea') no_model = learn(no_fea,not_open_status) print("Reading test file and making predictions") test_features = features.online_extract_features('data/'+test_file, train=False, limit=1e9)[0] _dim(test_features,'test_features') or_probs = or_model.predict_proba(test_features) probs = [] for i in range(0,len(or_probs)): or_prob = or_probs[i] if or_prob[0] > or_prob[1]: probs.append(np.array([1.0,0.0,0.0,0.0,0.0])) else: f = [ test_features[ff][i] for ff in test_features.keys() ] a = no_model.predict_proba(f) aa = np.insert(a,0,[0.0]) probs.append(aa) probs = np.array(probs) if False: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print("Reading the data from " + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2) clf.fit(fea, data["OpenStatus"]) print "Listing feature importances:" cu.list_feature_importance(clf, feature_names) print("Reading test file and making predictions: " + test_file) data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = clf.predict_proba(test_features) if (update_posteriors): print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) print("Training the model") rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1) rf.fit(fea, data["OpenStatus"]) gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0) gb.fit(fea, data["OpenStatus"]) dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0) dt.fit(fea, data["OpenStatus"]) et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0) et.fit(fea, data["OpenStatus"]) print("Reading test file and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) probs2 = gb.predict_proba(test_features) probs3 = dt.predict_proba(test_features) probs4 = et.predict_proba(test_features) for i in range(0, len(probs)): for j in range(0,5): probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4 print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit( X_train, y_train ) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Fitting" mnbayes.fit( fea, y ) #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): print("Reading the data") data = cu.get_dataframe(train_file) #print("Extracting features") #features.compute_features(train_file,feature_file1) print("Training the model") fea = cu.get_dataframe(feature_file1) rf = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=-1) rf.fit(fea, data["OpenStatus"][:178351]) important_features = [] for x, i in enumerate(rf.feature_importances_): if i > np.average(rf.feature_importances_): important_features.append([str(x),i]) print 'Most important features:',important_features print("Reading test file and making predictions") #features.compute_features("test1.csv",feature_file2) test_fea = cu.get_dataframe(feature_file2) probs = rf.predict_proba(test_fea) print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors,probs, new_priors, 0.01) y_pred=[] for i in probs: i=[float(k) for k in i] j=i.index(max(i)) if(j==3): y_pred.append("open") else: print "hi" y_pred.append("closed") y_true=[] a=0 b=0 test_reader = csv.reader(open(test_file)) headers=test_reader.next() for line in test_reader: if line[14]=='open': y_true.append("open") a=a+1 else: y_true.append("closed") b=b+1 print a print b print confusion_matrix(y_true[1:] , y_pred , labels=["open", "closed"]) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = np.kron(np.ones((num_samples, 5)), np.array(0.2)) cu.write_submission("uniform_benchmark.csv", predictions)
dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf))) for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X, y) y_submission = clf.predict_proba(X_test)[:,1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1] dataset_blend_test[:,j] = dataset_blend_test_j.mean(1) print print "Blending." clf = LogisticRegression() clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:,1] print "Linear stretch of predictions to [0,1]" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) probs = y_submission print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "Saving Results." cu.write_submission(submission_file, probs)
for i, (train, test) in enumerate(skf): print "Fold", i X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] clf.fit(X, y) y_submission = clf.predict_proba(X_test)[:, 1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1] dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) print print "Blending." clf = LogisticRegression() clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:, 1] print "Linear stretch of predictions to [0,1]" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) probs = y_submission print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "Saving Results." cu.write_submission(submission_file, probs)
def main(): predictions = [actual_lookup[r[14]] for r in cu.get_reader(actual_file)] cu.write_submission("actual_benchmark.csv", predictions)
def main(): if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append( features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print("Reading the data from:" + train_file) data = cu.get_dataframe(train_file) print("Extracting features") fea = features.extract_features(feature_names, data) y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: depth = len(feature_names) print "depth=" + str(depth) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=depth, init=None, random_state=None) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y), k=10) fold = 0 result_sum = 0 for train_index, test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "Fitting for fold " + str(fold) rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) # priors distribution over classes based on the training set #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] # priors distribution over classes based on the updated training set's last month new_priors = [ 0.03410911204982466, 0.01173872976800856, 0.018430671606251586, 0.926642216133641, 0.009079270442274271 ] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test, _pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold, result) print "depth=" + str(depth) print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10) else: #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1) rf = GradientBoostingClassifier(loss='deviance', learn_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=1, min_samples_leaf=1, max_depth=len(feature_names), init=None, random_state=None) rf.fit(fea, y) print("Reading test file " + test_file + " and making predictions") data = cu.get_dataframe(test_file) test_features = features.extract_features(feature_names, data) probs = rf.predict_proba(test_features) # commented out, because we want to adjust probabilities to the last month data anyway #if do_full_train == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print("Saving submission to %s" % submission_file) cu.write_submission(submission_file, probs)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" percep.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs #probs = percep.predict_proba(test_fea) # only available for binary classification probs = percep.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def main(): priors = cu.get_priors("train.csv") num_samples = len(cu.get_dataframe("test_.csv")) predictions = np.kron(np.ones((num_samples, 1)), priors) cu.write_submission("prior_benchmark.csv", predictions)
def main(): num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = [[0.0, 0.0, 0.0, 1.0, 0.0] for i in range(num_samples)] cu.write_submission("always_open_benchmark.csv", predictions)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mnbayes = MultinomialNB(alpha=1.0, fit_prior=True) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError y = [] for element in data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation #skf = StratifiedKFold(y,k = 10) skf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in skf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) mnbayes.fit(X_train, y_train) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) y_test = vectorize_actual(y_test) # vectorize y_test _pred_probs = mnbayes.predict_proba(X_test) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Fitting" mnbayes.fit(fea,y)#, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851]) print "Making predictions" global probs probs = mnbayes.predict_proba(test_fea) #if is_full_train_set == 0: # print("Calculating priors and updating posteriors") # new_priors = cu.get_priors(full_train_file) # old_priors = cu.get_priors(train_file) # probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
def main(): priors = cu.get_priors("train.csv") num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = np.kron(np.ones((num_samples,1)), priors) cu.write_submission("prior_benchmark.csv", predictions)
def main(): num_samples = len(cu.get_dataframe("public_leaderboard.csv")) predictions = [[0.0,0.0,0.0,1.0,0.0] for i in range(num_samples)] cu.write_submission("always_open_benchmark.csv", predictions)
def main(): start = time.time() if (use_low_mem == 1): data_iter = cu.iter_data_frames(train_file, _chunksize) i = _chunksize fea = None y = [] for train_data in data_iter: print "About to have processed: " + str(i) print("Extracting features") if fea is None: fea = features.extract_features(feature_names, train_data) else: fea = fea.append(features.extract_features(feature_names, train_data)) for element in train_data['OpenStatus']: for index, status in enumerate(ques_status): if element == status: y.append(index) i = i + _chunksize else: print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) fea = features.extract_features(feature_names,data) print "Collecting statuses" y = [] for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) if do_cross_validation == 1: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) print 'starting 10 fold verification' # Dividing the dataset into k = 10 folds for cross validation kf = KFold(len(y),k = 10) fold = 0 result_sum = 0 for train_index,test_index in kf: fold += 1 X_train = [] X_test = [] y_train = [] y_test = [] for i in train_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_train.append(temp) y_train.append(y[i]) for i in test_index: temp = [] for feature_name in feature_names: if feature_name == 'BodyLength': temp.append(fea['BodyMarkdown'][i]) elif feature_name == 'TitleLength': temp.append(fea['Title'][i]) else: temp.append(fea[feature_name][i]) X_test.append(temp) y_test.append(y[i]) print "fitting this fold's data" rf.fit(X_train, y_train) y_test = vectorize_actual(y_test) #_pred_probs = denormalize(rf.predict_proba(X_test)) _pred_probs = rf.predict_proba(X_test) print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001) # evaluating the performance result = eval.mcllfun(y_test,_pred_probs) result_sum += result print "MCLL score for fold %d = %0.11f" % (fold,result) print "Average MCLL score for this classifier = %0.11f" % (result_sum/10) else: logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None, fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True print "Fitting" logit.fit(fea, y) print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = logit.predict_proba(test_fea) if is_full_train_set == 0: print("Calculating priors and updating posteriors") #new_priors = cu.get_priors(full_train_file) new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952] old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)