Esempio n. 1
0
def make_submission():
    print("Reading data")
    fea, status = features.online_extract_features('data/train.csv',
                                                   limit=5e6)
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=1)
    
    rf.fit(fea, status['OpenStatus'])
    
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               fea.keys())
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    test_features = features.online_extract_features('data/'+test_file,
                                                     train=False,
                                                     limit=1e12)[0]
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def main():
	start = time.time()
	print("Reading the data from " + train_file)
	data = cu.get_dataframe(train_file)

	print("Extracting features")
	fea = features.extract_features(feature_names, data)

	print("Training the model")
	clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2)
	clf.fit(fea, data["OpenStatus"])

	print "Listing feature importances:"
	cu.list_feature_importance(clf,feature_names)
	
	print("Reading test file and making predictions: " + test_file)
	data = cu.get_dataframe(test_file)
	test_features = features.extract_features(feature_names, data)
	probs = clf.predict_proba(test_features)

	if (update_posteriors):
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
	
	print("Saving submission to %s" % submission_file)
	cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Esempio n. 3
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    score()
Esempio n. 4
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:140323])

    print("Reading test file and making predictions")
    features.compute_features(test_file,feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
Esempio n. 5
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    #print("Extracting features")
    #features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    clf = svm.SVC(decision_function_shape='ovo')
    clf.fit(fea, data["OpenStatus"][:178351])

    print("Reading test file and making predictions")
    #features.compute_features("test_.csv",feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = clf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
Esempio n. 6
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mten = MultiTaskElasticNet(alpha=0.1,
                               rho=0.5,
                               fit_intercept=True,
                               normalize=False,
                               copy_X=True,
                               max_iter=1000,
                               tol=0.0001,
                               warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    mten.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    probs = mten.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    if is_full_train_set == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    X = features.extract_features(feature_names, data)
    y =  [ class_labels[i] for i in data["OpenStatus"]]
    skf = StratifiedKFold(y, 10)
    result_f1 = 0
    result_logloss = 0

    fold = 1
    for train, test in skf:

        print "Fold %d" % fold 
        fold+=1

        X_train = [X.ix[i] for i in train]
        y_train = [y[i] for i in train]

        X_test = [X.ix[i] for i in test]
        y_test = [y[i] for i in test]

        if (options.__dict__['classifier'] == 'erf'):
            classifier = ExtraTreesClassifier(n_estimators=100, verbose=0, compute_importances=True, n_jobs=-1)
        elif(options.__dict__['classifier'] == 'mnb'):
            classifier =  MultinomialNB()
        elif (options.__dict__['classifier'] == 'knn'):
            classifier = KNeighborsClassifier(n_neighbors=11)
        elif (options.__dict__['classifier'] == 'gbc'):
            classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1)


        classifier.fit(X_train, y_train)
        
        probs = classifier.predict_proba(X_test)
        if (options.__dict__['priors'] != 0):
            print("Calculating priors and updating posteriors")
            new_priors = cu.get_priors(full_train_file)
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        y_pred = probs
        y_test = np.array(y_test)
        logloss = multiclass_log_loss(y_test, y_pred, eps=1e-15)

        y_pred = classifier.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=None)
        print "Log Loss: %f f1: %f" % (logloss, f1)
        result_f1 += f1
        result_logloss += logloss


    print '\navg LogLoss: %f avg f1: %f' % (result_logloss/10.0, result_f1/10.0)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	mten.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	probs = mten.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	if is_full_train_set == 0:
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Esempio n. 9
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Esempio n. 10
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2,
                                compute_importances=True, n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def make_submission():
    data = None
    if os.path.exists('data.pik'):
        print("Unpickeling the data")
        data = pickle.load(open('data.pik'))
    else:
        print("Reading the data")
        data = cu.get_dataframe(full_train_file)
        pickle.dump(data,open('data.pik','w'))

    fea = None
    if os.path.exists('fea.pik'):
        print("Unpickeling the fea")
        fea = pickle.load(open('fea.pik'))
    else:
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        pickle.dump(fea,open('fea.pik','w'))
    
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=2)
    
    rf.fit(fea, data["OpenStatus"])
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               feature_names,)
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def main():
    f = open(probs_file, 'r')
    lines = f.readlines()
    probs = []

    for line in lines:
        probs.append(np.array([float(x) for x in line.split(',')]))    

    print("Calculating priors and updating posteriors")
    probs = np.array(probs)
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    f.close()
def make_submission():
    print("Reading data")
    fea, status = features.online_extract_features('data/train.csv',
                                                   train=True,
                                                   limit=1e9)
    _dim(fea,'fea')
    print("Training Level 1 : Open/Rest model")
    open_status = [ or_binarize(e) for e in status['OpenStatus']  ]
    is_not_open_status = [ s != 'open' for s in open_status ]
    or_model = learn(fea,open_status)

    print("Training Level 2 : Not Open Split model")
    not_open_status = [ status['OpenStatus'][i] for i in range(len(is_not_open_status)) if is_not_open_status[i] ]
    no_fea = fea[is_not_open_status]
    _dim(no_fea,'no_fea')
    no_model = learn(no_fea,not_open_status)
    
    print("Reading test file and making predictions")
    test_features = features.online_extract_features('data/'+test_file,
                                                     train=False,
                                                     limit=1e9)[0]
    _dim(test_features,'test_features')
    or_probs = or_model.predict_proba(test_features)
    probs = []
    for i in range(0,len(or_probs)):
        or_prob = or_probs[i]
        if or_prob[0] > or_prob[1]:
            probs.append(np.array([1.0,0.0,0.0,0.0,0.0]))
        else:
            f = [ test_features[ff][i] for ff in test_features.keys() ]
            a = no_model.predict_proba(f)
            aa = np.insert(a,0,[0.0])
            probs.append(aa)
    probs = np.array(probs)

    if False:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Esempio n. 14
0
def main():
    start = time.time()
    print("Reading the data from " + train_file)
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    clf = ExtraTreesClassifier(n_estimators=trees_count,
                               max_features=len(feature_names),
                               max_depth=None,
                               min_samples_split=1,
                               compute_importances=True,
                               bootstrap=False,
                               random_state=0,
                               n_jobs=-1,
                               verbose=2)
    clf.fit(fea, data["OpenStatus"])

    print "Listing feature importances:"
    cu.list_feature_importance(clf, feature_names)

    print("Reading test file and making predictions: " + test_file)
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = clf.predict_proba(test_features)

    if (update_posteriors):
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def main():
    print("Reading the data")
    train_data = cu.get_dataframe(train_file)

    print("Extracting features")
    train_features = features.extract_features(feature_names, train_data)

    print("Reading test file and making predictions")
    test_data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, test_data)


    # print("Training random forest")
    # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    # rf.fit(train_features, train_data["OpenStatus"])
    # probs = rf.predict_proba(test_features)

    # print("Training decision tree")
    # dt = DecisionTreeClassifier()
    # dt.fit(train_features, train_data["OpenStatus"])
    # probs = dt.predict_proba(test_features)

    # print("Training adaboost")
    # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"])
    # probs = ada.predict_proba(test_features)

    print("Training nearest neighbors")
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features_scaled = scaler.transform(train_features)
    test_features_scaled = scaler.transform(test_features)
    nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled, train_data["OpenStatus"])
    probs = nbrs.predict_proba(test_features_scaled)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    actual = cu.get_actual(test_data["OpenStatus"])
    print(cu.get_log_loss(actual, probs, 10**(-15)))
Esempio n. 16
0
def main():
    print("Reading the data")
    train_data = cu.get_dataframe(train_file)

    print("Extracting features")
    train_features = features.extract_features(feature_names, train_data)

    print("Reading test file and making predictions")
    test_data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, test_data)

    # print("Training random forest")
    # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    # rf.fit(train_features, train_data["OpenStatus"])
    # probs = rf.predict_proba(test_features)

    # print("Training decision tree")
    # dt = DecisionTreeClassifier()
    # dt.fit(train_features, train_data["OpenStatus"])
    # probs = dt.predict_proba(test_features)

    # print("Training adaboost")
    # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"])
    # probs = ada.predict_proba(test_features)

    print("Training nearest neighbors")
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features_scaled = scaler.transform(train_features)
    test_features_scaled = scaler.transform(test_features)
    nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled,
                                                    train_data["OpenStatus"])
    probs = nbrs.predict_proba(test_features_scaled)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    actual = cu.get_actual(test_data["OpenStatus"])
    print(cu.get_log_loss(actual, probs, 10**(-15)))
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])
    gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0)
    gb.fit(fea, data["OpenStatus"])
    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
    dt.fit(fea, data["OpenStatus"])
    et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
    et.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)

    probs = rf.predict_proba(test_features)
    probs2 = gb.predict_proba(test_features)
    probs3 = dt.predict_proba(test_features)
    probs4 = et.predict_proba(test_features)

    for i in range(0, len(probs)):
        for j in range(0,5):
            probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Esempio n. 18
0
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X, y)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print
    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() -
                                                          y_submission.min())
    probs = y_submission

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "Saving Results."
    cu.write_submission(submission_file, probs)
def predict_class(test_file="test/public_leaderboard.csv", recompute_feats=False):
    """
	Module that predicts class probabilities for test data 
	from a .csv file or precomputed feature vectors.
	"""

    # custom variables
    DATA_DIR = "../data/"
    SUBMISSION_DIR = "../data/submission/"
    train_file_all = "train/train.csv"
    test_file = "test/private_leaderboard.csv"
    feature_file = "test/private_leaderboard-feats.csv"
    output_file = "predictions.csv"

    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
    log = logging.getLogger(__name__)

    if recompute_feats:
        # features.compute_features( 'test/test.csv', 'test/test-feats.csv' )
        features.compute_features(test_file, feature_file)

    log.info("π: load features from file")
    X_test = pd.io.parsers.read_csv(os.path.join(DATA_DIR, feature_file), header=None)
    X_test = X_test.as_matrix()

    log.info("π: load classifier")
    npz_file = np.load(SUBMISSION_DIR + "cfy.npz")
    clf_lda = joblib.load(SUBMISSION_DIR + "clf_lda.pkl")
    clf_rfc = joblib.load(SUBMISSION_DIR + "clf_rfc.pkl")
    clf_gbc = joblib.load(SUBMISSION_DIR + "clf_gbc.pkl")

    log.info("π: load standardizer, normalizer")
    standardizer = npz_file["standardizer"].item()
    normalizer = npz_file["normalizer"].item()

    # log.info( 'π: perform feature selection' )
    # fselect = npz_file[ 'fselect' ].item()
    # X_test = fselect.transform( X_test )

    log.info("π: Random Forest predictions")
    y_rfc = clf_rfc.predict_proba(X_test)

    log.info("π: standardize and normalize test features")
    standardizer.transform(X_test)  # in-place
    normalizer.transform(X_test)  # in-place

    log.info("π: LDA and GBC class membership predictions")
    # X_test = clf_lda.transform( X_test )
    y_lda = clf_lda.predict_proba(X_test)
    y_gbc = clf_gbc.predict_proba(X_test)

    y_pred = (y_rfc + y_gbc) / 2.0

    log.info("π: calculate priors and update posteriors")
    new_priors = cu.get_priors(train_file_all)
    closed_reasons = pd.io.parsers.read_csv(os.path.join(DATA_DIR, train_labels), header=None)["X0"]
    closed_reason_counts = Counter(closed_reasons)
    reasons = sorted(closed_reason_counts.keys())
    total = len(closed_reasons)
    old_priors = [closed_reason_counts[reason] / total for reason in reasons]
    y_pred = cu.cap_and_update_priors(old_priors, y_pred, new_priors, 0.001)

    y_pred = (2 * y_pred + y_lda) / 3.0
    log.info("π: write predictions to file")
    writer = csv.writer(open(os.path.join(SUBMISSION_DIR, output_file), "w"), lineterminator="\n")
    writer.writerows(y_pred)
Esempio n. 20
0
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X, y)
            y_submission = clf.predict_proba(X_test)[:,1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

    print
    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:,1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    probs = y_submission
    
    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print "Saving Results."
    cu.write_submission(submission_file, probs)
        
Esempio n. 21
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    #print("Extracting features")
    #features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:178351])
    important_features = []
    for x, i in enumerate(rf.feature_importances_):
        if i > np.average(rf.feature_importances_):
            important_features.append([str(x),i])
    print 'Most important features:',important_features

    print("Reading test file and making predictions")
    #features.compute_features("test1.csv",feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors,probs, new_priors, 0.01)

    y_pred=[]
    for i in probs:
        i=[float(k) for k in i]
        j=i.index(max(i))
        if(j==3):
            y_pred.append("open")
        else:
            print "hi"
            y_pred.append("closed")

    y_true=[]
    a=0
    b=0
    test_reader = csv.reader(open(test_file))
    headers=test_reader.next()
    for line in test_reader:
        if line[14]=='open':
            y_true.append("open")
            a=a+1
        else:
            y_true.append("closed")
            b=b+1
    print a
    print b

    print confusion_matrix(y_true[1:] , y_pred , labels=["open", "closed"])





    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
Esempio n. 22
0
def main():

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)

        i = _chunksize
        fea = None
        y = []

        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(
                    features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)

            i = i + _chunksize
    else:
        print("Reading the data from:" + train_file)
        data = cu.get_dataframe(train_file)
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        y = []
        for element in data['OpenStatus']:
            for index, status in enumerate(ques_status):
                if element == status: y.append(index)

    if do_cross_validation == 1:
        depth = len(feature_names)
        print "depth=" + str(depth)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=depth,
                                        init=None,
                                        random_state=None)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            print "Fitting for fold " + str(fold)

            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)

            _pred_probs = rf.predict_proba(X_test)
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            # priors distribution over classes based on the training set
            #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            # priors distribution over classes based on the updated training set's last month
            new_priors = [
                0.03410911204982466, 0.01173872976800856, 0.018430671606251586,
                0.926642216133641, 0.009079270442274271
            ]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs,
                                                   new_priors, 0.001)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "depth=" + str(depth)
        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)
    else:
        #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=len(feature_names),
                                        init=None,
                                        random_state=None)

        rf.fit(fea, y)

        print("Reading test file " + test_file + " and making predictions")
        data = cu.get_dataframe(test_file)
        test_features = features.extract_features(feature_names, data)
        probs = rf.predict_proba(test_features)

        # commented out, because we want to adjust probabilities to the last month data anyway
        #if do_full_train == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print("Saving submission to %s" % submission_file)
        cu.write_submission(submission_file, probs)
def main():
    priors = cu.get_priors("train.csv")
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples,1)), priors)
    cu.write_submission("prior_benchmark.csv", predictions)
Esempio n. 24
0
def main():
    priors = cu.get_priors("train.csv")
    num_samples = len(cu.get_dataframe("test_.csv"))
    predictions = np.kron(np.ones((num_samples, 1)), priors)
    cu.write_submission("prior_benchmark.csv", predictions)
def train_classifier( train_file='train/train.csv', recompute_feats=False ): 
	'''
	Module that reads stackoverflow data from a .csv file, 
	generates features, and trains a classifier.
	'''
	
	# custom variables
	DATA_DIR = "../data/"
	SUBMISSION_DIR = "../data/submission/"
	# train_file = 'train/train-sample.csv'
	label_file = 'train/train-labels.csv'
	feature_file = 'train/train-feats.csv'
			
	# display progress logs on stdout
	logging.basicConfig( level=logging.INFO,
						 format='%(asctime)s %(levelname)s %(message)s' )
	log = logging.getLogger(__name__)
	
	if recompute_feats:
		features.compute_features( train_file, feature_file, label_file )
	
	log.info( 'π: load features from file' )
	X = pd.io.parsers.read_csv( os.path.join( DATA_DIR, feature_file ), header=None )
	X = X.as_matrix()

	log.info( "π: encode labels" )
	labels = pd.io.parsers.read_csv( os.path.join( DATA_DIR, label_file ), header=None )['X0']
	lbl_map = { 'not a real question': 0, 'not constructive': 1, 'off topic': 2,
				'open': 3, 'too localized': 4 } # cf. required submission format
	labels = labels.map( lbl_map )
	y = labels.values
	
	log.info( 'π: select features' )
	fselect = SelectPercentile( score_func=chi2, percentile=42 ) # !?
	# X = fselect.fit_transform( X, y )
	
	log.info( 'π: define classifiers' )
	priors = cu.get_priors( os.path.join( DATA_DIR, 'train/train.csv' ) )
	clf_lda = LDA( priors=priors )
	clf_rfc = RandomForestClassifier( n_estimators=50, verbose=2, n_jobs=-1, random_state=0, 
				compute_importances=True, max_features=None ) #, criterion='entropy' )
	clf_gbc = GradientBoostingClassifier()

	log.info( 'π: fit Random Forest' )
	clf_rfc.fit( X, y )

	log.info( "π: compute feature ranking for RFC" )
	importances = clf_rfc.feature_importances_
	std = np.std([ tree.feature_importances_ for tree in clf_rfc.estimators_ ], axis=0 )
	indices = np.argsort( importances )[::-1]
	for f in xrange( 13 ): # the top thirteen features
		print "%d. feature %d (%f)" % (f + 1, indices[f], importances[ indices[f] ])

	log.info( "π: standardize and normalize features" )
	standardizer = StandardScaler( copy=False ).fit( X, y )
	standardizer.transform( X, y )	# in-place
	normalizer = Normalizer( copy=False, norm='l2' ).fit( X, y ) # 'l1'
	normalizer.transform( X, y )	# in-place
	
	log.info( 'π: fit Linear Discriminant Analysis' )
	clf_lda.fit( X, y )
	# X = cld_lda.transform( X, y )
	log.info( 'π: fit Gradient Boosting' )
	clf_gbc.fit( X, y )
	
	log.info( 'π: save classifiers' )
	np.savez( SUBMISSION_DIR+'cfy.npz', X=X, y=y, fselect=fselect, 
				standardizer=standardizer, normalizer=normalizer )
	joblib.dump( clf_lda, SUBMISSION_DIR + 'clf_lda.pkl', compress=9 )
	joblib.dump( clf_rfc, SUBMISSION_DIR + 'clf_rfc.pkl', compress=9 )
	joblib.dump( clf_gbc, SUBMISSION_DIR + 'clf_gbc.pkl', compress=9 )
Esempio n. 26
0
def main():
    start = time.time()

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)
    
        i = _chunksize
        fea = None
        y = []        
        
        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)
        
            i = i + _chunksize
    else:
        print "Reading train data and its features from: " + train_file
        data = cu.get_dataframe(train_file)
        fea = features.extract_features(feature_names,data)
        print "Collecting statuses"
        y = []
        for element in data["OpenStatus"]:
                for index, status in enumerate(ques_status):
                    if element == status:
                        y.append(index)

    if do_cross_validation == 1:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y),k = 10)
        fold = 0
        result_sum = 0
        for train_index,test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])
                
            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])
            
            print "fitting this fold's data"
            
            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)
            
            #_pred_probs = denormalize(rf.predict_proba(X_test))
            _pred_probs = rf.predict_proba(X_test)
            
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001)            
            # evaluating the performance
            result = eval.mcllfun(y_test,_pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold,result)
            
        print "Average MCLL score for this classifier = %0.11f" % (result_sum/10)     
    else:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True

        print "Fitting"
        logit.fit(fea, y)
        
        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names,test_data)

        print "Making predictions"
        global probs
        probs = logit.predict_proba(test_fea)
        
        if is_full_train_set == 0:
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)    

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish-start)