Example #1
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    #print("Extracting features")
    #features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    clf = svm.SVC(decision_function_shape='ovo')
    clf.fit(fea, data["OpenStatus"][:178351])

    print("Reading test file and making predictions")
    #features.compute_features("test_.csv",feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = clf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    #print(data['OpenStatusMod'])

    print("Extracting features")
    fea = features.extract_features(feature_names, data)
    #print(fea.columns)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1, random_state = 0)
    print("Training the model, created RFC")
    #rf.fit(fea, data["OpenStatus"])
    rf.fit(fea, data["OpenStatusMod"])

    print("Reading test file and making predictions")
    #data = cu.get_dataframe(test_file)
    data = cu.get_dataframe(full_train_file)
    print("Reading data frame")
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    print("adding column")
    test_features = features.extract_features(feature_names, data)
    print("extract features")
    probs = rf.predict_proba(test_features)

#    print("Calculating priors and updating posteriors")
#    new_priors = cu.get_priors(full_train_file)
#    old_priors = cu.get_priors(train_file)
#    print "new priors %s" %(new_priors)
#    print "old priors %s" %(old_priors)
#    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    score()
def main():
	start = time.time()
	print("Reading the data from " + train_file)
	data = cu.get_dataframe(train_file)

	print("Extracting features")
	fea = features.extract_features(feature_names, data)

	print("Training the model")
	clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2)
	clf.fit(fea, data["OpenStatus"])

	print "Listing feature importances:"
	cu.list_feature_importance(clf,feature_names)
	
	print("Reading test file and making predictions: " + test_file)
	data = cu.get_dataframe(test_file)
	test_features = features.extract_features(feature_names, data)
	probs = clf.predict_proba(test_features)

	if (update_posteriors):
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
	
	print("Saving submission to %s" % submission_file)
	cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Example #5
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:140323])

    print("Reading test file and making predictions")
    features.compute_features(test_file,feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_little)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    #classifier = MultinomialNB()
    #classifier = KNeighborsClassifier(n_neighbors=3, weights='distance')
    classifier = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1)
    #classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1)

    classifier.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_litte)
    test_features = features.extract_features(feature_names, data)
    probs = classifier.predict_proba(test_features)

    #print("Calculating priors and updating posteriors")
    #new_priors = cu.get_priors(full_train_file)
    #old_priors = cu.get_priors(train_file)
    #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_litte)
    cu.write_submission(submission_litte, probs)
Example #7
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mten = MultiTaskElasticNet(alpha=0.1,
                               rho=0.5,
                               fit_intercept=True,
                               normalize=False,
                               copy_X=True,
                               max_iter=1000,
                               tol=0.0001,
                               warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    mten.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    probs = mten.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    if is_full_train_set == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	percep.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	#probs = percep.predict_proba(test_fea) # only available for binary classification
	probs = percep.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	#if is_full_train_set == 0:
	#	print("Calculating priors and updating posteriors")
	#	new_priors = cu.get_priors(full_train_file)
	#	old_priors = cu.get_priors(train_file)
	#	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	mten.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	probs = mten.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	if is_full_train_set == 0:
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Example #10
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Example #11
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2,
                                compute_importances=True, n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def make_submission():
    data = None
    if os.path.exists('data.pik'):
        print("Unpickeling the data")
        data = pickle.load(open('data.pik'))
    else:
        print("Reading the data")
        data = cu.get_dataframe(full_train_file)
        pickle.dump(data,open('data.pik','w'))

    fea = None
    if os.path.exists('fea.pik'):
        print("Unpickeling the fea")
        fea = pickle.load(open('fea.pik'))
    else:
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        pickle.dump(fea,open('fea.pik','w'))
    
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=2)
    
    rf.fit(fea, data["OpenStatus"])
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               feature_names,)
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def cross_validate():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Cross-Validating")
    rf = RandomForestClassifier(n_estimators=10,
                                verbose=1,
                                compute_importances=True,
                                n_jobs=2)
    cv = cross_validation.KFold(len(data),
                                k=10,
                                indices=False)
    results = []
    for traincv, testcv in cv:
        print "\t-- cv [%d]"%len(results)
        print "\t","extracting features"
        #...
        feacv = features.extract_features(feature_names,
                                          traincv)
        print "\t","learning"
        rf.fit(feacv, data["OpenStatus"])
        print "\t","predicting"
        probs = rf.predict_proba(testcv)
        print "\t","evaluating"
        results.append( llfun(target[testcv],
                              [x["OpenStatus"] for x in probas]) )
    print "LogLoss: " + str( np.array(results).mean() )
def main():
    
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)
    print("Writing short sample features file")
    ''' preview in console '''
    print(fea.values[:4])
    print fea.describe().to_string()
    ''' save the X features data (matrix)'''
    # cu.write_submission(train_features_short_file, fea.values)
    np.savetxt(train_features_short_file, fea.values, fmt='%d', delimiter=',', newline='\n')

    
    '''train_features_short = [fea, data["OpenStatus"]]'''
    closed_reasons = data["OpenStatus"]
    closed_reasons_count = Counter(closed_reasons)

    print(closed_reasons_count.keys()[0:5])
    closed_reasons_enum = map(closed_reasons_count.keys().index, closed_reasons)    
    print(closed_reasons_enum[:9])
    
    print("Saving submission to %s" % submission_file)
    ''' save the y supervised classification data (vector) '''
    np.savetxt(train_y_short_file, closed_reasons_enum, fmt='%d', delimiter=',', newline='\n')

    '''
Example #15
0
def main():
    data = cu.get_dataframe("train.csv")
    data = data.sort_index(by="PostCreationDate")

    header = cu.get_header("train.csv")
    cutoff = datetime.datetime(2012, 7, 18)

    data[data["PostCreationDate"] < cutoff].to_csv(os.path.join(cu.data_path, "train-A.csv"), index=False)
    data[data["PostCreationDate"] >= cutoff].to_csv(os.path.join(cu.data_path, "train-B.csv"), index=False)
def main():
    data = cu.get_dataframe("train.csv")
    data = data.sort_index(by="PostCreationDate")

    header = cu.get_header("train.csv")
    cutoff = datetime.datetime(2012, 7, 18)

    data[data["PostCreationDate"] < cutoff].to_csv(os.path.join(cu.data_path, "train-A.csv"), index=False)
    data[data["PostCreationDate"] >= cutoff].to_csv(os.path.join(cu.data_path, "train-B.csv"), index=False)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    X = features.extract_features(feature_names, data)
    y =  [ class_labels[i] for i in data["OpenStatus"]]
    skf = StratifiedKFold(y, 10)
    result_f1 = 0
    result_logloss = 0

    fold = 1
    for train, test in skf:

        print "Fold %d" % fold 
        fold+=1

        X_train = [X.ix[i] for i in train]
        y_train = [y[i] for i in train]

        X_test = [X.ix[i] for i in test]
        y_test = [y[i] for i in test]

        if (options.__dict__['classifier'] == 'erf'):
            classifier = ExtraTreesClassifier(n_estimators=100, verbose=0, compute_importances=True, n_jobs=-1)
        elif(options.__dict__['classifier'] == 'mnb'):
            classifier =  MultinomialNB()
        elif (options.__dict__['classifier'] == 'knn'):
            classifier = KNeighborsClassifier(n_neighbors=11)
        elif (options.__dict__['classifier'] == 'gbc'):
            classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1)


        classifier.fit(X_train, y_train)
        
        probs = classifier.predict_proba(X_test)
        if (options.__dict__['priors'] != 0):
            print("Calculating priors and updating posteriors")
            new_priors = cu.get_priors(full_train_file)
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        y_pred = probs
        y_test = np.array(y_test)
        logloss = multiclass_log_loss(y_test, y_pred, eps=1e-15)

        y_pred = classifier.predict(X_test)
        f1 = f1_score(y_test, y_pred, pos_label=None)
        print "Log Loss: %f f1: %f" % (logloss, f1)
        result_f1 += f1
        result_logloss += logloss


    print '\navg LogLoss: %f avg f1: %f' % (result_logloss/10.0, result_f1/10.0)
def main():
    print("Reading the data")
    train_data = cu.get_dataframe(train_file)

    print("Extracting features")
    train_features = features.extract_features(feature_names, train_data)

    print("Reading test file and making predictions")
    test_data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, test_data)


    # print("Training random forest")
    # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    # rf.fit(train_features, train_data["OpenStatus"])
    # probs = rf.predict_proba(test_features)

    # print("Training decision tree")
    # dt = DecisionTreeClassifier()
    # dt.fit(train_features, train_data["OpenStatus"])
    # probs = dt.predict_proba(test_features)

    # print("Training adaboost")
    # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"])
    # probs = ada.predict_proba(test_features)

    print("Training nearest neighbors")
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features_scaled = scaler.transform(train_features)
    test_features_scaled = scaler.transform(test_features)
    nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled, train_data["OpenStatus"])
    probs = nbrs.predict_proba(test_features_scaled)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    actual = cu.get_actual(test_data["OpenStatus"])
    print(cu.get_log_loss(actual, probs, 10**(-15)))
Example #19
0
def main():
    start = time.time()
    print("Reading the data from " + train_file)
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    clf = ExtraTreesClassifier(n_estimators=trees_count,
                               max_features=len(feature_names),
                               max_depth=None,
                               min_samples_split=1,
                               compute_importances=True,
                               bootstrap=False,
                               random_state=0,
                               n_jobs=-1,
                               verbose=2)
    clf.fit(fea, data["OpenStatus"])

    print "Listing feature importances:"
    cu.list_feature_importance(clf, feature_names)

    print("Reading test file and making predictions: " + test_file)
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = clf.predict_proba(test_features)

    if (update_posteriors):
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
Example #20
0
def main():
    print("Reading the data")
    train_data = cu.get_dataframe(train_file)

    print("Extracting features")
    train_features = features.extract_features(feature_names, train_data)

    print("Reading test file and making predictions")
    test_data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, test_data)

    # print("Training random forest")
    # rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    # rf.fit(train_features, train_data["OpenStatus"])
    # probs = rf.predict_proba(test_features)

    # print("Training decision tree")
    # dt = DecisionTreeClassifier()
    # dt.fit(train_features, train_data["OpenStatus"])
    # probs = dt.predict_proba(test_features)

    # print("Training adaboost")
    # ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME").fit(train_features, train_data["OpenStatus"])
    # probs = ada.predict_proba(test_features)

    print("Training nearest neighbors")
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features_scaled = scaler.transform(train_features)
    test_features_scaled = scaler.transform(test_features)
    nbrs = KNeighborsClassifier(n_neighbors=10).fit(train_features_scaled,
                                                    train_data["OpenStatus"])
    probs = nbrs.predict_proba(test_features_scaled)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    actual = cu.get_actual(test_data["OpenStatus"])
    print(cu.get_log_loss(actual, probs, 10**(-15)))
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])
    gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0)
    gb.fit(fea, data["OpenStatus"])
    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
    dt.fit(fea, data["OpenStatus"])
    et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
    et.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)

    probs = rf.predict_proba(test_features)
    probs2 = gb.predict_proba(test_features)
    probs3 = dt.predict_proba(test_features)
    probs4 = et.predict_proba(test_features)

    for i in range(0, len(probs)):
        for j in range(0,5):
            probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def test1():
    data = cu.get_dataframe(full_train_file)
    print("Reading data frame")
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    print("fill na for bodymarkdown")
    data = data.fillna({"BodyMarkdown":""}, inplace=True)
    print("fill na for title")
    data = data.fillna({"Title":""}, inplace=True)
    print("fill na for post creation")
    data = data.fillna({"PostCreationDate":datetime.datetime(2008,07,31,21,42,52)}, inplace=True)
    print("fill na for owner creation")
    data = data.fillna({"OwnerCreationDate":datetime.datetime(2008,07,31,21,42,53)}, inplace=True)
    print("adding column")
    
    test_features = features.extract_features(feature_names, data)
Example #23
0
def main():
    print "get data"
    data = cu.get_dataframe("train.csv")
    print "sort by creation date"
    data = data.sort_index(by="PostCreationDate")
    print "cut off"
    header = cu.get_header("train.csv")
    splits = np.array_split(data, 3)
    frames = [splits[0], splits[1]]
    train_data = pd.concat(frames)
    test_data = splits[2]
    # cutoff = datetime.datetime(2012, 7, 18)
    print "write to csv"
    cu.write_sample("train_data.csv", header, train_data)
    train_data.to_csv(os.path.join(cu.data_path, "train_data.csv"), index=False, header=header)
    test_data.to_csv(os.path.join(cu.data_path, "test_data.csv"), index=False, header=header)
Example #24
0
def hack():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    counter = defaultdict(lambda: defaultdict(lambda: 0))
    grouped = data.groupby('OpenStatus')

    for name, group in grouped:
     print name
     for wrds in group["BodyMarkdown"].apply(words):
        for word in wrds:
            counter[name][word]+=1

    limit = 20
    for name, word_dict in counter.items():
        for word, count in sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)[:limit]
            print name, word, count
def main():
    priors = cu.get_priors("train.csv")
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples,1)), priors)
    cu.write_submission("prior_benchmark.csv", predictions)
                    , "BodyHasCode"
                    , "HourOfPost"
                    , "WeekDayOfPost"
                    , "DayOfPost"
                    , "MonthOfPost"
                    , "YearOfPost"
                    , "MainTag"    
                    , "SecondTag"
                    , "ThirdTag"
                    , "FourthTag"
                    , "FifthTag" 
                    , "TitlePmi" 
                    , "BodyPmi"   
                    , "TitleLenghtWords"
                    , "TitleLength"
                    , "BodyLenghtWords"
                    , "BodyLength"
                    ,"BagOfWords"
                    ]
#    feature_names = ["BagOfWords"]

    data = cu.get_dataframe("../files/train-sample.csv")
    features = features.extract_features(feature_names, data)
    y =  [ class_labels[i] for i in data["OpenStatus"]]
    print(features);
#    i = 0
#
#    for d in features:
#        print(d)
#        print(y[i])
#        i = i + 1   
Example #27
0
def main():
    start = time.time()

    result_sum = 0

    data = cu.get_dataframe("data/train-sample.csv")
    #test_data = cu.get_dataframe("data/public_leaderboard.csv")   #use this for evaluating public_leaderboard

    print 'data loaded'

    fea = features.extract_features(feature_names, data)
    #test_fea = features.extract_features(feature_names,test_data)  #use this for evaluating public_leaderboard

    print 'features extracted'

    knn = KNeighborsClassifier(n_neighbors=10, weights='distance')

    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    y = []
    ques_status = [
        'open', 'too localized', 'not constructive', 'off topic',
        'not a real question'
    ]
    for element in data['OpenStatus']:
        for index, status in enumerate(ques_status):
            if element == status: y.append(index)

    print 'starting 10 fold verification'
    # Dividing the dataset into k = 10 folds for cross validation
    skf = StratifiedKFold(y, k=10)
    fold = 0
    for train_index, test_index in skf:
        fold += 1
        X_train = []
        X_test = []
        y_train = []
        y_test = []
        for i in train_index:
            temp = []
            temp.append(fea['ReputationAtPostCreation'][i])
            temp.append(fea['UserAge'][i])
            temp.append(fea['Title'][i])
            temp.append(fea['BodyMarkdown'][i])
            temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i])
            X_train.append(temp)
            y_train.append(y[i])

        for i in test_index:
            temp = []
            temp.append(fea['ReputationAtPostCreation'][i])
            temp.append(fea['UserAge'][i])
            temp.append(fea['Title'][i])
            temp.append(fea['BodyMarkdown'][i])
            temp.append(fea['OwnerUndeletedAnswerCountAtPostTime'][i])
            X_test.append(temp)
            y_test.append(y[i])

        y_test = vectorize_actual(y_test)  # vectorize y_test
        knn.fit(X_train, y_train)  # train the classifier
        predictions = knn.predict_proba(X_test)  # predict the test fold

        # evaluating the performance
        result = eval_tool.mcllfun(y_test, predictions)
        result_sum += result
        print "MCLL score for fold %d = %0.11f" % (fold, result)

    print "Average MCLL score for this classifier = %0.11f" % (result_sum / 10)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)

    ### Use this code for evaluting public_leaderboard
    '''knn.fit(fea,y)
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples, 5)), np.array(0.2))
    cu.write_submission("uniform_benchmark.csv", predictions)
Example #29
0
def user_age(data):
    return pd.DataFrame.from_dict({
        "UserAge":
        (data["PostCreationDate"] -
         data["OwnerCreationDate"]).apply(lambda x: x.total_seconds())
    })


###########################################################


def extract_features(feature_names, data):
    fea = pd.DataFrame(index=data.index)
    for name in feature_names:
        if name in data:
            fea = fea.join(data[name])
        else:
            fea = fea.join(getattr(features, camel_to_underscores(name))(data))
    return fea


if __name__ == "__main__":
    feature_names = [
        "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime",
        "ReputationAtPostCreation", "TitleLength", "TitleWordCount", "UserAge"
    ]

    data = cu.get_dataframe("train-sample.csv")
    features = extract_features(feature_names, data)
    print(features)
Example #30
0
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples, 5)), np.array(0.2))
    cu.write_submission("uniform_benchmark.csv", predictions)
Example #31
0
def main():
    priors = cu.get_priors("train.csv")
    num_samples = len(cu.get_dataframe("test_.csv"))
    predictions = np.kron(np.ones((num_samples, 1)), priors)
    cu.write_submission("prior_benchmark.csv", predictions)
Example #32
0
def extract_features(feature_names, data):
    fea = pd.DataFrame(index=data.index)
    for name in feature_names:
        if name in data:
            #fea = fea.join(data[name])
            fea = fea.join(data[name].apply(math.fabs))
        else:
            #try:
            fea = fea.join(getattr(features, camel_to_underscores(name))(data))
        #except TypeError:
        #    pass
    return fea


if __name__ == "__main__":
    feature_names = [
        "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime",
        "ReputationAtPostCreation", "TitleLength", "UserAge"
    ]

    data = cu.get_dataframe(
        "private_leaderboard_massaged.csv"
    )  #cu.get_dataframe("train-sample_October_9_2012_v2_massaged.csv")
    features = extract_features(feature_names, data)
    print(features)
    #print features['UserAge']
    #print features['BodyLength']
    #print features['TitleLength']
    print features['NumTags']
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = [[0.0, 0.0, 0.0, 1.0, 0.0] for i in range(num_samples)]
    cu.write_submission("always_open_benchmark.csv", predictions)
Example #34
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    #print("Extracting features")
    #features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:178351])
    important_features = []
    for x, i in enumerate(rf.feature_importances_):
        if i > np.average(rf.feature_importances_):
            important_features.append([str(x),i])
    print 'Most important features:',important_features

    print("Reading test file and making predictions")
    #features.compute_features("test1.csv",feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors,probs, new_priors, 0.01)

    y_pred=[]
    for i in probs:
        i=[float(k) for k in i]
        j=i.index(max(i))
        if(j==3):
            y_pred.append("open")
        else:
            print "hi"
            y_pred.append("closed")

    y_true=[]
    a=0
    b=0
    test_reader = csv.reader(open(test_file))
    headers=test_reader.next()
    for line in test_reader:
        if line[14]=='open':
            y_true.append("open")
            a=a+1
        else:
            y_true.append("closed")
            b=b+1
    print a
    print b

    print confusion_matrix(y_true[1:] , y_pred , labels=["open", "closed"])





    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
Example #35
0
    features['tags'] = ' '.join(features)

    return features


def get_preprocess_data(data):
    data_p = []
    for i in range(0, len(data)):
        data_p.append(row_to_features(data.ix[i]))
    data_p = pd.DataFrame(data_p)
    return data_p


if __name__ == "__main__":
    print("get train data")
    train_data = cu.get_dataframe(train_file)
    print("get test data")
    test_data = cu.get_dataframe(test_file)
    train_data = train_data.replace(np.nan, '', regex=True)
    test_data = test_data.replace(np.nan, '', regex=True)
    train_data_p = get_preprocess_data(train_data)
    test_data_p = get_preprocess_data(test_data)
    text_vars = ['title', 'tags', 'body']
    print("print tf idf")
    for var in text_vars:
        tf_idf = TfidfVectorizer(min_df=2,
                                 use_idf=1,
                                 smooth_idf=1,
                                 sublinear_tf=1,
                                 ngram_range=(1, 2),
                                 norm='l2')
Example #36
0
        else:
            fea = fea.join(getattr(features, camel_to_underscores(name))(data))
    #re_qm = re.compile('\?')
    #fea['HasBodyQM'] = data.BodyMarkdown.apply(lambda b: re_qm.search(b) != None)
    #fea['IsNoob'] = data.ReputationAtPostCreation <= 1
    #fea['IsLeech'] = data.OwnerUndeletedAnswerCountAtPostTime == 0
    #print 'Added HasBodyQM: ', Counter(fea['HasBodyQM'])
    #print 'Added IsNoob: ', Counter(fea['IsNoob'])
    #print 'Added IsLeech: ', Counter(fea['IsLeech'])
    #print "generating extra features"
    #fea = fea.join(get_extra_features(data))

    return fea


if __name__ == "__main__":

    data = cu.get_dataframe("C:\\Projects\\ML\\stack\\data\\train-sample.csv")
    features = extract_features(data)
    print(features)


def get_words(data, i):
    p = re.compile('^[a-z][a-z-]*[a-z]$')
    body = Body(data.BodyMarkdown[i])
    words = body.get_unique_words()
    punks = [w for w in words if not p.match(w)]
    stops = [w for w in words if w in stopwords]
    words = [w for w in words if not w in stopwords and p.match(w)]
    return words, punks, stops
Example #37
0
def main():

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)

        i = _chunksize
        fea = None
        y = []

        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(
                    features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)

            i = i + _chunksize
    else:
        print("Reading the data from:" + train_file)
        data = cu.get_dataframe(train_file)
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        y = []
        for element in data['OpenStatus']:
            for index, status in enumerate(ques_status):
                if element == status: y.append(index)

    if do_cross_validation == 1:
        depth = len(feature_names)
        print "depth=" + str(depth)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=depth,
                                        init=None,
                                        random_state=None)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            print "Fitting for fold " + str(fold)

            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)

            _pred_probs = rf.predict_proba(X_test)
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            # priors distribution over classes based on the training set
            #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            # priors distribution over classes based on the updated training set's last month
            new_priors = [
                0.03410911204982466, 0.01173872976800856, 0.018430671606251586,
                0.926642216133641, 0.009079270442274271
            ]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs,
                                                   new_priors, 0.001)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "depth=" + str(depth)
        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)
    else:
        #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=len(feature_names),
                                        init=None,
                                        random_state=None)

        rf.fit(fea, y)

        print("Reading test file " + test_file + " and making predictions")
        data = cu.get_dataframe(test_file)
        test_features = features.extract_features(feature_names, data)
        probs = rf.predict_proba(test_features)

        # commented out, because we want to adjust probabilities to the last month data anyway
        #if do_full_train == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print("Saving submission to %s" % submission_file)
        cu.write_submission(submission_file, probs)
Example #38
0
        if name in data:
            fea = fea.join(data[name])
        else:
            fea = fea.join(getattr(features, 
                camel_to_underscores(name))(data))
    return fea

if __name__=="__main__":
#    feature_names = [ "BodyLength"
#                    , "NumTags"
#                    , "OwnerUndeletedAnswerCountAtPostTime"
#                    , "ReputationAtPostCreation"
#                    , "TitleLength"
#                    , "UserAge"
#                    ]
    
    feature_names = [ "BodyLength"                
                , "OwnerUndeletedAnswerCountAtPostTime"
                , "ReputationAtPostCreation"                
                , "UserAge"
                , "Title"
                , "Tag1"
                , "Tag2"
                , "Tag3"
                , "Tag4"
                , "Tag5"
                ]
              
    data = cu.get_dataframe("C:\\Users\\Ben\\Temp\\StackOverflow\\train-sample.csv")
    features = extract_features(feature_names, data)
    print(features)
    features['title'] = title
    features['label'] = row['OpenStatus']
    features['tags'] = ' '.join(features)

    return features

def get_preprocess_data(data):
    data_p = []
    for i in range(0, len(data)):
        data_p.append(row_to_features(data.ix[i]))
    data_p = pd.DataFrame(data_p)
    return data_p

if __name__=="__main__":
    print("get train data")
    train_data = cu.get_dataframe(train_file)
    print("get test data")
    test_data = cu.get_dataframe(test_file)
    train_data = train_data.replace(np.nan,'', regex=True)
    test_data = test_data.replace(np.nan,'', regex=True)
    train_data_p = get_preprocess_data(train_data)
    test_data_p = get_preprocess_data(test_data)
    text_vars = ['title', 'tags', 'body']
    print("print tf idf")
    for var in text_vars:
        tf_idf = TfidfVectorizer(min_df=2, use_idf=1, smooth_idf=1, sublinear_tf=1, ngram_range=(1,2), norm='l2')
        tf_idf.fit(train_data_p[var].append(test_data_p[var]))
        probs = get_predictions(tf_idf, train_data_p[var], test_data_p[var], train_data_p['label'])

        for i in range(1,6):
            test_data_p[var+'_pred%d'%i] = probs[:, i-1]
Example #40
0
            fea = fea.join(getattr(features, 
                camel_to_underscores(name))(data))
    #re_qm = re.compile('\?')
    #fea['HasBodyQM'] = data.BodyMarkdown.apply(lambda b: re_qm.search(b) != None)
    #fea['IsNoob'] = data.ReputationAtPostCreation <= 1
    #fea['IsLeech'] = data.OwnerUndeletedAnswerCountAtPostTime == 0
    #print 'Added HasBodyQM: ', Counter(fea['HasBodyQM'])
    #print 'Added IsNoob: ', Counter(fea['IsNoob'])
    #print 'Added IsLeech: ', Counter(fea['IsLeech'])
    #print "generating extra features"
    #fea = fea.join(get_extra_features(data))
    
    return fea

if __name__=="__main__":
              
    data = cu.get_dataframe("C:\\Projects\\ML\\stack\\data\\train-sample.csv")
    features = extract_features(data)
    print(features)


def get_words(data, i):
    p = re.compile('^[a-z][a-z-]*[a-z]$')
    body = Body(data.BodyMarkdown[i])
    words = body.get_unique_words()
    punks = [w for w in words if not p.match(w)]
    stops = [w for w in words if w in stopwords]
    words = [w for w in words if not w in stopwords and p.match(w)]
    return words, punks, stops

Example #41
0
    return pd.DataFrame.from_dict({"TitleLength": data["Title"].apply(len)})


def user_age(data):
    return pd.DataFrame.from_dict({"UserAge": (data["PostCreationDate"]
            - data["OwnerCreationDate"]).apply(lambda x: x.total_seconds())})

###########################################################

def extract_features(feature_names, data):
    fea = pd.DataFrame(index=data.index)
    for name in feature_names:
        if name in data:
            fea = fea.join(data[name])
        else:
            fea = fea.join(getattr(features, 
                camel_to_underscores(name))(data))
    return fea

if __name__=="__main__":
    feature_names = [ "BodyLength"
                    , "NumTags"
                    , "OwnerUndeletedAnswerCountAtPostTime"
                    , "ReputationAtPostCreation"
                    , "TitleLength"
                    , "UserAge"
                    ]
              
    data = cu.get_dataframe("train-sample.csv")
    features = extract_features(feature_names, data)
    print(features)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mnbayes = MultinomialNB(alpha=1.0, fit_prior=True)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	y = []
	
	for element in data['OpenStatus']:
		for index, status in enumerate(ques_status):
			if element == status: y.append(index)
	
	if do_cross_validation == 1:
		print 'starting 10 fold verification'
		# Dividing the dataset into k = 10 folds for cross validation
		#skf = StratifiedKFold(y,k = 10)
		skf = KFold(len(y),k = 10)
		fold = 0
		result_sum = 0
		for train_index,test_index in skf:
			fold += 1
			X_train = []
			X_test = []
			y_train = []
			y_test = []
			for i in train_index:
				temp = []
				for feature_name in feature_names:
					temp.append(fea[feature_name][i])
				X_train.append(temp)
				y_train.append(y[i])
				
			for i in test_index:
				temp = []
				for feature_name in feature_names:
					temp.append(fea[feature_name][i])
				X_test.append(temp)
				y_test.append(y[i])
			
			mnbayes.fit(X_train, y_train) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
			y_test = vectorize_actual(y_test)               # vectorize y_test
			
			_pred_probs = mnbayes.predict_proba(X_test)
			# evaluating the performance
			result = eval.mcllfun(y_test,_pred_probs)
			result_sum += result
			print "MCLL score for fold %d = %0.11f" % (fold,result)
			
		print "Average MCLL score for this classifier = %0.11f" % (result_sum/10)
	
		print "Reading test data and features"
		test_data = cu.get_dataframe(test_file)
		test_fea = features.extract_features(feature_names,test_data)
		
		print "Fitting"
		mnbayes.fit(fea,y)#, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
		
		print "Making predictions"
		global probs
		probs = mnbayes.predict_proba(test_fea)

		#if is_full_train_set == 0:
		#	print("Calculating priors and updating posteriors")
		#	new_priors = cu.get_priors(full_train_file)
		#	old_priors = cu.get_priors(train_file)
		#	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

		print "writing submission to " + submission_file
		cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Example #43
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mnbayes = MultinomialNB(alpha=1.0, fit_prior=True)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    y = []

    for element in data['OpenStatus']:
        for index, status in enumerate(ques_status):
            if element == status: y.append(index)

    if do_cross_validation == 1:
        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        #skf = StratifiedKFold(y,k = 10)
        skf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in skf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            mnbayes.fit(
                X_train, y_train
            )  #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
            y_test = vectorize_actual(y_test)  # vectorize y_test

            _pred_probs = mnbayes.predict_proba(X_test)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)

        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names, test_data)

        print "Fitting"
        mnbayes.fit(
            fea, y
        )  #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])

        print "Making predictions"
        global probs
        probs = mnbayes.predict_proba(test_fea)

        #if is_full_train_set == 0:
        #	print("Calculating priors and updating posteriors")
        #	new_priors = cu.get_priors(full_train_file)
        #	old_priors = cu.get_priors(train_file)
        #	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def extract_features(feature_names, data):
    fea = pd.DataFrame(index=data.index)
    for name in feature_names:
        if name in data:
            #fea = fea.join(data[name])
            fea = fea.join(data[name].apply(math.fabs))
        else:
            #try:
                fea = fea.join(getattr(features, camel_to_underscores(name))(data))
            #except TypeError:
            #    pass
    return fea

if __name__=="__main__":
    feature_names = [ "BodyLength"
                    , "NumTags"
                    , "OwnerUndeletedAnswerCountAtPostTime"
                    , "ReputationAtPostCreation"
                    , "TitleLength"
                    , "UserAge"
                    ]
              
    data = cu.get_dataframe("private_leaderboard_massaged.csv") #cu.get_dataframe("train-sample_October_9_2012_v2_massaged.csv")
    features = extract_features(feature_names, data)
    print(features)
    #print features['UserAge']
    #print features['BodyLength']
    #print features['TitleLength']
    print features['NumTags']

Example #45
0
def user_age(data):
    return pd.DataFrame.from_dict({
        "UserAge":
        (data["PostCreationDate"] -
         data["OwnerCreationDate"]).apply(lambda x: x.total_seconds())
    })


###########################################################


def extract_features(feature_names, data):
    fea = pd.DataFrame(index=data.index)
    for name in feature_names:
        if name in data:
            fea = fea.join(data[name])
        else:
            fea = fea.join(getattr(features, camel_to_underscores(name))(data))
    return fea


if __name__ == "__main__":
    feature_names = [
        "BodyLength", "NumTags", "OwnerUndeletedAnswerCountAtPostTime",
        "ReputationAtPostCreation", "TitleLength", "UserAge"
    ]

    data = cu.get_dataframe(
        "C:\\Users\\Ben\\Temp\\StackOverflow\\train-sample.csv")
    features = extract_features(feature_names, data)
    print(features)
        {"UserAge": (data["PostCreationDate"] - data["OwnerCreationDate"]).apply(lambda x: x.total_seconds())}
    )


###########################################################


def extract_features(feature_names, data):
    fea = pd.DataFrame(index=data.index)
    for name in feature_names:
        if name in data:
            fea = fea.join(data[name])
        else:
            fea = fea.join(getattr(features, camel_to_underscores(name))(data))
    return fea


if __name__ == "__main__":
    feature_names = [
        "BodyLength",
        "NumTags",
        "OwnerUndeletedAnswerCountAtPostTime",
        "ReputationAtPostCreation",
        "TitleLength",
        "UserAge",
    ]

    data = cu.get_dataframe("/home/kesten/VCP/Git/ML/StackOverflowChallenge/data/train-sample.csv")
    features = extract_features(feature_names, data)
    print(features)
Example #47
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    percep = Perceptron(penalty=None,
                        alpha=0.0001,
                        fit_intercept=False,
                        n_iter=5,
                        shuffle=False,
                        verbose=1,
                        eta0=1.0,
                        n_jobs=-1,
                        seed=0,
                        class_weight="auto",
                        warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    percep.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    #probs = percep.predict_proba(test_fea) # only available for binary classification
    probs = percep.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    #if is_full_train_set == 0:
    #	print("Calculating priors and updating posteriors")
    #	new_priors = cu.get_priors(full_train_file)
    #	old_priors = cu.get_priors(train_file)
    #	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = [[0.0,0.0,0.0,1.0,0.0] for i in range(num_samples)]
    cu.write_submission("always_open_benchmark.csv", predictions)
Example #49
0
def main():
    start = time.time()

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)
    
        i = _chunksize
        fea = None
        y = []        
        
        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)
        
            i = i + _chunksize
    else:
        print "Reading train data and its features from: " + train_file
        data = cu.get_dataframe(train_file)
        fea = features.extract_features(feature_names,data)
        print "Collecting statuses"
        y = []
        for element in data["OpenStatus"]:
                for index, status in enumerate(ques_status):
                    if element == status:
                        y.append(index)

    if do_cross_validation == 1:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y),k = 10)
        fold = 0
        result_sum = 0
        for train_index,test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])
                
            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])
            
            print "fitting this fold's data"
            
            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)
            
            #_pred_probs = denormalize(rf.predict_proba(X_test))
            _pred_probs = rf.predict_proba(X_test)
            
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001)            
            # evaluating the performance
            result = eval.mcllfun(y_test,_pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold,result)
            
        print "Average MCLL score for this classifier = %0.11f" % (result_sum/10)     
    else:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True

        print "Fitting"
        logit.fit(fea, y)
        
        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names,test_data)

        print "Making predictions"
        global probs
        probs = logit.predict_proba(test_fea)
        
        if is_full_train_set == 0:
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)    

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish-start)