Example #1
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    #print("Extracting features")
    #features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    clf = svm.SVC(decision_function_shape='ovo')
    clf.fit(fea, data["OpenStatus"][:178351])

    print("Reading test file and making predictions")
    #features.compute_features("test_.csv",feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = clf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    score()
Example #3
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:140323])

    print("Reading test file and making predictions")
    features.compute_features(test_file,feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    #print(data['OpenStatusMod'])

    print("Extracting features")
    fea = features.extract_features(feature_names, data)
    #print(fea.columns)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1, random_state = 0)
    print("Training the model, created RFC")
    #rf.fit(fea, data["OpenStatus"])
    rf.fit(fea, data["OpenStatusMod"])

    print("Reading test file and making predictions")
    #data = cu.get_dataframe(test_file)
    data = cu.get_dataframe(full_train_file)
    print("Reading data frame")
    data['OpenStatusMod'] = data['OpenStatus'].map(convert_status)
    print("adding column")
    test_features = features.extract_features(feature_names, data)
    print("extract features")
    probs = rf.predict_proba(test_features)

#    print("Calculating priors and updating posteriors")
#    new_priors = cu.get_priors(full_train_file)
#    old_priors = cu.get_priors(train_file)
#    print "new priors %s" %(new_priors)
#    print "old priors %s" %(old_priors)
#    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def main():
	start = time.time()
	print("Reading the data from " + train_file)
	data = cu.get_dataframe(train_file)

	print("Extracting features")
	fea = features.extract_features(feature_names, data)

	print("Training the model")
	clf = ExtraTreesClassifier(n_estimators=trees_count, max_features=len(feature_names), max_depth=None, min_samples_split=1, compute_importances=True, bootstrap=False, random_state=0, n_jobs=-1, verbose=2)
	clf.fit(fea, data["OpenStatus"])

	print "Listing feature importances:"
	cu.list_feature_importance(clf,feature_names)
	
	print("Reading test file and making predictions: " + test_file)
	data = cu.get_dataframe(test_file)
	test_features = features.extract_features(feature_names, data)
	probs = clf.predict_proba(test_features)

	if (update_posteriors):
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
	
	print("Saving submission to %s" % submission_file)
	cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_little)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    #classifier = MultinomialNB()
    #classifier = KNeighborsClassifier(n_neighbors=3, weights='distance')
    classifier = RandomForestClassifier(n_estimators=50, verbose=2, compute_importances=True, n_jobs=-1)
    #classifier = GradientBoostingClassifier(n_estimators=200, learn_rate=0.1)

    classifier.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_litte)
    test_features = features.extract_features(feature_names, data)
    probs = classifier.predict_proba(test_features)

    #print("Calculating priors and updating posteriors")
    #new_priors = cu.get_priors(full_train_file)
    #old_priors = cu.get_priors(train_file)
    #probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_litte)
    cu.write_submission(submission_litte, probs)
def make_submission():
    print("Reading data")
    fea, status = features.online_extract_features('data/train.csv',
                                                   limit=5e6)
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=1)
    
    rf.fit(fea, status['OpenStatus'])
    
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               fea.keys())
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    test_features = features.online_extract_features('data/'+test_file,
                                                     train=False,
                                                     limit=1e12)[0]
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Example #8
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mten = MultiTaskElasticNet(alpha=0.1,
                               rho=0.5,
                               fit_intercept=True,
                               normalize=False,
                               copy_X=True,
                               max_iter=1000,
                               tol=0.0001,
                               warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    mten.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    probs = mten.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    if is_full_train_set == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	percep = Perceptron(penalty=None, alpha=0.0001, fit_intercept=False, n_iter=5, shuffle=False, verbose=1, eta0=1.0, n_jobs=-1, seed=0, class_weight="auto", warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	percep.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	#probs = percep.predict_proba(test_fea) # only available for binary classification
	probs = percep.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	#if is_full_train_set == 0:
	#	print("Calculating priors and updating posteriors")
	#	new_priors = cu.get_priors(full_train_file)
	#	old_priors = cu.get_priors(train_file)
	#	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	mten.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	probs = mten.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	if is_full_train_set == 0:
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Example #11
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1. / K, 1. / K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Example #12
0
def main():
    # The number of documents to analyze each iteration
    batchsize = 100

    # The total number of questions on Stack Overflow
    D = 3.3e6

    # The number of topics
    K = 20

    # Make sure the topics are included as features for analysis
    feature_names.extend('Topic%d' % k for k in range(K))

    print("Reading the vocabulary")
    vocab = [w.strip() for w in file('./vocab4.txt')]

    # How many words are in the vocabulary
    W = len(vocab)

    print("Reading the data")
    data = cu.get_dataframe(train_file)

    # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7
    lda = onlineldavb.OnlineLDA(vocab, K, D, 1./K, 1./K, 1024., 0.7)

    print("Allocating the topics")
    allocate_topics(lda, data, K, batchsize, D)

    print("Extracting features")
    fea = extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50, verbose=2,
                                compute_importances=True, n_jobs=4)
    rf.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    allocate_topics(lda, data, K, batchsize, D)
    test_features = extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def make_submission():
    data = None
    if os.path.exists('data.pik'):
        print("Unpickeling the data")
        data = pickle.load(open('data.pik'))
    else:
        print("Reading the data")
        data = cu.get_dataframe(full_train_file)
        pickle.dump(data,open('data.pik','w'))

    fea = None
    if os.path.exists('fea.pik'):
        print("Unpickeling the fea")
        fea = pickle.load(open('fea.pik'))
    else:
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        pickle.dump(fea,open('fea.pik','w'))
    
    print("Training the model")
    rf = RandomForestClassifier(n_estimators=50,
                                verbose=2,
                                compute_importances=True,
                                oob_score=True,
                                #criterion='entropy',
                                n_jobs=2)
    
    rf.fit(fea, data["OpenStatus"])
    print "Features Importance:"
    imps = zip(rf.feature_importances_,
               feature_names,)
    imps.sort(reverse=True)
    print '\n'.join([ str(_) for _ in imps ])
    print "Generalization Error:", rf.oob_score_

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = rf.predict_proba(test_features)

    if True:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
def main():
    f = open(probs_file, 'r')
    lines = f.readlines()
    probs = []

    for line in lines:
        probs.append(np.array([float(x) for x in line.split(',')]))    

    print("Calculating priors and updating posteriors")
    probs = np.array(probs)
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    f.close()
def make_submission():
    print("Reading data")
    fea, status = features.online_extract_features('data/train.csv',
                                                   train=True,
                                                   limit=1e9)
    _dim(fea,'fea')
    print("Training Level 1 : Open/Rest model")
    open_status = [ or_binarize(e) for e in status['OpenStatus']  ]
    is_not_open_status = [ s != 'open' for s in open_status ]
    or_model = learn(fea,open_status)

    print("Training Level 2 : Not Open Split model")
    not_open_status = [ status['OpenStatus'][i] for i in range(len(is_not_open_status)) if is_not_open_status[i] ]
    no_fea = fea[is_not_open_status]
    _dim(no_fea,'no_fea')
    no_model = learn(no_fea,not_open_status)
    
    print("Reading test file and making predictions")
    test_features = features.online_extract_features('data/'+test_file,
                                                     train=False,
                                                     limit=1e9)[0]
    _dim(test_features,'test_features')
    or_probs = or_model.predict_proba(test_features)
    probs = []
    for i in range(0,len(or_probs)):
        or_prob = or_probs[i]
        if or_prob[0] > or_prob[1]:
            probs.append(np.array([1.0,0.0,0.0,0.0,0.0]))
        else:
            f = [ test_features[ff][i] for ff in test_features.keys() ]
            a = no_model.predict_proba(f)
            aa = np.insert(a,0,[0.0])
            probs.append(aa)
    probs = np.array(probs)

    if False:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Example #16
0
def main():
    start = time.time()
    print("Reading the data from " + train_file)
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    clf = ExtraTreesClassifier(n_estimators=trees_count,
                               max_features=len(feature_names),
                               max_depth=None,
                               min_samples_split=1,
                               compute_importances=True,
                               bootstrap=False,
                               random_state=0,
                               n_jobs=-1,
                               verbose=2)
    clf.fit(fea, data["OpenStatus"])

    print "Listing feature importances:"
    cu.list_feature_importance(clf, feature_names)

    print("Reading test file and making predictions: " + test_file)
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)
    probs = clf.predict_proba(test_features)

    if (update_posteriors):
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    print("Extracting features")
    fea = features.extract_features(feature_names, data)

    print("Training the model")
    rf = RandomForestClassifier(n_estimators=100, verbose=2, compute_importances=True, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"])
    gb = GradientBoostingClassifier(n_estimators=100, learn_rate=1.0)
    gb.fit(fea, data["OpenStatus"])
    dt = DecisionTreeClassifier(max_depth=None, min_samples_split=1, random_state=0)
    dt.fit(fea, data["OpenStatus"])
    et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0)
    et.fit(fea, data["OpenStatus"])

    print("Reading test file and making predictions")
    data = cu.get_dataframe(test_file)
    test_features = features.extract_features(feature_names, data)

    probs = rf.predict_proba(test_features)
    probs2 = gb.predict_proba(test_features)
    probs3 = dt.predict_proba(test_features)
    probs4 = et.predict_proba(test_features)

    for i in range(0, len(probs)):
        for j in range(0,5):
            probs[i][j] = (probs[i][j] + probs2[i][j] + probs3[i][j] + probs4[i][j])/4

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print("Saving submission to %s" % submission_file)
    cu.write_submission(submission_file, probs)
Example #18
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mnbayes = MultinomialNB(alpha=1.0, fit_prior=True)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    y = []

    for element in data['OpenStatus']:
        for index, status in enumerate(ques_status):
            if element == status: y.append(index)

    if do_cross_validation == 1:
        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        #skf = StratifiedKFold(y,k = 10)
        skf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in skf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            mnbayes.fit(
                X_train, y_train
            )  #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
            y_test = vectorize_actual(y_test)  # vectorize y_test

            _pred_probs = mnbayes.predict_proba(X_test)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)

        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names, test_data)

        print "Fitting"
        mnbayes.fit(
            fea, y
        )  #, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])

        print "Making predictions"
        global probs
        probs = mnbayes.predict_proba(test_fea)

        #if is_full_train_set == 0:
        #	print("Calculating priors and updating posteriors")
        #	new_priors = cu.get_priors(full_train_file)
        #	old_priors = cu.get_priors(train_file)
        #	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
Example #19
0
def main():
    print("Reading the data")
    data = cu.get_dataframe(train_file)

    #print("Extracting features")
    #features.compute_features(train_file,feature_file1)

    print("Training the model")
    fea = cu.get_dataframe(feature_file1)
    rf = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=-1)
    rf.fit(fea, data["OpenStatus"][:178351])
    important_features = []
    for x, i in enumerate(rf.feature_importances_):
        if i > np.average(rf.feature_importances_):
            important_features.append([str(x),i])
    print 'Most important features:',important_features

    print("Reading test file and making predictions")
    #features.compute_features("test1.csv",feature_file2)
    test_fea = cu.get_dataframe(feature_file2)
    probs = rf.predict_proba(test_fea)

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors,probs, new_priors, 0.01)

    y_pred=[]
    for i in probs:
        i=[float(k) for k in i]
        j=i.index(max(i))
        if(j==3):
            y_pred.append("open")
        else:
            print "hi"
            y_pred.append("closed")

    y_true=[]
    a=0
    b=0
    test_reader = csv.reader(open(test_file))
    headers=test_reader.next()
    for line in test_reader:
        if line[14]=='open':
            y_true.append("open")
            a=a+1
        else:
            y_true.append("closed")
            b=b+1
    print a
    print b

    print confusion_matrix(y_true[1:] , y_pred , labels=["open", "closed"])





    print("Saving submission to %s" % submission_file)

    cu.write_submission(submission_file, probs)
Example #20
0
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples, 5)), np.array(0.2))
    cu.write_submission("uniform_benchmark.csv", predictions)
Example #21
0
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(skf)))
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X, y)
            y_submission = clf.predict_proba(X_test)[:,1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

    print
    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:,1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())
    probs = y_submission
    
    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)
    
    print "Saving Results."
    cu.write_submission(submission_file, probs)
        
Example #22
0
        for i, (train, test) in enumerate(skf):
            print "Fold", i
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X, y)
            y_submission = clf.predict_proba(X_test)[:, 1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:, 1]
        dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

    print
    print "Blending."
    clf = LogisticRegression()
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

    print "Linear stretch of predictions to [0,1]"
    y_submission = (y_submission - y_submission.min()) / (y_submission.max() -
                                                          y_submission.min())
    probs = y_submission

    print("Calculating priors and updating posteriors")
    new_priors = cu.get_priors(full_train_file)
    old_priors = cu.get_priors(train_file)
    probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "Saving Results."
    cu.write_submission(submission_file, probs)
Example #23
0
def main():
    predictions = [actual_lookup[r[14]] for r in cu.get_reader(actual_file)]
    cu.write_submission("actual_benchmark.csv", predictions)
Example #24
0
def main():

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)

        i = _chunksize
        fea = None
        y = []

        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(
                    features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)

            i = i + _chunksize
    else:
        print("Reading the data from:" + train_file)
        data = cu.get_dataframe(train_file)
        print("Extracting features")
        fea = features.extract_features(feature_names, data)
        y = []
        for element in data['OpenStatus']:
            for index, status in enumerate(ques_status):
                if element == status: y.append(index)

    if do_cross_validation == 1:
        depth = len(feature_names)
        print "depth=" + str(depth)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=depth,
                                        init=None,
                                        random_state=None)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y), k=10)
        fold = 0
        result_sum = 0
        for train_index, test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])

            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])

            print "Fitting for fold " + str(fold)

            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)

            _pred_probs = rf.predict_proba(X_test)
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            # priors distribution over classes based on the training set
            #new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            # priors distribution over classes based on the updated training set's last month
            new_priors = [
                0.03410911204982466, 0.01173872976800856, 0.018430671606251586,
                0.926642216133641, 0.009079270442274271
            ]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs,
                                                   new_priors, 0.001)
            # evaluating the performance
            result = eval.mcllfun(y_test, _pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold, result)

        print "depth=" + str(depth)
        print "Average MCLL score for this classifier = %0.11f" % (result_sum /
                                                                   10)
    else:
        #rf = RandomForestClassifier(n_estimators=50, verbose=0, compute_importances=True, n_jobs=-1)
        rf = GradientBoostingClassifier(loss='deviance',
                                        learn_rate=0.1,
                                        n_estimators=100,
                                        subsample=1.0,
                                        min_samples_split=1,
                                        min_samples_leaf=1,
                                        max_depth=len(feature_names),
                                        init=None,
                                        random_state=None)

        rf.fit(fea, y)

        print("Reading test file " + test_file + " and making predictions")
        data = cu.get_dataframe(test_file)
        test_features = features.extract_features(feature_names, data)
        probs = rf.predict_proba(test_features)

        # commented out, because we want to adjust probabilities to the last month data anyway
        #if do_full_train == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

        print("Saving submission to %s" % submission_file)
        cu.write_submission(submission_file, probs)
Example #25
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    percep = Perceptron(penalty=None,
                        alpha=0.0001,
                        fit_intercept=False,
                        n_iter=5,
                        shuffle=False,
                        verbose=1,
                        eta0=1.0,
                        n_jobs=-1,
                        seed=0,
                        class_weight="auto",
                        warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    percep.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    #probs = percep.predict_proba(test_fea) # only available for binary classification
    probs = percep.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    #if is_full_train_set == 0:
    #	print("Calculating priors and updating posteriors")
    #	new_priors = cu.get_priors(full_train_file)
    #	old_priors = cu.get_priors(train_file)
    #	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
Example #26
0
def main():
    priors = cu.get_priors("train.csv")
    num_samples = len(cu.get_dataframe("test_.csv"))
    predictions = np.kron(np.ones((num_samples, 1)), priors)
    cu.write_submission("prior_benchmark.csv", predictions)
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = [[0.0, 0.0, 0.0, 1.0, 0.0] for i in range(num_samples)]
    cu.write_submission("always_open_benchmark.csv", predictions)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mnbayes = MultinomialNB(alpha=1.0, fit_prior=True)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	y = []
	
	for element in data['OpenStatus']:
		for index, status in enumerate(ques_status):
			if element == status: y.append(index)
	
	if do_cross_validation == 1:
		print 'starting 10 fold verification'
		# Dividing the dataset into k = 10 folds for cross validation
		#skf = StratifiedKFold(y,k = 10)
		skf = KFold(len(y),k = 10)
		fold = 0
		result_sum = 0
		for train_index,test_index in skf:
			fold += 1
			X_train = []
			X_test = []
			y_train = []
			y_test = []
			for i in train_index:
				temp = []
				for feature_name in feature_names:
					temp.append(fea[feature_name][i])
				X_train.append(temp)
				y_train.append(y[i])
				
			for i in test_index:
				temp = []
				for feature_name in feature_names:
					temp.append(fea[feature_name][i])
				X_test.append(temp)
				y_test.append(y[i])
			
			mnbayes.fit(X_train, y_train) #, sample_weight=None, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
			y_test = vectorize_actual(y_test)               # vectorize y_test
			
			_pred_probs = mnbayes.predict_proba(X_test)
			# evaluating the performance
			result = eval.mcllfun(y_test,_pred_probs)
			result_sum += result
			print "MCLL score for fold %d = %0.11f" % (fold,result)
			
		print "Average MCLL score for this classifier = %0.11f" % (result_sum/10)
	
		print "Reading test data and features"
		test_data = cu.get_dataframe(test_file)
		test_fea = features.extract_features(feature_names,test_data)
		
		print "Fitting"
		mnbayes.fit(fea,y)#, class_prior=[0.0091347705760047, 0.0046458596397953, 0.0052009655460509, 0.9791913907850639, 0.0018270134530851])
		
		print "Making predictions"
		global probs
		probs = mnbayes.predict_proba(test_fea)

		#if is_full_train_set == 0:
		#	print("Calculating priors and updating posteriors")
		#	new_priors = cu.get_priors(full_train_file)
		#	old_priors = cu.get_priors(train_file)
		#	probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

		print "writing submission to " + submission_file
		cu.write_submission(submission_file, probs)
	
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
def main():
    priors = cu.get_priors("train.csv")
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples,1)), priors)
    cu.write_submission("prior_benchmark.csv", predictions)
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = np.kron(np.ones((num_samples, 5)), np.array(0.2))
    cu.write_submission("uniform_benchmark.csv", predictions)
Example #31
0
def main():
    predictions = [actual_lookup[r[14]] for r in cu.get_reader(actual_file)]
    cu.write_submission("actual_benchmark.csv", predictions)
def main():
    num_samples = len(cu.get_dataframe("public_leaderboard.csv"))
    predictions = [[0.0,0.0,0.0,1.0,0.0] for i in range(num_samples)]
    cu.write_submission("always_open_benchmark.csv", predictions)
Example #33
0
def main():
    start = time.time()

    if (use_low_mem == 1):
        data_iter = cu.iter_data_frames(train_file, _chunksize)
    
        i = _chunksize
        fea = None
        y = []        
        
        for train_data in data_iter:
            print "About to have processed: " + str(i)
            print("Extracting features")
            if fea is None:
                fea = features.extract_features(feature_names, train_data)
            else:
                fea = fea.append(features.extract_features(feature_names, train_data))
            for element in train_data['OpenStatus']:
                for index, status in enumerate(ques_status):
                    if element == status: y.append(index)
        
            i = i + _chunksize
    else:
        print "Reading train data and its features from: " + train_file
        data = cu.get_dataframe(train_file)
        fea = features.extract_features(feature_names,data)
        print "Collecting statuses"
        y = []
        for element in data["OpenStatus"]:
                for index, status in enumerate(ques_status):
                    if element == status:
                        y.append(index)

    if do_cross_validation == 1:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001)

        print 'starting 10 fold verification'
        # Dividing the dataset into k = 10 folds for cross validation
        kf = KFold(len(y),k = 10)
        fold = 0
        result_sum = 0
        for train_index,test_index in kf:
            fold += 1
            X_train = []
            X_test = []
            y_train = []
            y_test = []
            for i in train_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_train.append(temp)
                y_train.append(y[i])
                
            for i in test_index:
                temp = []
                for feature_name in feature_names:
                    if feature_name == 'BodyLength':
                        temp.append(fea['BodyMarkdown'][i])
                    elif feature_name == 'TitleLength':
                        temp.append(fea['Title'][i])
                    else:
                        temp.append(fea[feature_name][i])
                X_test.append(temp)
                y_test.append(y[i])
            
            print "fitting this fold's data"
            
            rf.fit(X_train, y_train)
            y_test = vectorize_actual(y_test)
            
            #_pred_probs = denormalize(rf.predict_proba(X_test))
            _pred_probs = rf.predict_proba(X_test)
            
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            _pred_probs = cu.cap_and_update_priors(old_priors, _pred_probs, new_priors, 0.001)            
            # evaluating the performance
            result = eval.mcllfun(y_test,_pred_probs)
            result_sum += result
            print "MCLL score for fold %d = %0.11f" % (fold,result)
            
        print "Average MCLL score for this classifier = %0.11f" % (result_sum/10)     
    else:
        logit = LogisticRegression(penalty='l2', dual=False, C=1.0, class_weight=None,
                                       fit_intercept=True, intercept_scaling=1, tol=0.0001) # not available: compute_importances=True

        print "Fitting"
        logit.fit(fea, y)
        
        print "Reading test data and features"
        test_data = cu.get_dataframe(test_file)
        test_fea = features.extract_features(feature_names,test_data)

        print "Making predictions"
        global probs
        probs = logit.predict_proba(test_fea)
        
        if is_full_train_set == 0:
            print("Calculating priors and updating posteriors")
            #new_priors = cu.get_priors(full_train_file)
            new_priors = [0.00913477057600471, 0.004645859639795308, 0.005200965546050945, 0.9791913907850639, 0.0018270134530850952]
            old_priors = cu.get_priors(train_file)
            probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)    

        print "writing submission to " + submission_file
        cu.write_submission(submission_file, probs)

    finish = time.time()
    print "completed in %0.4f seconds" % (finish-start)