Ejemplo n.º 1
0
def main():
    """
    Fit models and make predictions.
    We'll use one-hot encoding to transform our categorical features
    into binary features.
    y and X will be numpy array objects.
    """
    model = linear_model.LogisticRegression(C=3)  # the classifier we'll use

    # === load data in memory === #
    print "loading data"

    cwd = os.getcwd()
    trainDataLoc = cwd + '/../data/train.csv'
    testDataLoc = cwd + '/../data/test.csv'

    y, X = load_data(trainDataLoc)
    y_test, X_test = load_data(testDataLoc, use_labels=False)

    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)

    # if you want to create new features, you'll need to compute them
    # before the encoding, and append them to your dataset after

    # === training & metrics === #
    mean_auc = 0.0
    n = 10  # repeat the CV procedure 10 times to get more precise results
    for i in range(n):
        # for each iteration, randomly hold out 20% of the data as CV set
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
            X, y, test_size=.20, random_state=i * SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it

        # train model and make predictions
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:, 1]

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc

    print "Mean AUC: %f" % (mean_auc / n)

    # === Predictions === #
    # When making predictions, retrain the model on the whole training set
    model.fit(X, y)
    preds = model.predict_proba(X_test)[:, 1]
    #filename = raw_input("Enter name for submission file: ")
    filename = 'LogisticRegressionResults'
    save_results(preds, filename + ".csv")
Ejemplo n.º 2
0
def main():

    cwd = os.getcwd()
    trainDataLoc = cwd + '/../data/train.csv'
    testDataLoc = cwd + '/../data/test.csv'

    y, X = load_data(trainDataLoc)
    y_test, X_test = load_data(testDataLoc, use_labels=False)

    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)

    #model = findBestModel(X, y) Best model is rbf, gamma = 1, c = 1

    X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
        X, y, test_size=.20, random_state=SEED)

    model = svm.SVC(C=1, probability=True, kernel='rbf', gamma=1)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_cv)[:, 1]

    # compute AUC metric for this CV fold
    fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
    roc_auc = metrics.auc(fpr, tpr)
    print "AUC : %f" % (roc_auc)

    preds = model.predict_proba(X_test)[:, 1]

    save_results(preds, "SVM_classifier.csv")
Ejemplo n.º 3
0
def main():
    filesToEnsemble = [
        'SVM_classifier.csv', 'output.csv', 'XGBoost_classifier.csv'
    ]  #, 'LogisticRegressionResults.csv','logistic_regression_pred.csv']#, 'sampleSubmission.csv']
    fdata = []
    itemSum = []
    for i in range(numItems):
        itemSum.append((i, 0))
    cwd = os.getcwd()

    for item in filesToEnsemble:
        item = cwd + '/../output/' + item
        fdict = {}
        f = open(item, 'r')
        first = True
        for line in f:
            if not first:
                contents = line.split(',', 2)
                contents[1] = contents[1].strip('\n')
                fdict[int(contents[0])] = float(contents[1])
            else:
                first = False
                continue

        #fList is a list of (row, probability) tuples
        fList = sorted(fdict.items(), key=lambda t: t[1])
        for i in range(len(fList)):
            fList[i] = (fList[i][0], i)  # fList is now a list of (row, rank)

        fdata.append(fList)

    for l in fdata:
        for item in l:
            row = item[0]
            itemSum[row - 1] = (itemSum[row - 1][0],
                                itemSum[row - 1][1] + item[1])

    reSorted = sorted(itemSum, key=lambda t: t[1])

    for i in range(len(reSorted)):
        reSorted[i] = (reSorted[i][0] + 1, float(i) / numItems)

    backToNormal = sorted(reSorted, key=lambda t: t[0])

    save_results(map(lambda x: x[1], backToNormal), 'Ensembled_Results.csv')
def main():

	cwd = os.getcwd()
	trainDataLoc = cwd + '/../data/train.csv'
	testDataLoc = cwd + '/../data/test.csv'

	y, X = load_data(trainDataLoc)
	y_test, X_test = load_data(testDataLoc, use_labels=False)

	clf = xgb.XGBClassifier(max_depth=15, 
	                        n_estimators=200, learning_rate=.4, colsample_bytree=.8, seed=SEED)

	# fitting
	clf.fit(X, y, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(X_test, y_test)])

	#print y_pred
	preds = clf.predict_proba(X_test)[:,1]

	save_results(preds, "XGBoost_classifier.csv")
Ejemplo n.º 5
0
def main():
	filesToEnsemble = ['SVM_classifier.csv', 'output.csv', 'XGBoost_classifier.csv']#, 'LogisticRegressionResults.csv','logistic_regression_pred.csv']#, 'sampleSubmission.csv']
	fdata = []
	itemSum = []
	for i in range(numItems):
		itemSum.append((i,0))
	cwd = os.getcwd()

	for item in filesToEnsemble:
		item = cwd + '/../output/' + item
		fdict = {}
		f = open(item, 'r')
		first = True
		for line in f:
			if not first:
				contents = line.split(',', 2)
				contents[1] = contents[1].strip('\n')
				fdict[int(contents[0])] = float(contents[1])
			else:
				first = False
				continue

		#fList is a list of (row, probability) tuples
		fList = sorted(fdict.items(), key=lambda t: t[1]) 
		for i in range(len(fList)):
			fList[i] = (fList[i][0], i) # fList is now a list of (row, rank)

		fdata.append(fList)


	for l in fdata:
		for item in l:
			row = item[0]
			itemSum[row-1] = (itemSum[row-1][0], itemSum[row-1][1] + item[1])

	reSorted = sorted(itemSum, key=lambda t: t[1])

	for i in range(len(reSorted)):
		reSorted[i] = (reSorted[i][0] + 1, float(i) / numItems)

	backToNormal = sorted(reSorted, key=lambda t: t[0])

	save_results(map(lambda x: x[1], backToNormal), 'Ensembled_Results.csv')
Ejemplo n.º 6
0
encoder.fit(np.vstack((X, X_test)))
X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
X_test = encoder.transform(X_test)

print("about to classify")
clf = AdaBoostClassifier(base_estimator=None,
                         n_estimators=900,
                         learning_rate=1.8)
scores = clf.fit(X, y)
# """
# X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=SEED)

# model = svm.SVC(C=1, probability=True, kernel='rbf')
# model.fit(X_train, y_train)
# preds = model.predict_proba(X_cv)[:, 1]

# # compute AUC metric for this CV fold
# fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
# roc_auc = metrics.auc(fpr, tpr)
# print "AUC : %f" % (roc_auc)
# """

prediction = scores.predict_proba(X_test)[:, 1]

save_results(predictions, 'AdaBoost_output.csv')
print("done")

# score = cross_val_score(clf, Matrix, salary)
# print score
# print score.mean()
Ejemplo n.º 7
0
encoder.fit(np.vstack((X, X_test)))
X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
X_test = encoder.transform(X_test)

print("about to classify")
clf = AdaBoostClassifier(base_estimator=None, n_estimators=900, learning_rate=1.8)
scores = clf.fit(X, y)
    # """
    # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=SEED)
    
    # model = svm.SVC(C=1, probability=True, kernel='rbf')
    # model.fit(X_train, y_train)
    # preds = model.predict_proba(X_cv)[:, 1]

    # # compute AUC metric for this CV fold
    # fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
    # roc_auc = metrics.auc(fpr, tpr)
    # print "AUC : %f" % (roc_auc)
    # """

prediction = scores.predict_proba(X_test)[:, 1]

save_results(predictions,'AdaBoost_output.csv')
print ("done")



# score = cross_val_score(clf, Matrix, salary)
# print score
# print score.mean()  
Ejemplo n.º 8
0
def setup_argparse():
    parser = argparse.ArgumentParser(description='GitHub repository file vulnerability finder')

    # Add the CL args
    parser.add_argument('-u', '--username', required=True, help='GitHub username')
    parser.add_argument('-r', '--repo', help='Repository name')
    parser.add_argument('-s', '--save', help='File name to which output will be saved (within output/ dir). If not provided, results only displayed on console.')
    parser.add_argument('-t', '--token', help='Github API Token')
    parser.add_argument('--entropy', help='Entropy threshold value, [0.0-1.0] default 0.45', type=float)

    # Check for these vulnerabilities
    vuln_group = parser.add_argument_group('Vulnerability types')
    vuln_group.add_argument('--api', help='Look for API keys', action='store_true')
    vuln_group.add_argument('-p', '--password', help='Look for passwords', action='store_true')
    vuln_group.add_argument('-e', '--email', help='Look for email addresses', action='store_true')
    vuln_group.add_argument('-b', '--bitcoin', help='Look for bitcoin', action='store_true')
    vuln_group.add_argument('-c', '--crypto', help='Look for cryptographic keys', action='store_true')



    args = parser.parse_args()

    # Validate entropy argument
    if args.entropy:
        if float(args.entropy) > 1 or float(args.entropy) < 0:
            raise argparse.ArgumentTypeError('Entropy value is not between 0 and 1')
    

    if args.api or args.password or args.email or args.bitcoin or args.crypto:
        types = set()
        if args.api:
            types.add('API')
        if args.password:
            types.add('Password')
        if args.email:
            types.add('Email')
        if args.crypto:
            types.add('Crypto')
    else:
        types = {'API', 'Password', 'Email', 'Crypto'}
    
    if args.token:
        headers = {'Authorization': f'token {args.token}'}
    else:
        headers = {}

    if args.repo:
        # Repo name provided
        print(f'Scraping {args.repo} repository')
        repo_files = RepoProcessing.get_repo_files(args.username, args.repo, headers)
        print("Recieved files from " +str(args.username) +"/"+ str(args.repo))
        if args.entropy:
            v = find_vulnerabilities(repo_files, args.entropy)
        else:
            v = find_vulnerabilities(repo_files)
        display_results(v, types)

        if args.save:
            save_results(v, args.save)
    else:
        print(f'Scraping repositories for user {args.username}')
        repo_names = RepoProcessing.get_user_repos(args.username, headers)
        print('Repository names:')
        for i in range(len(repo_names)):
            print(f'{i + 1}. {repo_names[i]}')
        print(f'{len(repo_names) + 1}. All repositories')

        repo_num = int(input(f'Enter a repository number to scrape ({len(repo_names) + 1} for all): ')) - 1

        if repo_num == len(repo_names):
            print('Scraping all repositories')
            all_files = RepoProcessing.get_all_files_for_user(args.username, headers)
            if args.entropy:
                v = find_vulnerabilities(all_files, args.entropy)
            else:
                v = find_vulnerabilities(all_files)
            display_results(v, types)
        else:
            print(f'Scraping {repo_names[repo_num]} repository')
            repo_files = RepoProcessing.get_repo_files(args.username, repo_names[repo_num], headers)
            print("Received files from " +str(args.username) +"/"+ str(repo_names[repo_num]))
            if args.entropy:
                v = find_vulnerabilities(repo_files, args.entropy)
            else:
                v = find_vulnerabilities(repo_files)
            display_results(v, types)