Example #1
0
def main(args):
	#load data
	if args.raw_data_dir is not None:
		raw_data_files = os.listdir(args.raw_data_dir)
		print("Generating features from data files")
		features = map(lambda raw_file: ef.generate_features(pd.read_csv("data/{0}".format(raw_file))), raw_data_files)
		data = pd.concat(features).values
		print("Done generating features")
	elif args.feature_dir is not None:
		data = load_files(args.feature_dir)

	data_train, data_test, target_train, target_test = train_test_split(data[:, 1:], data[:, 0], test_size = 0.4)

	nn_3 = kNN(3, data_train, target_train)
	nn_5 = kNN(5, data_train, target_train)
	nn_7 = kNN(7, data_train, target_train)
	gnb = NaiveBayes(data_train, target_train)
	svm = SVM(data_train, target_train)
	names = ["3nn", "5nn", "7nn", "GNB", "SVM"]
	classifiers = [nn_3, nn_5, nn_7, gnb, svm]

	for name, clf in zip(names, classifiers):
		prediction = clf.predict(data_test)
		print("Profile for {0}".format(name))
		print("Accuracy Score: {0}".format(accuracy_score(prediction, target_test)))
		print("Precision Score: {0}".format(precision_score(prediction, target_test)))
		print("Recall Score: {0}".format(recall_score(prediction, target_test)))
		print("F1 Score: {0}".format(f1_score(prediction, target_test)))
		print()
Example #2
0
def main(args):
    #load data
    if args.raw_data_dir is not None:
        raw_data_files = os.listdir(args.raw_data_dir)
        print("Generating features from data files")
        features = map(
            lambda raw_file: ef.generate_features(
                pd.read_csv("data/{0}".format(raw_file))), raw_data_files)
        data = pd.concat(features).values
        print("Done generating features")
    elif args.feature_dir is not None:
        data = load_files(args.feature_dir)

    names = ["3nn", "5nn", "7nn", "SVM", "D Tree", "Random Forest"]
    results = [
        pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'],
                     dtype=np.float_) for _ in range(len(names))
    ]
    for _ in range(args.iters):
        data_train, data_test, target_train, target_test = train_test_split(
            data[:, 1:8], data[:, 0], test_size=0.3)

        nn_3 = kNN(3)
        nn_5 = kNN(5)
        nn_7 = kNN(7)
        svm = SVM(500)
        dtree = DecisionTreeClassifier(max_depth=3)
        forest = RandomForestClassifier(max_depth=3)
        classifiers = [nn_3, nn_5, nn_7, svm, dtree, forest]

        for i, (name, clf,
                result) in enumerate(zip(names, classifiers, results)):
            k_predictions = train_folds(5, clf, data_train, target_train,
                                        data_test)
            prediction = vote_on_predictions(k_predictions)
            results[i] = result.append(
                {
                    'accuracy': accuracy_score(target_test, prediction),
                    'precision': precision_score(target_test, prediction),
                    'recall': recall_score(target_test, prediction),
                    'f1': f1_score(target_test, prediction)
                },
                ignore_index=True)

    # base = np.ones(target_test.shape)
    # print("Profile for base")
    # print("Accuracy Score: {0}".format(accuracy_score(target_test, base)))
    # print("Precision Score: {0}".format(precision_score(target_test, base)))
    # print("Recall Score: {0}".format(recall_score(target_test, base)))
    # print("F1 Score: {0}".format(f1_score(target_test, base)))
    # print()
    for name, res in zip(names, results):
        print("Results for %s" % name)
        print(res)
        print(res.mean(axis=0))
Example #3
0
def main(args):
    raw = pandas.read_csv(args.raw_data_file)
    features = extract_features.generate_features(raw).values
    features_labels = features[:, 0]
    feature_data = features[:, 1:]

    data = mkf.load_files(args.feature_dir)
    target = data[:, 0]
    fvs = data[:, 1:]
    svm = mkf.SVM(500)
    forest = RandomForestClassifier(max_depth=3)

    svm.fit(fvs, target)
    forest.fit(fvs, target)

    svm_res = svm.predict(feature_data)
    forest_res = forest.predict(feature_data)
    print("Accuracy of svm: {}".format(accuracy_score(svm_res,
                                                      features_labels)))
    print("Accuracy of random forest: {}".format(
        accuracy_score(forest_res, features_labels)))
Example #4
0
def calculate_featuresX(filename, a, sw):
    # All samples for activity
    X = genfromtxt(filename, delimiter=' ')
    i = 0
    # Get functions for features
    features = extract_features.generate_features()
    # Calculated features matrix
    outf = None
    while i + sw < X.shape[0]:
        fx = extract_features.get_features(X[i:i+sw,0], features)
        fy = extract_features.get_features(X[i:i+sw,1], features)
        fz = extract_features.get_features(X[i:i+sw,2], features)
        # Concatenate vectores for axis
        feat = np.concatenate((fx, fy, fx, [a]))
        if type(outf).__module__ != np.__name__:
            outf = feat
        else:
            # Concatenate matrices
            outf = np.vstack((outf, feat))
        # Move window
        i += sw/2
    savetxt('../data/huawei-p7/' + filename.split('/')[-1].split('.')[0] + 'X.txt', outf, delimiter=',')
 def evaluate_track(self, filepath):
     features = extract_features.generate_features(filepath)
     sel_feats = features.iloc[:-3].reshape(1, -1)
     sel_train = self.train_data.iloc[:, :-3]
     distance_vec = cdist(sel_train, sel_feats, "cosine").reshape(-1)
     return distance_vec
Example #6
0
def main():
	print("Getting training data")

	# Get parsed json data as dictionary objects for inputs to the model
	imperatives = extra_functions.json_to_dict("processed/instructions.json")
	ingredients = extra_functions.json_to_dict("processed/ingredients.json")
	num_instructions = extra_functions.json_to_dict("processed/num_instructions.json")
	num_ingredients = extra_functions.json_to_dict("processed/num_ingredients.json")
	instruction_times = extra_functions.json_to_dict("processed/instruction_time.json")
	times = extra_functions.json_to_dict("processed/times.json")

	# Get ordered list of recipe ids
	recipeIDs = times.keys()

	# Exclude recipes who's true Ready-In Times are greater than 24 hours
	for recipeID in recipeIDs:
		if times[recipeID] > 24*60: del times[recipeID]

	# Get feature matrix x, and true label vector y
	x, y, ids = extract_features.generate_features(imperatives, ingredients, times, num_instructions, num_ingredients, instruction_times)
	
	# Shuffle the data
	s = np.arange(len(x))
	np.random.shuffle(s)
	x = x[s]
	y = y[s]
	ids = ids[s]

	# Split data into train, test data (70% train data)
	train_split = int(len(x))/10*7
	train_x, train_y = x[:train_split], y[:train_split]
	test_x, test_y = x[train_split:], y[train_split:]
	train_ids, test_ids = ids[train_split:], ids[train_split:]

	# Get the baseline ready-in time prediction (median across the training data)
	baseline(train_y)

	''' Hyper-parameters which we deemed best (results of test_hyperparameters.py script) '''

	# Regularization parameter for SVM
	c = 100
	# Number of max tree splits for decision tree classifier
	m = 200
	# Number of individual classifiers used for our overarching model,
	# which averages over the results of each classifier
	f = 3

	# Create 5-folds for cross validation
	k = int(math.ceil(train_split/5.0))
	train_x_folds = [train_x[1:k], train_x[k:2*k], train_x[2*k:3*k], train_x[3*k:4*k], train_x[4*k:]]
	train_y_folds = [train_y[1:k], train_y[k:2*k], train_y[2*k:3*k], train_y[3*k:4*k], train_y[4*k:]]

	# Run model on each fold
	best_fold_model = []
	best_fold_accuracy = 0
	for fold in range(5):

		# Use other folds as training data, current fold as validation data
		train_x_current = [train_x[0]]
		train_y_current = [train_y[0]]
		for i in range(5):
			if fold != i:
				train_x_current = np.concatenate((train_x_current, train_x_folds[i]))
				train_y_current = np.concatenate((train_y_current, train_y_folds[i]))

		# Train model on training data
		model = train_model(train_x_current, train_y_current, c, f)

		# Get accuracy based on this fold
		accuracy = test_model(train_x_folds[fold], train_y_folds[fold], model)

		# Print accuracy for this fold
		print("FOLD # " + str(fold+1) + " " + str(accuracy))


		if accuracy > best_fold_accuracy:
			best_fold_accuracy = accuracy
			best_fold_model = model

	# Print final accuracy of our model based on the best fold
	final_acc = test_model(test_x, test_y, best_fold_model)
	print("TEST: " + str(final_acc))
 def evaluate_track(self, filepath):
     features = extract_features.generate_features(filepath)
     sel_feats = features.iloc[:-3].reshape(1, -1)
     sel_train = self.train_data.iloc[:, :-3]
     distance_vec = cdist(sel_train, sel_feats, "cosine").reshape(-1)
     return distance_vec