def main(args): #load data if args.raw_data_dir is not None: raw_data_files = os.listdir(args.raw_data_dir) print("Generating features from data files") features = map(lambda raw_file: ef.generate_features(pd.read_csv("data/{0}".format(raw_file))), raw_data_files) data = pd.concat(features).values print("Done generating features") elif args.feature_dir is not None: data = load_files(args.feature_dir) data_train, data_test, target_train, target_test = train_test_split(data[:, 1:], data[:, 0], test_size = 0.4) nn_3 = kNN(3, data_train, target_train) nn_5 = kNN(5, data_train, target_train) nn_7 = kNN(7, data_train, target_train) gnb = NaiveBayes(data_train, target_train) svm = SVM(data_train, target_train) names = ["3nn", "5nn", "7nn", "GNB", "SVM"] classifiers = [nn_3, nn_5, nn_7, gnb, svm] for name, clf in zip(names, classifiers): prediction = clf.predict(data_test) print("Profile for {0}".format(name)) print("Accuracy Score: {0}".format(accuracy_score(prediction, target_test))) print("Precision Score: {0}".format(precision_score(prediction, target_test))) print("Recall Score: {0}".format(recall_score(prediction, target_test))) print("F1 Score: {0}".format(f1_score(prediction, target_test))) print()
def main(args): #load data if args.raw_data_dir is not None: raw_data_files = os.listdir(args.raw_data_dir) print("Generating features from data files") features = map( lambda raw_file: ef.generate_features( pd.read_csv("data/{0}".format(raw_file))), raw_data_files) data = pd.concat(features).values print("Done generating features") elif args.feature_dir is not None: data = load_files(args.feature_dir) names = ["3nn", "5nn", "7nn", "SVM", "D Tree", "Random Forest"] results = [ pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1'], dtype=np.float_) for _ in range(len(names)) ] for _ in range(args.iters): data_train, data_test, target_train, target_test = train_test_split( data[:, 1:8], data[:, 0], test_size=0.3) nn_3 = kNN(3) nn_5 = kNN(5) nn_7 = kNN(7) svm = SVM(500) dtree = DecisionTreeClassifier(max_depth=3) forest = RandomForestClassifier(max_depth=3) classifiers = [nn_3, nn_5, nn_7, svm, dtree, forest] for i, (name, clf, result) in enumerate(zip(names, classifiers, results)): k_predictions = train_folds(5, clf, data_train, target_train, data_test) prediction = vote_on_predictions(k_predictions) results[i] = result.append( { 'accuracy': accuracy_score(target_test, prediction), 'precision': precision_score(target_test, prediction), 'recall': recall_score(target_test, prediction), 'f1': f1_score(target_test, prediction) }, ignore_index=True) # base = np.ones(target_test.shape) # print("Profile for base") # print("Accuracy Score: {0}".format(accuracy_score(target_test, base))) # print("Precision Score: {0}".format(precision_score(target_test, base))) # print("Recall Score: {0}".format(recall_score(target_test, base))) # print("F1 Score: {0}".format(f1_score(target_test, base))) # print() for name, res in zip(names, results): print("Results for %s" % name) print(res) print(res.mean(axis=0))
def main(args): raw = pandas.read_csv(args.raw_data_file) features = extract_features.generate_features(raw).values features_labels = features[:, 0] feature_data = features[:, 1:] data = mkf.load_files(args.feature_dir) target = data[:, 0] fvs = data[:, 1:] svm = mkf.SVM(500) forest = RandomForestClassifier(max_depth=3) svm.fit(fvs, target) forest.fit(fvs, target) svm_res = svm.predict(feature_data) forest_res = forest.predict(feature_data) print("Accuracy of svm: {}".format(accuracy_score(svm_res, features_labels))) print("Accuracy of random forest: {}".format( accuracy_score(forest_res, features_labels)))
def calculate_featuresX(filename, a, sw): # All samples for activity X = genfromtxt(filename, delimiter=' ') i = 0 # Get functions for features features = extract_features.generate_features() # Calculated features matrix outf = None while i + sw < X.shape[0]: fx = extract_features.get_features(X[i:i+sw,0], features) fy = extract_features.get_features(X[i:i+sw,1], features) fz = extract_features.get_features(X[i:i+sw,2], features) # Concatenate vectores for axis feat = np.concatenate((fx, fy, fx, [a])) if type(outf).__module__ != np.__name__: outf = feat else: # Concatenate matrices outf = np.vstack((outf, feat)) # Move window i += sw/2 savetxt('../data/huawei-p7/' + filename.split('/')[-1].split('.')[0] + 'X.txt', outf, delimiter=',')
def evaluate_track(self, filepath): features = extract_features.generate_features(filepath) sel_feats = features.iloc[:-3].reshape(1, -1) sel_train = self.train_data.iloc[:, :-3] distance_vec = cdist(sel_train, sel_feats, "cosine").reshape(-1) return distance_vec
def main(): print("Getting training data") # Get parsed json data as dictionary objects for inputs to the model imperatives = extra_functions.json_to_dict("processed/instructions.json") ingredients = extra_functions.json_to_dict("processed/ingredients.json") num_instructions = extra_functions.json_to_dict("processed/num_instructions.json") num_ingredients = extra_functions.json_to_dict("processed/num_ingredients.json") instruction_times = extra_functions.json_to_dict("processed/instruction_time.json") times = extra_functions.json_to_dict("processed/times.json") # Get ordered list of recipe ids recipeIDs = times.keys() # Exclude recipes who's true Ready-In Times are greater than 24 hours for recipeID in recipeIDs: if times[recipeID] > 24*60: del times[recipeID] # Get feature matrix x, and true label vector y x, y, ids = extract_features.generate_features(imperatives, ingredients, times, num_instructions, num_ingredients, instruction_times) # Shuffle the data s = np.arange(len(x)) np.random.shuffle(s) x = x[s] y = y[s] ids = ids[s] # Split data into train, test data (70% train data) train_split = int(len(x))/10*7 train_x, train_y = x[:train_split], y[:train_split] test_x, test_y = x[train_split:], y[train_split:] train_ids, test_ids = ids[train_split:], ids[train_split:] # Get the baseline ready-in time prediction (median across the training data) baseline(train_y) ''' Hyper-parameters which we deemed best (results of test_hyperparameters.py script) ''' # Regularization parameter for SVM c = 100 # Number of max tree splits for decision tree classifier m = 200 # Number of individual classifiers used for our overarching model, # which averages over the results of each classifier f = 3 # Create 5-folds for cross validation k = int(math.ceil(train_split/5.0)) train_x_folds = [train_x[1:k], train_x[k:2*k], train_x[2*k:3*k], train_x[3*k:4*k], train_x[4*k:]] train_y_folds = [train_y[1:k], train_y[k:2*k], train_y[2*k:3*k], train_y[3*k:4*k], train_y[4*k:]] # Run model on each fold best_fold_model = [] best_fold_accuracy = 0 for fold in range(5): # Use other folds as training data, current fold as validation data train_x_current = [train_x[0]] train_y_current = [train_y[0]] for i in range(5): if fold != i: train_x_current = np.concatenate((train_x_current, train_x_folds[i])) train_y_current = np.concatenate((train_y_current, train_y_folds[i])) # Train model on training data model = train_model(train_x_current, train_y_current, c, f) # Get accuracy based on this fold accuracy = test_model(train_x_folds[fold], train_y_folds[fold], model) # Print accuracy for this fold print("FOLD # " + str(fold+1) + " " + str(accuracy)) if accuracy > best_fold_accuracy: best_fold_accuracy = accuracy best_fold_model = model # Print final accuracy of our model based on the best fold final_acc = test_model(test_x, test_y, best_fold_model) print("TEST: " + str(final_acc))