def question10(): perceptron = Perceptron() svm = SVM() lda = LDA() repeat = 500 mean_accuracies = np.empty((3, len(all_m))) accuracies = np.empty((3, repeat)) for i, m in enumerate(all_m): for j in range(repeat): X, yx = draw_points_until_two_classes(m) # training set Z, yz = draw_points_until_two_classes(k) # test set perceptron.fit(X, yx) svm.fit(X, yx) lda.fit(X, yx) accuracies[0, j] = perceptron.score(Z, yz)["accuracy"] accuracies[1, j] = svm.score(Z, yz)["accuracy"] accuracies[2, j] = lda.score(Z, yz)["accuracy"] mean_accuracies[:, i] = accuracies.mean(axis=1) models = ["Perceptron", "SVM", "LDA"] colors = ['blue', 'red', 'green'] fig = plt.figure() for i, model in enumerate(models): plt.plot(all_m, mean_accuracies[i, :], color=colors[i], label=model) plt.legend() plt.title("Q10: mean accuracy as function of m") fig.savefig("q10.png", bbox_inches='tight', pad_inches=0.2, dpi=fig.dpi)
def character_classification(): print('Loading data...') x, y = load_data_chars() print('Processing data..') print('Training data shape: ', x.shape) print('Test data shape: ', y.shape) plots.plot_filters(x[0]) SVM.svm(x, y) Naive_Bayes.naive_bayes(x, y) KNN.knn(x, y) CNN.fit_cnn(x, y, trials=1, network_type='simple')
def train(args): """ This function trains the models :param args: the command line arguments defining the desired actions """ # load data train_data_all, dev_data_all, _ = load(args.data_dir, cachedir=args.cachedir, override_cache=args.override_cache, text_only=(args.model.lower() in ["bi-lstm", "bert"]), include_tfidf=args.include_tfidf, balanced=args.balanced) train_data, train_labels = train_data_all.X, train_data_all.y dev_data, dev_labels = dev_data_all.X, dev_data_all.y # Build model apx = get_appendix(args.include_tfidf, args.balanced) if args.model.lower() == "simple-ff": model = FeedForward(args.ff_hunits, train_data.shape[1]) train_pytorch(args, model, train_data, train_labels, dev_data, dev_labels, save_model_path=f"models/simple-ff{apx}.torch") elif args.model.lower() == "bi-lstm": model = BiLSTM(epochs=args.num_epochs, batch_size=args.batch_size, max_seq_len=args.max_seq_len) model.train(train_data, train_labels, dev_data, dev_labels) elif args.model.lower() == "logreg": model = LogisticRegression() model.train(train_data, train_labels, dev_data, dev_labels, save_model_path=f"models/logreg{apx}.pkl") elif args.model.lower() == "majority-vote": model = MajorityVote() model.train(train_labels, dev_labels) elif args.model.lower() == "bert": model = Bert(epochs=args.num_epochs, batch_size=args.batch_size, max_seq_len=args.max_seq_len, learning_rate=args.learning_rate) model.train(train_data, train_labels, dev_data, dev_labels, save_model_path=f"models/bert.pkl") elif args.model.lower() == "svm": model = SVM() model.train(train_data, train_labels, save_model_path=f"models/svm{apx}.sav") else: raise Exception("Unknown model type passed in!")
def build_model(model_name): if model_name == "dtree": return DTree() elif model_name == "svm": return SVM() else: print("No model") exit(-1)
def comparing_models(X_train, X_test, y_train, y_test): AdaBoost(X_train, X_test, y_train, y_test) Logistic_Regression(X_train, X_test, y_train, y_test) NaiveBayes(X_train, X_test, y_train, y_test) XGBoost(X_train, X_test, y_train, y_test) RandomForest(X_train, X_test, y_train, y_test) SVM(X_train, X_test, y_train, y_test) NeuralNetwork(X_train, X_test, y_train, y_test)
def one_iteraion(m): _modes = [Perceptron(), SVM(), LDA()] X, y, _f = genrate_real_plane(m) ret = [] for _model in _modes: _model.fit(deepcopy(X), y) Z, _ = draw_points(k) ret.append(accur(_model, _f, Z)) return np.array(ret)
def question14(x_train, y_train, x_test, y_test): all_m = [50, 100, 300, 500] repeat = 50 n_neighbors = 4 # after checking some values, this turned out to be the best one max_depth = 8 # after checking some values, this turned out to be the best one models = [ Logistic(), SVM(), DecisionTree(max_depth=max_depth), NearestNeighbors(n_neighbors=n_neighbors) ] models_names = [ "Logistic", "SVM", "Decision Tree, depth=" + str(max_depth), "Nearest Neighbors, neighbors=" + str(n_neighbors) ] # models_names = ["Logistic", "SVM"] # models = [Logistic(), SVM()] # for i in range(3,9): # models.append(DecisionTree(max_depth=i)) # models_names.append(["decision " + str(i)]) # models.append(NearestNeighbors(n_neighbors=i)) # models_names.append(["Neighbors " + str(i)]) mean_accuracies = np.empty((len(models), len(all_m))) accuracies = np.empty((len(models), repeat)) running_time = np.zeros((len(models), len(all_m))) Z, yz = x_test, y_test # test set for i, m in enumerate(all_m): for j in range(repeat): X, yx = draw_points_until_two_classes(x_train, y_train, m) # training set for k, model in enumerate(models): start = time() model.fit(X, yx) accuracies[k, j] = model.model.score(Z, yz) end = time() running_time[k, i] += (end - start) mean_accuracies[:, i] = accuracies.mean(axis=1) colors = ['blue', 'red', 'green', 'orange'] fig = plt.figure() for i, model in enumerate(models_names): # plt.plot(all_m, mean_accuracies[i, :], color=colors[i], label=model) plt.plot(all_m, mean_accuracies[i, :], label=model) plt.legend() fig.suptitle("Q14: mean accuracy as function of m") fig.savefig("q14.png", bbox_inches='tight', pad_inches=0.1, dpi=fig.dpi) running_time = running_time / repeat print( pd.DataFrame(running_time, models_names, ["m=" + str(m) for m in all_m]))
def question9(): perceptron = Perceptron() svm = SVM() fig = plt.figure() plt.suptitle("Q9: True vs. Perceptron vs. SVM hyperplanes") for i, m in enumerate(all_m): X, y = draw_points_until_two_classes(m) svm.fit(X, y) ax = fig.add_subplot(2, 3, i + 1) ax.scatter(X[y == -1, 0], X[y == -1, 1], color="blue", label="y=-1") # first class, labeled -1 ax.scatter(X[y == 1, 0], X[y == 1, 1], color="red", label="y=1") # second class, labeled 1 xmin, xmax = plt.xlim() xx = np.linspace(xmin, xmax) true_hyperplane = a * xx - (b / w[1]) # perceptron perceptron.fit(X, y) w_perc = perceptron.model[:-1] a_perceptron = -w_perc[0] / w_perc[1] b_perceptron = perceptron.model[-1] / perceptron.model[1] perceptron_hyperplane = a_perceptron * xx - b_perceptron w_svm = svm.model.coef_[0] a_svm = -w_svm[0] / w_svm[1] b_svm = svm.model.intercept_[0] / w_svm[1] SVM_hyperplane = a_svm * xx - b_svm ax.plot(xx, true_hyperplane, color="black", label="true hyperplane") ax.plot(xx, perceptron_hyperplane, color="green", label="perceptron hyperplane") ax.plot(xx, SVM_hyperplane, color="orange", label="svm hyperplane") ax.title.set_text("m=" + str(m)) if i == 0: plt.legend() fig.savefig("q9.png", bbox_inches='tight', pad_inches=0.3, dpi=fig.dpi)
def test(args): """ This function tests our models :param args: the command line arguments with the desired actions """ _, _, test_data_all = load(args.data_dir, cachedir=args.cachedir, override_cache=args.override_cache, text_only=(args.model.lower() in ["bi-lstm", "bert"]), include_tfidf=args.include_tfidf, balanced=args.balanced) test_data, test_labels = test_data_all.X, test_data_all.y apx = get_appendix(args.include_tfidf, args.balanced) if args.model.lower() == "simple-ff": preds = test_pytorch( test_data, test_labels, load_model_path=f"models/simple-ff{apx}.torch", predictions_file=f"preds/simple-ff-preds{apx}.txt") elif args.model.lower() == "bi-lstm": model = BiLSTM(load_model_path="models/bilstm.keras", tokenizer_path='models/bilstm-tokenizer.json') preds = model.test(test_data, y_test=test_labels) elif args.model.lower() == "logreg": model = LogisticRegression(load_model_path=f"models/logreg{apx}.pkl") preds = model.test( test_data, test_labels, save_predictions_path=f"preds/logreg-preds{apx}.txt") elif args.model.lower() == "majority-vote": model = MajorityVote(load_model_path="models/majority-class.txt") preds = model.test(test_labels) elif args.model.lower() == "bert": model = Bert(load_model_path="models/bert.pkl") preds = model.test(test_data, test_labels, save_predictions_path="preds/bert-preds.txt") elif args.model.lower() == "svm": model = SVM(load_model_path=f"models/svm{apx}.sav") preds = model.test(test_data, save_predictions_path=f"preds/svm-preds{apx}.txt") else: raise Exception("Unknown model type passed in!") metrics = classification_report(test_labels, preds, output_dict=True) pprint(metrics) with open(f"scores/{args.model.lower()}{apx}.json", "w") as fout: json.dump(metrics, fout, indent=4)
def one_iteraion(m): _modes = [Logistic(), DecisionTree(), KNearestNeighbor(), SVM()] X, y, indexs = generate(m, x_train, y_train) # your code while (0 not in y) or (1 not in y): X, y, indexs = generate(m, x_train, y_train) ret = [] for _model in _modes: start_time = time.time() _model.fit(deepcopy(X), y) elapsed_time = time.time() - start_time print("train : {} takes {}".format(_model, elapsed_time)) Z, _, indexs = generate(k, x_test, y_test) ret.append(accur(_model, indexs, Z)) return np.array(ret)
def main(grid): # Get Clean Data X, Y = read_clean_data() # Linear Regression try: LinearRegression(X, Y, grid) except Exception as e: print(e) # Binarize Y Y_binary = BinaryY(Y) # Logistic Regression try: LogisticRegression(X, Y_binary, grid) except Exception as e: print(e) # Decision Tree try: DecisionTree(X, Y_binary, grid) except Exception as e: print(e) # Support Vector Machine try: SVM(X, Y_binary, grid) except Exception as e: print(e) # Random Forest try: RandomForest(X, Y_binary, grid) except Exception as e: print(e) # Bagging Classifier try: Bagging(X, Y_binary, grid) except Exception as e: print(e) # Neural Network try: NeuralNet(X, Y_binary, grid) except Exception as e: print(e)
def perform_svm_grid_search(self, c_svm, kernel_svm): acc_train_svm = {} acc_test_svm = {} progress_bar = tqdm(total=len(c_svm) * len(kernel_svm), desc='Grid searching for best svm') for kernel in kernel_svm: acc_train_svm[kernel] = [] acc_test_svm[kernel] = [] for c in c_svm: acc1, acc2 = SVM.train_svm(self.train_data, self.validation_data, c, kernel, self.svm_type) log_message = ( "SVM kernel: {},\t SVM c parameter: {}\n".format( kernel, c)) log_message = log_message + ( "Training accuracy: {},\t Validation accuracy: {}\n". format(acc1, acc2)) util.logger(log_message, self.log_folder) acc_train_svm[kernel].append(acc1) acc_test_svm[kernel].append(acc2) progress_bar.update(1) for key in acc_train_svm.keys(): plt.clf() plt.plot(c_svm, acc_train_svm[key], '.-', color='red') plt.plot(c_svm, acc_test_svm[key], '.-', color='orange') plt.xlabel('c') plt.ylabel('Accuracy') plt.title( "Plot of accuracy vs c for training and validation data for {} kernel" .format(key)) plt.grid() plot_save_path = os.path.join(self.plot_folder, ("svm_{}.png".format(key))) plt.savefig(plot_save_path)
loss = loss_fn(y, y_hat) # hinge loss # Computes gradients loss.backward() # Updates parameters and zeroes gradients optimizer.step() optimizer.zero_grad() # Returns the loss return loss.item() # Returns the function that will be called inside the train loop return train_step def hinge_loss(y, y_hat): return torch.mean(torch.clamp(1 - y_hat * y, min=0)) model = SVM() # Our model optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Our optimizer model.train() # Our model, SVM is a subclass of the nn.Module, so it inherits the train method losses = [] val_losses = [] train_step = make_train_step(model, hinge_loss, optimizer) train_per_epoch = int(len(train_set) / batch_size) for epoch in range(n_epochs): sum_loss = 0 sum_val_loss = 0 kbar = pkbar.Kbar(target=train_per_epoch, width=8) for i, batch in enumerate(train_loader): #TODO: when have CUDA: #x_batch = batch['imagePower'].to(device) #y_batch = batch['label'].to(device)
def run_models(words, models, verbose, train=True, test=True, embeddings=False): ''' Runs all the models that are specified with the specified word set. It runs all preporocessing steps necessary for the models specified Note: If a model is specified twice, it will be run twice, but the preprocessing on the input data will not(useful to test for model parameter initialization) Returns a list containing the the objects of the models used, the outputs they predicted and the sklearn classification reports (dictionary format), in the order where they were provided Keyword arguments: words: list of list of words and features. Format: n*m. n=nr of words, m=nr features + expected output (single) models: a string containing the model names. Order is not important. Possible models are: NB, LR, SVM, HMM, CRF. Coming soon: CNN If a model is specified twice, it will be run twice. The input is randomized only once, where applicable veboose: 0: print nothing 1: print results 2: print status messages: 3: print both ''' # Preparing data for one-hot encodign -- converts strings into integers if any(i in models for i in ['NB', 'LR', 'SVM']): verbose | 2 and print('Initial pre-processing...') if embeddings: stems = [word[0] for word in words] words = [word[1:] for word in words] X, Y, transl, labels_num, labels_name = create_dataset(words) #Algorithm uses sentences (list of list of tuples): HMM if 'HMM' in models: verbose | 2 and print('Preprocessing data for HMM...') sentences_hmm, symbols, tag_set = words2tuples(words) _, y_train, _, y_test = split_tr([], sentences_hmm, 0.8) x_test = [[tup[0] for tup in sentence] for sentence in y_test] y_test = [[tup[1] for tup in sentence] for sentence in y_test] #shuffle_parallel(x_test,y_test) data_hmm = data_wrap(None, y_train, x_test, y_test) # Algorithms using shuffled, one-hot data:NB,LR,SVM if any(i in models for i in ['NB', 'LR', 'SVM']): verbose | 2 and print('Preprocessing data for NB, LR and/or SVM...') indexes = shuffle_parallel(X, Y) X_onehot_sh = one_hot(X, transl) if embeddings: verbose | 2 and print('Loading and generating embeddings...') X_onehot_sh = embeddings.insert_embeddings(X_onehot_sh, stems, indexes) x_train_oh_sh, y_train_oh_sh, x_test_oh_sh, y_test_oh_sh = split_tr( X_onehot_sh, Y, 0.8) data_shuffled = data_wrap(x_train_oh_sh, y_train_oh_sh, x_test_oh_sh, y_test_oh_sh, transl, labels_num, labels_name) #Ordered, using sentences (list of list of dict): CRF if 'CRF' in models: verbose | 2 and print('Preprocessing data for CRF...') tokens_dict, labels_dict = words2dictionary(words) shuffle_parallel(tokens_dict, labels_dict) tokens_train, labels_train, tokens_test, labels_test = split_tr( tokens_dict, labels_dict, 0.8) data_dictionary = data_wrap(tokens_train, labels_train, tokens_test, labels_test) model_objects = [] model_results = [] model_predictions = [] #removes clutter when calling the functions separately #Using a list of function handlers could also be used, but I find that to be #less intuitive def _add_to_output(model_y_pred): model_objects.append(model_y_pred[0]) model_results.append(model_y_pred[1]) if (len(model_y_pred) > 2): model_predictions.append(model_y_pred[2]) #Run each of the models from the paramters, while KEEPING THE ORDER they were called in #and append it to the return lists for model in models: if 'HMM' in model: verbose | 2 and print('Running HMM from nltk...') _add_to_output(HMM(data_hmm, symbols, tag_set, verbose | 1)) if 'NB' in model: verbose | 2 and print('Running NB ' + ('with ' if embeddings else 'without ') + 'embeddings...') if embeddings: _add_to_output(NB_cont(data_shuffled, verbose | 1)) else: _add_to_output(NB_disc(data_shuffled, verbose | 1)) if 'LR' in model: verbose | 2 and print('Running LR ' + ('with ' if embeddings else 'without ') + 'embeddings...') _add_to_output( LR(data_shuffled, verbose | 1, C=(0.1 if embeddings else 5))) if 'SVM' in model: verbose | 2 and print('Running SVM ' + ('with ' if embeddings else 'without ') + 'embeddings...') _add_to_output(SVM(data_shuffled, verbose | 1)) if 'CRF' in model: verbose | 2 and print('Running CRF...') _add_to_output(CRF(data_dictionary, verbose | 1)) return model_objects, model_results, model_predictions
with open(path_test_data, 'rb') as f: X_test = pickle.load(f) Y_test = pickle.load(f) test_fantom_labels = pickle.load(f) test_reconstr_labels = pickle.load(f) X = np.ones((X_test.shape[0], 32768, 4), dtype=np.float16) Y = np.ones((X_test.shape[0], 32768), dtype=np.float16) X[:, 0:X_test.shape[1]] = X_test X = np.reshape(X, (X.shape[0], X.shape[1], 4)) #load model model = SVM((X.shape[1], 4)) #model = model_from_json(open('model.json').read()) model.load_weights(path_best_weights) #predict Y_pred = model.predict(X) Y_pred = Y_pred[:, 0:25350] Y_pred_hard = 2 * np.argmax(Y_pred, axis=-1) + 1 Y_pred_soft = 2 * Y_pred[:, :, 1] + 1 #place for predictions predicted_phantoms_hard = sio.loadmat(path_predicted_phantoms_hard) phantoms_data = predicted_phantoms_hard['PhantomDataBase']
preprocessing.samples_statistics(train_samples, _classes, get_question)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess( train_texts, test_texts, words_src="samples", normalize_flag=False) if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question) test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question) model = SVM() model.train(train_matrix, train_labels) predict = model.predict(test_matrix) elif _model == "NN": train_dists = preprocessing.samples_to_dists(train_samples, _classes, get_question) test_dists = preprocessing.samples_to_dists(test_samples, _classes, get_question) model = Neural_Network(_n_factors=train_matrix.shape[1], _learning_rate=_learning_rate, _hidden_nodes=_hidden_nodes, _last_layer=len(_classes)) model.train(train_matrix, train_dists, test_matrix, test_dists) predict = model.predict(test_matrix) predict = preprocessing.dists_to_labels(predict, _classes)
# Do SVM with Gaussian Kernel predictions results0 = np.zeros(3000) len_files = len(FILES) for i in range(len_files): γ = gamma_list[i] λ = lambda_list[i] X_train, Y_train, X_test = load_data(i, data_dir=DATA_DIR, files_dict=FILES) kernel = GaussianKernel(γ) clf = SVM(_lambda=λ, kernel=kernel) clf.fit(X_train, Y_train) y_pred = clf.predict(X_test) results0[i * 1000:i * 1000 + 1000] = y_pred # SAVE Results save_results("results_SVM_gaussian.csv", results0, RESULT_DIR) print("1/3 Ending SVM with Gaussian kernel...") ##################################### # 2) SVM with Convolutional kernel # ##################################### print("2/3 Starting SVM with Convolutional kernel...") # Define parameters lists sigma_list = [0.31, 0.31, 0.3] k_list = [9, 10, 11]
def main(DATA_NUM, DATA_PATH, TRAIN_TIMES, N_LABELED, TEST_SIZE, N_CLASS): # read training data logger.info('Read kaggle training data') data = pd.read_csv(DATA_PATH, index_col=0) logger.info('The train shape : {}'.format(data.shape)) # read testing data # logger.info('Read kaggle testing data') # testing_data_set = pd.read_csv(testing_data_path, index_col=0) # logger.info('The test shape : {}'.format(testing_data_set.shape)) # Comparing UncertaintySampling strategy with RandomSampling. # model is the base learner, e.g. LogisticRegression, SVM ... etc. columns = list(data.columns) columns.remove('Class') columns.remove('Id') scaler = StandardScaler() results = [] results_loss = [] for T in range(TRAIN_TIMES): # repeat the experiment T times logger.info("%dth experiment" % (T + 1)) if DATA_NUM != 0: random_data = data.sample(n=DATA_NUM, weights='Class', random_state=T + 1) random_data.to_csv('random_data_sample.csv', encoding='utf-8') logger.info(random_data.head()) else: random_data = copy.deepcopy(data) del data gc.collect() x = random_data.loc[:, columns].values x = scaler.fit_transform(x) x = np.concatenate((random_data['Id'].values.reshape(-1, 1), x), axis=1) y = random_data.loc[:, 'Class'].values trn_ds, tst_ds, y_train, fully_labeled_trn_ds = split_train_test( x, y, TEST_SIZE, N_LABELED, N_CLASS) lbr = IdealLabeler(fully_labeled_trn_ds) quota = len(y_train) - N_LABELED # number of samples to query logger.info(quota) start_time = time.time() logger.info('Running ALBL') qs = ActiveLearningByLearning( trn_ds, query_strategies=[ UncertaintySampling(trn_ds, model=SVM(kernel='linear', decision_function_shape='ovr')), QUIRE(trn_ds), RandomSampling(trn_ds) # HintSVM(trn_ds, cl=1.0, ch=1.0), ## only support binary class ], T=quota, uniform_sampler=True, model=SVM(kernel='linear', decision_function_shape='ovr')) model = SVM(kernel='linear', decision_function_shape='ovr') _, E_out_5, loss = run(trn_ds, tst_ds, lbr, model, qs, quota, T) end_time = time.time() logger.info('ALBL finish. {} s'.format(end_time - start_time)) results.append(E_out_5.tolist()) results_loss.append(np.array(loss)) ### save albl error value and validation loss pickle.dump(results, open('albl_error.pkl', 'wb')) pickle.dump(results_loss, open('albl_Loss_record.pkl', 'wb')) # Plot the learning curve of UncertaintySampling to RandomSampling # The x-axis is the number of queries, and the y-axis is the corresponding # error rate. fig = plt.figure() plt.plot(np.mean(results, axis=0), 'c', label='ALBL') plt.xlabel('Number of Queries') plt.ylabel('Error') fig.suptitle('ALBL Experiment Result') plt.legend(loc='upper right') # plt.xticks(np.arange(0, DATA_NUM//100, step=1)) fig.savefig('./loss_figure/albl_loss_kaggle_malware.png') # plt.show() fig2 = plt.figure() plt.plot(np.mean(results_loss, axis=0), 'c', label='val_loss') plt.xlabel('Validation Times') plt.ylabel('Loss') fig2.suptitle('Validation Loss Result') plt.legend(loc='upper right') plt.xticks(np.arange(0, DATA_NUM // 100, step=1)) fig2.savefig('./loss_figure/kaggle_loss.png')
def test_everything(args): ## Get features, labels, training and testing set, adjacency args, file_names, stat_dirname, features, gt_labels, genres, adjacency, indx_train, indx_test, pygsp_graph, release_dates = load_parameters_and_data( args) if args.graph_statistics: if not os.path.exists(stat_dirname): os.makedirs(stat_dirname) if args.graph_statistics == 'all': ## Prints out all statistics about graph gstats.allstats(adjacency, stat_dirname, active_plots=False) elif args.graph_statistics == 'advanced': ## Prints out all advanced statistics gstats.advanced(adjacency, stat_dirname, active_plots=args.plot_graph) else: # basic setting ## Prints out basic statistics gstats.basic(adjacency) gstats.growth_analysis(adjacency, release_dates, gt_labels, stat_dirname) if args.inductive_learning: print('#### Testing Inductive Learning ####') if args.additional_models: ## Initialize models with correct parameters svm_clf = SVM(features, gt_labels, kernel='linear', seed=SEED, save_path=file_names) random_forest_clf = Random_Forest(features, gt_labels, n_estimators=100, max_depth=20, seed=SEED, save_path=file_names) knn_clf = KNN(features, gt_labels, save_path=file_names) error_svm = simple_test(svm_clf, indx_test, classes=genres, name=file_names + "svm_") print('* SVM simple test error: {:.2f}'.format(error_svm)) error_rf = simple_test(random_forest_clf, indx_test, classes=genres, name=file_names + "rf_") print('* Random Forest simple test error: {:.2f}'.format(error_rf)) error_knn = simple_test(knn_clf, indx_test, classes=genres, name=file_names + "knn_") print('* KNN simple test error: {:.2f}'.format(error_knn)) if args.gcn: ## Initialize GCN with correct parameters gnn_clf = GCN(nhid=[1200, 100], dropout=0.1, adjacency=adjacency, features=features, labels=gt_labels, n_class=len(genres), cuda=args.use_cpu, regularization=None, lr=0.01, weight_decay=5e-4, epochs=300, batch_size=10000, save_path=file_names) error_gnn = simple_test(gnn_clf, indx_test, classes=genres, name=file_names + "gnn_") print('* GCN simple test error: {:.2f}'.format(error_gnn)) if args.gcn_khop: ## Initialize GCN K-Hop with correct parameters gnn_clf = GCN_KHop(nhid=[1200, 100], dropout=0.1, adjacency=adjacency, features=features, labels=gt_labels, n_class=len(genres), khop=2, cuda=args.use_cpu, regularization=None, lr=0.01, weight_decay=5e-4, epochs=300, batch_size=10000, save_path=file_names) error_gnn = simple_test(gnn_clf, indx_test, classes=genres, name=file_names + "gnn_khop_") print('* GCN KHop simple test error: {:.2f}'.format(error_gnn)) if args.mlp_nn: ## Initialize MLP with correct parameters mlp_nn = MLP_NN(hidden_size=100, features=features, labels=gt_labels, num_epoch=10, batch_size=100, num_classes=len(genres), save_path=file_names, cuda=args.use_cpu) error_mlpNN = simple_test(mlp_nn, indx_test, classes=genres, name=file_names + "mlpNN_") print('* MLP NN simple test error: {:.2f}'.format(error_mlpNN))
if __name__ == "__main__": train_data = [] train_label = [] load_data = [] for file in config.data_files: load_data.append(LoadData(file)) for cpt in range(len(load_data)): train_x, train_y = load_data[cpt].getTrainData() train_data += train_x train_label += train_y nb_model_nb = NaiveBayes(train_data, train_label) nb_model_svm = SVM(train_data, train_label) # Save Naive Bayes Model nb_pickle = open(config.naive_bayes_path, 'wb') pickle.dump(nb_model_nb, nb_pickle) nb_pickle.close() # Save SVM Model svm_pickle = open(config.SVM_path, 'wb') pickle.dump(nb_model_nb, svm_pickle) svm_pickle.close() valid_data = [] valid_label = [] for cpt in range(len(load_data)): valid_x, valid_y = load_data[cpt].getTestData()
# Paths to training and testing set TRAINING_SET = '../resources/csv/training_set.csv' TEST_SET = '../resources/csv/test_set.csv' # Path to export predictions DESTINATION = '../products/' # Fingerprint transformation FINGERPRINT = fingerprints.morgan() # Model to train MODEL = ConsensusClassifier([ KNN(n_neighbors=17), MLP(random_state=0), SVM(gamma='auto', random_state=0, probability=True), RFC(500, random_state=0) ]) ######## # Main # ######## if __name__ == '__main__': # Load training and test set LS = utils.load_from_csv(TRAINING_SET) TS = utils.load_from_csv(TEST_SET) # Create fingerprint features and output of learning set X_LS = fingerprints.transform(LS['SMILES'].values, FINGERPRINT) y_LS = LS['ACTIVE'].values
def __init__(self): self.resource_folder = get_resource_path() # for dataset_name in sorted(os.listdir(folder)): # if dataset_name.endswith('.csv'): # print(dataset_name[:-4]) self.pipelines = { 'credit-g': ( 'credit-g/dataset_31_credit-g.csv', 'class', CreditGPipeline()), 'wine-quality': ( 'wine-quality/wine-quality-red.csv', 'class', WineQualityPipeline()), 'wq-missing': ( 'wine-quality/wine-quality-red.csv', 'class', WineQualityMissingPipeline()), 'abalone': ( 'abalone/abalone.csv', 'Rings', AbalonePipeline()), 'adult': ( 'adult/adult.csv', 'class', AdultPipeline()), 'adult-missing': ( 'adult/adult.csv', 'class', AdultMissingPipeline()), 'heart': ( 'heart/heart.csv', 'class', HeartPipeline())} self.classifiers = { 'dtc': DecisionTree(), 'rfc40': RandomForest(size=40), 'ertc40': ExtremelyRandomizedTrees(size=40), 'xgb': XGB(), 'svm': SVM(), 'lsvm': LinearSVM(), 'knn': KNN(n_neighbors=7), 'logreg': LogRegression(), 'gaus': GausNB(), 'brfc40': BaggingRandomForest(size=40), 'mlpc': MLPC(input_size=[16, 32, 16, 8]) } self.error_gens = { 'numeric anomalies': ( Anomalies(), lambda x: x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'typos': ( Typos(), lambda x: x.dtype == DataType.STRING), 'explicit misvals': ( ExplicitMissingValues(), lambda x: True), 'implicit misvals': ( ImplicitMissingValues(), lambda x: True), 'swap fields': ( SwapFields(), lambda x: True)} self.params = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8] self.tests = {'num disc': lambda x: (x.scale == DataScale.NOMINAL and x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'num cont': lambda x: (x.scale == DataScale.NOMINAL and x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'string': lambda x: x.dtype == DataType.STRING} self.results = Table(rows=sorted(self.pipelines.keys()), columns=sorted(self.classifiers.keys()), subrows=self.tests.keys(), subcolumns=self.error_gens.keys())
print("Train set distribution:", preprocessing.samples_statistics(train_samples, _sections, get_section)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _sections, get_section)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] tfidf_vectorizer = get_tfidfVectorizer_of_essay_top_tf_words() print("Vectorizer built..") train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, savedir = _save_dir, words_src = tfidf_vectorizer, normalize_flag = False, reduction = _reduction, reduce_n_attr = _reduce_n_attr, stem_words = _stem_words) model = None print("Generating labels..") if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _sections, get_section) test_labels = preprocessing.samples_to_label(test_samples, _sections, get_section) model = SVM() print("Training.. ") model.train(train_matrix, train_labels) predict = model.predict(test_matrix) elif _model == "NN": train_dists = preprocessing.samples_to_dists(train_samples, _sections, get_section) test_dists = preprocessing.samples_to_dists(test_samples, _sections, get_section) model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_sections)) print("Training.. ") model.train(train_matrix, train_dists, test_matrix, test_dists, max_iter = _max_iter) predict = model.predict(test_matrix) predict = preprocessing.dists_to_labels(predict, _sections) test_labels = preprocessing.samples_to_label(test_samples, _sections, get_section) else:
def analyze_clssifiers(): # def generate_line_prec(prec): def plot(prec, _svm, X, y): plt.scatter(X[y == -1][:, 0], X[y == -1][:, 1]) plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1]) min_x, max_x = min(X[:, 0]), max(X[:, 0]) min_y, max_y = min(X[:, 1]), max(X[:, 1]) print(min_x, max_x) _min_range, _max_range = min(min_x, min_y), max(max_x, max_y) print(prec.W) xx = [_min_range, _max_range] def get_y(W, _x): return -(W[0] + _x * W[1]) / W[2] if W[2] != 0 else -W[0] def get_y_prep(_x): return get_y(prec.W, _x) def get_y_svm(_x): print(_svm.coef_()[0]) return get_y(_svm.coef_()[0], _x) def get_true_y(_x): return 0.1 / 0.5 + 0.3 / 0.5 * _x plt.xlim([_min_range, _max_range]) plt.ylim([_min_range, _max_range]) middle = (_min_range + _max_range) / 2 def print_line(msg, _f, _color): xx = [_min_range, _max_range] yy = [_f(_x) for _x in xx] _x = middle + 2 * (0.5 - random()) _y = _f(_x) plt.plot(xx, yy, color=_color) plt.annotate(msg, color=_color, xy=(_x, _y), xycoords='data', xytext=(_x + 0.3, _y), textcoords='data', arrowprops=dict(arrowstyle="->")) print_line("prep", get_y_prep, "C5") print_line("svm", get_y_svm, "C4") print_line("true plane", get_true_y, "C2") plt.title("svm vs prep") plt.xlabel("x)") plt.ylabel("y") plt.show() for m in [5, 10, 15, 25, 70]: X, y = draw_points(m) blues, reds = X[y == 1], X[y == -1] _modes = [Perceptron(), SVM()] for _model in _modes: _model.fit(deepcopy(X), y) plot(_modes[0], _modes[1], X, y)
# test the model with linear features y_hat = mdl.predict(x_norm_valid) # get metrics recall, precision, f1 = metrics(y_valid, y_hat) save_result(t, 'lr', 'linear', recall, precision, f1, C=None, gamma=p['gamma']) # print result t_elapsed = time.time() - t_start print('Logistic Regression w/ gamma = {:.2e}'.format(p['gamma'],) + ' | Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f}, '.format(precision, recall, f1) + ' | Time = {:.2f} seconds'.format(t_elapsed)) # loop over svm model parameter space mdl = SVM() params = mdl.hyper_parameters() for p in params: # train the model with linear features t_start = time.time() success = mdl.train(x_norm_train, y_train, C=p['C'], mode='primal') # did we succeed? if success: # test the model y_hat = mdl.predict(x_norm_valid) # get metrics recall, precision, f1 = metrics(y_valid, y_hat)
def train_everything(args): ## Get features, labels, training and testing set, adjacency args, file_names, stat_dirname, features, gt_labels, genres, adjacency, indx_train, indx_test, pygsp_graph, release_dates = load_parameters_and_data( args) if args.inductive_learning: print('#### Applying Inductive Learning ####') if args.additional_models: ## Initialize model with correct parameters svm_clf = SVM(features, gt_labels, kernel='linear', seed=SEED, save_path=file_names) random_forest_clf = Random_Forest(features, gt_labels, n_estimators=100, max_depth=20, seed=SEED, save_path=file_names) knn_clf = KNN(features, gt_labels, save_path=file_names) start = time.time() mean_error_svm, std_error_svm = cross_validation(svm_clf, indx_train, K=5, classes=genres, name=file_names + "svm_") print('* SVM cross validation error mean: {:.2f}, std: {:.2f}'. format(mean_error_svm, std_error_svm)) print("SVM time", time.time() - start) start = time.time() mean_error_rf, std_error_rf = cross_validation(random_forest_clf, indx_train, K=5, classes=genres, name=file_names + "rf_") print( '* Random Forest cross validation error mean: {:.2f}, std: {:.2f}' .format(mean_error_rf, std_error_rf)) print("RF time", time.time() - start) start = time.time() mean_error_knn, std_error_knn = cross_validation(knn_clf, indx_train, K=5, classes=genres, name=file_names + "knn_") print('* KNN cross validation error mean: {:.2f}, std: {:.2f}'. format(mean_error_knn, std_error_knn)) print("KNN time", time.time() - start) if args.gcn: print("Training GCN") start = time.time() ## Initialize GCN with correct parameters gnn_clf = GCN(nhid=[1200, 100], dropout=0.1, adjacency=adjacency, features=features, labels=gt_labels, n_class=len(genres), cuda=args.use_cpu, regularization=None, lr=0.01, weight_decay=5e-4, epochs=300, batch_size=10000, save_path=file_names) train_gcn(gnn_clf, indx_train, name=file_names + "gnn_") print("GCN time", time.time() - start) if args.gcn_khop: print("Training GCN K-Hop") start = time.time() ## Initialize GCN K-Hop with correct parameters gnn_clf = GCN_KHop(nhid=[1200, 100], dropout=0.1, adjacency=adjacency, features=features, labels=gt_labels, n_class=len(genres), khop=2, cuda=args.use_cpu, regularization=None, lr=0.01, weight_decay=5e-4, epochs=300, batch_size=10000, save_path=file_names) train_gcn(gnn_clf, indx_train, name=file_names + "gnn_khop_") print("GCN K-Hop time", time.time() - start) if args.mlp_nn: start = time.time() ## Initialize MLP with correct parameters mlp_nn = MLP_NN(hidden_size=100, features=features, labels=gt_labels, num_epoch=100, batch_size=100, num_classes=len(genres), save_path=file_names, cuda=args.use_cpu) mean_error_mlpNN, std_error_mlpNN = cross_validation( mlp_nn, indx_train, K=5, classes=genres, name=file_names + "mlpNN_") print('* MLP NN cross validation error mean: {:.2f}, std: {:.2f}'. format(mean_error_mlpNN, std_error_mlpNN)) print("MLP time", time.time() - start)
train_samples = samples[0:int(n_samples*_train_ratio)] test_samples = samples[int(n_samples*_train_ratio):n_samples] print("Samples distribution:", preprocessing.samples_statistics(samples, _classes, get_question)) print("Train set distribution:", preprocessing.samples_statistics(train_samples, _classes, get_question)) print("Test set distribution:", preprocessing.samples_statistics(test_samples, _classes, get_question)) train_texts = [sample.text for sample in train_samples] test_texts = [sample.text for sample in test_samples] train_matrix, test_matrix, words = preprocessing.preprocess(train_texts, test_texts, words_src = "samples", normalize_flag = False) if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _classes, get_question) test_labels = preprocessing.samples_to_label(test_samples, _classes, get_question) model = SVM() model.train(train_matrix, train_labels) predict = model.predict(test_matrix) elif _model == "NN": train_dists = preprocessing.samples_to_dists(train_samples, _classes, get_question) test_dists = preprocessing.samples_to_dists(test_samples, _classes, get_question) model = Neural_Network(_n_factors = train_matrix.shape[1], _learning_rate = _learning_rate, _hidden_nodes = _hidden_nodes, _last_layer = len(_classes)) model.train(train_matrix, train_dists, test_matrix, test_dists) predict = model.predict(test_matrix) predict = preprocessing.dists_to_labels(predict, _classes) test_labels = preprocessing.samples_to_label(test_samples, _classes) else: raise Exception("Unknown model flag '%s'"%str(_model))
mosaic(20, images)) cv2.imwrite('out/test_set.jpg', mosaic(20, shoes_test)) cv2.imwrite('out/train_set.jpg', mosaic(20, shoes_train)) print 'training KNearest...' model = KNearest(k=4) model.train(samples_train, labels_train) vis, knearestError = evaluate_model(model, shoes_test, samples_test, labels_test) cv2.imwrite('out/KNearest_test_' + str(SZ) + '.jpg', vis) # print 'saving KNearest as "shoes_svm_' + str(SZ) + '.dat"...' # model.save('out/shoes_KNearest_' + str(SZ) + '.dat') print 'training SVM...' model = SVM(C=2.67, gamma=5.383) model.train(samples_train, labels_train) vis, svmError = evaluate_model(model, shoes_test, samples_test, labels_test) cv2.imwrite('out/SVM_test_' + str(SZ) + '.jpg', vis) print 'saving SVM as "shoes_svm_' + str(SZ) + '.dat"...' model.save('out/shoes_svm_' + str(SZ) + '.dat') print 'training RTrees...' model = RTrees() model.train(samples_train, labels_train) vis, rtreesError = evaluate_model(model, shoes_test, samples_test, labels_test) cv2.imwrite('out/rtrees_test_' + str(SZ) + '.jpg', vis) print 'saving RTrees as "shoes_rtrees_' + str(SZ) + '.dat"...' model.save('out/shoes_rtrees_' + str(SZ) + '.dat')
test_texts, savedir=_save_dir, words_src=tfidf_vectorizer, normalize_flag=False, reduction=_reduction, reduce_n_attr=_reduce_n_attr, stem_words=_stem_words) model = None print("Generating labels..") if _model == "SVM": train_labels = preprocessing.samples_to_label(train_samples, _sections, get_section) test_labels = preprocessing.samples_to_label(test_samples, _sections, get_section) model = SVM() print("Training.. ") model.train(train_matrix, train_labels) predict = model.predict(test_matrix) elif _model == "NN": train_dists = preprocessing.samples_to_dists(train_samples, _sections, get_section) test_dists = preprocessing.samples_to_dists(test_samples, _sections, get_section) model = Neural_Network(_n_factors=train_matrix.shape[1], _learning_rate=_learning_rate, _hidden_nodes=_hidden_nodes, _last_layer=len(_sections)) print("Training.. ") model.train(train_matrix,
best_window_size = {i: 0 for i in range(len_files)} # Main loop for _, params in enumerate(settings): gamma, _lambda, = params if kernel_name == "Gaussian": kernel = GaussianKernel(gamma) elif kernel_name == "Linear": kernel = LinearKernel() if model_name == "SVM": clf = SVM(_lambda=_lambda, kernel=kernel) elif model_name == "SPR": clf = SPR(kernel=kernel) # Loop from pre-computed embeddings #for filename in os.listdir(EMBEDDING_DIR)[:1]: # small test for filename in os.listdir(EMBEDDING_DIR): # Full path file_path = os.path.join(EMBEDDING_DIR, filename) # Parsing dataset_idx, sigma, window_size = filename_parser(filename) # Cross validation results = cross_validation(dataset_idx=dataset_idx, clf=clf, data_dir=DATA_DIR, files_dict=FILES,
X = np.ones((X_train.shape[0], 32768, 4), dtype=np.float16) Y = np.ones((Y_train.shape[0], 32768), dtype=np.float16) #normalize data # X_train = (X_train-1)/2.0 # Y_train = (Y_train-1)/2.0 X[:, 0:X_train.shape[1]] = X_train Y[:, 0:Y_train.shape[1]] = Y_train Y = to_categorical(Y) X = np.reshape(X, (X.shape[0], X.shape[1], 4)) #train model = SVM((X.shape[1], 4)) model._get_distribution_strategy = lambda: None json_string = model.to_json() open('model.json', 'w').write(json_string) checkpointer = ModelCheckpoint(path_best_weights, verbose=1, monitor='val_loss', mode='auto', save_best_only=True) tbCallback = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True, profile_batch=100000000) model.fit(X, Y,