def test_linear_regression(loga=False, norm=False): train_split = TRAIN_SPLIT feature_set = FEATURE_SET logarithm = loga normalize = norm name = 'name' parameter_dict = { 'name': name, 'feature_set': feature_set, 'logarithm': logarithm, 'normalize': normalize, 'train_split': train_split, } x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data( logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET, ) prediction = linear_regression(x_tr, y_tr, x_te, L2_matrix, parameter_dict) write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data, parameter_dict)
def train_n_test_model(times, parameters): x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data( parameters['logarithm'], parameters['normalize'], FILE_PATH, parameters['train_split'], parameters['feature_set']) for i in range(times): prediction = train_and_evaluate_models(x_tr, y_tr, x_te, L2_matrix, parameters) write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data, parameters)
def calculate_distance(lat, long): country_list = preprocessing.import_data() nearest_distance = sys.maxint nearest_point = "" for point in country_list: point_distance = distance.haversine(float(long), float(lat), float(point[2]), float(point[1])) if nearest_distance > point_distance: nearest_distance = point_distance nearest_point = point[0] print nearest_point
def test_deep_learning_GS(epochs, lr, constant, dropout): # region data import train_split = TRAIN_SPLIT feature_set = FEATURE_SET logarithm = True normalize = False name = 'with_logarithm' parameter_dict = { 'name': name, 'feature_set': feature_set, 'logarithm': logarithm, 'normalize': normalize, 'train_split': train_split, } x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data( logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET) # endregion # region parameters learning_rate_list = [lr] epochs_list = [epochs] batch_size_list = [16] hidden_layers_list = [4] number_neurons_list = [256] batchnorm_list = [False] dropout_list = [dropout] init_mode_list = ['he_normal'] input_dim = [x_tr.shape[1]] optimizer_list = ['adam'] dropout_rate = [0.1] constant = [constant] set_of_features = [0, 1, 2, 3, 4, 5, 6, 7, 8] parameters = dict(batch_size=batch_size_list, epochs=epochs_list, hidden_layers=hidden_layers_list, neurons=number_neurons_list, learn_rate=learning_rate_list, batchnormalize=batchnorm_list, init_mode=init_mode_list, input_dimension=input_dim, optimizer=optimizer_list, dropout=dropout_list, dropout_rate=dropout_rate, constant=constant) # endregion do_grid_search(x_tr, y_tr, parameters, create_model, parameter_dict)
def test_svr(): train_split = TRAIN_SPLIT feature_set = FEATURE_SET logarithm = True normalize = True name = 'name' parameter_dict = { 'name': name, 'feature_set': feature_set, 'logarithm': logarithm, 'normalize': normalize, 'train_split': train_split, } x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data( logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET) prediction = support_vector_machine(x_tr, y_tr, x_te, L2_matrix, 1e-06, 300, 0.01, parameter_dict) write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data, parameter_dict)
def test_random_forest(): train_split = TRAIN_SPLIT feature_set = FEATURE_SET logarithm = True normalize = False name = 'logarithm' parameter_dict = { 'name': name, 'feature_set': feature_set, 'logarithm': logarithm, 'normalize': normalize, 'train_split': train_split, } x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data( logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET) for i in range(3): prediction = random_forrest(x_tr, y_tr, x_te, L2_matrix, parameter_dict) write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data, parameter_dict)
def test_svr_grid(): train_split = TRAIN_SPLIT feature_set = FEATURE_SET logarithm = False normalize = False name = 'name' parameter_dict = { 'name': name, 'feature_set': feature_set, 'logarithm': logarithm, 'normalize': normalize, 'train_split': train_split, } x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data( logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET) epsilon_list = [1e-5] gamma_list = [200] c_list = [0.01] manual_gridsearch_svr(x_tr, y_tr, x_te, y_te, L2_matrix, epsilon_list, gamma_list, c_list, parameter_dict)
# -*- coding: utf-8 -*- from sklearn.ensemble import RandomForestClassifier from sklearn.tree import export_graphviz import pydot import numpy as np import preprocessing from math import sqrt #1 = bots, 0 = legit #training data df = preprocessing.import_data() train_bots = df[1].values[:12000,3:].astype(int) train_legit = df[2].values[:12000,3:].astype(int) feature_list = df[3][3:] X_train = np.vstack((train_bots,train_legit)) #testing data test_bots = df[1].values[15000:16000,3:].astype(int) test_legit = df[2].values[15000:16000,3:].astype(int) X_test = np.vstack((test_bots,test_legit)) #training labels train_bots_label = np.ones((train_bots.shape[0],1)) train_legit_label = np.zeros((train_legit.shape[0],1)) Y_train = np.vstack((train_bots_label,train_legit_label)) Y_train = Y_train.ravel(order='C')
import pandas as pd import evaluation as evaal import preprocessing as prp import functions as func from datetime import datetime from tensorflow import keras #print(keras.__version__) from keras.preprocessing.sequence import pad_sequences """### Preprocessing""" #@title Please choose the algorithm that you want to use: { form-width: "250px", display-mode: "both" } algorithm = "SVM" #@param ["SVM", "BiLSTM"] # ========== Importing the data X, y = prp.import_data(algorithm) # =========================== Preprocessing =============================== # ========== Ordinal encoding amino_codes = [ '0', 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y' ] non_amino_letters = ['B', 'J', 'O', 'U', 'X', 'Z'] amino_mapping = prp.create_mapping(amino_codes) X['mapped_seq'] = prp.integer_encoding(X['seq'], amino_mapping)