def __init__(self, algorithm): ''' init data preprocessor and classifier ''' self.data_preprocessor = DataPreprocessor() if str(algorithm).lower() == 'decisiontree': self.clf = DecisionTreeClassifier() elif str(algorithm).lower() == 'randomforest': self.clf = RandomForestClassifier()
def _preprocess_image(self, input_image): input_image = cv2.resize(input_image, (self._input_shape[0], self._input_shape[1])) input_image = np.expand_dims(input_image, axis=0) preprocessor_prediction = DataPreprocessor(input_image) preprocessor_prediction.restore_preprocessing_parameters( file_name=self._parameter_file) input_image = preprocessor_prediction.get_reprocessed_data() return input_image
def generate_training_data(): dp = DataPreprocessor() features_filenames = [] for i in range(10) : features_filenames.append('data/raw/features/part-0000'+str(i)+'-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv') label_filename = 'data/raw/labels/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv' dp.prepare_training_data(features_filenames, label_filename)
def __init__(self, X, y): # remove zero covariance features and standardize self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y) self.number_features = X.shape[1] self.all_classes = np.unique(y) self.number_classes = self.all_classes.size self.prior, self.mean_array, self.std_array = self.calculate_Gauss_parameters( X, y)
def __init__(self): self.embedding_size = 3 self.epochs = 10 self.hidden_state_size = 16 self.data_sequence = DataPreprocessor(64, train=True) self.data_sequence.tokenizer.save_vocab() self.val_sequence = DataPreprocessor(64, train=False) self.history = None self.model_path: str = None self.model: KerasModel = None
def __init__(self, X, y): # remove zero covariance features and standardize self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y) self.number_features = X.shape[1] self.all_classes = np.unique(y) self.number_classes = self.all_classes.size self.W = self.calculate_weight_vector(X, y) self.prior, self.mean, self.covariance = \ self.calculate_GaussGM_parameters(self.LDA_projection(X), y)
class NaiveBayes(Classifier): def __init__(self, X, y): # remove zero covariance features and standardize self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y) self.number_features = X.shape[1] self.all_classes = np.unique(y) self.number_classes = self.all_classes.size self.prior, self.mean_array, self.std_array = self.calculate_Gauss_parameters( X, y) def calculate_Gauss_parameters(self, X, y): prior = [np.sum(y == y_val) / y.size for y_val in self.all_classes] num_obs, _ = X.shape mean_array = np.zeros((self.number_classes, self.number_features)) std_array = np.zeros((self.number_classes, self.number_features)) for k in range(0, self.number_classes): index = y == self.all_classes[k] X_sub = X[index, :] mean_array[k, :] = np.mean(X_sub, axis=0) std_array[k, :] = np.std(X_sub, axis=0, ddof=1) std_array[std_array < 1e-03] = 1e-03 return prior, mean_array, std_array def validate(self, X_test, y_test): X_test = self.data_preprocessor.process_data(X_test) assert X_test.shape[1] == self.number_features predicted_score = self.predict_score(X_test) predicted_class = self.predict_class(predicted_score) prediction_error = self.calculate_predict_error( predicted_class, y_test) return prediction_error def calculate_predict_error(self, predicted_class, y): predicted_indicator = np.array( [predicted_class[i] == y[i] for i in range(0, y.size)]) return 1 - np.sum(predicted_indicator) / y.size def predict_class(self, predicted_score): max_indicator = np.argmax(predicted_score, axis=1) return np.array([self.all_classes[i] for i in max_indicator]) def predict_score(self, X): N = X.shape[0] log_score = np.zeros((N, self.number_classes)) for k in range(0, self.number_classes): for j in range(0, self.number_features): log_score[:, k] += norm.logpdf(X[:, j], loc=self.mean_array[k, j], scale=self.std_array[k, j]) log_prior = [log(p) for p in self.prior] log_score += log_prior return log_score
def _create_exploration_df(example_test, example_train, is_numeric=False): example_cols = ["column1"] test_df, train_df = _create_testing_dataframes(example_cols, example_test, example_train) prep = DataPreprocessor(train_df=train_df, test_df=test_df) if is_numeric: factor_exploration = prep.explore_numeric_columns() else: factor_exploration = prep.explore_factors() assert len(factor_exploration) == 1 return factor_exploration["column1"]
def __init__(self): # load config file with open("./config/predictionconfig.yml", "r") as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) self.interval = cfg['interval'] self.threshold = cfg['single_threshold'] # init DataPreprocessor self.data_preprocessor = DataPreprocessor() # init PredictionMaker self.prediction_maker = PredictionMaker() self.registry = CollectorRegistry() self.pushgateway_url = os.getenv('PUSHGATEWAY_URL')
def __init__(self, X, y): # Data_Preprocessor will copy X self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y) self.all_classes = np.unique(y) self.number_classes = self.all_classes.size self.number_observations, self.number_features = X.shape # row-wise concatenated weight vector W_init = np.random.normal(0, 0.001, self.number_classes * self.number_features) self.W = self.IRLS(W_init, X, y)
def __init__(self): # Definition of hyper parameter, data sources and other class variables self.embedding_dim = 3 self.lstm_hidden_dim = self.embedding_dim self.max_decoder_length = 25 self.epochs = 10 self.data_sequence = DataPreprocessor(64, train=True, enc_dec=True, pad_to=self.max_decoder_length) self.data_sequence.tokenizer.save_vocab() self.val_sequence = DataPreprocessor(64, train=False, enc_dec=True, pad_to=self.max_decoder_length) self.history = None self.model_path: str = None self.model: KerasModel = None
class PredictionPipeline(): def __init__(self): # load config file with open("./config/predictionconfig.yml", "r") as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) self.interval = cfg['interval'] self.threshold = cfg['single_threshold'] # init DataPreprocessor self.data_preprocessor = DataPreprocessor() # init PredictionMaker self.prediction_maker = PredictionMaker() self.registry = CollectorRegistry() self.pushgateway_url = os.getenv('PUSHGATEWAY_URL') def run(self): while True: start_millis = int(round(time.time() * 1000)) print("Starting pipeline...") # get data df = self.data_preprocessor.get_data() df = self.data_preprocessor.preprocess_data(df) if df.empty == False: # predict result = self.prediction_maker.make_prediction(df) end_millis = int(round(time.time() * 1000)) prediction_millis = end_millis - start_millis prediction = Prediction(result) # apply changes to K8s Cluster prediction.apply(self.threshold) # push to prometheus gateway prediction.push_to_prometheus(self.registry, self.pushgateway_url) try: g = Gauge('prediction_making_speed', 'Time in ms for making Prediction.', registry=registry) except: pass g.set(prediction_millis) push_to_gateway('{}:9091'.format(self.pushgateway_url), job='prediction-maker', registry=registry) # sleep until next interval print("Prediction took {} ms.".format(prediction_millis)) print("Going back to sleep for {} sec...".format(self.interval)) time.sleep(self.interval)
def __init__(self, X, y, regulator): self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y).astype(int) self.all_classes = np.unique(y) assert self.all_classes.size == 2 self.target_value = np.array([1, -1]).astype(int) y[y == self.all_classes[0]] = self.target_value[0] y[y == self.all_classes[1]] = self.target_value[1] self.number_features = X.shape[1] alpha = self.solve_dual_problem(X, y, regulator) zero_threshold = 1e-6 self.number_support_vectors = np.sum(alpha > zero_threshold) self.margin = 1 / np.linalg.norm(alpha) self.svm_weight, self.svm_bias = SVMCVX.compute_svm_parameters(alpha, X, y, regulator)
def test_several_classification_models_fitting(preprocessor_train_data): df = preprocessor_train_data.train_df.sample(0.1) preprocessor = DataPreprocessor(train_df=df, test_df=df) preprocessor.prepare_to_model(target_col='income', to_strip=' .') models = ModelsContainer() models.fit(preprocessor.train_encoded_df, kind=ModelTypes.CLASSIFICATION) expected_results = [ { "model": models.logistic_class.fitted_model, "metrics": { "areaUnderROC": 0.770414, "areaUnderPR": 0.646093 }, }, { "model": models.random_forest_class.fitted_model, "metrics": { "areaUnderROC": 0.674751, "areaUnderPR": 0.664931 }, }, { "model": models.gbt_class.fitted_model, "metrics": { "areaUnderROC": 0.811643, "areaUnderPR": 0.746147 }, }, { "model": models.svm_class.fitted_model, "metrics": { "areaUnderROC": 0.750627, "areaUnderPR": 0.645328 }, }, { "model": models.naive_bayes_class.fitted_model, "metrics": { "areaUnderROC": 0.615000, "areaUnderPR": 0.504709 }, }, ] for result in expected_results: _check_evaluation(preprocessor=preprocessor, model=result["model"], metrics=result["metrics"])
def load_data_from_folder(self): filenames, class_ids, classes = self.__load_audio_filenames_with_class__() dataset_size = len(filenames) X_train = [] y_train = [] X_test = [] y_test = [] X_validation = [] y_validation = [] pool = Pool(cpu_count() - 1) preprocessor = DataPreprocessor(dataset_path=self.dataset_path) for (results, filepath, class_id, random_roll) in tqdm(pool.imap_unordered(preprocessor.process_file, zip_longest(filenames, class_ids)), total=dataset_size): filepath = normpath(filepath) is_testing = 1 <= random_roll <= 10 is_validation = 11 <= random_roll <= 20 for item in results: if is_testing: X_test.append(item) y_test.append(class_id) elif is_validation: X_validation.append(item) y_validation.append(class_id) else: X_train.append(item) y_train.append(class_id) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) y_test = np.array(y_test) X_validation = np.array(X_validation) y_validation = np.array(y_validation) return X_train, y_train, X_test, y_test, X_validation, y_validation, classes
def test_preprocess_empty_data(self): """ test preprocess_data with empty df """ df = pd.DataFrame( columns=['traceid', 'sessionid', 'servicessequence', 'starttime']) df = DataPreprocessor().preprocess_data(df) assert_frame_equal(df, df)
def __init__(self, X, y, sgd_batch_size): self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y).astype(int) self.all_classes = np.unique(y) assert self.all_classes.size == 2 self.target_value = np.array([1, -1]).astype(int) y[y == self.all_classes[0]] = self.target_value[0] y[y == self.all_classes[1]] = self.target_value[1] self.number_features = X.shape[1] # prepare for optimization self.loss_record = [] penalty_lambda = 1 w_init = np.zeros(self.number_features) w_init.fill(np.sqrt(1 / (self.number_features * penalty_lambda))) self.svm_weight = self.pegas(X, y, penalty_lambda, w_init, sgd_batch_size)
def test_preprocess_data(self): """ test preprocess_data with normal df """ df = pd.DataFrame( [['1234', '1234', 'front-end,carts,front-end', '1234']], columns=['traceid', 'sessionid', 'servicessequence', 'starttime']) df = DataPreprocessor().preprocess_data(df) df_test = pd.DataFrame( [['1234', '1234', 'front-end,carts', '1234', 1]], columns=[ 'traceid', 'sessionid', 'servicessequence', 'starttime', 'currentclusternumber' ]) assert_frame_equal(df.sort_index(axis=1), df_test.sort_index(axis=1), check_dtype=False, check_index_type=False)
def __init__(self, X, y, sgd_batch_size): self.softplus_a = 0.1 self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y).astype(int) self.all_classes = np.unique(y) assert self.all_classes.size == 2 self.target_value = np.array([1, -1]).astype(int) y[y == self.all_classes[0]] = self.target_value[0] y[y == self.all_classes[1]] = self.target_value[1] n, self.number_features = X.shape assert sgd_batch_size <= n # prepare for optimization self.loss_record = [] penalty_lambda = 1 w_init = np.random.normal(0, 0.001, self.number_features) self.svm_weight = self.optimization(X, y, penalty_lambda, w_init, sgd_batch_size)
def main(_): """ MAIN FUNCTION - define loops for experiments """ # Preprocess data: convert to pkl data if FLAGS.preprocess: for raw_data_fname in RAW_DATA_FNAME_LIST: data_preprocessor = DataPreprocessor(to_dir=DATA_DIR) data_preprocessor.process_and_save(raw_data_fname) # Used for loading data and building graph n_hidden_node_list = [100] n_hidden_layer_list = [1] # PARAM GRIDS param_grid_targets = [n_hidden_node_list, # path only n_hidden_layer_list, # for final dense layers ] param_product = product(*param_grid_targets) print(param_grid_targets) param_product_size = np.prod([len(t) for t in param_grid_targets]) for i, params in enumerate(param_product): n_hidden_node, n_hidden_layer = params FLAGS.num_units = n_hidden_node FLAGS.n_hidden_node = n_hidden_node FLAGS.n_hidden_layer = n_hidden_layer # Model id id_components = [ '{model}_{edim}x{layer}_last'.format( model=('B' if FLAGS.bi_direction else FLAGS.model_type[0].upper()), edim=n_hidden_node, layer=n_hidden_layer), # some details ] model_id = '__'.join(id_components) log.infov('=' * 30 + '{} / {} ({:.1f}%)'.format( i + 1, param_product_size, (i + 1) / param_product_size * 100) + '=' * 30) log.infov('model_id: ' + model_id) train_eval_save(car_id_list, FLAGS.dest_type, model_id, n_save_viz=FLAGS.n_save_viz)
def live(params): sr = params['sampling_rate'] audio_length = params['audio_length'] blocksize = params['blocksize'] sd.default.samplerate = sr sd.default.channels = params['channels'] sd.default.blocksize = blocksize sd.default.latency = params['latency'] weights_path = PROJECT_PATH / params['weights_file'] classes = list(params['all_classes']) num_classes = len(classes) # model = get_tc_resnet_14((321, 40), num_classes, 1.5) model = get_tc_resnet_8((321, 40), num_classes, 1.5) model.load_weights(weights_path) model.summary() recent_signal = [] recording_id = 0 try: while True: input("Press Enter to start recording:") stream = sd.InputStream() stream.start() print("Say the word:") while True: data, overflowed = stream.read(blocksize) data = data.flatten() recent_signal.extend(data.tolist()) if len(recent_signal) >= sr * audio_length: recent_signal = recent_signal[:sr * audio_length] break stream.close() rec_path = PROJECT_PATH / f'recording_{recording_id}.wav' sf.write(rec_path, np.array(recent_signal), sr) recording_id += 1 print("Recording finished! Result is:") mfcc = DataPreprocessor.get_mfcc(np.asarray(recent_signal), sr) y_pred = model.predict(np.array([mfcc]))[0] result_id = int(np.argmax(y_pred)) result_prob = y_pred[result_id] print("result id: " + str(result_id) + " " + classes[result_id] + " " + str(result_prob)) recent_signal = [] except KeyboardInterrupt: print('Record finished!')
class TestDataPreprocessor(unittest.TestCase): """ unittests for DataPreprocessor """ def setUp(self): """ init DataPreprocessor """ self.data_preprocessor = DataPreprocessor() def test_preprocess_empty_data(self): """ test preprocess_data with empty df """ df = pd.DataFrame( columns=['traceid', 'sessionid', 'servicessequence', 'starttime']) df = self.data_preprocessor.preprocess_data(df) full_df = pd.DataFrame( columns=['sessionid', 'nextcluster', 'starttime']) assert_frame_equal(df, full_df) def test_preprocess_data(self): """ test preprocess_data with normal df """ df = pd.DataFrame( [['1234', '1234', 'service-1,service-2,service-1', '1234']], columns=['traceid', 'sessionid', 'servicessequence', 'starttime']) df = self.data_preprocessor.preprocess_data(df) full_df = pd.DataFrame( [[0.0, '1234', '1234', 'service-1,service-2', '1234', 0, 0, 0]], columns=[ 'index', 'traceid', 'sessionid', 'servicessequence', 'starttime', 'currentclusternumber', 'clustersequence', 'nextcluster' ]) assert_frame_equal(df.sort_index(axis=1), full_df.sort_index(axis=1), check_dtype=False, check_index_type=False)
def run(self): """Performs various stages in predictive modeling""" #Path to Data set. path = "../../neeraj/resource/pima-indians-diabetes.data" #Column names of Data set. column_names = [ ' preg ' , ' plas ' , ' pres ' , ' skin ' , ' test ' , ' mass ' , ' pedi ' , ' age ' , ' class ' ] #Loading Data set using class DatasetLoader. load_data = DatasetLoader(path, column_names) data = load_data.load() load_data.print_shape(data) #Understanding data using class DataExplorer. explore_data = DataExplorer() explore_data.print_data_statistics(data) explore_data.visualize(data) #Performing data preprocessing. process_data = DataPreprocessor() input_set, output_set = process_data.split_dataset(data,0,8,8) process_data.display_dataset() process_data.summarize(input_set, 0, 5, 3) #Model evaluation using class Evaluator. evaluator = Evaluator() evaluator.validate(LogisticRegression(), input_set, output_set, 10, 7) evaluator.evaluate(LogisticRegression(), input_set, output_set, 10, 7,'log_loss') #Selecting best model using class ModelSelector. model = ModelSelector() #A set of models for selection. models = [] models.append(( ' LR ' , LogisticRegression())) models.append(( ' LDA ' , LinearDiscriminantAnalysis())) models.append(( ' RF ' , RandomForestClassifier(n_estimators=100, max_features=3))) selected_model = model.select_model(models, input_set, output_set, 10, 7) print("\nSelected Model:\n %s") % (selected_model) #Improving Accuracy using class AccuracyImprover. improve_accuracy = AccuracyImprover() improve_accuracy.tuning(Ridge(),input_set, output_set) improve_accuracy.ensemble_prediction(RandomForestClassifier(n_estimators=100, max_features=3), input_set, output_set, 10, 7) #Finalizing the model and performing prediction. finalize_model = ModelFinalizer() input_train, input_test, output_train, output_test = finalize_model.split_train_test_sets(input_set, output_set, 0.33, 7) finalize_model.finalize_and_save(LogisticRegression(), "../../neeraj/resource/pima_model.sav", input_train, output_train) finalize_model.predict("../../neeraj/resource/pima_model.sav", input_test, output_test)
def __init__(self, args): """ initial function of appPreprocessing :param args: parameter from CLI """ self.args = args self.time_recoder = datetime.datetime.now() self.method_thread = None #self.interface = ClUI() self.interface = GraphUI("OmniPhotos Preprocessing") # register the callback function self.interface.before_exit = self.callback_exit AbsPreprocessor.abs_ui = self.interface # register UI self.data_preproc = DataPreprocessor(args) self.traj_preproc = TrajPreprocessor(args) self.op_preproc = OpPreprocessor(args) self.of_preproc = OfPreprocessor(args)
class LDA2dGaussGM(Classifier): def __init__(self, X, y): # remove zero covariance features and standardize self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y) self.number_features = X.shape[1] self.all_classes = np.unique(y) self.number_classes = self.all_classes.size self.W = self.calculate_weight_vector(X, y) self.prior, self.mean, self.covariance = \ self.calculate_GaussGM_parameters(self.LDA_projection(X), y) def calculate_weight_vector(self, X, y): # hard coded for 2d projection k = 2 X_kclass = {} for one_class in self.all_classes: X_kclass[one_class] = X[y == one_class] mean_all = np.mean(X, axis=0) S_T = np.matmul(np.transpose(X - mean_all), X - mean_all) S_W = np.zeros((self.number_features, self.number_features)) for one_class in self.all_classes: mean_each = np.mean(X_kclass[one_class], axis=0) S_W += np.matmul(np.transpose(X_kclass[one_class] - mean_each), X_kclass[one_class] - mean_each) S_B = S_T - S_W temp_mat = mat(np.linalg.inv(S_W)) * mat(S_B) _, eig_vecs = eigs(temp_mat, k=k) return eig_vecs.real def LDA_projection(self, X): assert X.shape[1] == self.W.shape[0] return X.dot(self.W) def calculate_GaussGM_parameters(self, X, y): number_features = X.shape[1] priors = [np.sum(y == one_class) / y.size for one_class in self.all_classes] means = np.zeros((self.number_classes, number_features)) covariances = np.zeros((self.number_classes, number_features, number_features)) for k in range(0, self.number_classes): index = y == self.all_classes[k] X_classk = X[index, :] means[k, :] = np.mean(X_classk, axis=0) covariances[k, :, :] = np.cov(X_classk, rowvar=False, bias=True) return priors, means, covariances def validate(self, X_test, y_test): X_test = self.data_preprocessor.process_data(X_test) assert X_test.shape[1] == self.number_features X_test = self.LDA_projection(X_test) predicted_scores = self.predict_score(X_test) predicted_class = self.predict_class(predicted_scores) test_error = self.calculate_predict_error(predicted_class, y_test) return test_error def calculate_predict_error(self, predicted_class, y): predicted_indicator = np.array([predicted_class[i] == y[i] for i in range(0, y.size)]) return 1 - np.sum(predicted_indicator) / y.size def predict_class(self, score): max_indicator = np.argmax(score, axis=1) return np.array([self.all_classes[i] for i in max_indicator]) def predict_score(self, X): N = X.shape[0] log_score = np.zeros((N, self.number_classes)) for k in range(self.number_classes): mean_k = self.mean[k, :] cov_k = self.covariance[k, :, :] log_score[:, k] = multivariate_normal.logpdf(X, mean_k, cov_k) log_prior = [log(p) for p in self.prior] log_score += log_prior return log_score
modified_model = ModifiedReferenceCaffeNet(class_size) # copy W/b from the original model to the new one copy_model(original_model, modified_model) if args.gpu >= 0: chainer.cuda.get_device(args.gpu).use() # make the GPU current modified_model.to_gpu() print("# _/_/_/ load dataset _/_/_/") in_size = ModifiedReferenceCaffeNet.IN_SIZE mean = np.load(mean_image_path) train = DataPreprocessor(training_data_path, root_dir_path, mean, in_size, random=True, is_scaled=True) test = DataPreprocessor(testing_data_path, root_dir_path, mean, in_size, random=False, is_scaled=True) train_iter = chainer.iterators.MultiprocessIterator( train, args.batch_size, n_processes=args.loader_job) test_iter = chainer.iterators.MultiprocessIterator( test, args.test_batch_size, repeat=False,
class LogisticRegression(Classifier): def __init__(self, X, y): # Data_Preprocessor will copy X self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y) self.all_classes = np.unique(y) self.number_classes = self.all_classes.size self.number_observations, self.number_features = X.shape # row-wise concatenated weight vector W_init = np.random.normal(0, 0.001, self.number_classes * self.number_features) self.W = self.IRLS(W_init, X, y) def IRLS(self, W, X, y): # construct YT to compute gradients and hessian T = np.zeros((self.number_observations, self.number_classes)) Y = np.zeros((self.number_observations, self.number_classes)) # through iterations number_iterations = 30 loss_record = np.zeros(number_iterations) for iter in range(number_iterations): W_mat = self.W_vector2matrix(W) for i in range(self.number_observations): T[i, y[i]] = 1 Y[i, :] = LogisticRegression.softmax(W_mat, X[i, :]) loss_record[iter] = LogisticRegression.cross_entropy_loss(Y, T) grad_W = self.compute_gradient(X, Y, T) hess_W = self.compute_hessian(X, Y) W += -0.01 * np.matmul(np.linalg.inv(hess_W), grad_W) # W += - 0.01 * grad_W return self.W_vector2matrix(W) def compute_gradient(self, X, Y, T): grad_mat = np.zeros((self.number_classes, self.number_features)) for i in range(self.number_classes): grad_mat[i, :] = (Y[:, i] - T[:, i]).dot(X) return grad_mat.reshape(self.number_classes * self.number_features) def cross_entropy_loss(Y, T): loss = 0 N, K = Y.shape for n in range(N): for k in range(K): loss += -T[n, k] * log(Y[n, k]) return loss def compute_hessian(self, X, Y): hess_mat = np.zeros((self.number_classes * self.number_features, self.number_classes * self.number_features)) for j in range(self.number_classes): for k in range(self.number_classes): i_kj = 1 if (k == j) else 0 dot_vec = Y[:, k] * (i_kj - Y[:, j]) block_kj = np.matmul(np.matmul(X.T, np.diag(dot_vec)), X) hess_mat[j * self.number_features : (j + 1) * self.number_features, \ k * self.number_features : (k + 1) * self.number_features] = block_kj # hessian may not be PSD due to numerical issue hess_mat = hess_mat + 0.1 * np.identity( self.number_classes * self.number_features) return hess_mat def W_vector2matrix(self, W_vec): assert (W_vec.size == self.number_classes * self.number_features) return W_vec.reshape((self.number_classes, self.number_features)) def softmax(W, x): e = np.exp(W.dot(x)) dist = e / np.sum(e) return dist def validate(self, X_test, y_test): X_test = self.data_preprocessor.process_data(X_test) assert X_test.shape[1] == self.number_features predicted_score = self.predict_score(X_test) predicted_class = self.predict_class(predicted_score) test_error = self.calculate_predict_error(predicted_class, y_test) return test_error def calculate_predict_error(self, predicted_class, y): predicted_indicator = np.array( [predicted_class[i] == y[i] for i in range(0, y.size)]) return 1 - np.sum(predicted_indicator) / y.size def predict_class(self, score): max_indicator = np.argmax(score, axis=1) return np.array([self.all_classes[i] for i in max_indicator]) def predict_score(self, X): N = X.shape[0] softmax_score = np.zeros((N, self.number_classes)) for i in range(N): softmax_score[i, :] = LogisticRegression.softmax(self.W, X[i, :]) return softmax_score
# Configurations timesteps = 10 hidden_neurons = 50 epochs = 300 batchsize = 10 # Load data nikkei_data_org, nasdaq_data_org, currency_data_org = data_loader.load_dataset() # Data Preprocessing dropping_features_for_nikkei = ['Open Price', 'High Price', 'Low Price'] dropping_features_for_nasdaq = ['High', 'Low', 'Total Market Value', 'Dividend Market Value'] dropping_features_for_currency = ['High (est)', 'Low (est)'] nikkei_data = DataPreprocessor(nikkei_data_org).preprocess_data(dropping_features_for_nikkei) nasdaq_data = DataPreprocessor(nasdaq_data_org).preprocess_data(dropping_features_for_nasdaq) currency_data = DataPreprocessor(currency_data_org).preprocess_data(dropping_features_for_currency) merged_data = DataPreprocessor.merge(nikkei_data, nasdaq_data, currency_data) data = merged_data.dropna() data.to_csv("data/data.csv") # Split the data data_train, data_val, data_test = DataSplitter.split_to_train_val_test(data) x_train, y_train = DataSplitter.split_to_x_and_y(data_train, timesteps=timesteps) x_val, y_val = DataSplitter.split_to_x_and_y(data_val, timesteps=timesteps) x_test, y_test = DataSplitter.split_to_x_and_y(data_test, timesteps=timesteps) print("Train dataset has {} samples.".format(*x_train.shape)) # print(x_train[:3])
import calendar import time files = [] for r, d, f in os.walk('data/raw/features/'): for file in f: if '.csv' in file: files.append(os.path.join(r, file)) features = [] for f in files: df = pd.read_csv(f) features.append(df) features = pd.concat(features) features = features.sort_values(by=['bookingID', 'second']) dp = DataPreprocessor() features = dp.feature_engineering(features) #features = pd.read_csv('data/processed/features_1560750238.csv') files = [] for r, d, f in os.walk('data/raw/labels/'): for file in f: if '.csv' in file: files.append(os.path.join(r, file)) labels = [] for f in files: df = pd.read_csv(f) labels.append(df) true_values_exist = True if (len(labels) == 0): true_values_exist = False
class SVMCVX(Classifier): def __init__(self, X, y, regulator): self.data_preprocessor = DataPreprocessor(X) X = self.data_preprocessor.process_data(X) y = np.copy(y).astype(int) self.all_classes = np.unique(y) assert self.all_classes.size == 2 self.target_value = np.array([1, -1]).astype(int) y[y == self.all_classes[0]] = self.target_value[0] y[y == self.all_classes[1]] = self.target_value[1] self.number_features = X.shape[1] alpha = self.solve_dual_problem(X, y, regulator) zero_threshold = 1e-6 self.number_support_vectors = np.sum(alpha > zero_threshold) self.margin = 1 / np.linalg.norm(alpha) self.svm_weight, self.svm_bias = SVMCVX.compute_svm_parameters(alpha, X, y, regulator) def solve_dual_problem(self, X, y, c): # QP problem # min 0.5 * xTPx + qTx # st Gx <= h, Ax = b number_observations = X.shape[0] yX = np.reshape(y, (number_observations, 1)) * X P = matrix(yX.dot(yX.T)) q = matrix(-np.ones(number_observations)) A = matrix(np.reshape(y.astype(float), (1, number_observations))) b = matrix([0.0]) I = np.identity(number_observations) G = matrix(np.concatenate((I, -I), axis=0)) vector_c = c * np.ones(number_observations) vector_0 = np.zeros(number_observations) h = matrix(np.concatenate((vector_c, vector_0))) solution = qp(P, q, G, h, A, b) alpha = np.array(solution['x']) return alpha.reshape((-1,)) def compute_svm_parameters(alpha, X, y, c): w = (alpha * y).dot(X) b = 0 count = 0 for i in range(alpha.shape[0]): if 0 < alpha[i] < c: count += 1 b += y[i] - w.dot(X[i, :]) assert count > 0 b /= count return w, b def predict(self, X_new): X = self.data_preprocessor.process_data(X_new) X = np.reshape(X, (-1, self.number_features)) predicted_score = self.predict_score(X) predicted_class = self.predict_class(predicted_score) return predicted_class def validate(self, X_test, y_test): X_test = self.data_preprocessor.process_data(X_test) assert X_test.shape[1] == self.number_features predicted_score = self.predict_score(X_test) predicted_class = self.predict_class(predicted_score) test_error = self.calculate_predict_error(predicted_class, y_test) return test_error def calculate_predict_error(self, predicted_class, y): predicted_indicator = np.array([predicted_class[i] == y[i] for i in range(0, y.size)]) return 1 - np.sum(predicted_indicator) / y.size def predict_class(self, score): max_indicator = np.argmax(score, axis=1) return np.array([self.all_classes[i] for i in max_indicator]) def predict_score(self, X): N = X.shape[0] svm_score = np.zeros((N, 2)) svm_score[:, 0] = X.dot(self.svm_weight) + self.svm_bias svm_score[:, 1] = -svm_score[:, 0] return svm_score