def prepare_train_test_data(self, data_feature, LabelColumnName): firstloop = 1 for ticker, data in data_feature.items(): #print(ticker, "n_feature", self.paras.n_features, len(data[0])) X, y = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=True) X, y = reshape_input(self.paras.n_features, X, y) X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split( X, y, test_size=0.2) # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape) # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape) if firstloop == 1: firstloop = 0 X_train = X_train_temp X_test = X_test_temp y_train = y_train_temp y_test = y_test_temp else: X_train = np.append(X_train, X_train_temp, 0) X_test = np.append(X_test, X_test_temp, 0) y_train = np.append(y_train, y_train_temp, 0) y_test = np.append(y_test, y_test_temp, 0) # print('Train shape X:', X_train.shape, ',y:', y_train.shape) # print('Test shape X:', X_test.shape, ',y:', y_test.shape) return X_train, y_train, X_test, y_test
def prepare_train_data(self, data_feature, LabelColumnName): firstloop = 1 print("get_data_feature") #print(data_feature.items()) train_tickers_dict = get_all_target_dict() train_symbols = train_tickers_dict.keys() for ticker, data in data_feature.items(): if ticker not in train_symbols: continue X, y = preprocessing_train_data(self.paras, data[0].copy(), LabelColumnName, ticker, train_tickers_dict, one_hot_label_proc=True) if len(X) == 0 or len(y) == 0: continue X, y = reshape_input(self.paras.n_features, X, y) X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2) if firstloop == 1: firstloop = 0 X_train = X_train_temp X_test = X_test_temp y_train = y_train_temp y_test = y_test_temp else: X_train = np.append(X_train, X_train_temp, 0) X_test = np.append(X_test, X_test_temp, 0) y_train = np.append(y_train, y_train_temp, 0) y_test = np.append(y_test, y_test_temp, 0) return X_train, y_train, X_test, y_test
def predict_data(self, model, data_feature, window, LabelColumnName): if model == None: model = self.load_training_model(window) if model == None: print('predict failed, model not exist') return filename = open("./predict_out.txt", 'w') for ticker in self.paras.predict_tickers: try: data = data_feature[ticker] except: # print('stock not preparee', ticker) continue X_train, y_train = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=True) X_valid, y_valid = preprocessing_data(self.paras, data[1], LabelColumnName, one_hot_label_proc=True) X_lately, y_lately = preprocessing_data(self.paras, data[2], LabelColumnName, one_hot_label_proc=False) X_train, y_train = reshape_input(self.paras.n_features, X_train, y_train) X_valid, y_valid = reshape_input(self.paras.n_features, X_valid, y_valid) X_lately, y_lately = reshape_input(self.paras.n_features, X_lately, y_lately) possibility_columns = [ str(window) + '_' + str(idx) for idx in range(self.paras.n_out_class) ] # print('\n ---------- ', ticker, ' ---------- \n') # print(' ############## validation on train data ############## ') mse_known_train, predictions_train = self.predict( model, X_train, y_train) data[3].loc[data[0].index, 'label'] = np.argmax( y_train, axis=1) #- int(self.paras.n_out_class/2) data[3].loc[data[0].index, 'pred'] = np.argmax( predictions_train, axis=1) #- int(self.paras.n_out_class/2) s = pd.DataFrame(predictions_train, index=data[0].index, columns=possibility_columns) # print(' ############## validation on valid data ############## ') mse_known_lately, predictions_valid = self.predict( model, X_valid, y_valid) data[3].loc[data[1].index, 'label'] = np.argmax( y_valid, axis=1) #- int(self.paras.n_out_class/2) data[3].loc[data[1].index, 'pred'] = np.argmax( predictions_valid, axis=1) #- int(self.paras.n_out_class/2) s = s.append( pd.DataFrame(predictions_valid, index=data[1].index, columns=possibility_columns)) # print(' ############## validation on lately data ############## ') mse_lately, predictions_lately = self.predict( model, X_lately, y_lately) data[3].loc[data[2].index, 'label'] = np.nan #np.argmax(actual_lately, axis=1) data[3].loc[data[2].index, 'pred'] = np.argmax( predictions_lately, axis=1) #- int(self.paras.n_out_class/2) s = s.append( pd.DataFrame(predictions_lately, index=data[2].index, columns=possibility_columns)) data[3] = pd.merge(data[3], s, how='outer', left_index=True, right_index=True) if data[3]['pred'][-1] == 3: continue actual_count = [] predict_count = [] for i in range(self.paras.n_out_class): actual_count.append(len(data[3][data[3]['label'] == i])) predict_count.append( len(data[3][(data[3]['label'] == i) & (data[3]['label'] == data[3]['pred'])])) valid_actual_count = [] valid_predict_count = [] data.append(data[3][-self.paras.valid_len:]) for i in range(self.paras.n_out_class): valid_actual_count.append(len(data[4][data[4]['label'] == i])) valid_predict_count.append( len(data[4][(data[4]['label'] == i) & (data[4]['label'] == data[4]['pred'])])) # print('classification counter:\n', actual_count) # print('classification possibility:\n', 100*np.array(actual_count)/np.sum(actual_count)) # print('classification train predict:\n', 100*np.array(predict_count)/np.array(actual_count)) # print('classification valid predict:\n', 100*np.array(valid_predict_count)/np.array(valid_actual_count)) # timePeriod = [22*24, 22*12, 22*6, 22*3, 22*2, 22] # pred_profit = data[3]["pred_profit"] # pred_profit_len = len(pred_profit) # centers_oris = [] # index_oris = [] # for time in timePeriod: # if pred_profit_len < time: continue # out_labels, counters, centers_ori = kmeans_claasification(pred_profit[pred_profit_len - time : pred_profit_len], self.paras.n_out_class) # centers_oris.append(centers_ori) # index_oris.append("Days: " + str(time)) # df_ori = pd.DataFrame(centers_oris, index=index_oris, columns=[str(idx) for idx in range(self.paras.n_out_class)]) # print('\nclassification centers:\n', df_ori) data[3]['label'] = data[3]['label'] - int( self.paras.n_out_class / 2) data[3]['pred'] = data[3]['pred'] - int(self.paras.n_out_class / 2) # rewrite data frame and save / update data[3] = self.save_data_frame_mse( ticker, data[3], window, possibility_columns, mses=[mse_known_train, mse_known_lately]) self.df = data[3] pd.set_option('display.max_rows', None) print('\n ---------- ', ticker, ' ---------- \n', file=filename) print(data[3][-(self.paras.pred_len + self.paras.valid_len):], file=filename, flush=True)
def predict_data(self, model, data_feature, window, LabelColumnName): if model == None: model = self.load_training_model(window) if model == None: print('predict failed, model not exist') return filename = open("./predict_out.txt", 'w') for ticker in self.paras.predict_tickers: try: data = data_feature[ticker] except: # print('stock not preparee', ticker) continue X_train, y_train = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=True) X_valid, y_valid = preprocessing_data(self.paras, data[1], LabelColumnName, one_hot_label_proc=True) X_lately, y_lately = preprocessing_data(self.paras, data[2], LabelColumnName, one_hot_label_proc=False) X_train, y_train = reshape_input(self.paras.n_features, X_train, y_train) X_valid, y_valid = reshape_input(self.paras.n_features, X_valid, y_valid) X_lately, y_lately = reshape_input(self.paras.n_features, X_lately, y_lately) possibility_columns = [str(window) + '_' + str(idx) for idx in range(self.paras.n_out_class)] mse_known_train, predictions_train = self.predict(model, X_train, y_train) data[3].loc[data[0].index, 'label'] = np.argmax(y_train, axis=1) #- int(self.paras.n_out_class/2) data[3].loc[data[0].index, 'pred'] = np.argmax(predictions_train, axis=1) #- int(self.paras.n_out_class/2) s = pd.DataFrame(predictions_train, index = data[0].index, columns=possibility_columns) mse_known_lately, predictions_valid = self.predict(model, X_valid, y_valid) data[3].loc[data[1].index, 'label'] = np.argmax(y_valid, axis=1) #- int(self.paras.n_out_class/2) data[3].loc[data[1].index, 'pred'] = np.argmax(predictions_valid, axis=1) #- int(self.paras.n_out_class/2) s = s.append(pd.DataFrame(predictions_valid, index = data[1].index, columns=possibility_columns)) mse_lately, predictions_lately = self.predict(model, X_lately, y_lately) data[3].loc[data[2].index, 'label'] = np.nan#np.argmax(actual_lately, axis=1) data[3].loc[data[2].index, 'pred'] = np.argmax(predictions_lately, axis=1) #- int(self.paras.n_out_class/2) s = s.append(pd.DataFrame(predictions_lately, index = data[2].index, columns=possibility_columns)) data[3] = pd.merge(data[3], s, how='outer', left_index=True, right_index=True) if data[3]['pred'][-1] == 3: continue actual_count = [] predict_count = [] for i in range(self.paras.n_out_class): actual_count.append(len(data[3][data[3]['label'] == i])) predict_count.append(len(data[3][(data[3]['label'] == i) & (data[3]['label'] == data[3]['pred'])])) valid_actual_count = [] valid_predict_count = [] data.append(data[3][-self.paras.valid_len:]) for i in range(self.paras.n_out_class): valid_actual_count.append(len(data[4][data[4]['label'] == i])) valid_predict_count.append(len(data[4][(data[4]['label'] == i) & (data[4]['label'] == data[4]['pred'])])) data[3]['label'] = data[3]['label'] - int(self.paras.n_out_class/2) data[3]['pred'] = data[3]['pred'] - int(self.paras.n_out_class/2) data[3] = self.save_data_frame_mse(ticker, data[3], window, possibility_columns, mses=[mse_known_train, mse_known_lately]) self.df = data[3] pd.set_option('display.max_rows', None) print('\n ---------- ', ticker, ' ---------- \n', file = filename) print(data[3][-(self.paras.pred_len + self.paras.valid_len):], file = filename, flush = True)