def prepare_train_test_data(self, data_feature, LabelColumnName):
        firstloop = 1
        for ticker, data in data_feature.items():
            #print(ticker, "n_feature", self.paras.n_features, len(data[0]))
            X, y = preprocessing_data(self.paras,
                                      data[0],
                                      LabelColumnName,
                                      one_hot_label_proc=True)
            X, y = reshape_input(self.paras.n_features, X, y)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
                X, y, test_size=0.2)
            # print('Train shape X:', X_train_temp.shape, ',y:', y_train_temp.shape)
            # print('Test shape X:', X_test_temp.shape, ',y:', y_test_temp.shape)

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        # print('Train shape X:', X_train.shape, ',y:', y_train.shape)
        # print('Test shape X:', X_test.shape, ',y:', y_test.shape)
        return X_train, y_train, X_test, y_test
Ejemplo n.º 2
0
    def prepare_train_data(self, data_feature, LabelColumnName):
        firstloop = 1
        print("get_data_feature")
        #print(data_feature.items())

        train_tickers_dict = get_all_target_dict()
        train_symbols = train_tickers_dict.keys()

        for ticker, data in data_feature.items():
     
            if ticker not in train_symbols: continue

            X, y = preprocessing_train_data(self.paras, data[0].copy(), LabelColumnName, ticker, train_tickers_dict, one_hot_label_proc=True)

            if len(X) == 0 or len(y) == 0: continue
    
            X, y = reshape_input(self.paras.n_features, X, y)
            X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X, y, test_size=0.2)
            

            if firstloop == 1:
                firstloop = 0
                X_train = X_train_temp
                X_test = X_test_temp
                y_train = y_train_temp
                y_test = y_test_temp
            else:
                X_train = np.append(X_train, X_train_temp, 0)
                X_test = np.append(X_test, X_test_temp, 0)
                y_train = np.append(y_train, y_train_temp, 0)
                y_test = np.append(y_test, y_test_temp, 0)

        return X_train, y_train, X_test, y_test
    def predict_data(self, model, data_feature, window, LabelColumnName):

        if model == None: model = self.load_training_model(window)

        if model == None:
            print('predict failed, model not exist')
            return

        filename = open("./predict_out.txt", 'w')

        for ticker in self.paras.predict_tickers:
            try:
                data = data_feature[ticker]
            except:
                # print('stock not preparee', ticker)
                continue

            X_train, y_train = preprocessing_data(self.paras,
                                                  data[0],
                                                  LabelColumnName,
                                                  one_hot_label_proc=True)
            X_valid, y_valid = preprocessing_data(self.paras,
                                                  data[1],
                                                  LabelColumnName,
                                                  one_hot_label_proc=True)
            X_lately, y_lately = preprocessing_data(self.paras,
                                                    data[2],
                                                    LabelColumnName,
                                                    one_hot_label_proc=False)

            X_train, y_train = reshape_input(self.paras.n_features, X_train,
                                             y_train)
            X_valid, y_valid = reshape_input(self.paras.n_features, X_valid,
                                             y_valid)
            X_lately, y_lately = reshape_input(self.paras.n_features, X_lately,
                                               y_lately)

            possibility_columns = [
                str(window) + '_' + str(idx)
                for idx in range(self.paras.n_out_class)
            ]

            # print('\n ---------- ', ticker, ' ---------- \n')
            # print(' ############## validation on train data ############## ')
            mse_known_train, predictions_train = self.predict(
                model, X_train, y_train)
            data[3].loc[data[0].index, 'label'] = np.argmax(
                y_train, axis=1)  #- int(self.paras.n_out_class/2)
            data[3].loc[data[0].index, 'pred'] = np.argmax(
                predictions_train, axis=1)  #- int(self.paras.n_out_class/2)
            s = pd.DataFrame(predictions_train,
                             index=data[0].index,
                             columns=possibility_columns)

            # print(' ############## validation on valid data ############## ')
            mse_known_lately, predictions_valid = self.predict(
                model, X_valid, y_valid)
            data[3].loc[data[1].index, 'label'] = np.argmax(
                y_valid, axis=1)  #- int(self.paras.n_out_class/2)
            data[3].loc[data[1].index, 'pred'] = np.argmax(
                predictions_valid, axis=1)  #- int(self.paras.n_out_class/2)
            s = s.append(
                pd.DataFrame(predictions_valid,
                             index=data[1].index,
                             columns=possibility_columns))

            # print(' ############## validation on lately data ############## ')
            mse_lately, predictions_lately = self.predict(
                model, X_lately, y_lately)
            data[3].loc[data[2].index,
                        'label'] = np.nan  #np.argmax(actual_lately, axis=1)
            data[3].loc[data[2].index, 'pred'] = np.argmax(
                predictions_lately, axis=1)  #- int(self.paras.n_out_class/2)
            s = s.append(
                pd.DataFrame(predictions_lately,
                             index=data[2].index,
                             columns=possibility_columns))

            data[3] = pd.merge(data[3],
                               s,
                               how='outer',
                               left_index=True,
                               right_index=True)

            if data[3]['pred'][-1] == 3:
                continue

            actual_count = []
            predict_count = []
            for i in range(self.paras.n_out_class):
                actual_count.append(len(data[3][data[3]['label'] == i]))
                predict_count.append(
                    len(data[3][(data[3]['label'] == i)
                                & (data[3]['label'] == data[3]['pred'])]))

            valid_actual_count = []
            valid_predict_count = []
            data.append(data[3][-self.paras.valid_len:])
            for i in range(self.paras.n_out_class):
                valid_actual_count.append(len(data[4][data[4]['label'] == i]))
                valid_predict_count.append(
                    len(data[4][(data[4]['label'] == i)
                                & (data[4]['label'] == data[4]['pred'])]))

            # print('classification counter:\n', actual_count)
            # print('classification possibility:\n', 100*np.array(actual_count)/np.sum(actual_count))
            # print('classification train predict:\n', 100*np.array(predict_count)/np.array(actual_count))
            # print('classification valid predict:\n', 100*np.array(valid_predict_count)/np.array(valid_actual_count))

            # timePeriod = [22*24, 22*12, 22*6, 22*3, 22*2, 22]
            # pred_profit = data[3]["pred_profit"]
            # pred_profit_len = len(pred_profit)
            # centers_oris = []
            # index_oris = []
            # for time in timePeriod:
            #     if pred_profit_len < time: continue
            #     out_labels, counters, centers_ori = kmeans_claasification(pred_profit[pred_profit_len - time : pred_profit_len], self.paras.n_out_class)
            #     centers_oris.append(centers_ori)
            #     index_oris.append("Days: " + str(time))

            # df_ori = pd.DataFrame(centers_oris, index=index_oris, columns=[str(idx) for idx in range(self.paras.n_out_class)])
            # print('\nclassification centers:\n', df_ori)

            data[3]['label'] = data[3]['label'] - int(
                self.paras.n_out_class / 2)
            data[3]['pred'] = data[3]['pred'] - int(self.paras.n_out_class / 2)

            # rewrite data frame and save / update
            data[3] = self.save_data_frame_mse(
                ticker,
                data[3],
                window,
                possibility_columns,
                mses=[mse_known_train, mse_known_lately])
            self.df = data[3]

            pd.set_option('display.max_rows', None)
            print('\n ---------- ', ticker, ' ---------- \n', file=filename)
            print(data[3][-(self.paras.pred_len + self.paras.valid_len):],
                  file=filename,
                  flush=True)
Ejemplo n.º 4
0
    def predict_data(self, model, data_feature, window, LabelColumnName):

        if model == None: model = self.load_training_model(window)

        if model == None:
            print('predict failed, model not exist')
            return

        filename = open("./predict_out.txt", 'w')

        for ticker in self.paras.predict_tickers:
            try:
                data = data_feature[ticker]
            except:
                # print('stock not preparee', ticker)
                continue

            X_train, y_train   = preprocessing_data(self.paras, data[0], LabelColumnName, one_hot_label_proc=True)
            X_valid, y_valid   = preprocessing_data(self.paras, data[1], LabelColumnName, one_hot_label_proc=True)
            X_lately, y_lately = preprocessing_data(self.paras, data[2], LabelColumnName, one_hot_label_proc=False)
            
            X_train, y_train   = reshape_input(self.paras.n_features, X_train, y_train)                                                                    
            X_valid, y_valid   = reshape_input(self.paras.n_features, X_valid, y_valid)
            X_lately, y_lately = reshape_input(self.paras.n_features, X_lately, y_lately)

            possibility_columns = [str(window) + '_' + str(idx) for idx in range(self.paras.n_out_class)]

         
            mse_known_train, predictions_train = self.predict(model, X_train, y_train)
            data[3].loc[data[0].index, 'label'] = np.argmax(y_train, axis=1) #- int(self.paras.n_out_class/2)
            data[3].loc[data[0].index, 'pred'] = np.argmax(predictions_train, axis=1) #- int(self.paras.n_out_class/2)
            s = pd.DataFrame(predictions_train, index = data[0].index, columns=possibility_columns)

            mse_known_lately, predictions_valid = self.predict(model, X_valid, y_valid)
            data[3].loc[data[1].index, 'label'] = np.argmax(y_valid, axis=1) #- int(self.paras.n_out_class/2)
            data[3].loc[data[1].index, 'pred'] = np.argmax(predictions_valid, axis=1) #- int(self.paras.n_out_class/2)
            s = s.append(pd.DataFrame(predictions_valid, index = data[1].index, columns=possibility_columns))

            mse_lately, predictions_lately = self.predict(model, X_lately, y_lately)
            data[3].loc[data[2].index, 'label'] = np.nan#np.argmax(actual_lately, axis=1)
            data[3].loc[data[2].index, 'pred'] = np.argmax(predictions_lately, axis=1) #- int(self.paras.n_out_class/2)
            s = s.append(pd.DataFrame(predictions_lately, index = data[2].index, columns=possibility_columns))
            
            data[3] = pd.merge(data[3], s, how='outer', left_index=True, right_index=True)

            if data[3]['pred'][-1] == 3:
                continue

            actual_count = []
            predict_count = []
            for i in range(self.paras.n_out_class):
                actual_count.append(len(data[3][data[3]['label'] == i]))
                predict_count.append(len(data[3][(data[3]['label'] == i) & (data[3]['label'] == data[3]['pred'])]))

            valid_actual_count = []
            valid_predict_count = []
            data.append(data[3][-self.paras.valid_len:])
            for i in range(self.paras.n_out_class):
                valid_actual_count.append(len(data[4][data[4]['label'] == i]))
                valid_predict_count.append(len(data[4][(data[4]['label'] == i) & (data[4]['label'] == data[4]['pred'])]))

         

            data[3]['label'] = data[3]['label'] - int(self.paras.n_out_class/2)
            data[3]['pred'] = data[3]['pred'] - int(self.paras.n_out_class/2)
         
            data[3] = self.save_data_frame_mse(ticker, data[3], window, possibility_columns, mses=[mse_known_train, mse_known_lately])
            self.df = data[3]

            pd.set_option('display.max_rows', None)
            print('\n ---------- ', ticker, ' ---------- \n', file = filename)
            print(data[3][-(self.paras.pred_len + self.paras.valid_len):], file = filename, flush = True)