def test(self, inputs):
        spot_price = read_data_with_specified_columns(inputs['source'],'exp/3d/Co/logistic_regression/v3/LMCADY_v3.conf','2003-11-12')[0].loc[:,self.ground_truth].to_frame()
        for date in self.dates.split(','):

            #generate model specific arguments
            if self.model is None:
                model = Post_process()

            #case if we are running a substitution post process
            elif self.model == "Substitution":
                X = { 'Prediction' : read_classification(self.ground_truth,self.horizon,date,self.version[0],"ensemble") }
                #case if we are substituting with analyst report
                if inputs['substitution'] == "analyst":
                    validation_date = date.split("-")[0]+"-01-01" if date[5:7] <= "06" else date.split("-")[0]+"-07-01"
                    
                    #read substitution configuration which details the metal and horizon combinations that are eligible for substitution
                    with open("exp/substitution.conf","r") as f:
                        config = json.load(f)

                    #case if they are not to be substituted
                    if self.ground_truth not in config.keys() or self.horizon not in config[self.ground_truth]:
                        model = Post_process()
                        X["Uncertainty"] = read_uncertainty(self.ground_truth,self.horizon,date,"ensemble","classification") 
                        prediction = model.predict(X)
                        X["Uncertainty"].to_csv(os.path.join("result","uncertainty","classification",'_'.join([self.ground_truth,validation_date,str(self.horizon),"substitution.csv"])))
                    
                    #case if they are to be substituted
                    else:
                        model = Post_process_substitution()
                        validation_date = date.split("-")[0]+"-01-01" if date[5:7] <= "06" else date.split("-")[0]+"-07-01" 
                        X["Substitute"] = read_substitution_analyst(self.ground_truth, self.horizon, date)
                        X["Uncertainty"] = read_uncertainty(self.ground_truth,self.horizon,date,"ensemble","classification")
                        prediction, uncertainty = model.predict(X)
                        uncertainty.to_csv(os.path.join("result","uncertainty","classification",'_'.join([self.ground_truth,validation_date,str(self.horizon),"substitution.csv"])))

            #case if we are running a filter post process
            elif self.model == "Filter":
                X = { 'Prediction' : read_classification(self.ground_truth,self.horizon,date,self.version[0],"Substitution") }
                model = Post_process_filter()
                X["Prediction"] = read_regression(spot_price, self.ground_truth, self.horizon, date, self.version[1])
                X["Filter"] = generate_final_signal(spot_price,self.ground_truth, self.horizon, date, self.version[0], self.version[1], inputs["class_threshold"], inputs["reg_threshold"], inputs["reg_window"])
                print(X["Filter"])
                prediction = model.predict(X)
            prediction.to_csv(os.path.join('result','prediction','post_process',self.model,'_'.join([self.ground_truth,date,str(self.horizon),self.model+".csv"])))
Exemple #2
0
    hidden_state = args.hidden_state
    dropout = args.drop_out
    attention_size = args.attention_size
    embedding_size = args.embedding_size
    lambd = args.lambd
    save_loss = args.save_loss
    save_prediction = args.save_prediction

    # prepare for the data
    time_horizon = args.horizon
    if args.action == 'train':
        comparison = None
        n = 0

        #read the data from the 4E or NExT database
        time_series,LME_dates,config_length = gn.read_data_with_specified_columns(args.source,args.data_configure_file,"2003-11-12")

        #generate list of list of dates to be used to roll over 5 half years
        today = args.date
        length = 5
        if gn.even_version(args.version) and time_horizon > 5:
            length = 4
        start_time,end_time = gn.get_relevant_dates(today,length,"tune")
        split_dates = gn.rolling_half_year(start_time,end_time,length)
        split_dates  =  split_dates[:]
        
        importance_list = []
        #generate the version
        version_params=generate_version_params(args.version)

    def tune(self, inputs):
        #initialize parameters
        class_dict = {'threshold':[],'acc':[],'coverage':[], 'total_len' :[]}
        reg_dict = {'threshold':[],'mae':[], 'coverage':[], 'total_len' :[]}
        spot_price = read_data_with_specified_columns(inputs['source'],'exp/3d/Co/logistic_regression/v3/LMCADY_v3.conf','2003-11-12')[0].loc[:,self.ground_truth].to_frame()

        for date in self.dates.split(","):
            class_dict[date+"_acc"] = []
            class_dict[date+"_coverage"] = []
            class_dict[date+"_total_len"] = []

            # reg_dict[date+"_acc"] = []
            reg_dict[date+"_mae"] = []
            reg_dict[date+"_coverage"] = []
            reg_dict[date+"_total_len"] = []

        #begin tuning with looping of date
        for date in self.dates.split(','):
            validation_date = date.split("-")[0]+"-01-01" if date[5:7] <= "06" else date.split("-")[0]+"-07-01" 
            if self.model == "Filter":
                #classification tune
                class_thresh = [i + 0.01 for i in np.arange(0.51,step = float(0.05))]
                class_combination = product([self.ground_truth], [self.horizon], [date], [self.version[0]], class_thresh)
                
                #generate classification signals
                p = pl(multiprocessing.cpu_count())
                class_signal = p.starmap(generate_class_signal, class_combination)
                p.close()
                class_signal = pd.concat(class_signal, axis = 1)
                class_signal.columns = [class_thresh]

                #begin analysis of classification results
                for c, col in enumerate(class_signal.columns):
                    class_pred = (read_classification(self.ground_truth, self.horizon, date, self.version[0],"ensemble")*2 - 1).multiply(class_signal[col]*1, axis = 0)
                    class_pred = class_pred.loc[class_pred['result'] != 0]
                    class_label = pd.read_csv(os.path.join("data","Label",'_'.join([self.ground_truth,"h"+str(self.horizon),validation_date,"label.csv"])),index_col = 0)*2 - 1
                    class_label = class_label.loc[class_pred.index,:]
                    if col not in class_dict['threshold']:
                        class_dict['threshold'].append(col)
                        class_dict['acc'].append(0)
                        class_dict['coverage'].append(0)
                        class_dict['total_len'].append(0)
                    if len(class_pred.index) > 0:
                        class_dict[date + "_acc"].append(metrics.accuracy_score(class_pred,class_label))
                        class_dict['acc'][c] += metrics.accuracy_score(class_pred, class_label)*len(class_label.index)
                    else:
                        class_dict[date + "_acc"].append(0)
                        class_dict['acc'][c] += 0
                    class_dict[date +"_coverage"].append(len(class_label.index)/len(class_signal.index))
                    class_dict[date +"_total_len"].append(len(class_signal.index))
                    class_dict['coverage'][c] += len(class_label.index)
                    class_dict['total_len'][c] += len(class_signal.index)
                
                #regression tuning
                if self.horizon  <= 5:
                    reg_thresh = np.arange(1.01, step = 0.1)
                else:
                    reg_thresh = np.arange(0.51, step = 0.05)
                reg_thresh = np.arange(0.05,0.31, step = 0.025)
                reg_window = [60]
                reg_combination = product([spot_price], [self.ground_truth], [self.horizon], [date], [self.version[1]], reg_thresh, reg_window)
                p = pl(multiprocessing.cpu_count())
                
                #generate regression signals
                reg_signal = p.starmap(generate_reg_signal, reg_combination)
                p.close()
                reg_signal = pd.concat(reg_signal, axis = 1)
                reg_signal.columns = [reg_thresh]

                #begin analysis of regression results
                for c,col in enumerate(reg_signal.columns):
                    reg_pred = (read_regression(spot_price, self.ground_truth, self.horizon, date, self.version[1])).multiply(reg_signal[col]*1, axis = 0)
                    reg_pred = reg_pred.loc[reg_pred['Prediction'] != 0]
                    class_pred = np.sign(reg_pred)
                    reg_label = spot_price.shift(-self.horizon).loc[reg_pred.index,:]
                    class_label = pd.read_csv(os.path.join("data","Label",'_'.join([self.ground_truth,"h"+str(self.horizon),validation_date,"label.csv"])),index_col = 0)*2 - 1
                    class_label = class_label.loc[reg_pred.index,:]
                    spot = spot_price.loc[reg_pred.index,:]
                    if col not in reg_dict['threshold']:
                        reg_dict['threshold'].append(col)
                        # reg_dict['acc'].append(0)
                        reg_dict['mae'].append(0)
                        reg_dict['coverage'].append(0)
                        reg_dict['total_len'].append(0)
                    if len(reg_pred.index) > 0:
                        # reg_dict[date+"_acc"].append(metrics.accuracy_score(class_pred,class_label))
                        reg_dict[date + "_mae"].append(metrics.mean_absolute_error(reg_pred/np.array(spot),reg_label/np.array(spot)))
                        reg_dict['mae'][c] += metrics.mean_absolute_error(reg_pred/np.array(spot),reg_label/np.array(spot))*len(reg_label.index)
                        # reg_dict["acc"][c] += metrics.accuracy_score(class_pred,class_label)*len(reg_label.index)
                    else:
                        # reg_dict[date + "_acc"].append(0)
                        reg_dict[date + "_mae"].append(0)
                        reg_dict['mae'][c] += 0
                        # reg_dict['acc'][c] += 0
                    reg_dict[date +"_coverage"].append(len(reg_label.index)/len(reg_signal.index))
                    reg_dict[date+"_total_len"].append(len(reg_signal.index))
                    reg_dict['coverage'][c] += len(reg_label.index)
                    reg_dict['total_len'][c] += len(reg_signal.index)
                    print(reg_dict)
        
        #compute average
        for i in range(len(class_dict['threshold'])):
            class_dict['acc'][i] = class_dict['acc'][i]/class_dict['coverage'][i] if class_dict['coverage'][i] > 0 else 0
            class_dict['coverage'][i] = class_dict['coverage'][i]/class_dict['total_len'][i]

        for i in range(len(reg_dict['threshold'])):
            # reg_dict['acc'][i] = reg_dict['acc'][i]/reg_dict['coverage'][i] if reg_dict['coverage'][i] > 0 else 0
            reg_dict['mae'][i] = reg_dict['mae'][i]/reg_dict['coverage'][i] if reg_dict['coverage'][i] > 0 else 0
            reg_dict['coverage'][i] = reg_dict['coverage'][i]/reg_dict['total_len'][i]
        class_df = pd.DataFrame(class_dict)
        reg_df = pd.DataFrame(reg_dict)
        reg_df = reg_df.loc[reg_df["coverage"] != 0].reset_index(drop = True)

        #generate ranking 
        class_df['acc_rank'] = class_df['acc'].rank(method = 'min', ascending = False)
        class_df['coverage_rank'] = class_df['coverage'].rank(method = 'min', ascending = False)
        class_df['rank'] = (class_df['acc_rank'] + class_df['coverage_rank'])/2
        # reg_df['acc_rank'] = reg_df['acc'].rank(method = 'min', ascending = False)
        reg_df['mae_rank'] = reg_df['mae'].rank(method = 'min', ascending = True)
        reg_df['coverage_rank'] = reg_df['coverage'].rank(method = 'min', ascending = False)
        reg_df['rank'] = (reg_df['mae_rank'] + reg_df['coverage_rank'])/2

        return class_df,reg_df
Exemple #4
0
    parser.add_argument('-sou',
                        '--source',
                        help='source of data',
                        type=str,
                        default="NExT")
    args = parser.parse_args()
    if args.ground_truth == 'None':
        args.ground_truth = None
    os.chdir(os.path.abspath(sys.path[0]))

    args.ground_truth = args.ground_truth.split(",")
    args.horizon = [int(i) for i in args.horizon.split(",")]

    #read data from specified source
    if args.source == "NExT":
        ts, dates, length = read_data_with_specified_columns(
            "NExT", "exp/LMCADY_v3.conf", "2003-11-12")
    else:
        start_date = "2003-11-12"
        import rpy2.robjects as robjects
        robjects.r('.sourceQlib()')
        ts = robjects.r(
            '''merge(getSecurity(c("LMCADY Comdty","LMAHDY Comdty","LMPBDY Comdty","LMZSDY Comdty","LMNIDY Comdty","LMSNDY Comdty"), start = "'''
            + start_date + '''"), 
                        getSecurityOHLCV(c("LMCADS03 Comdty","LMPBDS03 Comdty","LMNIDS03 Comdty","LMSNDS03 Comdty","LMZSDS03 Comdty","LMAHDS03 Comdty"), start = "'''
            + start_date + '''")
                        )
                    ''')
        ts.colnames = robjects.vectors.StrVector([
            "LME_Cu_Spot", "LME_Al_Spot", "LME_Pb_Spot", "LME_Zn_Spot",
            "LME_Ni_Spot", "LME_Xi_Spot", "LME_Cu_Open", "LME_Cu_High",
            "LME_Cu_Low", "LME_Cu_Close", "LME_Cu_Volume", "LME_Cu_OI",
                            if not os.path.exists(os.path.join(filepath, f)):
                                ans[validation_dates[i] + "_mae"].append(0)
                                ans[validation_dates[i] + "_mse"].append(0)
                                ans[validation_dates[i] + "_acc"].append(0)
                                ans[validation_dates[i] +
                                    "_coverage"].append(0)
                                ans[validation_dates[i] + "_length"].append(
                                    len(label.index))
                                continue

                        #generate labels
                        temp = pd.read_csv(os.path.join(filepath, f),
                                           index_col=0)
                        if label.index[-1] > date:
                            label = label.iloc[:-1, :]
                        data, LME_dates, length = read_data_with_specified_columns(
                            args.source, 'exp/LMCADY_v3.conf', '2003-11-12')
                        spot = data.loc[label.index[0]:label.index[-1],
                                        gt].to_frame()
                        if args.regression == "ret":
                            temp = (temp - np.array(spot.loc[temp.index, :])
                                    ) / np.array(spot.loc[temp.index, :])
                            label = (label - np.array(spot)) / np.array(spot)

                        #generate metrics
                        if len(temp.index) == 0:
                            mae = 0
                            mse = 0
                            acc = 0
                        else:
                            mae = mean_absolute_error(label.loc[temp.index, :],
                                                      temp)
Exemple #6
0
    def train(
        self,split = 0.9,
        num_epochs=50,
        drop_out=0.0,
        drop_out_mc = 0.0,
        repeat_mc = 10,
        embedding_size=5,
        batch_size=512,
        hidden_state=50,
        lrate=0.001,
        attention_size=2,
        interval=1,
        lambd=0,
        save_loss=0,
        save_prediction=0,
        method =""):
        """
        drop_out: the dropout rate of LSTM network
        hidden: number of hidden_state of encoder/decoder
        embdedding_size: the size of embedding layer
        batch: the mini-batch size
        hidden_satte: number of hidden_state of encoder/decoder
        lrate: learning rate
        attention_size: the head number in MultiheadAttention Mechanism
        interval: save models every interval epoch
        lambd: the weight of classfication loss
        save_loss: whether to save loss results
        save_prediction: whether to save prediction results
        """
        sys.path[0] = os.curdir
        print("begin to train")
        #assert that the configuration path is correct
        self.path = gn.generate_config_path(self.version)

        #retrieve column list based on configuration path
        time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12")

      #begin to split the train data
        for date in self.date.split(","):
            torch.manual_seed(1)
            np.random.seed(1)
            random.seed(1)
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"train")
            split_dates  =  [train_time,evalidate_date,str(today)]
            
            #generate the version
            version_params = generate_version_params(self.version)
            print("the train date is {}".format(split_dates[0]))
            print("the test date is {}".format(split_dates[1]))
            norm_volume = "v1"
            norm_3m_spread = "v1"
            norm_ex = "v1"
            len_ma = 5
            len_update = 30
            tol = 1e-7
            norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread,
                    'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False}
            tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22,
                            'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None}

            #for versions that tune over 6 metals 
            final_X_tr = []
            final_y_tr = []
            final_X_val = []
            final_y_val = []
            final_X_te = []
            final_y_te = []
            final_y_te_class_list = []
            final_y_te_class_top_list = []
            final_y_te_top_ind_list = []
            final_y_te_class_bot_list = []
            final_y_te_bot_ind_list = []
            final_train_X_embedding = []
            final_test_X_embedding = []
            final_val_X_embedding = []

            i = 0
            #toggle metal id
            metal_id = False
            ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"]
            for ground_truth in ground_truths_list:
                print(ground_truth)
                new_time_series = copy(time_series)
                ts = new_time_series.loc[start_time:split_dates[2]]

                #load data for use
                X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False)
                        
                # split validation
                X_ta = X_tr[:int(len(X_tr) * split), :, :]
                y_ta = y_tr[:int(len(y_tr) * split),0]

                X_val = X_tr[int(len(X_tr) * split):, :, :]
                y_val = y_tr[int(len(y_tr) * split):,0]

                X_te = X_va
                y_te = y_va[:,0]

                # generate metal id for embedding lookup
                train_X_id_embedding = [i]*len(X_ta)
                val_X_id_embedding = [i]*len(X_val)
                test_X_id_embedding = [i]*len(X_te)

                if len(final_X_tr) == 0:
                    final_X_tr = copy(X_ta)
                else:
                    final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0)
                if len(final_y_tr) == 0:
                    final_y_tr = copy(y_ta)
                else:
                    final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0)

                if len(final_X_te) == 0:
                    final_X_te = copy(X_te)
                else:
                    final_X_te = np.concatenate((final_X_te, X_te), axis=0)
                if len(final_y_te) == 0:
                    final_y_te = copy(y_te)
                else:
                    final_y_te = np.concatenate((final_y_te, y_te), axis=0)

                y_te_rank = np.argsort(y_te)
                y_te_class = []
                for item in y_te:
                    y_te_class.append(item)
                final_y_te_class_list.append(y_te_class)
                split_position = len(y_te) // 3
                final_y_te_bot_ind_list.append(y_te_rank[:split_position])
                final_y_te_top_ind_list.append(y_te_rank[-split_position:])
                y_te_class = np.array(y_te_class)
                final_y_te_class_bot_list.append(
                    y_te_class[y_te_rank[:split_position]])
                final_y_te_class_top_list.append(
                    y_te_class[y_te_rank[-split_position:]])

                if len(final_X_val) == 0:
                    final_X_val = copy(X_val)
                else:
                    final_X_val = np.concatenate((final_X_val, X_val), axis=0)
                if len(final_y_val) == 0:
                    final_y_val = copy(y_val)
                else:
                    final_y_val = np.concatenate((final_y_val, y_val), axis=0)

                final_train_X_embedding+=train_X_id_embedding
                final_test_X_embedding+=test_X_id_embedding
                final_val_X_embedding+=val_X_id_embedding

                # update metal index
                i+=1
            print('Dataset statistic: #examples')
            print('Train:', len(final_X_tr), len(final_y_tr), len(final_train_X_embedding))
            print(np.max(final_X_tr), np.min(final_X_tr), np.max(final_y_tr), np.min(final_y_tr))
            print('Validation:', len(final_X_val), len(final_y_val), len(final_val_X_embedding))
            print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding))
            # begin to train the model
            input_dim = final_X_tr.shape[-1]
            window_size = self.lag
            case_number = len(ground_truths_list)
            start = time.time()
            trainer = Trainer(input_dim, hidden_state, window_size, lrate,
                    drop_out, case_number, attention_size,
                    embedding_size,
                    drop_out_mc,repeat_mc,
                    final_X_tr, final_y_tr,
                    final_X_te, final_y_te,
                    final_X_val, final_y_val,
                    final_train_X_embedding,
                    final_test_X_embedding,
                    final_val_X_embedding,
                    final_y_te_class_list,
                    final_y_te_class_top_list,
                    final_y_te_class_bot_list,
                    final_y_te_top_ind_list,
                    final_y_te_bot_ind_list,
                    self.mc
                    )
            end = time.time()
            print("pre-processing time: {}".format(end-start))
            print("the split date is {}".format(split_dates[1]))
            save = 1
            net=trainer.train_minibatch(num_epochs, batch_size, interval, self.lag, self.version, self.horizon, split_dates, method)
Exemple #7
0
    def test(self,split = 0.9,
        num_epochs=50,
        drop_out=0.0,
        drop_out_mc = 0.0,
        repeat_mc = 10,
        embedding_size=5,
        batch_size=512,
        hidden_state=50,
        lrate=0.001,
        attention_size=2,
        interval=1,
        lambd=0,
        save_loss=0,
        save_prediction=0,
        method = ""):
        sys.path[0] = os.curdir
        print(sys.path)
        print("begin to test")

        #assert that the configuration path is correct
        self.path = gn.generate_config_path(self.version)

        #retrieve column list based on configuration path
        time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12")

        #begin to split the train data
        for date in self.date.split(","):
            torch.manual_seed(1)
            np.random.seed(1)
            random.seed(1)
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"test")
            split_dates  =  [train_time,evalidate_date,str(today)]

            #generate the version			
            version_params=generate_version_params(self.version)
            print("the test date is {}".format(split_dates[1]))
            norm_volume = "v1"
            norm_3m_spread = "v1"
            norm_ex = "v1"
            len_ma = 5
            len_update = 30
            tol = 1e-7
            norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread,
                    'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False}
            tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22,
                            'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None}
            
            #for versions that tune over 6 metals 
            final_X_tr = []
            final_y_tr = []
            final_X_val = []
            final_y_val = []
            final_X_te = []
            final_y_te = []
            final_y_te_class_list = []
            final_y_te_class_top_list = []
            final_y_te_top_ind_list = []
            final_y_te_class_bot_list = []
            final_y_te_bot_ind_list = []
            final_train_X_embedding = []
            final_test_X_embedding = []
            final_val_X_embedding = []
            spot_list = []
            i = 0
            #toggle metal id
            metal_id = False
            ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"]
            for ground_truth in ground_truths_list:
                print(ground_truth)
                new_time_series = copy(time_series)
                ts = new_time_series.loc[start_time:split_dates[2]]

                #load data for use
                X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False,live = True)
                        
                # split validation
                X_ta = X_tr[:int(len(X_tr) * split), :, :]
                y_ta = y_tr[:int(len(y_tr) * split),0]

                X_val = X_tr[int(len(X_tr) * split):, :, :]
                y_val = y_tr[int(len(y_tr) * split):,0]

                X_te = X_va
                y_te = y_va[:,0]
                spot_list = np.concatenate([spot_list,y_va[:,1]],axis = 0) if len(spot_list) > 0 else y_va[:,1]
                
                # generate metal id for embedding lookup
                train_X_id_embedding = [i]*len(X_ta)
                val_X_id_embedding = [i]*len(X_val)
                test_X_id_embedding = [i]*len(X_te)

                if len(final_X_tr) == 0:
                    final_X_tr = copy(X_ta)
                else:
                    final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0)
                if len(final_y_tr) == 0:
                    final_y_tr = copy(y_ta)
                else:
                    final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0)

                if len(final_X_te) == 0:
                    final_X_te = copy(X_te)
                else:
                    final_X_te = np.concatenate((final_X_te, X_te), axis=0)
                if len(final_y_te) == 0:
                    final_y_te = copy(y_te)
                else:
                    final_y_te = np.concatenate((final_y_te, y_te), axis=0)

                y_te_rank = np.argsort(y_te)
                y_te_class = []
                for item in y_te:
                    y_te_class.append(item)
                final_y_te_class_list.append(y_te_class)
                split_position = len(y_te) // 3
                final_y_te_bot_ind_list.append(y_te_rank[:split_position])
                final_y_te_top_ind_list.append(y_te_rank[-split_position:])
                y_te_class = np.array(y_te_class)
                final_y_te_class_bot_list.append(
                    y_te_class[y_te_rank[:split_position]])
                final_y_te_class_top_list.append(
                    y_te_class[y_te_rank[-split_position:]])

                if len(final_X_val) == 0:
                    final_X_val = copy(X_val)
                else:
                    final_X_val = np.concatenate((final_X_val, X_val), axis=0)
                if len(final_y_val) == 0:
                    final_y_val = copy(y_val)
                else:
                    final_y_val = np.concatenate((final_y_val, y_val), axis=0)

                final_train_X_embedding+=train_X_id_embedding
                final_test_X_embedding+=test_X_id_embedding
                final_val_X_embedding+=val_X_id_embedding

                i+=1
            
            print('Dataset statistic: #examples')
            print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding))
            # begin to train the model
            input_dim = final_X_tr.shape[-1]
            window_size = self.lag
            case_number = len(ground_truths_list)
            # begin to predict
            start = time.time()
            test_loss_list = []
            test_X = torch.from_numpy(final_X_te).float()
            test_Y = torch.from_numpy(final_y_te).float()
            var_x_test_id = torch.LongTensor(np.array(final_test_X_embedding))

            if self.mc:
                net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+str(drop_out_mc)+"_"+str(repeat_mc)+"_"+self.version+"_"+'alstm.pkl'))
                final_test_output = [[],[],[],[],[],[]]
                for i in range(len(test_X)//6):
                    clone_test_X = test_X.clone()[i::(len(test_X)//6)]
                    clone_var_x_test_id = var_x_test_id.clone()[i::(len(test_X)//6)]

                    for rep in range(repeat_mc):
                        if rep == 0:
                            test_output = net(clone_test_X, clone_var_x_test_id).detach().numpy()
                        else:
                            test_output = np.append(test_output,net(clone_test_X, clone_var_x_test_id).detach().numpy(),axis = 1)
                    final_test_output[0].append(test_output[0].tolist())
                    final_test_output[1].append(test_output[1].tolist())
                    final_test_output[2].append(test_output[2].tolist())
                    final_test_output[3].append(test_output[3].tolist())
                    final_test_output[4].append(test_output[4].tolist())
                    final_test_output[5].append(test_output[5].tolist())
                final_test_output = np.array(final_test_output[0] + final_test_output[1] + final_test_output[2] + final_test_output[3] + final_test_output[4] + final_test_output[5])
                standard_dev = final_test_output.std(axis = 1)
                test_output = final_test_output.sum(axis = 1)/repeat_mc
                print(len(standard_dev),len(test_output))
            else:
                net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+self.version+"_"+'alstm.pkl'))
                net.eval()
                test_output = net(test_X, var_x_test_id).detach().view(-1,)
            current_test_pred = list((1+test_output) * spot_list)
            pred_length = int(len(current_test_pred)/6)
            for num,gt in enumerate(ground_truths_list):
                final_list = pd.DataFrame(current_test_pred[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["Prediction"])
                sd_list = pd.DataFrame(standard_dev[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["uncertainty"])
                if self.mc:
                    pred_path = os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version,str(self.mc)])+".csv")
                    sd_path = os.path.join(os.getcwd(),"result","uncertainty","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version,str(self.mc)])+".csv")
                    sd_list.to_csv(sd_path)
                else:
                    pred_path = os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version])+".csv")
                final_list.to_csv(pred_path)
            end = time.time()
            print("predict time: {}".format(end-start))
    def test(self):
        print("begin to test")

        pure_LogReg = LogReg(parameters={})

        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        for date in self.date.split(","):
            #generate list of dates for today's model training period
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time, train_time, evalidate_date = gn.get_relevant_dates(
                today, length, "test")
            split_dates = [train_time, evalidate_date, str(today)]

            if gn.even_version(self.version):
                model = pure_LogReg.load(self.version, "LME_All_Spot",
                                         self.horizon, self.lag,
                                         evalidate_date)
            else:
                model = pure_LogReg.load(self.version, self.gt, self.horizon,
                                         self.lag, evalidate_date)

            #generate the version
            version_params = generate_version_params(self.version)

            metal_id = False
            if gn.even_version(self.version):
                metal_id = True

            #extract copy of data to process
            ts = copy(time_series.loc[start_time:split_dates[2]])

            #load data for use
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon, [self.gt],
                self.lag,
                copy(split_dates),
                version_params,
                metal_id_bool=metal_id,
                live=True)

            prob = model.predict(final_X_va)
            probability = model.predict_proba(final_X_va)
            np.savetxt(
                os.path.join(
                    "result", "probability", "logistic", "_".join([
                        self.gt + str(self.horizon), date, "lr", self.version,
                        "probability.txt"
                    ])), probability)
            final_list = []
            piece_list = []
            for i, val_date in enumerate(val_dates):
                piece_list.append(val_date)
                piece_list.append(prob[i])
                final_list.append(piece_list)
                piece_list = []
            final_dataframe = pd.DataFrame(prob,
                                           columns=['prediction'],
                                           index=val_dates)
            final_dataframe.to_csv(
                os.path.join(
                    "result", "prediction", "logistic",
                    "_".join([self.gt, date,
                              str(self.horizon), self.version]) + ".csv"))
    def tune(self, max_iter):
        print("begin to tune")

        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        #generate list of list of dates for rolling window
        today = self.date
        length = 5
        if gn.even_version(self.version) and self.horizon > 5:
            length = 4
        start_time, end_time = gn.get_relevant_dates(today, length, "tune")
        split_dates = gn.rolling_half_year(start_time, end_time, length)

        #generate the version parameters (parameters that control the preprocess)
        version_params = generate_version_params(self.version)

        #prepare holder for results
        ans = {"C": []}

        #loop over each half year
        for s, split_date in enumerate(split_dates):

            print("the train date is {}".format(split_date[1]))
            print("the test date is {}".format(split_date[2]))

            #toggle metal id
            metal_id = False
            ground_truth_list = [self.gt]
            if gn.even_version(self.version):
                metal_id = True
                ground_truth_list = [
                    "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot",
                    "LME_Zn_Spot", "LME_Pb_Spot"
                ]

            #extract copy of data to process
            ts = copy(time_series.loc[split_date[0]:split_date[-1]])
            tvt_date = split_date[1:-1]

            #prepare data according to model type and version parameters
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon,
                ground_truth_list,
                self.lag,
                copy(tvt_date),
                version_params,
                metal_id_bool=metal_id)

            #generate hyperparameters instances
            if self.horizon == 1:
                C_list = [0.01, 0.1, 1.0, 10.0, 100.0]
            elif self.horizon == 3:
                C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
            elif self.horizon == 5:
                C_list = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
            elif self.horizon == 10:
                C_list = [0.001, 0.01, 0.1, 1.0, 10.0]
            elif self.horizon == 20:
                C_list = [0.001, 0.01, 0.1, 1.0, 10.0]
            elif self.horizon == 60:
                C_list = [1.0, 10.0, 100.0, 1000.0]
            # if self.horizon <=5:
            #     if self.version == "v23":
            #         C_list = [0.01,0.1,1.0,10.0,100.0,1000.0]
            #     else:
            #         C_list = [0.001,0.01,0.1,1.0,10.0,100.0]
            # else:
            #     if self.version == "v24":
            #         C_list = [0.1,1.0,10.0,100.0,1000.0,10000.0]
            #     else:
            #         C_list = [1e-5,0.0001,0.001,0.01,0.1,1.0,10.0]

            #generate model results for each hyperparameter instance for each half year
            for C in C_list:
                if C not in ans['C']:
                    ans["C"].append(C)
                if split_date[2] + "_acc" not in ans.keys():
                    ans[split_date[2] + "_acc"] = []
                    ans[split_date[2] + "_pos_f1_score"] = []
                    ans[split_date[2] + "_neg_f1_score"] = []
                    ans[split_date[2] + "_f1_score"] = []
                    ans[split_date[2] + "_length"] = []

                pure_LogReg = LogReg(parameters={})
                max_iter = max_iter
                parameters = {
                    "penalty": "l2",
                    "C": C,
                    "solver": "lbfgs",
                    "tol": 1e-7,
                    "max_iter": 6 * 4 * config_length * max_iter,
                    "verbose": 0,
                    "warm_start": False,
                    "n_jobs": -1
                }
                pure_LogReg.train(final_X_tr, final_y_tr.flatten(), parameters)
                pred = pure_LogReg.predict(final_X_va)
                y_label = pd.DataFrame(final_y_va.flatten(),
                                       columns=["prediction"],
                                       index=val_dates)
                y_pred = pd.DataFrame(pred,
                                      columns=["prediction"],
                                      index=val_dates)
                acc = accuracy_score(y_label, y_pred)
                pos_f1 = f1_score(y_label, y_pred)
                y_label = 1 * (y_label == 0.0)
                y_pred = 1 * (y_pred == 0.0)
                neg_f1 = f1_score(y_label, y_pred)
                f1 = (pos_f1 + neg_f1) / 2
                ans[split_date[2] + "_acc"].append(acc)
                ans[split_date[2] + "_pos_f1_score"].append(pos_f1)
                ans[split_date[2] + "_neg_f1_score"].append(neg_f1)
                ans[split_date[2] + "_f1_score"].append(f1)
                ans[split_date[2] + "_length"].append(len(
                    final_y_va.flatten()))

        ans = pd.DataFrame(ans)
        ave_acc = None
        length = None

        #generate total average across all half years
        for col in ans.columns.values.tolist():
            if "_acc" in col:
                if ave_acc is None:
                    ave_acc = ans.loc[:, col] * ans.loc[:, col[:-3] + "length"]
                    ave_f1 = ans.loc[:,
                                     col[:-3 +
                                         "f1_score"]] * ans.loc[:, col[:-3] +
                                                                "length"]
                    length = ans.loc[:, col[:-3] + "length"]
                else:
                    ave_acc = ave_acc + ans.loc[:, col] * ans.loc[:, col[:-3] +
                                                                  "length"]
                    ave_f1 = ave_f1 + ans.loc[:,
                                              col[:-3 +
                                                  "f1_score"]] * ans.loc[:,
                                                                         col[:-3]
                                                                         +
                                                                         "length"]
                    length = length + ans.loc[:, col[:-3] + "length"]
        ave_acc = ave_acc / length
        ave_f1 = ave_f1 / length
        ans = pd.concat([
            ans,
            pd.DataFrame({
                "average accuracy": ave_acc,
                "average_f1": ave_f1
            })
        ],
                        axis=1)

        #store results in csv
        pd.DataFrame(ans).to_csv(os.path.join(os.getcwd(),'result','validation','logistic',\
                                                        "_".join(["log_reg",self.gt,self.version,str(self.lag),str(self.horizon)+".csv"])))
Exemple #10
0
    def test(self):
        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        for date in self.date.split(","):
            #generate list of dates for today's model training period
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time, train_time, evalidate_date = gn.get_relevant_dates(
                today, length, "test")
            split_dates = [train_time, evalidate_date, str(today)]

            #generate the version
            version_params = generate_version_params(self.version)

            metal_id = False
            if gn.even_version(self.version):
                metal_id = True

            #extract copy of data to process
            ts = copy(time_series.loc[start_time:split_dates[2]])

            #load data for use
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon, [self.gt],
                self.lag,
                copy(split_dates),
                version_params,
                metal_id_bool=metal_id,
                live=True)

            train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list)
            train_X = train_dataframe.loc[:, column_lag_list]
            train_y = pd.DataFrame(final_y_tr, columns=['prediction'])
            test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list)
            test_X = test_dataframe.loc[:, column_lag_list]
            n_splits = 10
            pos = sum(train_y.values)[0]
            from sklearn.metrics import accuracy_score
            model = xgb.XGBClassifier(
                n_estimators=500,
                silent=True,
                nthread=10,
                colsample_bytree=0.7,
                colsample_bylevel=1,
                reg_alpha=0.0001,
                reg_lambda=1,
                scale_pos_weight=(len(train_y.values) - pos) / pos,
                seed=1440,
                missing=None)
            folds = KFold(n_splits=n_splits)
            scores = []
            prediction = np.zeros((len(final_X_va), 1))
            folder_index = []
            #load the model
            for fold_n, (train_index,
                         valid_index) in enumerate(folds.split(train_X)):
                if not gn.even_version(self.version):
                    model = pickle.load(
                        open(
                            os.path.join(
                                "result", "model", "xgboost", split_dates[1] +
                                "_" + self.gt + "_" + str(self.horizon) + "_" +
                                str(self.lag) + "_" + str(fold_n) + "_" +
                                self.version + "_" + 'xgb.model'), "rb"))
                else:
                    model = pickle.load(
                        open(
                            os.path.join(
                                "result", "model", "xgboost", split_dates[1] +
                                "_LME_All_Spot_" + str(self.horizon) + "_" +
                                str(self.lag) + "_" + str(fold_n) + "_" +
                                self.version + "_" + 'xgb.model'), "rb"))
                y_pred = model.predict_proba(
                    test_X, ntree_limit=model.best_ntree_limit)[:, 1]
                y_pred = y_pred.reshape(-1, 1)
                if fold_n == 0:
                    folder_1 = y_pred
                    folder_1 = folder_1.reshape(len(folder_1), 1)
                elif fold_n == 1:
                    folder_2 = y_pred
                    folder_2 = folder_2.reshape(len(folder_2), 1)
                elif fold_n == 2:
                    folder_3 = y_pred
                    folder_3 = folder_3.reshape(len(folder_3), 1)
                elif fold_n == 3:
                    folder_4 = y_pred
                    folder_4 = folder_4.reshape(len(folder_4), 1)
                elif fold_n == 4:
                    folder_5 = y_pred
                    folder_5 = folder_5.reshape(len(folder_5), 1)
                elif fold_n == 5:
                    folder_6 = y_pred
                    folder_6 = folder_6.reshape(len(folder_6), 1)
                elif fold_n == 6:
                    folder_7 = y_pred
                    folder_7 = folder_7.reshape(len(folder_7), 1)
                elif fold_n == 7:
                    folder_8 = y_pred
                    folder_8 = folder_8.reshape(len(folder_8), 1)
                elif fold_n == 8:
                    folder_9 = y_pred
                    folder_9 = folder_9.reshape(len(folder_9), 1)
                elif fold_n == 9:
                    folder_10 = y_pred
                    folder_10 = folder_10.reshape(len(folder_10), 1)
                    #calculate the all folder voting
            result = np.concatenate(
                (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6,
                 folder_7, folder_8, folder_9, folder_10),
                axis=1)
            np.savetxt(
                os.path.join(
                    "result", "probability", "xgboost",
                    self.gt + "_h" + str(self.horizon) + "_" + date +
                    "_xgboost" + self.version + ".txt"), result)
            final_list = []
            for j in range(len(result)):
                count_1 = 0
                count_0 = 0
                for item in result[j]:
                    if item > 0.5:
                        count_1 += 1
                    else:
                        count_0 += 1
                if count_1 > count_0:
                    final_list.append(1)
                else:
                    final_list.append(0)
            print("the all folder voting precision is {}".format(
                metrics.accuracy_score(final_y_va, final_list)))
            final_list = pd.DataFrame(final_list,
                                      index=val_dates,
                                      columns=["prediction"])
            final_list.to_csv(
                os.path.join(
                    os.getcwd(), "result", "prediction", "xgboost",
                    "_".join([self.gt, date,
                              str(self.horizon), self.version]) + ".csv"))
Exemple #11
0
    def train(self, C=0.01, tol=1e-7, max_iter=100):
        print("begin to train")

        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        for date in self.date.split(","):
            #generate list of dates for today's model training period
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time, train_time, evalidate_date = gn.get_relevant_dates(
                today, length, "train")
            split_dates = [train_time, evalidate_date, str(today)]

            #generate the version
            version_params = generate_version_params(self.version)

            print("the train date is {}".format(split_dates[0]))
            print("the test date is {}".format(split_dates[1]))

            #toggle metal id
            metal_id = False
            ground_truth_list = [self.gt]
            if gn.even_version(self.version):
                metal_id = True
                ground_truth_list = [
                    "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot",
                    "LME_Zn_Spot", "LME_Pb_Spot"
                ]

            #extract copy of data to process
            ts = copy(time_series.loc[start_time:split_dates[2]])

            #load data for use
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon,
                ground_truth_list,
                self.lag,
                copy(split_dates),
                version_params,
                metal_id_bool=metal_id)

            pure_LogReg = LogReg(parameters={})
            parameters = {
                "penalty": "l2",
                "C": C,
                "solver": "lbfgs",
                "tol": tol,
                "max_iter": 6 * 4 * config_length * max_iter,
                "verbose": 0,
                "warm_start": False,
                "n_jobs": -1
            }
            pure_LogReg.train(final_X_tr, final_y_tr.flatten(), parameters)
            pure_LogReg.save(self.version, self.gt, self.horizon, self.lag,
                             evalidate_date)
Exemple #12
0
    def train(self, max_depth, learning_rate, gamma, min_child_weight,
              subsample):

        print("begin to train")
        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        for date in self.date.split(","):
            #generate list of dates for today's model training period
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time, train_time, evalidate_date = gn.get_relevant_dates(
                today, length, "train")
            split_dates = [train_time, evalidate_date, str(today)]

            #generate the version
            version_params = generate_version_params(self.version)

            print("the train date is {}".format(split_dates[0]))
            print("the test date is {}".format(split_dates[1]))

            #toggle metal id
            metal_id = False
            ground_truth_list = [self.gt]
            if gn.even_version(self.version):
                metal_id = True
                ground_truth_list = [
                    "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot",
                    "LME_Zn_Spot", "LME_Pb_Spot"
                ]

            #extract copy of data to process
            ts = copy(time_series.loc[start_time:split_dates[2]])

            #load data for use
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon,
                ground_truth_list,
                self.lag,
                copy(split_dates),
                version_params,
                metal_id_bool=metal_id)

            train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list)
            train_X = train_dataframe.loc[:, column_lag_list]
            train_y = pd.DataFrame(final_y_tr, columns=['result'])

            test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list)
            test_X = test_dataframe.loc[:, column_lag_list]
            n_splits = 10
            pos = sum(train_y.values)[0]
            from sklearn.metrics import accuracy_score
            model = xgb.XGBClassifier(
                max_depth=max_depth,
                learning_rate=learning_rate,
                n_estimators=500,
                silent=True,
                nthread=10,
                gamma=gamma,
                min_child_weight=min_child_weight,
                subsample=subsample,
                colsample_bytree=0.7,
                colsample_bylevel=1,
                reg_alpha=0.0001,
                reg_lambda=1,
                scale_pos_weight=(len(train_y.values) - pos) / pos,
                seed=1440,
                missing=None)
            folds = KFold(n_splits=n_splits)
            scores = []
            prediction = np.zeros((len(final_X_va), 1))
            folder_index = []

            #save the model
            for fold_n, (train_index,
                         valid_index) in enumerate(folds.split(train_X)):
                X_train, X_valid = train_X[column_lag_list].iloc[
                    train_index], train_X[column_lag_list].iloc[valid_index]
                y_train, y_valid = train_y.iloc[train_index], train_y.iloc[
                    valid_index]
                model.fit(X_train,
                          y_train,
                          eval_metric='error',
                          verbose=True,
                          eval_set=[(X_valid, y_valid)],
                          early_stopping_rounds=5)
                y_pred_valid = model.predict(X_valid)
                pickle.dump(
                    model,
                    open(
                        os.path.join(
                            'result', 'model', 'xgboost', split_dates[1] +
                            "_" + self.gt + "_" + str(self.horizon) + "_" +
                            str(self.lag) + "_" + str(fold_n) + "_" +
                            self.version + "_" + 'xgb.model'), "wb"))
                y_pred = model.predict_proba(
                    test_X, ntree_limit=model.best_ntree_limit)[:, 1]
                y_pred = y_pred.reshape(-1, 1)
                if fold_n == 0:
                    folder_1 = y_pred
                    folder_1 = folder_1.reshape(len(folder_1), 1)
                elif fold_n == 1:
                    folder_2 = y_pred
                    folder_2 = folder_2.reshape(len(folder_2), 1)
                elif fold_n == 2:
                    folder_3 = y_pred
                    folder_3 = folder_3.reshape(len(folder_3), 1)
                elif fold_n == 3:
                    folder_4 = y_pred
                    folder_4 = folder_4.reshape(len(folder_4), 1)
                elif fold_n == 4:
                    folder_5 = y_pred
                    folder_5 = folder_5.reshape(len(folder_5), 1)
                elif fold_n == 5:
                    folder_6 = y_pred
                    folder_6 = folder_6.reshape(len(folder_6), 1)
                elif fold_n == 6:
                    folder_7 = y_pred
                    folder_7 = folder_7.reshape(len(folder_7), 1)
                elif fold_n == 7:
                    folder_8 = y_pred
                    folder_8 = folder_8.reshape(len(folder_8), 1)
                elif fold_n == 8:
                    folder_9 = y_pred
                    folder_9 = folder_9.reshape(len(folder_9), 1)
                elif fold_n == 9:
                    folder_10 = y_pred
                    folder_10 = folder_10.reshape(len(folder_10), 1)
            #calculate the all folder voting
            result = np.concatenate(
                (folder_1, folder_2, folder_3, folder_4, folder_5, folder_6,
                 folder_7, folder_8, folder_9, folder_10),
                axis=1)
Exemple #13
0
    def tune(self):
        print("begin to choose the parameter")

        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        #generate list of list of dates for rolling window
        today = self.date
        length = 5
        if gn.even_version(self.version) and self.horizon > 5:
            length = 4
        start_time, end_time = gn.get_relevant_dates(today, length, "tune")
        split_dates = gn.rolling_half_year(start_time, end_time, length)

        #generate the version parameters (parameters that control the preprocess)
        version_params = generate_version_params(self.version)

        #prepare holder for results
        ans = {"C": []}

        #loop over each half year
        for s, split_date in enumerate(split_dates):

            print("the train date is {}".format(split_date[1]))
            print("the test date is {}".format(split_date[2]))

            #toggle metal id
            metal_id = False
            ground_truth_list = [self.gt]
            if gn.even_version(self.version):
                metal_id = True
                ground_truth_list = [
                    "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot",
                    "LME_Zn_Spot", "LME_Pb_Spot"
                ]

            #extract copy of data to process
            ts = copy(time_series.loc[split_date[0]:split_date[-1]])
            tvt_date = split_date[1:-1]

            #prepare data according to model type and version parameters
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon,
                ground_truth_list,
                self.lag,
                copy(tvt_date),
                version_params,
                metal_id_bool=metal_id)

            train_dataframe = pd.DataFrame(final_X_tr, columns=column_lag_list)
            train_X = train_dataframe.loc[:, column_lag_list]
            train_y = pd.DataFrame(final_y_tr, columns=['result'])

            test_dataframe = pd.DataFrame(final_X_va, columns=column_lag_list)
            test_X = test_dataframe.loc[:, column_lag_list]
            n_splits = 10

            #tune xgboost hyper parameter with grid search
            for max_depth in [3, 4, 5]:
                for learning_rate in [0.6, 0.7, 0.8, 0.9]:
                    for gamma in [0.6, 0.7, 0.8, 0.9]:
                        for min_child_weight in [3, 4, 5, 6]:
                            for subsample in [0.6, 0.7, 0.85, 0.9]:
                                from sklearn.metrics import accuracy_score
                                model = xgb.XGBClassifier(
                                    max_depth=max_depth,
                                    learning_rate=learning_rate,
                                    n_estimators=500,
                                    silent=True,
                                    nthread=10,
                                    gamma=gamma,
                                    min_child_weight=min_child_weight,
                                    subsample=subsample,
                                    colsample_bytree=0.7,
                                    colsample_bylevel=1,
                                    reg_alpha=0.0001,
                                    reg_lambda=1,
                                    scale_pos_weight=1,
                                    seed=1440,
                                    missing=None)
                                folds = KFold(n_splits=n_splits)
                                scores = []
                                prediction = np.zeros((len(final_X_va), 1))
                                folder_index = []

                                #generate k fold and train xgboost model
                                for fold_n, (train_index,
                                             valid_index) in enumerate(
                                                 folds.split(train_X)):
                                    X_train, X_valid = train_X[
                                        column_lag_list].iloc[
                                            train_index], train_X[
                                                column_lag_list].iloc[
                                                    valid_index]
                                    y_train, y_valid = train_y.iloc[
                                        train_index], train_y.iloc[valid_index]
                                    model.fit(X_train,
                                              y_train,
                                              eval_metric='error',
                                              verbose=True,
                                              eval_set=[(X_valid, y_valid)],
                                              early_stopping_rounds=5)
                                    y_pred_valid = model.predict(X_valid)
                                    y_pred = model.predict_proba(
                                        test_X,
                                        ntree_limit=model.best_ntree_limit)[:,
                                                                            1]
                                    y_pred = y_pred.reshape(-1, 1)
                                    if fold_n == 0:
                                        folder_1 = y_pred
                                        folder_1 = folder_1.reshape(
                                            len(folder_1), 1)
                                    elif fold_n == 1:

                                        folder_2 = y_pred
                                        folder_2 = folder_2.reshape(
                                            len(folder_2), 1)
                                    elif fold_n == 2:

                                        folder_3 = y_pred
                                        folder_3 = folder_3.reshape(
                                            len(folder_3), 1)
                                    elif fold_n == 3:

                                        folder_4 = y_pred
                                        folder_4 = folder_4.reshape(
                                            len(folder_4), 1)
                                    elif fold_n == 4:

                                        folder_5 = y_pred
                                        folder_5 = folder_5.reshape(
                                            len(folder_5), 1)
                                    elif fold_n == 5:

                                        folder_6 = y_pred
                                        folder_6 = folder_6.reshape(
                                            len(folder_6), 1)
                                    elif fold_n == 6:

                                        folder_7 = y_pred
                                        folder_7 = folder_7.reshape(
                                            len(folder_7), 1)
                                    elif fold_n == 7:

                                        folder_8 = y_pred
                                        folder_8 = folder_8.reshape(
                                            len(folder_8), 1)
                                    elif fold_n == 8:

                                        folder_9 = y_pred
                                        folder_9 = folder_9.reshape(
                                            len(folder_9), 1)
                                    elif fold_n == 9:
                                        folder_10 = y_pred
                                        folder_10 = folder_10.reshape(
                                            len(folder_10), 1)

                                #calculate the all folder voting
                                result = np.concatenate(
                                    (folder_1, folder_2, folder_3, folder_4,
                                     folder_5, folder_6, folder_7, folder_8,
                                     folder_9, folder_10),
                                    axis=1)
                                final_list = []
                                for j in range(len(result)):
                                    count_1 = 0
                                    count_0 = 0
                                    for item in result[j]:
                                        if item > 0.5:
                                            count_1 += 1
                                        else:
                                            count_0 += 1
                                    if count_1 > count_0:
                                        final_list.append(1)
                                    else:
                                        final_list.append(0)
                                #print("the lag is {}".format(lag))
                                print("the all folder voting precision is {}".
                                      format(
                                          metrics.accuracy_score(
                                              final_y_va, final_list)))

                                #calculate the near folder voting
                                result = np.concatenate(
                                    (folder_6, folder_7, folder_8, folder_9,
                                     folder_10),
                                    axis=1)
                                final_list = []
                                for j in range(len(result)):
                                    count_1 = 0
                                    count_0 = 0
                                    for item in result[j]:
                                        if item > 0.5:
                                            count_1 += 1
                                        else:
                                            count_0 += 1
                                    if count_1 > count_0:
                                        final_list.append(1)
                                    else:
                                        final_list.append(0)
                                print("the near precision is {}".format(
                                    metrics.accuracy_score(
                                        final_y_va, final_list)))

                                #calculate the far folder voting
                                result = np.concatenate(
                                    (folder_1, folder_2, folder_3, folder_4,
                                     folder_5),
                                    axis=1)
                                final_list = []
                                for j in range(len(result)):
                                    count_1 = 0
                                    count_0 = 0
                                    for item in result[j]:
                                        if item > 0.5:
                                            count_1 += 1
                                        else:
                                            count_0 += 1
                                    if count_1 > count_0:
                                        final_list.append(1)
                                    else:
                                        final_list.append(0)
                                print("the far precision is {}".format(
                                    metrics.accuracy_score(
                                        final_y_va, final_list)))

                                #calculate the same folder voting
                                if split_date[1].split("-")[1] == '01':
                                    result = np.concatenate(
                                        (folder_1, folder_3, folder_5,
                                         folder_7, folder_9),
                                        axis=1)
                                    final_list = []
                                    for j in range(len(result)):
                                        count_1 = 0
                                        count_0 = 0
                                        for item in result[j]:
                                            if item > 0.5:
                                                count_1 += 1
                                            else:
                                                count_0 += 1
                                        if count_1 > count_0:
                                            final_list.append(1)
                                        else:
                                            final_list.append(0)
                                    #print("the lag is {}".format(lag))
                                    print("the same precision is {}".format(
                                        metrics.accuracy_score(
                                            final_y_va, final_list)))

                                    #calculate the reverse folder voting
                                    result = np.concatenate(
                                        (folder_2, folder_4, folder_6,
                                         folder_8, folder_10),
                                        axis=1)
                                    final_list = []
                                    for j in range(len(result)):
                                        count_1 = 0
                                        count_0 = 0
                                        for item in result[j]:
                                            if item > 0.5:
                                                count_1 += 1
                                            else:
                                                count_0 += 1
                                        if count_1 > count_0:
                                            final_list.append(1)
                                        else:
                                            final_list.append(0)
                                    #print("the lag is {}".format(lag))
                                    print("the reverse precision is {}".format(
                                        metrics.accuracy_score(
                                            final_y_va, final_list)))
                                    print("the max_depth is {}".format(
                                        max_depth))
                                    print("the learning_rate is {}".format(
                                        learning_rate))
                                    print("the gamma is {}".format(gamma))
                                    print("the min_child_weight is {}".format(
                                        min_child_weight))
                                    print("the subsample is {}".format(
                                        subsample))
                                else:

                                    #calculate the same folder voting
                                    result = np.concatenate(
                                        (folder_2, folder_4, folder_6,
                                         folder_8, folder_10),
                                        axis=1)
                                    final_list = []
                                    for j in range(len(result)):
                                        count_1 = 0
                                        count_0 = 0
                                        for item in result[j]:
                                            if item > 0.5:
                                                count_1 += 1
                                            else:
                                                count_0 += 1
                                        if count_1 > count_0:
                                            final_list.append(1)
                                        else:
                                            final_list.append(0)
                                    #print("the lag is {}".format(lag))
                                    print("the same precision is {}".format(
                                        metrics.accuracy_score(
                                            final_y_va, final_list)))

                                    #calculate the reverse folder voting
                                    result = np.concatenate(
                                        (folder_1, folder_3, folder_5,
                                         folder_7, folder_9),
                                        axis=1)
                                    final_list = []
                                    for j in range(len(result)):
                                        count_1 = 0
                                        count_0 = 0
                                        for item in result[j]:
                                            if item > 0.5:
                                                count_1 += 1
                                            else:
                                                count_0 += 1
                                        if count_1 > count_0:
                                            final_list.append(1)
                                        else:
                                            final_list.append(0)
                                    #print("the lag is {}".format(lag))
                                    print("the reverse precision is {}".format(
                                        metrics.accuracy_score(
                                            final_y_va, final_list)))
                                    print("the max_depth is {}".format(
                                        max_depth))
                                    print("the learning_rate is {}".format(
                                        learning_rate))
                                    print("the gamma is {}".format(gamma))
                                    print("the min_child_weight is {}".format(
                                        min_child_weight))
                                    print("the subsample is {}".format(
                                        subsample))
            print("the lag is {}".format(self.lag))
            print("the train date is {}".format(split_date[0]))
            print("the test date is {}".format(split_date[1]))
            print("the length is {}".format(len(test_X)))
Exemple #14
0
    def test(self,split = 0.9,
        num_epochs=50,
        drop_out=0.0,
        embedding_size=5,
        batch_size=512,
        hidden_state=50,
        lrate=0.001,
        attention_size=2,
        interval=1,
        lambd=0,
        save_loss=0,
        save_prediction=0,
        method = ""):
        sys.path[0] = os.curdir
        print(sys.path)
        print("begin to test")

        #identify the configuration file for data based on version
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database with configuration file to determine columns to that are required
        time_series,LME_dates,config_length = gn.read_data_with_specified_columns(self.source,self.path,"2003-11-12")

        for date in self.date.split(","):
            torch.manual_seed(1)
            np.random.seed(1)
            random.seed(1)

            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time,train_time,evalidate_date = gn.get_relevant_dates(today,length,"test")
            split_dates  =  [train_time,evalidate_date,str(today)]

            

            #generate the version parameters
            version_params=generate_version_params(self.version)
            print("the test date is {}".format(split_dates[1]))
            norm_volume = "v1"
            norm_3m_spread = "v1"
            norm_ex = "v1"
            len_ma = 5
            len_update = 30
            tol = 1e-7
            norm_params = {'vol_norm':norm_volume,'ex_spread_norm':norm_ex,'spot_spread_norm':norm_3m_spread,
                    'len_ma':len_ma,'len_update':len_update,'both':3,'strength':0.01,'xgboost':False}
            tech_params = {'strength':0.01,'both':3,'Win_VSD':[10,20,30,40,50,60],'Win_EMA':12,'Win_Bollinger':22,
                            'Fast':12,'Slow':26,'Win_NATR':10,'Win_VBM':22,'acc_initial':0.02,'acc_maximum':0.2,"live":None}
            #for versions that tune over 6 metals 
            final_X_tr = []
            final_y_tr = []
            final_X_val = []
            final_y_val = []
            final_X_te = []
            final_y_te = []
            final_y_te_class_list = []
            final_y_te_class_top_list = []
            final_y_te_top_ind_list = []
            final_y_te_class_bot_list = []
            final_y_te_bot_ind_list = []
            final_train_X_embedding = []
            final_test_X_embedding = []
            final_val_X_embedding = []

            i = 0
            #toggle metal id
            metal_id = False
            ground_truths_list = ["LME_Cu_Spot","LME_Al_Spot","LME_Ni_Spot","LME_Xi_Spot","LME_Zn_Spot","LME_Pb_Spot"]
            for ground_truth in ground_truths_list:
                new_time_series = copy(time_series)
                spot_list = np.array(new_time_series[ground_truth])
                new_time_series['spot_price'] = spot_list
                ts = new_time_series.loc[start_time:split_dates[2]]

                #load data for use
                X_tr, y_tr, X_va, y_va, val_dates, column_lag_list = gn.prepare_data(ts,LME_dates,self.horizon,[ground_truth],self.lag,copy(split_dates),version_params,metal_id_bool = metal_id,reshape = False,live = True)
                        
                # split validation
                X_ta = X_tr[:int(len(X_tr) * split), :, :]
                y_ta = y_tr[:int(len(y_tr) * split)]

                X_val = X_tr[int(len(X_tr) * split):, :, :]
                y_val = y_tr[int(len(y_tr) * split):]

                X_te = X_va
                y_te = y_va

                # generate metal id for embedding lookup
                train_X_id_embedding = [i]*len(X_ta)
                val_X_id_embedding = [i]*len(X_val)
                test_X_id_embedding = [i]*len(X_te)

                if len(final_X_tr) == 0:
                    final_X_tr = copy(X_ta)
                else:
                    final_X_tr = np.concatenate((final_X_tr, X_ta), axis=0)
                if len(final_y_tr) == 0:
                    final_y_tr = copy(y_ta)
                else:
                    final_y_tr = np.concatenate((final_y_tr, y_ta), axis=0)

                if len(final_X_te) == 0:
                    final_X_te = copy(X_te)
                else:
                    final_X_te = np.concatenate((final_X_te, X_te), axis=0)
                if len(final_y_te) == 0:
                    final_y_te = copy(y_te)
                else:
                    final_y_te = np.concatenate((final_y_te, y_te), axis=0)

                y_te_rank = np.argsort(y_te[:,0])
                y_te_class = []
                for item in y_te:
                    if item >= thresh:
                        y_te_class.append(1)
                    else:
                        y_te_class.append(0)
                final_y_te_class_list.append(y_te_class)
                split_position = len(y_te) // 3
                final_y_te_bot_ind_list.append(y_te_rank[:split_position])
                final_y_te_top_ind_list.append(y_te_rank[-split_position:])
                y_te_class = np.array(y_te_class)
                final_y_te_class_bot_list.append(
                y_te_class[y_te_rank[:split_position]])
                final_y_te_class_top_list.append(
                y_te_class[y_te_rank[-split_position:]])

                if len(final_X_val) == 0:
                    final_X_val = copy(X_val)
                else:
                    final_X_val = np.concatenate((final_X_val, X_val), axis=0)
                if len(final_y_val) == 0:
                    final_y_val = copy(y_val)
                else:
                    final_y_val = np.concatenate((final_y_val, y_val), axis=0)
                
                final_train_X_embedding+=train_X_id_embedding
                final_test_X_embedding+=test_X_id_embedding
                final_val_X_embedding+=val_X_id_embedding

                # update metal index
                i+=1
            print('Dataset statistic: #examples')
            print('Testing:', len(final_X_te), len(final_y_te), len(final_test_X_embedding))
            # begin to train the model
            input_dim = final_X_tr.shape[-1]
            window_size = self.lag
            case_number = len(ground_truths_list)
            # begin to predict
            start = time.time()
            test_loss_list = []
            test_X = torch.from_numpy(final_X_te).float()
            test_Y = torch.from_numpy(final_y_te).float()
            var_x_test_id = torch.LongTensor(np.array(final_test_X_embedding))
            net = torch.load(os.path.join('result','model','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+str(drop_out)+"_"+str(hidden_state)+"_"+str(embedding_size)+"_"+str(self.lag)+"_"+self.version+"_"+'alstm.pkl'))
            net.eval()
            test_output = net(test_X, var_x_test_id)
            current_test_pred = list(test_output.detach().view(-1,))

            current_test_class = [1 if ele>thresh else 0 for ele in current_test_pred]
            np.savetxt(os.path.join('result','probability','alstm',self.version+"_"+method,split_dates[1]+"_"+str(self.horizon)+"_"+self.version+".txt"),current_test_class)

            pred_length = int(len(current_test_class)/6)
            for num,gt in enumerate(ground_truths_list):
                final_list = pd.DataFrame(current_test_class[num*pred_length:(num+1)*pred_length],index = val_dates, columns = ["Prediction"])
                final_list.to_csv(os.path.join(os.getcwd(),"result","prediction","alstm",self.version+"_"+method,"_".join([gt,date,str(self.horizon),self.version])+".csv"))
            end = time.time()
            print("predict time: {}".format(end-start))
Exemple #15
0
    def train(self):
        print("begin to train")

        #assert that the configuration path is correct
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        for date in self.date.split(","):
            #generate list of dates for today's model training period
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time, train_time, evalidate_date = gn.get_relevant_dates(
                today, length, "train")
            split_dates = [train_time, evalidate_date, str(today)]

            #generate the version
            version_params = generate_version_params(self.version)

            print("the train date is {}".format(split_dates[0]))
            print("the test date is {}".format(split_dates[1]))

            #toggle metal id
            metal_id = False
            ground_truth_list = [self.gt]
            if gn.even_version(self.version):
                metal_id = True
                ground_truth_list = [
                    "LME_Cu_Spot", "LME_Al_Spot", "LME_Ni_Spot", "LME_Xi_Spot",
                    "LME_Zn_Spot", "LME_Pb_Spot"
                ]

            #extract copy of data to process
            ts = copy(time_series.loc[start_time:split_dates[2]])

            #load data for use
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon,
                ground_truth_list,
                self.lag,
                copy(split_dates),
                version_params,
                metal_id_bool=metal_id)

            LR = LinearRegression(n_jobs=-1)
            LR.fit(final_X_tr, final_y_tr[:, 0])
            if gn.even_version(self.version):
                joblib.dump(
                    LR,
                    os.path.join(
                        os.getcwd(), 'result', 'model', 'linear',
                        self.version + "_ALL_" + str(self.horizon) + "_" +
                        str(self.lag) + "_" + evalidate_date + '.pkl'))
            else:
                joblib.dump(
                    LR,
                    os.path.join(
                        os.getcwd(), 'result', 'model', 'linear',
                        self.version + "_" + self.gt + "_" +
                        str(self.horizon) + "_" + str(self.lag) + "_" +
                        evalidate_date + '.pkl'))
Exemple #16
0
    def test(self):
        #split the date
        print("begin to test")
        #assert that the configuration path is correct
        self.path = gn.generate_config_path(self.version)

        #read the data from the 4E or NExT database
        time_series, LME_dates, config_length = gn.read_data_with_specified_columns(
            self.source, self.path, "2003-11-12")

        for date in self.date.split(","):
            #generate list of dates for today's model training period
            today = date
            length = 5
            if gn.even_version(self.version) and self.horizon > 5:
                length = 4
            start_time, train_time, evalidate_date = gn.get_relevant_dates(
                today, length, "test")
            split_dates = [train_time, evalidate_date, str(today)]

            if gn.even_version(self.version):
                model = joblib.load(
                    os.path.join(
                        os.getcwd(), 'result', 'model', 'linear',
                        self.version + "_ALL_" + str(self.horizon) + "_" +
                        str(self.lag) + "_" + evalidate_date + '.pkl'))
            else:
                model = joblib.load(
                    os.path.join(
                        os.getcwd(), 'result', 'model', 'linear',
                        self.version + "_" + self.gt + "_" +
                        str(self.horizon) + "_" + str(self.lag) + "_" +
                        evalidate_date + '.pkl'))

            #generate the version
            version_params = generate_version_params(self.version)

            metal_id = False
            if gn.even_version(self.version):
                metal_id = True

            #extract copy of data to process
            ts = copy(time_series.loc[start_time:split_dates[2]])

            #load data for use
            final_X_tr, final_y_tr, final_X_va, final_y_va, val_dates, column_lag_list = gn.prepare_data(
                ts,
                LME_dates,
                self.horizon, [self.gt],
                self.lag,
                copy(split_dates),
                version_params,
                metal_id_bool=metal_id,
                live=True)

            prob = (1 + model.predict(final_X_va)) * final_y_va[:, 1]
            final_list = []
            piece_list = []
            for i, val_date in enumerate(val_dates):
                piece_list.append(val_date)
                piece_list.append(prob[i])
                final_list.append(piece_list)
                piece_list = []
            final_dataframe = pd.DataFrame(prob,
                                           columns=['prediction'],
                                           index=val_dates)
            final_dataframe.to_csv(
                os.path.join(
                    "result", "prediction", "linear", self.version,
                    "_".join([self.gt, date,
                              str(self.horizon), self.version]) + ".csv"))