def model_train_predict_test( self, input_file_regx="(DBID)\((\d+)\)_INSTID\([1]\).csv", override=False): # "^(\d+)\.csv" """ :param override=Fasle: rerun the model prediction no matter if the expected output file exists :return: model file, model weights files, prediction file, discrepancy statistic bar plot file """ # get training sets for lstm training print("Scanning files within select id range ...") print(input_file_regx) print(self.training_set_dir) ids, files = get_ids_and_files_in_dir(inputdir=self.training_set_dir, range=self.training_set_id_range, input_file_regx=input_file_regx) print("Scanning done! Selected enterprise ids are {}".format(ids)) if not files: raise ValueError( "No files selected in current id range. Please check the input training set directory, " "input enterprise id range or file format which should be '[0-9]+.csv'" ) # get train, test, validation data for id_index, id_file in enumerate(files): # store prediction result to prediction directory enter_file = self.training_set_dir + "/" + id_file print("Processing dataset - enterprise_id is: {}".format( ids[id_index])) print("Reading from file {}".format(enter_file)) df = pd.read_csv(enter_file) df.index = range(len(df.index)) # retrieve training X and Y columns. First column is customer_id select_col = [] select_col = np.append( select_col, ['X' + str(i) for i in range(1, 1 + self.training_set_length)]) select_col = np.append(select_col, ['Y']) df_selected = df.ix[:, select_col] print(df_selected) # remove outlier records """ df_selected = percentile_remove_outlier(df_selected, filter_start=0, filter_end=1+self.training_set_length) print(df_selected) """ # scale the train columns print("Scaling...") if self.scaler == 'mm': global bin_boundary df_scale, minVal, maxVal, bin_boundary = MinMaxScaler( df_selected, start_col_index=0, end_col_index=self.training_set_length) elif self.scaler == 'norm': df_scale, meanVal, stdVal = NormalDistributionScaler( df_selected, start_col_index=0, end_col_index=self.training_set_length) else: raise ValueError("Argument scaler must be mm or norm!") # bin date y df_bin, bin_boundary = binning_date_y( df_scale, y_col=self.training_set_length, n_group=5, bin_boundary=bin_boundary) print("Bin boundary is {}".format(bin_boundary)) # get train and test dataset print("Randomly selecting training set and test set...") all_data_x = np.asarray( df_bin.ix[:, 0:self.training_set_length]).reshape( (len(df_bin.index), 1, self.training_set_length)) all_data_y = np.asarray(df_bin.ix[:, self.training_set_length]) # convert y label to one-hot dummy label y_dummy_label = np.asarray(pd.get_dummies(all_data_y)) # format train, test, validation data sub_train, val_train, sub_test, val_test = train_test_split( all_data_x, y_dummy_label, test_size=self.test_size) train_x, test_x, train_y, test_y = train_test_split( sub_train, sub_test, test_size=self.test_size) # create and fit the NN model model_save_path = self.model_save_dir + "/" + self.model_file_prefix + "-" + str( ids[id_index]) + ".h5" # check if model file exists if not os.path.exists(model_save_path) or override: self.NN_model_train(train_x, train_y, test_x, test_y, model_save_path=model_save_path) # generate prediction for training print("Predicting the output of validation set...") val_predict_class, val_predict_prob = self.NN_prediction( val_train, model_save_path=model_save_path) # statistic of discrepancy between expected value and real value total_sample_count = len(val_predict_class) val_test_label = np.asarray([list(x).index(1) for x in val_test]) match_count = (np.asarray(val_predict_class) == np.asarray( val_test_label.ravel())).sum() print("Precision using validation dataset is {}".format( float(match_count) / total_sample_count)) # 0.9178082191780822
def model_train_predict_test( self, dataX, dataY, end, lookback, override=False, ): # remove outlier records """ df_selected = percentile_remove_outlier(df_selected, filter_start=0, filter_end=1+self.training_set_length) print(df_selected) """ # scale the train columns print("Scaling...") if self.scaler == 'mm': copy = dataX[:, :, dataX.shape[2] - 1] dataX[:, :, dataX.shape[2] - 1], minVal, maxVal, _bin_boundary = MinMaxScaler(copy) for i in range(dataX.shape[2] - 1): copy = dataX[:, :, i] dataX[:, :, i], null1, null2, null3 = MinMaxScaler(copy) elif self.scaler == 'norm': pass # target_collection, meanVal, stdVal = NormalDistributionScaler(target_collection, start_col_index=0, end_col_index=self.training_set_length) else: raise ValueError("Argument scaler must be mm or norm!") # bin date y bin_boundary = [0, 50, 75, 90] dataY, bin_boundary = binning_date_y(dataY, y_col=self.training_set_length, n_group=5, bin_boundary=bin_boundary) print("Bin boundary is {}".format(bin_boundary)) # get train and test dataset print("Randomly selecting training set and test set...") # convert y label to one-hot dummy label if len(set(dataY)) == 1: return (1010, [1010, 1010]) y_dummy_label = np.asarray(pd.get_dummies(dataY)) # format train, test, validation data count = 0 while True: x_sub, x_test, y_sub, y_test = train_test_split( dataX, y_dummy_label, test_size=self.test_size) x_train, x_val, y_train, y_val = train_test_split( x_sub, y_sub, test_size=self.test_size) if count == 10: return (1010, [1010, 1010]) def to_list(x): to_list = [] for i in range(x.shape[0]): result = 0 for j in range(x.shape[1]): result += x[i][j] * (j + 1) to_list.append(result) return to_list if len(set(to_list(y_train))) > 1: break # create and fit the NN model model_save_path = self.model_save_dir + "/" + self.model_file_prefix + ".h5" # check if model file exists if not os.path.exists(model_save_path) or override: score = self.NN_model_train(x_train, y_train, x_val, y_val, model_save_path=model_save_path, end=end, lookback=lookback) print("Models and their parameters are stored in {}".format( model_save_path)) else: score = [1010, 1010] # generate prediction for training print("Predicting the output of validation set...") val_predict_class, val_predict_prob = self.NN_prediction( x_test, model_save_path=model_save_path) # statistic of discrepancy between expected value and real value total_sample_count = len(val_predict_class) val_test_label = np.asarray([list(x).index(1) for x in y_test]) match_count = (np.asarray(val_predict_class) == np.asarray( val_test_label.ravel())).sum() return float(match_count) / total_sample_count, score