def main_execute(): trading_advisor = TradingAdvisor() intervals = [20, 60, 120] processor = DataProcessor(intervals) subset_df = processor.get_dataframe_subset(500) advisor.calc_buy_sell(intervals, subset_df) processor.plot(subset_df) processor.db.close()
def get_data( model_args, training_args, tokenizer, text_data_path="../data/test_dataset"): # 경로 변경 ../data/test_dataset """ get data Args: model_args: model arguments training_args: training arguments tokenizer: tokenizer text_data_path: Defaults to "../data/test_dataset" Returns: text_data, val_iter, val_dataset, scores """ text_data = load_from_disk(text_data_path) # run_ lasticsearch if "elastic" in model_args.retrieval_type: is_sentence_trainformer = False if "sentence_trainformer" in model_args.retrieval_type: is_sentence_trainformer = True # number of text to concat concat_num = model_args.retrieval_elastic_num text_data, scores = run_elasticsearch(text_data, concat_num, model_args, is_sentence_trainformer) elif model_args.retrieval_type == "dense": concat_num = model_args.retrieval_elastic_num text_data, scores = run_concat_dense_retrival(text_data, concat_num) column_names = text_data["validation"].column_names data_collator = (DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # 데이터 tokenize(mrc 모델안에 들어 갈 수 있도록) data_processor = DataProcessor(tokenizer) val_text = text_data["validation"] val_dataset = data_processor.val_tokenzier(val_text, column_names) val_iter = DataLoader(val_dataset, collate_fn=data_collator, batch_size=1) return text_data, val_iter, val_dataset, scores
def load_rf_data(cur_path): data_folder = "data\\titanic" processed_data_folder = os.path.join(cur_path, data_folder) # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess # how well the model performed. data_file_path = os.path.join(processed_data_folder, "train.csv") data = DataProcessor(data_file_path, processed_data_folder) try: #Try to load data data.load_processed_data() except FileNotFoundError: #No data found, so process it # 10% test, 10% validation, 80% training samples from data splits = (0.1, 0.1, 0.8) # Only use certain columns use_cols = ( # 0, #PassengerID 1, # Survived 2, # Pclass # 3, #Name 4, # Sex 5, # Age 6, # SibSp 7, # Parch # 8, #Ticket 9, # Fare # 10, #Cabin 11, # Embarked ) # Mark features as categorical (so we can one-hot-encode them later) # categorical_cols = () categorical_cols = (2, # Pclass 4, # Sex 11 # Embarked ) # Convert certain columns to float values (so we can use numpy arrays) converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex], 11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]} data.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters, filter_missing=True) return data
def get_data(data_args, training_args, tokenizer): '''train과 validation의 dataloader와 dataset를 반환하는 함수''' if data_args.dataset_name == 'basic': if os.path.isdir("../data/train_dataset"): dataset = load_from_disk("../data/train_dataset") else: raise Exception("Set the data path to 'p3-mrc-team-ikyo/data/.'") elif data_args.dataset_name == 'preprocessed': if os.path.isfile("../data/preprocess_train.pkl"): dataset = get_pickle("../data/preprocess_train.pkl") else: dataset = make_custom_dataset("../data/preprocess_train.pkl") elif data_args.dataset_name == 'concat': if os.path.isfile("../data/concat_train.pkl"): dataset = get_pickle("../data/concat_train.pkl") else: dataset = make_custom_dataset("../data/concat_train.pkl") elif data_args.dataset_name == 'korquad': if os.path.isfile("../data/korquad_train.pkl"): dataset = get_pickle("../data/korquad_train.pkl") else: dataset = make_custom_dataset("../data/korquad_train.pkl") elif data_args.dataset_name == "question_type": if os.path.isfile("../data/question_type.pkl"): dataset = get_pickle("../data/question_type.pkl") else: dataset = make_custom_dataset("../data/question_type.pkl") elif data_args.dataset_name == "ai_hub": if os.path.isfile("../data/ai_hub_dataset.pkl"): dataset = get_pickle("../data/ai_hub_dataset.pkl") else: dataset = make_custom_dataset("../data/ai_hub_dataset.pkl") elif data_args.dataset_name == "only_korquad": dataset = load_dataset("squad_kor_v1") elif data_args.dataset_name == "random_masking": if os.path.isfile("../data/random_mask_train.pkl"): dataset = get_pickle("../data/random_mask_train.pkl") else: dataset = make_custom_dataset("../data/random_mask_train.pkl") elif data_args.dataset_name == "token_masking": if os.path.isfile("../data/concat_token_mask_top_3.pkl"): dataset = get_pickle("../data/concat_token_mask_top_3.pkl") else: dataset = make_mask_dataset("../data/concat_token_mask_top_3.pkl", tokenizer) train_dataset = dataset['train'] val_dataset = dataset['validation'] else: raise Exception( "dataset_name have to be one of ['basic', 'preprocessed', 'concat', 'korquad', 'only_korquad', 'question_type', 'ai_hub', 'random_masking', 'token_masking']" ) if data_args.dataset_name != "token_masking": train_dataset = dataset['train'] val_dataset = dataset['validation'] train_column_names = train_dataset.column_names val_column_names = val_dataset.column_names data_processor = DataProcessor(tokenizer, data_args.max_seq_length, data_args.doc_stride) train_dataset = data_processor.train_tokenizer(train_dataset, train_column_names) val_dataset = data_processor.val_tokenzier(val_dataset, val_column_names) data_collator = (DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) train_iter = DataLoader( train_dataset, collate_fn=data_collator, batch_size=training_args.per_device_train_batch_size) val_iter = DataLoader(val_dataset, collate_fn=data_collator, batch_size=training_args.per_device_eval_batch_size) return dataset, train_iter, val_iter, train_dataset, val_dataset
def run_gru(s): s.print_settings() prob = tf.placeholder_with_default( 1.0, shape=()) #Retain probability for TF dropout start_time = timeit.default_timer() if s.implementation == "keras": if s.use_binary: raise Exception("Binary Keras not implemented") input = Input(shape=(1, s.x_dims)) dense1 = Dense(s.nodes, activation='sigmoid')(input) dense2 = Dense(s.nodes, activation='sigmoid')(input) dense3 = Dense(s.nodes, activation='tanh')(input) mult1 = Multiply()([dense2, dense3]) act1 = Activation('tanh')(mult1) mult2 = Multiply()([dense1, act1]) reshape = Reshape((s.nodes, ))(mult2) dropout = Dropout(0.5)(reshape) dense_out = Dense(1)(dropout) rnn = Model(inputs=[input], outputs=[dense_out]) opt = adam(lr=s.lr, decay=0.0, epsilon=s.adam_eps) #, clipvalue=1.)#1e-3) #opt = rmsprop(lr=s.lr) rnn.compile(loss=s.loss, optimizer=opt) if s.max_verbosity > 0: print(rnn.summary()) else: raise Exception("Unknown implementation " + s.implementation) sequence = readDataSet(s.dataSet, s.dataSetDetailed, s).values if s.limit_to: sequence = sequence[:s.limit_to] #Get rid of unneeded columns sequence = sequence[:, 0:s.feature_count] #sequence[-1000,0] = 666 #print "Changed -1000 to 666" """ We need to leave some values unpredicted in front so that - We can fill the lookback window for each prediction - We can get the value from 1 season earlier for MASE --> Don't use the first `front_buffer` values as prediction --> Independent from `prediction_step`, so the first actual value predicted is `front_buffer`\ plus however many steps the `prediction_step` is higher than 1 In other words, the most recent X-value for the first prediction will be the final value in the `front_buffer` """ first_prediction_index = s.front_buffer + s.predictionStep - 1 targetInput = sequence[ first_prediction_index:, 0].copy() #grab this now to avoid having to denormalize dp = DataProcessor() if s.normalization_type == 'default': (meanSeq, stdSeq) = dp.normalize( sequence, s.nTrain if s.cutoff_normalize else len(sequence)) elif s.normalization_type == 'windowed': dp.windowed_normalize(sequence, columns=[0]) if s.feature_count > 1: dp.normalize(sequence, s.nTrain, columns=range(1, s.feature_count)) elif s.normalization_type == 'AN': an = AdaptiveNormalizer(s.lookback, s.lookback + s.predictionStep) an.set_pruning(False) an.set_source_data(sequence, s.nTrain) an.do_ma('s') an.do_stationary() an.remove_outliers() seq_norm = an.do_adaptive_normalize() print seq_norm.shape if s.feature_count > 1: dp.normalize(sequence, s.nTrain, columns=range(1, s.feature_count)) start = sequence.shape[0] - seq_norm.shape[ 0] - s.lookback - s.predictionStep + 1 for i in range(seq_norm.shape[0]): seq_norm[i, :, 1:s.feature_count] = sequence[start + i:start + i + seq_norm.shape[1], 1:s.feature_count] #an.do_ma('s') #an.do_stationary() #an.remove_outliers() #seq_norm = an.do_adaptive_normalize() #print seq_norm[15000,0,0] #exit(1) else: raise Exception("Unsupported normalization type: " + s.normalization_type) #seq_actual = sequence[s.front_buffer:] #Leave enough headroom for MASE calculation and lookback #seq_full_norm = np.reshape(sequence[:,0], (sequence.shape[0],)) #seq_actual_norm = seq_full_norm[s.front_buffer:] if s.normalization_type != "AN": #Default and windowed change the seq itself but still require creating lookback frames allX = getX(sequence, s) allY = sequence[first_prediction_index:, 0] else: #AN creates a new array but takes care of lookback internally allX = seq_norm[:, 0:-s.predictionStep] allY = np.reshape(seq_norm[:, -1, 0], (-1, )) predictedInput = np.full((len(allY), ), np.nan) #Initialize all predictions to NaN #print "TESTT", allX[15000,0,1:] print "FIRST", allX[875] trainX = allX[:s.nTrain] trainY = allY[:s.nTrain] #print "FIRST", trainX[0], trainY[0] trainX = np.reshape(trainX, s.actual_input_shape_train) trainY = np.reshape(trainY, s.actual_output_shape_train) #print "FIRST", trainX[0], trainY[0] if s.implementation == "keras": #for _ in tqdm(range(s.epochs)): for _ in range(1): rnn.fit( trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=min(s.max_verbosity, 2), shuffle=not s.stateful ) #, validation_data=(trainX, trainY), callbacks=[TensorBoard(log_dir='./logs', histogram_freq=1, write_grads=True)]) if s.stateful: rnn_layer.reset_states() # for layer in rnn.layers: # print layer.get_weights() #for i in xrange(0, s.nTrain + s.predictionStep): # rnn.predict(np.reshape(allX[i], (1, 1, x_dims))) #predictedInput[s.nTrain + s.predictionStep : len(allX)] = rnn.predict(np.reshape(allX[s.nTrain + s.predictionStep : len(allX)], (1, 12510, x_dims))) latestStart = None do_non_lookback = True latest_onego = 0 #buffer = s.retrain_interval / 2 buffer = 0 for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)), disable=s.max_verbosity == 0): if i % s.retrain_interval == 0 and s.online and i > s.nTrain + s.predictionStep + buffer: do_non_lookback = True if s.normalization_type == 'AN': #print "TEST", seq_norm[15000,0,1] predictedInput = np.array( an.do_adaptive_denormalize( predictedInput, therange=(i - s.retrain_interval, i))) latestStart = i an.set_ignore_first_n(i - s.nTrain - s.predictionStep) an.do_ma('s') an.do_stationary() an.remove_outliers() seq_norm = an.do_adaptive_normalize() print seq_norm[15000, 0, 0] print seq_norm.shape #exit(1) #print "FIRST", seq_norm[i-s.nTrain -s.predictionStep,0]#, trainY[0] #print sequence[start+i-s.nTrain-s.predictionStep:start+ if s.feature_count > 1: #dp.normalize(sequence, s.nTrain, columns=range(1,s.feature_count)) start = sequence.shape[0] - seq_norm.shape[ 0] - s.lookback - s.predictionStep + 1 for j in range(seq_norm.shape[0]): seq_norm[j, :, 1:s.feature_count] = sequence[ start + j:start + j + seq_norm.shape[1], 1:s.feature_count] #print "FIRST", seq_norm[i-s.nTrain -s.predictionStep,0]#, trainY[0] allX = seq_norm[:, 0:-s.predictionStep] allY = np.reshape(seq_norm[:, -1, 0], (-1, )) if s.lookback: trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainY = allY[i - s.nTrain - s.predictionStep:i - s.predictionStep] else: trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainY = allY[i - s.nTrain - s.predictionStep:i - s.predictionStep] #print "TESTT", allX[15000,0,:] print "at", i - s.nTrain - s.predictionStep print "FIRST", allX[875] #, trainY[0] #exit(1) trainX = np.reshape(trainX, s.actual_input_shape_train) trainY = np.reshape(trainY, s.actual_output_shape_train) #print "FIRST", trainX[0], trainY[0] #exit(1) if s.implementation == "keras": if s.reset_on_retrain: input = Input(shape=(1, s.x_dims)) dense1 = Dense(s.nodes, activation='sigmoid')(input) dense2 = Dense(s.nodes, activation='sigmoid')(input) dense3 = Dense(s.nodes, activation='tanh')(input) mult1 = Multiply()([dense2, dense3]) act1 = Activation('tanh')(mult1) mult2 = Multiply()([dense1, act1]) reshape = Reshape((s.nodes, ))(mult2) dropout = Dropout(0.5)(reshape) dense_out = Dense(1)(dropout) rnn = Model(inputs=[input], outputs=[dense_out]) opt = adam(lr=s.lr, decay=0.0, epsilon=s.adam_eps) # , clipvalue=1.)#1e-3) #opt = rmsprop(lr=s.lr) rnn.compile(loss=s.loss, optimizer=opt) for _ in range(1): rnn.fit(trainX, trainY, epochs=s.epochs_retrain if s.epochs_retrain else s.epochs, batch_size=s.batch_size, verbose=2, shuffle=not s.stateful) if s.stateful: rnn_layer.reset_states() if s.lookback: if s.implementation == "keras": predictedInput[i] = rnn.predict( np.reshape(allX[i], s.predict_input_shape)) elif do_non_lookback: do_non_lookback = False up_to = min(allX.shape[0], i - (i % s.retrain_interval) + s.retrain_interval) if s.online else allX.shape[0] start_time = time.time() #print allX[0] start = 0 if s.refeed_on_retrain else latest_onego new_predicts = rnn.predict( np.reshape(allX[start:up_to], (1, -1, s.x_dims))) new_predicts = np.reshape(new_predicts, (new_predicts.shape[1], )) predictedInput[i:up_to] = new_predicts[-(up_to - i):] latest_onego = up_to for i in range(s.nTrain + s.predictionStep): predictedInput[i] = np.nan if s.normalization_type == 'default': predictedInput = dp.denormalize(predictedInput, meanSeq[0], stdSeq[0]) elif s.normalization_type == 'windowed': dp.windowed_denormalize(predictedInput, pred_step=s.predictionStep) elif s.normalization_type == 'AN': if latestStart: predictedInput = np.array( an.do_adaptive_denormalize(predictedInput, therange=(latestStart, len(predictedInput)))) else: predictedInput = np.array( an.do_adaptive_denormalize(predictedInput)) if an.pruning: targetInput = np.delete(targetInput, an.deletes) print "Final time", (timeit.default_timer() - start_time) #print "Last not to change:", predictedInput[-996], targetInput[-996] #print "First to change:", predictedInput[-995], targetInput[-995] dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru', s.predictionStep, s.max_verbosity) for ignore in s.ignore_for_error: skipTrain = ignore from plot import computeSquareDeviation squareDeviation = computeSquareDeviation(predictedInput, targetInput) squareDeviation[:skipTrain] = None nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInput - predictedInput)) if s.max_verbosity > 0: print "MAE {}".format(mae) mape = errors.get_mape(predictedInput, targetInput, skipTrain) if s.max_verbosity > 0: print "MAPE {}".format(mape) mase = errors.get_mase(predictedInput, targetInput, np.roll(targetInput, s.season), skipTrain) if s.max_verbosity > 0: print "MASE {}".format(mase) return mase
def __init__(self): self.engine = Engine() self.processor = DataProcessor()