def show_possibilities(self, tickers='all', *args, **kwargs): do_print = utils.parse_kwargs("do_print", kwargs, True) if tickers == 'all': tickers = self.tickers else: tickers = utils.check_ticker_input(tickers_input=tickers, tickers_avail=self.tickers, do_print=do_print) for ticker in tickers: utils.print_issue(None, '=' * 80, do_print=do_print) utils.print_issue('INFO', 'Current ticker: {}'.format(ticker), do_print=do_print) #check if last value is nan: last_value_index = -1 if not np.isnan(self.data[ticker][last_value_index]): utils.print_issue('WARNING', 'Last value of data set is not NaN!') input_message = 'Proceed anyways? ' if not self._get_answer(input_message=input_message): continue else: last_value_index = -2 if self.break_values[ticker] is None and generic_value is None: utils.print_issue('ERROR', 'No break values computed for this ticker!') continue deviation = utils.parse_kwargs('deviation', kwargs, error_arg=.0125) bottom_value, top_value = self.break_values[ticker] middle_value = (top_value - bottom_value)*.5 + bottom_value bottom_value *= (1 - deviation) top_value *= (1 + deviation) test_values = [bottom_value, middle_value, top_value] for value in test_values: utils.print_issue(None, '-' * 80, do_print=do_print) utils.print_issue('INFO', 'Result for value: {}'.format(value), do_print=do_print) #create an imag_model: test_model = self.copy_model() #assign the value to the last entry: test_model.data[ticker][-1] = value #init model test_model._init_model(do_print=False) test_model.eval_model(do_print=False) p_range = utils.parse_kwargs('plot_range', kwargs, None) p_index = utils.parse_kwargs('plot_from_index', kwargs, None) p_date = utils.parse_kwargs('plot_from_date', kwargs, None) switch_axes = utils.parse_kwargs('switch_axes', kwargs, False) return_plot = utils.parse_kwargs("return_plot", kwargs, False) save_figures = utils.parse_kwargs("save_figures", kwargs, False) fig_name = "{}_imag_value_{:.2f}".format(ticker, value) output_folder = utils.parse_kwargs("output_folder", kwargs, None) plotting.plot_model(model=test_model, tickers=ticker, plot_range=p_range, plot_from_index=p_index, plot_from_date=p_date, plot_break_values=True, switch_axes=switch_axes, return_plot=return_plot, output_folder=output_folder, save_figures=save_figures, fig_name=fig_name)
def check_for_nan_values(self, tickers='all', exclude_last_value=True, *args, **kwargs): #TODO: CODE THIS FUNCTION ! do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True) if tickers == 'all': tickers = self.tickers else: tickers = utils.check_ticker_input(tickers_input=tickers, tickers_avail=self.tickers, do_print=True) for ticker in tickers: if exclude_last_value: nan_indices = np.where(np.isnan(self.data[ticker][:-1]))[0] valid_indices = np.where(np.isfinite(self.data[ticker][:-1]))[0] valid_indices = np.hstack((valid_indices, self.data[ticker].shape[0] - 1)) filtered_data = self.data[ticker][valid_indices] self.data[ticker] = filtered_data else: utils.print_issue('INFO', 'Last value is considered to be removed.', do_print=do_print) nan_indices = np.where(np.isnan(self.data[ticker]))[0] if nan_indices.size > 0: #print(self.data[ticker].dropna())#[~np.isnan(self.data[ticker][:last_value_index])]) return input_message = 'Remove {} NaN values? '.format(nan_indices.size) if self._get_answer(input_message=input_message): self.data[ticker] = self.data[ticker][~nan_indices] else: utils.print_issue('INFO', 'No NaN values detected.', do_print=do_print)
def stock_price(cls, iteration_=1, **kwargs): """generate stock spot through stochastic process""" _rand = rand_norm(0, 1, iteration_) _isp, _rate, _div, _vol, _t = parse_kwargs( kwargs, ['isp', 'rate', 'div', 'vol', 't'], 0) return _isp * exp((_rate - _div - _vol**2 / 2) * _t + _vol * sqrt(_t) * _rand)
def setup_devices(ids): if ids == '': return {'main': -1} devices = parse_kwargs(ids) for key in devices: devices[key] = int(devices[key]) return devices
def _get_locs(self, ticker, *args, **kwargs): do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True) if len(self.local_min) > 1: buy_locs = self.local_min[ticker][0] + self.buy_delay sell_locs = self.local_max[ticker][0] + self.buy_delay else: buy_locs = self.local_min[ticker] + self.buy_delay sell_locs = self.local_max[ticker] + self.buy_delay try: if buy_locs[0] > sell_locs[0]: sell_locs = sell_locs[1:] except IndexError: utils.print_issue('INFO', 'First sell position will not be displayed.', do_print=do_print) #check locs: if buy_locs.shape[0] > sell_locs.shape[0]: utils.print_issue('INFO', 'Open position.', do_print=do_print) elif buy_locs.shape[0] < sell_locs.shape[0]: try: sell_locs[0] = buy_locs[0] except IndexError: utils.print_issue('INFO', 'No buy locations occured.\ Sell locations are set to buy locations.', do_print=do_print) sell_locs = buy_locs return buy_locs, sell_locs
def _init_model(self, *args, **kwargs): ''' Function to set up the price model. The idea is to locate the inflection points of the difference of "moving average converging diverging (macd)" and "Signal Line (signal_line)". These indicate local up and down trends. The actual buy and sell prices are therefore the next day, i.e. buy_delay. Inputs: - periods: days to calculate the macd (first two values) and Signal Line (last value). default: 12, 26, 9 - buy_delay: buy and sell dates default: 1 - grad_return: return the "gradient" of the model, i.e. the model itself default: True Outputs: - local_min: Buy prices - local_max: Sell prices - grad: "gradient" of the model (optionally) ''' do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True) utils.print_issue('INIT', 'Initialising model for tickers: {}'.format(self.tickers), do_print=do_print) macd = self._calc_ema(self.data, self.periods[0]) - self._calc_ema(self.data, self.periods[1]) signal_line = self._calc_ema(macd, self.periods[2]) if len(self.tickers) == 1: grad = np.gradient(macd[self.tickers[0]] - signal_line[self.tickers[0]]) else: grad = np.gradient(macd - signal_line) local_min, local_max, grad_dict = {}, {}, {} if isinstance(grad, list): utils.print_issue('WARNING', 'Ignoring second entry of gradient!', do_print=do_print) grad = grad[0].T for n in range(grad.shape[0]): local_min[self.tickers[n]] = argrelextrema(grad[n], np.less) local_max[self.tickers[n]] = argrelextrema(grad[n], np.greater) else: local_min[self.tickers[0]] = argrelextrema(grad, np.less)[0] local_max[self.tickers[0]] = argrelextrema(grad, np.greater)[0] #transforming grad as dict if len(grad.shape) == 1: grad_dict[self.tickers[0]] = grad else: for n, ticker in enumerate(self.tickers): grad_dict[ticker] = grad[n] self.local_min = local_min self.local_max = local_max self.grad = grad_dict utils.print_issue('INIT', 'Successfully initialized model.', do_print=do_print) utils.print_issue(None, '*' * 80, do_print=do_print)
def append_timedelta(self, timedelta=1, overwrite_data=True, *args, **kwargs): do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True) new_entry = self.data.index[-1] + pd.Timedelta(days=timedelta) final_entries = list(self.data.index) final_entries.append(new_entry) idx = pd.DatetimeIndex(final_entries) new_data = self.data.reindex(idx) if overwrite_data: utils.print_issue('INFO', 'New data was appended.', do_print=do_print) self.data = new_data else: return new_data
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") # load validation data val_data = ehr.EHR("dataset/EHR", "val") # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) # init model model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, verbose=1, n_iter_no_change=10, random_state=10) train_feat, train_label = train_data.get_feat_data() print("Start Training.") model.fit(train_feat, train_label) print("Training Finished.") # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_feat, test_label = test_data.get_feat_data() test_metric, test_log, test_result = evaluate_clf(model, test_feat, test_label, top_k_list=[3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/EHR", "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # init model model = TextCNN(**model_param) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if model_param["use_gpu"]: model.cuda() print("Model Inited.") optimizer = torch.optim.Adam(model.parameters(), lr=model_param["lr"], weight_decay=0) for epoch in range(model_param["num_epoch"]): total_loss = 0 model.train() for idx, (feat, dise) in enumerate(train_data_loader): pred = model.forward(feat) if model_param["use_gpu"]: label = torch.LongTensor(dise).cuda() else: label = torch.LongTensor(dise) # label is [1,2,3...,27] loss = F.cross_entropy(pred, label - 1) # multi-class xent loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate_clf( model, val_data_loader, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], model, "textcnn") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate_clf(model, test_data_loader, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def train(**kwargs): w2v_model_name = "./ckpt/w2v" if os.path.exists(w2v_model_name): print("load word2vec model from", w2v_model_name) # load model directly w2v_model = Word2Vec.load(w2v_model_name) else: # load data filename = "./dataset/EHR/train/data.txt" fin = open(filename, "r") corpus = [] for line in fin.readlines(): corpus.append(line.strip().split()[2:]) # learn word2vec model start_time = time.time() w2v_model = Word2Vec(corpus, size=64, window=3, min_count=1, workers=4, sg=1) w2v_model.save("./ckpt/w2v") print("training done, costs {} secs.".format(time.time() - start_time)) # start training and testing the MLP model setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/EHR", "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # let's build a MLP for prediction model_param["w2v_model"] = w2v_model model = MLP(**model_param) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if model_param["use_gpu"]: model.cuda() print("Model Inited.") optimizer = torch.optim.Adam(model.parameters(), lr=model_param["lr"], weight_decay=kwargs["weight_decay"]) for epoch in range(model_param["num_epoch"]): total_loss = 0 model.train() for idx, (feat, dise) in enumerate(train_data_loader): pred = model.forward(feat) if model_param["use_gpu"]: label = torch.LongTensor(dise).cuda() else: label = torch.LongTensor(dise) # label is [1,2,3...,27] loss = F.cross_entropy(pred, label - 1) # multi-class xent loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate_clf( model, val_data_loader, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], model, "med2vec") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate_clf(model, test_data_loader, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.") pass
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # init model data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] gnn = HGNN_SDS(**model_param) if model_param["w2v"] is not None: # load w2v data gnn.load_symp_embed(model_param["w2v"]) if use_gpu: gnn.cuda() print("Model Inited.") sds_sampler = SDS_sampler("dataset/EHR") # load pmi ss mat symp2symp_mat = sp.load_npz(os.path.join("dataset/EHR", "pmi_ss_mat.npz")) symp2symp_mat.setdiag(0) # total number of symptoms num_total_batch = gnn.num_symp // model_param["batch_size"] all_symp_index = np.arange(1, gnn.num_symp + 1) lambda_hard_r = lambda epoch: epoch * model_param[ "hard_ratio"] / model_param["num_epoch"] # build hard map and pos map symp2symp_hard_map = [0] symp2symp_pos_map = [0] for k in all_symp_index: symp2symp_b_ar = symp2symp_mat[k].toarray().flatten() max_index = np.argmax(symp2symp_b_ar) if max_index == 0: symp2symp_pos_map.append(np.random.randint(1, k)) symp2symp_hard_map.append(np.random.randint(1, k)) else: symp2symp_pos_map.append(max_index) symp2symp_b_ar[max_index] = -1 max_2nd_index = np.argmax(symp2symp_b_ar) if max_2nd_index == 0: symp2symp_hard_map.append(np.random.randint(1, k)) else: symp2symp_hard_map.append(max_2nd_index) symp2symp_hard_map = np.array(symp2symp_hard_map) symp2symp_pos_map = np.array(symp2symp_pos_map) print("Pos / Hard symptom map Inited.") optimizer = torch.optim.Adam(gnn.parameters(), lr=model_param["lr"], weight_decay=model_param["lr"]) last_total_loss = 1e10 for epoch in range(model_param["num_epoch"]): total_loss = 0 gnn.train() np.random.shuffle(all_symp_index) hard_ratio = lambda_hard_r(epoch) for idx in range(num_total_batch): batch_symp = all_symp_index[idx * model_param["batch_size"]:(idx + 1) * model_param["batch_size"]] # get pos symp and neg symp pos_symp = symp2symp_pos_map[batch_symp] # sample neg neg_symp = np.random.randint(1, gnn.num_symp, model_param["batch_size"]) # cope with overlapping in pos and neg symps overlap_index = (neg_symp == pos_symp) overlap_symp = neg_symp[overlap_index] neg_symp[overlap_index] = symp2symp_hard_map[overlap_symp] if hard_ratio > 0: num_hard = int(hard_ratio * model_param["batch_size"]) neg_symp[:num_hard] = symp2symp_hard_map[neg_symp[:num_hard]] batch_symp_ts = torch.LongTensor(batch_symp) pos_symp_ts = torch.LongTensor(pos_symp) neg_symp_ts = torch.LongTensor(neg_symp) if model_param["use_gpu"]: batch_symp_ts = batch_symp_ts.cuda() pos_symp_ts = pos_symp_ts.cuda() neg_symp_ts = neg_symp_ts.cuda() # forward batch symp batch_symp_data = sds_sampler(batch_symp, 1, 20) symp_emb = gnn.forward(batch_symp_ts, batch_symp_data) pos_symp_data = sds_sampler(pos_symp, 1, 20) pos_emb = gnn.forward(pos_symp_ts, pos_symp_data) neg_symp_data = sds_sampler(neg_symp, 1, 20) neg_emb = gnn.forward(neg_symp_ts, neg_symp_data) # create loss scores = symp_emb.mul(pos_emb).sum(1) - symp_emb.mul(neg_emb).sum( 1) + 1.0 scores[scores < 0] = 0 loss = scores.mean() optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) if total_loss - last_total_loss > 0: print("Loss stops to decrease, converge.") break last_total_loss = total_loss # save model torch.save(gnn.state_dict(), "./ckpt/sds_gnn.pt") print("Model saved.")
def main(**kwargs): # parse parameters param = default_config() param.update({ "mode": "sds", "top_k": 10, "ckpt": "ckpt/gnn.pt", "use_gpu": False }) param.update(kwargs) # read maps symp2id, id2symp = read_symp2id() dise2id, id2dise = read_dise2id() # read data datapath = os.path.join("dataset/EHR/test/data.txt") fin = open(datapath, "r", encoding="utf-8") lines = fin.readlines() data_model = ehr.EHR("dataset/EHR", "train") # init retrieval system ehr_ret = EHR_retrieval(mode=param["mode"]) # init and load model data_model_param = parse_data_model(data_model) param.update(data_model_param) param = parse_kwargs(param, kwargs) gnn = HGNN(**param) if param["use_gpu"]: gnn.cuda() ckpt_path = param.get("ckpt") if ckpt_path is None: print("[Warning] Do not set ckpt path, load from the default path.") load_ckpt("ckpt/checkpoint.pt", gnn, param["use_gpu"]) else: load_ckpt(ckpt_path, gnn, param["use_gpu"]) dsd_sampler = DSD_sampler("dataset/EHR") usu_sampler = USU_sampler("dataset/EHR") gnn.eval() emb_dise = gnn.gen_all_dise_emb(dsd_sampler) # init result list before_list = [] after_list = [] real_dise_list = [] init_symp_list = [] after_symp_list = [] result_map_bfo = defaultdict(list) result_map_aft = defaultdict(list) # this is top_k for evaluation p@N, Rec@N, ... top_k_list = [1, 5] for i, line in enumerate(lines): line_data = line.strip().split() uid = line_data[0] did = line_data[1] real_dise_list.append(did) symps = line_data[2:] # select the first symptom and do inference init_symp = symps[0] init_symp_list.append(id2symp[init_symp]) symp_ar = np.array([[init_symp]]) pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5) # calculate statistics for top_k in top_k_list: pred_top_k = pred_rank[0][:top_k] calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_bfo) # print("true did:", did) # print("before:", pred_rank) before_list.append(pred_rank[0]) rank_symp = ehr_ret(symp_idx=init_symp, top_k=param["top_k"]) after_symp_list.append([id2symp[str(t)] for t in rank_symp]) symp_ar = [np.concatenate([[init_symp], rank_symp], 0)] # symp_ar = np.array([symps]) pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5) for top_k in top_k_list: pred_top_k = pred_rank[0][:top_k] calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_aft) # print("after:", pred_rank) after_list.append(pred_rank[0]) ret_symps = ehr_ret(init_symp, param["top_k"]) ret_symp_list = [] for sid in ret_symps: ret_symp_list.append(id2symp[str(sid)]) if i % 100 == 0: print("[line]:", i) # summary bf_log = build_result_log(result_map_bfo, top_k_list) af_log = build_result_log(result_map_aft, top_k_list) print("[before]: {}".format(bf_log)) print("[after]: {}".format(af_log)) # to result csv fout = open("retrieval_result_{}.txt".format(param["mode"]), "w", encoding="utf-8") fout.write("did\tbefore_pred\tafter_pred\tinit_symp\taftersymp\n") for i in range(len(init_symp_list)): wrtline = id2dise[int(real_dise_list[i])] + "\t" + id2dise[int( before_list[i][0])] + "\t" + id2dise[int( after_list[i] [0])] + "\t" + init_symp_list[i] + "\t" + "#".join( after_symp_list[i]) + "\n" fout.write(wrtline) fin.close() fout.close() df_res = pd.read_table("retrieval_result_{}.txt".format(param["mode"])) df_res.to_excel("retrieval_result_{}.xlsx".format(param["mode"]), encoding="utf-8") print("Done")
def plot_model(model, tickers='all', plot_range=None, plot_from_index=None, plot_from_date=None, plot_break_values=True, switch_axes=False, **kwargs): ''' Function to plot a model. Inputs: - model: model of class MODEL - tickers: tickers to plot default: all, i.e. tickers in input class MODEL - plot_range: range to plot of type pandas.date_range() defualt: None, i.e. complete data set - plot_break_values: if available, plot break_values of input class MODEL default: True ''' do_print = utils.parse_kwargs("do_print", kwargs, True) if tickers == 'all': tickers = model.tickers else: tickers = utils.check_ticker_input(tickers_input=tickers, tickers_avail=model.tickers, do_print=do_print) for ticker in tickers: if plot_range is not None: x_axis = model.data[ticker][plot_range].index indices = np.where(np.isin(model.data[ticker].index, plot_range))[0] elif plot_from_index is not None: x_axis = model.data[ticker].index[plot_from_index:] indices = np.arange(plot_from_index, model.data[ticker].index.shape[0], 1) elif plot_from_date is not None: idx = model.data[ticker].index.get_loc(plot_from_date).start x_axis = model.data[ticker].index[idx:] indices = np.arange(idx, model.data[ticker].index.shape[0], 1) else: x_axis = model.data[ticker].index indices = np.arange(0, x_axis.shape[0], 1) grad = model.grad[ticker][indices] min_arg = np.where(model.local_min[ticker] >= indices[0]) max_arg = np.where(model.local_max[ticker] >= indices[0]) try: local_min = model.local_min[ticker][min_arg] local_max = model.local_max[ticker][max_arg] in_loop = False except TypeError: #loop over tickers: in_loop = True local_min = model.local_min[ticker][0][min_arg[1]] local_max = model.local_max[ticker][0][max_arg[1]] price = model.data[ticker][indices] try: buy_dates = model.ticker_df[ticker]['Buy Dates'].values[min_arg[0]] if in_loop: buy_dates = model.ticker_df[ticker]['Buy Dates'].values[ min_arg[1]] except IndexError: utils.print_issue( 'INFO', 'New buy signal was detected for last value: {}.'.format( model.data[ticker][-1]), do_print=do_print) buy_dates = model.ticker_df[ticker]['Buy Dates'].values[min_arg[0] [:-1]] if in_loop: buy_dates = model.ticker_df[ticker]['Buy Dates'].values[ min_arg[1][:-1]] buy_dates = np.hstack( (buy_dates, model.data[ticker].index[local_min[-1] + 1].to_numpy())) try: sell_dates = model.ticker_df[ticker]['Sell Dates'].values[ max_arg[0]] if in_loop: sell_dates = model.ticker_df[ticker]['Sell Dates'].values[ max_arg[1]] except IndexError: utils.print_issue( 'INFO', 'New sell signal was detected for last value: {}.'.format( model.data[ticker][-1]), do_print=do_print) sell_dates = model.ticker_df[ticker]['Sell Dates'].values[ max_arg[0][:-1]] if in_loop: sell_dates = model.ticker_df[ticker]['Sell Dates'].values[ max_arg[1][:-1]] sell_dates = np.hstack( (sell_dates, model.data[ticker].index[local_max[-1] + 1].to_numpy())) #Generating plots: fig, axs = plt.subplots(2, 1, figsize=(16, 9), sharex=True) if switch_axes: ax_indices = [1, 0] else: ax_indices = [0, 1] axs[ax_indices[0]].fill_between(x_axis, 0, grad, where=grad > 0, facecolor='green', interpolate=True, label='Up Trend') axs[ax_indices[0]].fill_between(x_axis, 0, grad, where=grad <= 0, facecolor='red', interpolate=True, label='Down Trend') axs[ax_indices[0]].vlines(model.data[ticker].index[local_min], np.min(grad), np.max(grad), color='g', label='Min Reached') axs[ax_indices[0]].vlines(model.data[ticker].index[local_max], np.min(grad), np.max(grad), color='r', label='Peak Reached') #layout: axs[ax_indices[0]].set_title('{} - MODEL'.format(ticker), fontsize='larger') axs[ax_indices[0]].set_ylabel('Gradient [-]', fontsize='larger') #subplot 2: axs[ax_indices[1]].plot(x_axis, price, label='{}'.format(ticker)) axs[ax_indices[1]].vlines(buy_dates, np.min(price), np.max(price), color='g', label='Buy Dates') axs[ax_indices[1]].vlines(sell_dates, np.min(price), np.max(price), color='r', linestyle='--', label='Sell dates') if plot_break_values: if model.break_values is not None: axs[ax_indices[1]].hlines(model.break_values[ticker][0], x_axis[0], x_axis[-1], color='k', label='Break value {:.5f}'.format( model.break_values[ticker][0])) axs[ax_indices[1]].hlines(model.break_values[ticker][1], x_axis[0], x_axis[-1], color='c', label='Break value {:.5f}'.format( model.break_values[ticker][1])) #layout: axs[ax_indices[1]].set_title('{} - PRICE'.format(ticker), fontsize='larger') axs[ax_indices[1]].set_ylabel('Price', fontsize='larger') #settings for all plots: axs[np.sort(ax_indices)[-1]].set_xlabel('Date', fontsize='larger') for n in ax_indices: axs[ax_indices[n]].grid() axs[ax_indices[n]].legend(loc='upper left') save_figures = utils.parse_kwargs(key="save_figures", kwargs=kwargs, error_arg=False) return_plot = utils.parse_kwargs(key="return_plot", kwargs=kwargs, error_arg=False) output_folder = utils.parse_kwargs(key="output_folder", kwargs=kwargs, error_arg=False) fig_name = utils.parse_kwargs(key="fig_name", kwargs=kwargs, error_arg="{}_evaluation".format(ticker)) if fig_name is not None: plt.suptitle(fig_name) if all([save_figures, output_folder, fig_name]): fname = "{}/{}.pdf".format(output_folder, fig_name) plt.savefig(fname=fname) plt.close() message = "Exported: %s" % fname utils.print_issue("INFO", message, do_print=do_print) #return if return_plot: return plt
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) dataset_name = model_param["dataset"] # load hard maps if model_param["hard_ratio"] > 0: model_param["hard_map"] = np.load("dataset/hard_dise.npy", allow_pickle=True).item() # load training data train_data = ehr.EHR("dataset/{}".format(dataset_name), "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/{}".format(dataset_name), "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # init model gnn = HGNN(**model_param) if kwargs["w2v"] is not None: if os.path.exists(kwargs["w2v"]): # load w2v data gnn.load_symp_embed(kwargs["w2v"]) else: from gensim.models import Word2Vec # build word2vec embeddings filename = "./dataset/EHR/train/data.txt" fin = open(filename, "r") corpus = [] for line in fin.readlines(): corpus.append(line.strip().split()[2:]) # learn word2vec model start_time = time.time() w2v_model = Word2Vec(corpus, size=64, window=3, min_count=1, workers=4, sg=1) w2v_model.save("./ckpt/w2v") print("word2vec training done, costs {} secs.".format(time.time() - start_time)) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if use_gpu: gnn.cuda() print("Model Inited.") # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"]) optimizer = torch.optim.Adam(gnn.parameters(), lr=model_param["lr"], weight_decay=0) # init sampler for netative sampling during training. dsd_sampler = DSD_sampler("dataset/{}".format(dataset_name)) print("D-S-D Sampler Inited.") for epoch in range(model_param["num_epoch"]): total_loss = 0 gnn.train() for idx, (feat, dise) in enumerate(train_data_loader): pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward( feat, dise, dsd_sampler) bpr_loss = create_bpr_loss(pred, pred_neg) l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise) loss = bpr_loss + model_param["weight_decay"] * l2_loss # loss = bpr_loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += bpr_loss.item() # print(idx,total_loss) print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate( gnn, val_data_loader, dsd_sampler, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], gnn, "gnn") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/{}".format(dataset_name), "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate(gnn, test_data_loader, dsd_sampler, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def train(**kwargs): setup_seed(2020) model_param = default_config() model_param = parse_kwargs(model_param, kwargs) # load hard maps if model_param["hard_ratio"] > 0: model_param["hard_map"] = np.load("dataset/hard_dise.npy", allow_pickle=True).item() # load training data train_data = ehr.EHR("dataset/EHR", "train") train_data_loader = DataLoader(train_data, model_param["batch_size"], shuffle=True, num_workers=0, collate_fn=collate_fn) # load validation data val_data = ehr.EHR("dataset/EHR", "val") val_data_loader = DataLoader(val_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) # use data model to update model_param data_model_param = parse_data_model(train_data) model_param.update(data_model_param) use_gpu = model_param["use_gpu"] # init model gnn = HGNN_DSD(**model_param) if kwargs["w2v"] is not None: # load w2v data gnn.load_symp_embed(kwargs["w2v"]) early_stopper = EarlyStopping(patience=model_param["early_stop"], larger_better=True) if use_gpu: gnn.cuda() print("Model Inited.") # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"]) optimizer = torch.optim.Adam(gnn.parameters(), lr=model_param["lr"], weight_decay=0) # init sampler for netative sampling during training. dsd_sampler = DSD_sampler("dataset/EHR") print("D-S-D Sampler Inited.") for epoch in range(model_param["num_epoch"]): total_loss = 0 gnn.train() for idx, (feat, dise) in enumerate(train_data_loader): pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward( feat, dise, dsd_sampler) bpr_loss = create_bpr_loss(pred, pred_neg) l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise) loss = bpr_loss + model_param["weight_decay"] * l2_loss # loss = bpr_loss optimizer.zero_grad() loss.backward() optimizer.step() total_loss += bpr_loss.item() # print(idx,total_loss) print("{} Epoch {}/{}: train loss: {:.6f}".format( now(), epoch + 1, model_param["num_epoch"], total_loss)) # do evaluation on recall and ndcg metric_result, eval_log, eval_result = evaluate( gnn, val_data_loader, dsd_sampler, [5]) print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1, model_param["num_epoch"], eval_log)) early_stopper(metric_result["ndcg_5"], gnn, "gnn_dsd") if early_stopper.early_stop: print("[Early Stop] {} Epoch {}/{}: {}".format( now(), epoch + 1, model_param["num_epoch"], eval_log)) break # eval on test set # load test data test_data = ehr.EHR("dataset/EHR", "test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) test_metric, test_log, test_result = evaluate(gnn, test_data_loader, dsd_sampler, top_k_list=[1, 3, 5, 10]) print("[Test] {}: {}".format(now(), test_log)) print("Training Done.")
def calc_probs(model, time=None, tickers='all', stats_data=None, auto_update_tolerances=False, *args, **kwargs): """Function to calculate statistics.""" do_print = utils.parse_kwargs("do_print", kwargs, True) if tickers == 'all': tickers = model.tickers else: tickers = utils.check_ticker_input(tickers_input=tickers, tickers_avail=model.tickers) try: timezone = kwargs['timezone'] except KeyError: timezone = None try: start = kwargs['start'] except KeyError: start = None for ticker in tickers: utils.print_issue(None, '=' * 80) utils.print_issue('INFO', 'Current ticker: {}'.format(ticker)) z_values, tols, means = _create_z_values(model=model, ticker=ticker, stats_data=stats_data, timezone=timezone, start=start, auto_update_tolerances=auto_update_tolerances) freq_range, frequencies = _create_freq() delta_t = model.data.index[-1].to_datetime64() - pd.Timestamp.now().to_datetime64() delta_t = pd.Timedelta(delta_t).seconds / 3600 arg = np.argsort(tols) value_arg = np.argsort(model.break_values[ticker]) probs = ss.norm.cdf(z_values) * 100 # do 1 - if: flip_arg = np.where(z_values > 0) probs[np.where(z_values > 0)] = (1 - ss.norm.cdf(z_values[flip_arg])) * 100 poly_deg = 5 poly_probs = np.zeros(2) fig, axs = plt.subplots(2, 1, figsize=(16, 9), sharex=True, sharey=True) for n, ax in enumerate(axs): ax.plot(frequencies, probs[n], label='Probability') ax.vlines(delta_t, np.min(probs), np.max(probs), label='Time to deadline') poly_line = np.poly1d(np.polyfit(freq_range, probs[n], poly_deg)) ax.plot(frequencies, poly_line(freq_range), 'r', label='Polyfit of deg {}'.format(poly_deg)) title = 'Ticker: {} - Break Value: {} - Tolerance: {}'.format(ticker, model.break_values[ticker][value_arg[n]], tols[arg[n]]) current_prob = poly_line(delta_t) ax.text(x=delta_t - .25, y=(np.max(probs) + np.min(probs))*.5, s='{:.2f}%'.format(current_prob), fontsize='larger') ax.set_title(title, fontsize='large') ax.legend() ax.grid() ax.yaxis.get_label().set_fontsize('larger') ax.xaxis.get_label().set_fontsize('larger') poly_probs[n] = current_prob ax.invert_xaxis() plt.setp(axs[-1], xlabel='Time to break value [h]') plt.setp(axs, ylabel='Probability [%]') prob_between = np.abs(np.diff(poly_probs))[0] for n, prob in enumerate(poly_probs): utils.print_issue('STATS-EVAL', 'Probability for tol={:.5f}: {:.2f}%'.format(tols[arg][n], prob)) utils.print_issue('STATS-EVAL', 'Probability between: {:.2f}%'.format(prob_between)) save_figures = utils.parse_kwargs(key="save_figures", kwargs=kwargs, error_arg=False) return_plot = utils.parse_kwargs(key="return_plot", kwargs=kwargs, error_arg=False) output_folder = utils.parse_kwargs(key="output_folder", kwargs=kwargs, error_arg=False) fig_name = "{}_statistics".format(ticker) if fig_name is not None: plt.suptitle(fig_name) if all([save_figures, output_folder, fig_name]): fname = "{}/{}.pdf".format(output_folder, fig_name) plt.savefig(fname=fname) plt.close() message = "Exported: %s" %fname utils.print_issue("INFO", message, do_print=do_print) #return if return_plot: return plt
def main(**kwargs): model_param = default_config() model_param.update({"top_k":3}) model_param = parse_kwargs(model_param, kwargs) print("Start evaluating on top {} predictions.".format(model_param["top_k"])) # load map dise2id, id2dise = read_dise2id("dataset/EHR") # load train data model data_model = ehr.EHR("dataset/EHR","train") test_data = ehr.EHR("dataset/EHR","test") test_data_loader = DataLoader(test_data, model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn) data_model_param = parse_data_model(data_model) model_param.update(data_model_param) gnn = HGNN(**model_param) if model_param["use_gpu"]: gnn.cuda() ckpt_path = kwargs.get("ckpt") if ckpt_path is None: print("[Warning] Do not set ckpt path, load from the default path.") load_ckpt("ckpt/checkpoint.pt", gnn, model_param["use_gpu"]) else: load_ckpt(ckpt_path, gnn, model_param["use_gpu"]) dsd_sampler = DSD_sampler("dataset/EHR") usu_sampler = USU_sampler("dataset/EHR") gnn.eval() emb_dise = gnn.gen_all_dise_emb(dsd_sampler) rank_list = None dise_list = None for idx, (feat, dise) in enumerate(test_data_loader): this_dise_list = parse_rank(dise, id2dise) if dise_list is None: dise_list = this_dise_list else: dise_list = np.r_[dise_list, this_dise_list] # get symps symp_list = [] for x in feat: symp_list.append(x["symp"]) symp_ar = np.array(symp_list) # re-sampling users embeddings by their symptoms pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=model_param["top_k"]) # parse rank for print pred_list = parse_rank(pred_rank, id2dise) if rank_list is None: rank_list = pred_list else: rank_list = np.r_[rank_list, pred_list] # save results res_ar = np.c_[dise_list, rank_list] df_res = pd.DataFrame(res_ar) col_name = ["GroundTruth"] + ["Pred_"+str(i+1) for i in range(rank_list.shape[1])] df_res.columns = col_name df_res.to_csv("Test_Results.csv",encoding="utf-8") print("Test done, save results in", "Test_Results.csv")
def comp_break_values(self, tickers='all', refactor_step_size=1, append_break_values=False, parallel_computing=True,\ *args, **kwargs): do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True) if tickers == 'all': tickers = self.tickers else: tickers = utils.check_ticker_input(tickers_input=tickers, tickers_avail=self.tickers, do_print=True) imag_model = self.copy_model() break_values_dict = dict.fromkeys(tickers) current_values = dict.fromkeys(tickers, None) tolerances = dict.fromkeys(tickers) deviation = .3 utils.print_issue('INFO', 'Compute break values with {:.2%} deviation'.format(deviation), do_print=do_print) for ticker in tickers: utils.print_issue('INFO', 'Current ticker: {}'.format(ticker), do_print=do_print) break_values = [None, None] if np.isnan(self.data[ticker].values[-1]): value_index = -2 else: value_index = -1 current_values[ticker] = self.data[ticker].values[value_index] #create range: start_value = current_values[ticker] * (1 - deviation) end_value = current_values[ticker] * (1 + deviation) step_size = (current_values[ticker] / 5000) * refactor_step_size rng = np.arange(start_value, end_value, step_size) try: import multiprocessing as mp except ModuleNotFoundError: utils.print_issue('ERROR', 'Multiprocessing module not available.', do_print=do_print) parallel_computing = False if not parallel_computing: break_values_dict[ticker] = np.sort(self._comp_bvs(model=imag_model, rng=rng, ticker=ticker)) else: n_procs = cpu_count() utils.print_issue('INFO', 'Using {} processes.'.format(n_procs), do_print=do_print) rng_list = self._do_array_split(rng, n_procs) from functools import partial inputs_partial = partial(self._comp_bvs, imag_model, ticker) with mp.Pool(processes=n_procs) as pool: bvs = pool.map(inputs_partial, rng_list) bv_final = [None, None] for bv_list in bvs: for n, bv in enumerate(bv_list): if bv is not None and bv_final[n] is None: bv_final[n] = bv if all(bv_final): break break_values_dict[ticker] = np.sort(bv_final) #make sure to already have sort break_values_dict! tolerances[ticker] = break_values_dict[ticker] - current_values[ticker] self.tolerances = tolerances self.break_values = break_values_dict if append_break_values: utils.print_issue('INFO', 'Appending break values to model data', do_print=do_print) for ticker in tickers: smal_tol = np.argsort(tolerances[ticker])[0] self.data[ticker][-1] = break_values_dict[ticker][smal_tol] self._init_model(do_print=False) else: utils.print_issue('INFO', 'Current values: {}'.format(current_values), do_print=do_print) utils.print_issue('INFO', 'Break values: {}'.format(break_values_dict), do_print=do_print) utils.print_issue('INFO', 'Tolerances: {}'.format(tolerances), do_print=do_print)
def eval_model(self, tickers='all', entry_money=200, fees=(1.0029, .9954), tax=.25, visualize=False, *args, **kwargs): ''' Function to evaluate the price model predictions Inputs: - data: price data of asset - locs: buy and sell locations, i.e. return from from function price_model() - entry_money: initial investment default = 100 - fees: fee for buying and selling prices, i.e. buy asset at broker for slightly higher price than actual asset prices, vice versa for sells default = (1.005, .995), i.e. .5% higher buy price and .5% lower sell price - tax: german tay payments for annual wins > 800€ default = .25, i.e. 25% - df_return: return model evaluation as pandas DataFrame default = True Outputs: - net_income: Net Income/win after entry_money (and possibly tax) subtracted - df_return: model evaluation as pandas DataFrame ''' do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True) if tickers == 'all': valid_tickers = self.tickers else: valid_tickers = utils.check_ticker_input(tickers_input=tickers, tickers_avail=self.tickers, do_print=do_print) utils.print_opening(ticker=valid_tickers, start_date=self.data.index[0].strftime('%D'), end_date=self.data.index[-1].strftime('%D'), initial_investment_per_ticker=entry_money, do_print=do_print) if any([self.local_min is None, self.local_max is None, self.grad is None]): self._init_model(do_print=do_print) for ticker in valid_tickers: utils.print_issue('TICKER', ticker, do_print=do_print) buy_locs, sell_locs = self._get_locs(ticker=ticker, do_print=do_print) buy_prices = self.data[ticker][buy_locs] buy_dates = self.data[ticker].index.values[buy_locs] sell_prices = self.data[ticker][sell_locs] sell_dates = self.data[ticker].index.values[sell_locs] buy_prices *= fees[0] sell_prices *= fees[1] #check if nan in prices: #TODO: ''' nan_indices = np.isnan(sell_prices) sell_prices = sell_prices[~nan_indices] buy_prices = buy_prices[~nan_indices] nan_indices = np.isnan(buy_prices) sell_prices = sell_prices[~nan_indices] buy_prices = buy_prices[~nan_indices] ''' n_calls = sell_prices.shape[0] if buy_prices.shape > sell_prices.shape: #must use to_numpy() since the dates are still stored in prices as names #-> pandas devides same dates, obviously buy and sell dates differ, #hence pandas would return NaN all the time ratios = sell_prices.to_numpy() / buy_prices.to_numpy()[:-1] else: ratios = sell_prices.to_numpy() / buy_prices.to_numpy() trade_rewards = entry_money * np.cumprod(ratios) #Calculate trade wins trade_wins = np.diff(trade_rewards) #Insert first win try: trade_wins = np.insert(trade_wins, 0, trade_rewards[0] - entry_money) except IndexError: #case where one has one buy but not yet selled. pass #Evaluate Calls good_calls = np.where(trade_wins > 0) bad_calls = np.where(trade_wins < 0) try: efficiency = good_calls[0].shape[0] / n_calls except ZeroDivisionError: efficiency = np.nan #TODO: Error handling here: win_loss = trade_wins / (trade_rewards - trade_wins) average_win = np.mean(win_loss[np.where(win_loss > 0)]) average_loss = np.mean(win_loss[np.where(win_loss < 0)]) if np.sum(trade_wins) > 800: tax_pays = np.sum(trade_wins) * tax utils.print_issue('INFO', '{:.2f} tax was paid.'.format(tax_pays), do_print=do_print) net_income = (trade_rewards[-1] - entry_money) * (1 - tax) else: utils.print_issue('INFO', 'No tax paid.', do_print=do_print) net_income = np.sum(trade_wins) #create final DataFrame sell_grad = self.grad[ticker][sell_locs - self.buy_delay] buy_grad = self.grad[ticker][buy_locs - self.buy_delay] #be aware that buy_dates can be 1 entry longer then sell dates! if buy_dates.shape[0] > sell_dates.shape[0]: if sell_dates.shape[0] > 0: utils.print_issue('INFO', 'Last entry of "Sell Dates" will \ be assigned equally as the penultimate one.', do_print=do_print) sell_dates = np.append(sell_dates, sell_dates[-1]) else: utils.print_issue('INFO', 'First entry of "Sell Dates" \ will be first entry of "Buy Dates".', do_print=do_print) sell_dates = buy_dates[0] try: sell_prices.loc[pd.Timestamp.max] = np.nan except: #OverflowError: --> NOT WORKING? sell_prices.loc[buy_prices.index[-1]] = np.nan trade_rewards = np.append(trade_rewards, np.nan) trade_wins = np.append(trade_wins, np.nan) win_loss = np.append(win_loss, np.nan) sell_grad = np.append(sell_grad, np.nan) grad_diff = sell_grad - buy_grad final_df = pd.DataFrame(data = {'Buy Dates': buy_dates, 'Sell Dates': sell_dates, 'Buy Prices': buy_prices.to_numpy(), 'Sell Prices': sell_prices.to_numpy(), 'Trade Reward': trade_rewards, 'Trade Win': trade_wins, 'Trade Efficiency': win_loss, 'Grad at Buy': buy_grad, 'Grad at Sell': sell_grad, 'Grad Difference': grad_diff}) self.ticker_df[ticker] = final_df utils.print_issue(None, '-' * 80, do_print=do_print) utils.print_issue('SUMMARY', 'Average trade win: {:.10%}'.format(average_win), do_print=do_print) utils.print_issue('SUMMARY', 'Average trade loss: {:.10%}'.format(average_loss), do_print=do_print) utils.print_issue('SUMMARY', 'Efficiency: {:.2%}'.format(efficiency), do_print=do_print) utils.print_issue('SUMMARY', 'NET WIN: {:.2f}'.format(net_income), do_print=do_print) utils.print_issue(None, '=' * 80, do_print=do_print)