Esempio n. 1
0
 def show_possibilities(self, tickers='all', *args, **kwargs):
     do_print = utils.parse_kwargs("do_print", kwargs, True)
     if tickers == 'all':
         tickers = self.tickers
     else:
         tickers = utils.check_ticker_input(tickers_input=tickers,
                                            tickers_avail=self.tickers,
                                            do_print=do_print)
     for ticker in tickers:
         utils.print_issue(None, '=' * 80, do_print=do_print)
         utils.print_issue('INFO', 'Current ticker: {}'.format(ticker), do_print=do_print)
         #check if last value is nan:
         last_value_index = -1
         if not np.isnan(self.data[ticker][last_value_index]):
             utils.print_issue('WARNING', 'Last value of data set is not NaN!')
             input_message = 'Proceed anyways? '
             if not self._get_answer(input_message=input_message):
                 continue
         else:
             last_value_index = -2
         if self.break_values[ticker] is None and generic_value is None:
             utils.print_issue('ERROR', 'No break values computed for this ticker!')
             continue
         deviation = utils.parse_kwargs('deviation', kwargs, error_arg=.0125)
         bottom_value, top_value = self.break_values[ticker]
         middle_value = (top_value - bottom_value)*.5 + bottom_value
         bottom_value *= (1 - deviation)
         top_value *= (1 + deviation)
         test_values = [bottom_value, middle_value, top_value]
         for value in test_values:
             utils.print_issue(None, '-' * 80, do_print=do_print)
             utils.print_issue('INFO', 'Result for value: {}'.format(value),
                               do_print=do_print)
             #create an imag_model:
             test_model = self.copy_model()
             #assign the value to the last entry:
             test_model.data[ticker][-1] = value
             #init model
             test_model._init_model(do_print=False)
             test_model.eval_model(do_print=False)
             p_range = utils.parse_kwargs('plot_range', kwargs, None)
             p_index = utils.parse_kwargs('plot_from_index', kwargs, None)
             p_date = utils.parse_kwargs('plot_from_date', kwargs, None)
             switch_axes = utils.parse_kwargs('switch_axes', kwargs, False)
             return_plot = utils.parse_kwargs("return_plot", kwargs, False)
             save_figures = utils.parse_kwargs("save_figures", kwargs, False)
             fig_name = "{}_imag_value_{:.2f}".format(ticker, value)
             output_folder = utils.parse_kwargs("output_folder", kwargs, None)
             plotting.plot_model(model=test_model,
                                 tickers=ticker,
                                 plot_range=p_range,
                                 plot_from_index=p_index,
                                 plot_from_date=p_date,
                                 plot_break_values=True,
                                 switch_axes=switch_axes,
                                 return_plot=return_plot,
                                 output_folder=output_folder,
                                 save_figures=save_figures,
                                 fig_name=fig_name)
Esempio n. 2
0
    def check_for_nan_values(self, tickers='all', exclude_last_value=True,
                             *args, **kwargs):
        #TODO: CODE THIS FUNCTION !
        do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True)
        if tickers == 'all':
            tickers = self.tickers
        else:
            tickers = utils.check_ticker_input(tickers_input=tickers,
                                               tickers_avail=self.tickers,
                                               do_print=True)
        for ticker in tickers:
            if exclude_last_value:
                nan_indices = np.where(np.isnan(self.data[ticker][:-1]))[0]
                valid_indices = np.where(np.isfinite(self.data[ticker][:-1]))[0]
                valid_indices = np.hstack((valid_indices, self.data[ticker].shape[0] - 1))
                filtered_data = self.data[ticker][valid_indices]
                self.data[ticker] = filtered_data

            else:
                utils.print_issue('INFO', 'Last value is considered to be removed.',
                                    do_print=do_print)
                nan_indices = np.where(np.isnan(self.data[ticker]))[0]
            if nan_indices.size > 0:

                #print(self.data[ticker].dropna())#[~np.isnan(self.data[ticker][:last_value_index])])
                return
                input_message = 'Remove {} NaN values? '.format(nan_indices.size)
                if self._get_answer(input_message=input_message):
                    self.data[ticker] = self.data[ticker][~nan_indices]
            else:
                utils.print_issue('INFO', 'No NaN values detected.',
                                    do_print=do_print)
Esempio n. 3
0
 def stock_price(cls, iteration_=1, **kwargs):
     """generate stock spot through stochastic process"""
     _rand = rand_norm(0, 1, iteration_)
     _isp, _rate, _div, _vol, _t = parse_kwargs(
         kwargs, ['isp', 'rate', 'div', 'vol', 't'], 0)
     return _isp * exp((_rate - _div - _vol**2 / 2) * _t +
                       _vol * sqrt(_t) * _rand)
Esempio n. 4
0
def setup_devices(ids):
    if ids == '':
        return {'main': -1}
    devices = parse_kwargs(ids)
    for key in devices:
        devices[key] = int(devices[key])
    return devices
Esempio n. 5
0
 def _get_locs(self, ticker, *args, **kwargs):
     do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True)
     if len(self.local_min) > 1:
         buy_locs = self.local_min[ticker][0] + self.buy_delay
         sell_locs = self.local_max[ticker][0] + self.buy_delay
     else:
         buy_locs = self.local_min[ticker] + self.buy_delay
         sell_locs = self.local_max[ticker] + self.buy_delay
     try:
         if buy_locs[0] > sell_locs[0]:
             sell_locs = sell_locs[1:]
     except IndexError:
         utils.print_issue('INFO', 'First sell position will not be displayed.',
                            do_print=do_print)
     #check locs:
     if buy_locs.shape[0] > sell_locs.shape[0]:
         utils.print_issue('INFO', 'Open position.', do_print=do_print)
     elif buy_locs.shape[0] < sell_locs.shape[0]:
         try:
             sell_locs[0] = buy_locs[0]
         except IndexError:
             utils.print_issue('INFO', 'No buy locations occured.\
 Sell locations are set to buy locations.', do_print=do_print)
             sell_locs = buy_locs
     return buy_locs, sell_locs
Esempio n. 6
0
    def _init_model(self, *args, **kwargs):
        '''
        Function to set up the price model. The idea is to locate the inflection
        points of the difference of "moving average converging diverging (macd)"
        and "Signal Line (signal_line)". These indicate local up and down trends.
        The actual buy and sell prices are therefore the next day, i.e. buy_delay.
        Inputs:
            - periods: days to calculate the macd (first two values)
            and Signal Line (last value).
                default: 12, 26, 9
            - buy_delay: buy and sell dates
                default: 1
            - grad_return: return the "gradient" of the model, i.e. the model itself
                default: True
        Outputs:
            - local_min: Buy prices
            - local_max: Sell prices
            - grad: "gradient" of the model (optionally)
        '''
        do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True)
        utils.print_issue('INIT', 'Initialising model for tickers: {}'.format(self.tickers),
                          do_print=do_print)
        macd = self._calc_ema(self.data, self.periods[0]) - self._calc_ema(self.data, self.periods[1])
        signal_line = self._calc_ema(macd, self.periods[2])
        if len(self.tickers) == 1:
            grad = np.gradient(macd[self.tickers[0]] -
                               signal_line[self.tickers[0]])
        else:
            grad = np.gradient(macd - signal_line)
        local_min, local_max, grad_dict = {}, {}, {}
        if isinstance(grad, list):
            utils.print_issue('WARNING', 'Ignoring second entry of gradient!',
                               do_print=do_print)
            grad = grad[0].T
            for n in range(grad.shape[0]):
                local_min[self.tickers[n]] = argrelextrema(grad[n], np.less)
                local_max[self.tickers[n]] = argrelextrema(grad[n], np.greater)
        else:
            local_min[self.tickers[0]] = argrelextrema(grad, np.less)[0]
            local_max[self.tickers[0]] = argrelextrema(grad, np.greater)[0]
        #transforming grad as dict
        if len(grad.shape) == 1:
            grad_dict[self.tickers[0]] = grad
        else:
            for n, ticker in enumerate(self.tickers):
                grad_dict[ticker] = grad[n]

        self.local_min = local_min
        self.local_max = local_max
        self.grad = grad_dict
        utils.print_issue('INIT', 'Successfully initialized model.',
                          do_print=do_print)
        utils.print_issue(None, '*' * 80,
                          do_print=do_print)
Esempio n. 7
0
 def append_timedelta(self, timedelta=1, overwrite_data=True, *args, **kwargs):
     do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True)
     new_entry = self.data.index[-1] + pd.Timedelta(days=timedelta)
     final_entries = list(self.data.index)
     final_entries.append(new_entry)
     idx = pd.DatetimeIndex(final_entries)
     new_data = self.data.reindex(idx)
     if overwrite_data:
         utils.print_issue('INFO', 'New data was appended.',
                            do_print=do_print)
         self.data = new_data
     else:
         return new_data
Esempio n. 8
0
def train(**kwargs):
    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)

    # init model
    model = GradientBoostingClassifier(n_estimators=100,
                                       learning_rate=0.1,
                                       verbose=1,
                                       n_iter_no_change=10,
                                       random_state=10)

    train_feat, train_label = train_data.get_feat_data()

    print("Start Training.")
    model.fit(train_feat, train_label)

    print("Training Finished.")

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_feat, test_label = test_data.get_feat_data()

    test_metric, test_log, test_result = evaluate_clf(model,
                                                      test_feat,
                                                      test_label,
                                                      top_k_list=[3, 5, 10])

    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 9
0
def train(**kwargs):
    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # init model
    model = TextCNN(**model_param)
    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if model_param["use_gpu"]:
        model.cuda()

    print("Model Inited.")
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=0)

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        model.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred = model.forward(feat)

            if model_param["use_gpu"]:
                label = torch.LongTensor(dise).cuda()
            else:
                label = torch.LongTensor(dise)

            # label is [1,2,3...,27]
            loss = F.cross_entropy(pred, label - 1)

            # multi-class xent loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg
        metric_result, eval_log, eval_result = evaluate_clf(
            model, val_data_loader, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], model, "textcnn")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate_clf(model,
                                                      test_data_loader,
                                                      top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 10
0
def train(**kwargs):
    w2v_model_name = "./ckpt/w2v"

    if os.path.exists(w2v_model_name):
        print("load word2vec model from", w2v_model_name)
        # load model directly
        w2v_model = Word2Vec.load(w2v_model_name)

    else:
        # load data
        filename = "./dataset/EHR/train/data.txt"
        fin = open(filename, "r")
        corpus = []
        for line in fin.readlines():
            corpus.append(line.strip().split()[2:])

        # learn word2vec model
        start_time = time.time()
        w2v_model = Word2Vec(corpus,
                             size=64,
                             window=3,
                             min_count=1,
                             workers=4,
                             sg=1)
        w2v_model.save("./ckpt/w2v")
        print("training done, costs {} secs.".format(time.time() - start_time))

    # start training and testing the MLP model
    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # let's build a MLP for prediction
    model_param["w2v_model"] = w2v_model
    model = MLP(**model_param)

    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if model_param["use_gpu"]:
        model.cuda()

    print("Model Inited.")
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=kwargs["weight_decay"])

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        model.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred = model.forward(feat)

            if model_param["use_gpu"]:
                label = torch.LongTensor(dise).cuda()
            else:
                label = torch.LongTensor(dise)

            # label is [1,2,3...,27]
            loss = F.cross_entropy(pred, label - 1)

            # multi-class xent loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg
        metric_result, eval_log, eval_result = evaluate_clf(
            model, val_data_loader, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], model, "med2vec")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate_clf(model,
                                                      test_data_loader,
                                                      top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
    pass
Esempio n. 11
0
def train(**kwargs):
    setup_seed(2020)
    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # init model
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    gnn = HGNN_SDS(**model_param)
    if model_param["w2v"] is not None:
        # load w2v data
        gnn.load_symp_embed(model_param["w2v"])

    if use_gpu:
        gnn.cuda()

    print("Model Inited.")

    sds_sampler = SDS_sampler("dataset/EHR")

    # load pmi ss mat
    symp2symp_mat = sp.load_npz(os.path.join("dataset/EHR", "pmi_ss_mat.npz"))
    symp2symp_mat.setdiag(0)

    # total number of symptoms
    num_total_batch = gnn.num_symp // model_param["batch_size"]
    all_symp_index = np.arange(1, gnn.num_symp + 1)

    lambda_hard_r = lambda epoch: epoch * model_param[
        "hard_ratio"] / model_param["num_epoch"]

    # build hard map and pos map
    symp2symp_hard_map = [0]
    symp2symp_pos_map = [0]
    for k in all_symp_index:
        symp2symp_b_ar = symp2symp_mat[k].toarray().flatten()
        max_index = np.argmax(symp2symp_b_ar)
        if max_index == 0:
            symp2symp_pos_map.append(np.random.randint(1, k))
            symp2symp_hard_map.append(np.random.randint(1, k))

        else:
            symp2symp_pos_map.append(max_index)
            symp2symp_b_ar[max_index] = -1
            max_2nd_index = np.argmax(symp2symp_b_ar)
            if max_2nd_index == 0:
                symp2symp_hard_map.append(np.random.randint(1, k))
            else:
                symp2symp_hard_map.append(max_2nd_index)

    symp2symp_hard_map = np.array(symp2symp_hard_map)
    symp2symp_pos_map = np.array(symp2symp_pos_map)
    print("Pos / Hard symptom map Inited.")

    optimizer = torch.optim.Adam(gnn.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=model_param["lr"])
    last_total_loss = 1e10

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        gnn.train()
        np.random.shuffle(all_symp_index)

        hard_ratio = lambda_hard_r(epoch)

        for idx in range(num_total_batch):
            batch_symp = all_symp_index[idx *
                                        model_param["batch_size"]:(idx + 1) *
                                        model_param["batch_size"]]

            # get pos symp and neg symp
            pos_symp = symp2symp_pos_map[batch_symp]

            # sample neg
            neg_symp = np.random.randint(1, gnn.num_symp,
                                         model_param["batch_size"])

            # cope with overlapping in pos and neg symps
            overlap_index = (neg_symp == pos_symp)
            overlap_symp = neg_symp[overlap_index]
            neg_symp[overlap_index] = symp2symp_hard_map[overlap_symp]

            if hard_ratio > 0:
                num_hard = int(hard_ratio * model_param["batch_size"])
                neg_symp[:num_hard] = symp2symp_hard_map[neg_symp[:num_hard]]

            batch_symp_ts = torch.LongTensor(batch_symp)
            pos_symp_ts = torch.LongTensor(pos_symp)
            neg_symp_ts = torch.LongTensor(neg_symp)

            if model_param["use_gpu"]:
                batch_symp_ts = batch_symp_ts.cuda()
                pos_symp_ts = pos_symp_ts.cuda()
                neg_symp_ts = neg_symp_ts.cuda()

            # forward batch symp
            batch_symp_data = sds_sampler(batch_symp, 1, 20)
            symp_emb = gnn.forward(batch_symp_ts, batch_symp_data)

            pos_symp_data = sds_sampler(pos_symp, 1, 20)
            pos_emb = gnn.forward(pos_symp_ts, pos_symp_data)

            neg_symp_data = sds_sampler(neg_symp, 1, 20)
            neg_emb = gnn.forward(neg_symp_ts, neg_symp_data)

            # create loss
            scores = symp_emb.mul(pos_emb).sum(1) - symp_emb.mul(neg_emb).sum(
                1) + 1.0
            scores[scores < 0] = 0
            loss = scores.mean()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        if total_loss - last_total_loss > 0:
            print("Loss stops to decrease, converge.")
            break

        last_total_loss = total_loss

    # save model
    torch.save(gnn.state_dict(), "./ckpt/sds_gnn.pt")
    print("Model saved.")
Esempio n. 12
0
def main(**kwargs):
    # parse parameters
    param = default_config()
    param.update({
        "mode": "sds",
        "top_k": 10,
        "ckpt": "ckpt/gnn.pt",
        "use_gpu": False
    })

    param.update(kwargs)

    # read maps
    symp2id, id2symp = read_symp2id()
    dise2id, id2dise = read_dise2id()

    # read data
    datapath = os.path.join("dataset/EHR/test/data.txt")
    fin = open(datapath, "r", encoding="utf-8")
    lines = fin.readlines()

    data_model = ehr.EHR("dataset/EHR", "train")

    # init retrieval system
    ehr_ret = EHR_retrieval(mode=param["mode"])

    # init and load model
    data_model_param = parse_data_model(data_model)
    param.update(data_model_param)
    param = parse_kwargs(param, kwargs)
    gnn = HGNN(**param)

    if param["use_gpu"]:
        gnn.cuda()

    ckpt_path = param.get("ckpt")
    if ckpt_path is None:
        print("[Warning] Do not set ckpt path, load from the default path.")
        load_ckpt("ckpt/checkpoint.pt", gnn, param["use_gpu"])
    else:
        load_ckpt(ckpt_path, gnn, param["use_gpu"])

    dsd_sampler = DSD_sampler("dataset/EHR")
    usu_sampler = USU_sampler("dataset/EHR")

    gnn.eval()

    emb_dise = gnn.gen_all_dise_emb(dsd_sampler)

    # init result list
    before_list = []
    after_list = []
    real_dise_list = []
    init_symp_list = []
    after_symp_list = []

    result_map_bfo = defaultdict(list)
    result_map_aft = defaultdict(list)
    # this is top_k for evaluation p@N, Rec@N, ...
    top_k_list = [1, 5]

    for i, line in enumerate(lines):
        line_data = line.strip().split()
        uid = line_data[0]
        did = line_data[1]
        real_dise_list.append(did)
        symps = line_data[2:]

        # select the first symptom and do inference
        init_symp = symps[0]
        init_symp_list.append(id2symp[init_symp])

        symp_ar = np.array([[init_symp]])

        pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5)

        # calculate statistics
        for top_k in top_k_list:
            pred_top_k = pred_rank[0][:top_k]
            calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_bfo)

        # print("true did:", did)
        # print("before:", pred_rank)
        before_list.append(pred_rank[0])

        rank_symp = ehr_ret(symp_idx=init_symp, top_k=param["top_k"])
        after_symp_list.append([id2symp[str(t)] for t in rank_symp])
        symp_ar = [np.concatenate([[init_symp], rank_symp], 0)]

        # symp_ar = np.array([symps])
        pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=5)
        for top_k in top_k_list:
            pred_top_k = pred_rank[0][:top_k]
            calculate_rec_ndcg(pred_top_k, int(did), top_k, result_map_aft)

        # print("after:", pred_rank)
        after_list.append(pred_rank[0])

        ret_symps = ehr_ret(init_symp, param["top_k"])
        ret_symp_list = []
        for sid in ret_symps:
            ret_symp_list.append(id2symp[str(sid)])

        if i % 100 == 0:
            print("[line]:", i)

    # summary
    bf_log = build_result_log(result_map_bfo, top_k_list)
    af_log = build_result_log(result_map_aft, top_k_list)

    print("[before]: {}".format(bf_log))
    print("[after]: {}".format(af_log))

    # to result csv
    fout = open("retrieval_result_{}.txt".format(param["mode"]),
                "w",
                encoding="utf-8")
    fout.write("did\tbefore_pred\tafter_pred\tinit_symp\taftersymp\n")
    for i in range(len(init_symp_list)):
        wrtline = id2dise[int(real_dise_list[i])] + "\t" + id2dise[int(
            before_list[i][0])] + "\t" + id2dise[int(
                after_list[i]
                [0])] + "\t" + init_symp_list[i] + "\t" + "#".join(
                    after_symp_list[i]) + "\n"
        fout.write(wrtline)

    fin.close()
    fout.close()

    df_res = pd.read_table("retrieval_result_{}.txt".format(param["mode"]))
    df_res.to_excel("retrieval_result_{}.xlsx".format(param["mode"]),
                    encoding="utf-8")
    print("Done")
Esempio n. 13
0
def plot_model(model,
               tickers='all',
               plot_range=None,
               plot_from_index=None,
               plot_from_date=None,
               plot_break_values=True,
               switch_axes=False,
               **kwargs):
    '''
    Function to plot a model.
    Inputs:
        - model: model of class MODEL
        - tickers: tickers to plot
            default: all, i.e. tickers in input class MODEL
        - plot_range: range to plot of type pandas.date_range()
            defualt: None, i.e. complete data set
        - plot_break_values: if available, plot break_values of input class MODEL
            default: True
    '''
    do_print = utils.parse_kwargs("do_print", kwargs, True)
    if tickers == 'all':
        tickers = model.tickers
    else:
        tickers = utils.check_ticker_input(tickers_input=tickers,
                                           tickers_avail=model.tickers,
                                           do_print=do_print)
    for ticker in tickers:
        if plot_range is not None:
            x_axis = model.data[ticker][plot_range].index
            indices = np.where(np.isin(model.data[ticker].index,
                                       plot_range))[0]
        elif plot_from_index is not None:
            x_axis = model.data[ticker].index[plot_from_index:]
            indices = np.arange(plot_from_index,
                                model.data[ticker].index.shape[0], 1)
        elif plot_from_date is not None:
            idx = model.data[ticker].index.get_loc(plot_from_date).start
            x_axis = model.data[ticker].index[idx:]
            indices = np.arange(idx, model.data[ticker].index.shape[0], 1)
        else:
            x_axis = model.data[ticker].index
            indices = np.arange(0, x_axis.shape[0], 1)

        grad = model.grad[ticker][indices]
        min_arg = np.where(model.local_min[ticker] >= indices[0])
        max_arg = np.where(model.local_max[ticker] >= indices[0])
        try:
            local_min = model.local_min[ticker][min_arg]
            local_max = model.local_max[ticker][max_arg]
            in_loop = False
        except TypeError:
            #loop over tickers:
            in_loop = True
            local_min = model.local_min[ticker][0][min_arg[1]]
            local_max = model.local_max[ticker][0][max_arg[1]]
        price = model.data[ticker][indices]
        try:
            buy_dates = model.ticker_df[ticker]['Buy Dates'].values[min_arg[0]]
            if in_loop:
                buy_dates = model.ticker_df[ticker]['Buy Dates'].values[
                    min_arg[1]]
        except IndexError:
            utils.print_issue(
                'INFO',
                'New buy signal was detected for last value: {}.'.format(
                    model.data[ticker][-1]),
                do_print=do_print)
            buy_dates = model.ticker_df[ticker]['Buy Dates'].values[min_arg[0]
                                                                    [:-1]]
            if in_loop:
                buy_dates = model.ticker_df[ticker]['Buy Dates'].values[
                    min_arg[1][:-1]]
            buy_dates = np.hstack(
                (buy_dates,
                 model.data[ticker].index[local_min[-1] + 1].to_numpy()))
        try:
            sell_dates = model.ticker_df[ticker]['Sell Dates'].values[
                max_arg[0]]
            if in_loop:
                sell_dates = model.ticker_df[ticker]['Sell Dates'].values[
                    max_arg[1]]
        except IndexError:
            utils.print_issue(
                'INFO',
                'New sell signal was detected for last value: {}.'.format(
                    model.data[ticker][-1]),
                do_print=do_print)
            sell_dates = model.ticker_df[ticker]['Sell Dates'].values[
                max_arg[0][:-1]]
            if in_loop:
                sell_dates = model.ticker_df[ticker]['Sell Dates'].values[
                    max_arg[1][:-1]]
            sell_dates = np.hstack(
                (sell_dates,
                 model.data[ticker].index[local_max[-1] + 1].to_numpy()))

        #Generating plots:
        fig, axs = plt.subplots(2, 1, figsize=(16, 9), sharex=True)
        if switch_axes:
            ax_indices = [1, 0]
        else:
            ax_indices = [0, 1]
        axs[ax_indices[0]].fill_between(x_axis,
                                        0,
                                        grad,
                                        where=grad > 0,
                                        facecolor='green',
                                        interpolate=True,
                                        label='Up Trend')
        axs[ax_indices[0]].fill_between(x_axis,
                                        0,
                                        grad,
                                        where=grad <= 0,
                                        facecolor='red',
                                        interpolate=True,
                                        label='Down Trend')
        axs[ax_indices[0]].vlines(model.data[ticker].index[local_min],
                                  np.min(grad),
                                  np.max(grad),
                                  color='g',
                                  label='Min Reached')
        axs[ax_indices[0]].vlines(model.data[ticker].index[local_max],
                                  np.min(grad),
                                  np.max(grad),
                                  color='r',
                                  label='Peak Reached')
        #layout:
        axs[ax_indices[0]].set_title('{} - MODEL'.format(ticker),
                                     fontsize='larger')
        axs[ax_indices[0]].set_ylabel('Gradient [-]', fontsize='larger')
        #subplot 2:
        axs[ax_indices[1]].plot(x_axis, price, label='{}'.format(ticker))
        axs[ax_indices[1]].vlines(buy_dates,
                                  np.min(price),
                                  np.max(price),
                                  color='g',
                                  label='Buy Dates')
        axs[ax_indices[1]].vlines(sell_dates,
                                  np.min(price),
                                  np.max(price),
                                  color='r',
                                  linestyle='--',
                                  label='Sell dates')

        if plot_break_values:
            if model.break_values is not None:
                axs[ax_indices[1]].hlines(model.break_values[ticker][0],
                                          x_axis[0],
                                          x_axis[-1],
                                          color='k',
                                          label='Break value {:.5f}'.format(
                                              model.break_values[ticker][0]))
                axs[ax_indices[1]].hlines(model.break_values[ticker][1],
                                          x_axis[0],
                                          x_axis[-1],
                                          color='c',
                                          label='Break value {:.5f}'.format(
                                              model.break_values[ticker][1]))
        #layout:
        axs[ax_indices[1]].set_title('{} - PRICE'.format(ticker),
                                     fontsize='larger')
        axs[ax_indices[1]].set_ylabel('Price', fontsize='larger')
        #settings for all plots:
        axs[np.sort(ax_indices)[-1]].set_xlabel('Date', fontsize='larger')
        for n in ax_indices:
            axs[ax_indices[n]].grid()
            axs[ax_indices[n]].legend(loc='upper left')
        save_figures = utils.parse_kwargs(key="save_figures",
                                          kwargs=kwargs,
                                          error_arg=False)
        return_plot = utils.parse_kwargs(key="return_plot",
                                         kwargs=kwargs,
                                         error_arg=False)
        output_folder = utils.parse_kwargs(key="output_folder",
                                           kwargs=kwargs,
                                           error_arg=False)
        fig_name = utils.parse_kwargs(key="fig_name",
                                      kwargs=kwargs,
                                      error_arg="{}_evaluation".format(ticker))
        if fig_name is not None:
            plt.suptitle(fig_name)
        if all([save_figures, output_folder, fig_name]):
            fname = "{}/{}.pdf".format(output_folder, fig_name)
            plt.savefig(fname=fname)
            plt.close()
            message = "Exported: %s" % fname
            utils.print_issue("INFO", message, do_print=do_print)
            #return
        if return_plot:
            return plt
Esempio n. 14
0
def train(**kwargs):

    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    dataset_name = model_param["dataset"]

    # load hard maps
    if model_param["hard_ratio"] > 0:
        model_param["hard_map"] = np.load("dataset/hard_dise.npy",
                                          allow_pickle=True).item()

    # load training data
    train_data = ehr.EHR("dataset/{}".format(dataset_name), "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/{}".format(dataset_name), "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # init model
    gnn = HGNN(**model_param)
    if kwargs["w2v"] is not None:
        if os.path.exists(kwargs["w2v"]):
            # load w2v data
            gnn.load_symp_embed(kwargs["w2v"])
        else:
            from gensim.models import Word2Vec
            # build word2vec embeddings
            filename = "./dataset/EHR/train/data.txt"
            fin = open(filename, "r")
            corpus = []
            for line in fin.readlines():
                corpus.append(line.strip().split()[2:])
            # learn word2vec model
            start_time = time.time()
            w2v_model = Word2Vec(corpus,
                                 size=64,
                                 window=3,
                                 min_count=1,
                                 workers=4,
                                 sg=1)
            w2v_model.save("./ckpt/w2v")
            print("word2vec training done, costs {} secs.".format(time.time() -
                                                                  start_time))

    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if use_gpu:
        gnn.cuda()

    print("Model Inited.")

    # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"])

    optimizer = torch.optim.Adam(gnn.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=0)

    # init sampler for netative sampling during training.
    dsd_sampler = DSD_sampler("dataset/{}".format(dataset_name))
    print("D-S-D Sampler Inited.")

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        gnn.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward(
                feat, dise, dsd_sampler)

            bpr_loss = create_bpr_loss(pred, pred_neg)

            l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise)
            loss = bpr_loss + model_param["weight_decay"] * l2_loss
            # loss = bpr_loss

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            total_loss += bpr_loss.item()
            # print(idx,total_loss)

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg

        metric_result, eval_log, eval_result = evaluate(
            gnn, val_data_loader, dsd_sampler, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], gnn, "gnn")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/{}".format(dataset_name), "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate(gnn,
                                                  test_data_loader,
                                                  dsd_sampler,
                                                  top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 15
0
def train(**kwargs):

    setup_seed(2020)

    model_param = default_config()
    model_param = parse_kwargs(model_param, kwargs)

    # load hard maps
    if model_param["hard_ratio"] > 0:
        model_param["hard_map"] = np.load("dataset/hard_dise.npy",
                                          allow_pickle=True).item()

    # load training data
    train_data = ehr.EHR("dataset/EHR", "train")
    train_data_loader = DataLoader(train_data,
                                   model_param["batch_size"],
                                   shuffle=True,
                                   num_workers=0,
                                   collate_fn=collate_fn)

    # load validation data
    val_data = ehr.EHR("dataset/EHR", "val")
    val_data_loader = DataLoader(val_data,
                                 model_param["batch_size"],
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # use data model to update model_param
    data_model_param = parse_data_model(train_data)
    model_param.update(data_model_param)
    use_gpu = model_param["use_gpu"]

    # init model
    gnn = HGNN_DSD(**model_param)
    if kwargs["w2v"] is not None:
        # load w2v data
        gnn.load_symp_embed(kwargs["w2v"])
    early_stopper = EarlyStopping(patience=model_param["early_stop"],
                                  larger_better=True)

    if use_gpu:
        gnn.cuda()

    print("Model Inited.")

    # optimizer = torch.optim.Adam(gnn.parameters(),lr=model_param["lr"],weight_decay=model_param["weight_decay"])

    optimizer = torch.optim.Adam(gnn.parameters(),
                                 lr=model_param["lr"],
                                 weight_decay=0)

    # init sampler for netative sampling during training.
    dsd_sampler = DSD_sampler("dataset/EHR")
    print("D-S-D Sampler Inited.")

    for epoch in range(model_param["num_epoch"]):
        total_loss = 0
        gnn.train()

        for idx, (feat, dise) in enumerate(train_data_loader):
            pred, pred_neg, emb_user, emb_dise, neg_emb_dise = gnn.forward(
                feat, dise, dsd_sampler)

            bpr_loss = create_bpr_loss(pred, pred_neg)

            l2_loss = create_l2_loss(emb_user, emb_dise, neg_emb_dise)
            loss = bpr_loss + model_param["weight_decay"] * l2_loss
            # loss = bpr_loss

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            total_loss += bpr_loss.item()
            # print(idx,total_loss)

        print("{} Epoch {}/{}: train loss: {:.6f}".format(
            now(), epoch + 1, model_param["num_epoch"], total_loss))

        # do evaluation on recall and ndcg

        metric_result, eval_log, eval_result = evaluate(
            gnn, val_data_loader, dsd_sampler, [5])
        print("{} Epoch {}/{}: [Val] {}".format(now(), epoch + 1,
                                                model_param["num_epoch"],
                                                eval_log))

        early_stopper(metric_result["ndcg_5"], gnn, "gnn_dsd")

        if early_stopper.early_stop:
            print("[Early Stop] {} Epoch {}/{}: {}".format(
                now(), epoch + 1, model_param["num_epoch"], eval_log))
            break

    # eval on test set
    # load test data
    test_data = ehr.EHR("dataset/EHR", "test")
    test_data_loader = DataLoader(test_data,
                                  model_param["batch_size"],
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=collate_fn)

    test_metric, test_log, test_result = evaluate(gnn,
                                                  test_data_loader,
                                                  dsd_sampler,
                                                  top_k_list=[1, 3, 5, 10])
    print("[Test] {}: {}".format(now(), test_log))
    print("Training Done.")
Esempio n. 16
0
def calc_probs(model, time=None, tickers='all', stats_data=None,
               auto_update_tolerances=False, *args, **kwargs):
    """Function to calculate statistics."""
    do_print = utils.parse_kwargs("do_print", kwargs, True)
    if tickers == 'all':
        tickers = model.tickers
    else:
        tickers = utils.check_ticker_input(tickers_input=tickers,
                                           tickers_avail=model.tickers)
    try:
        timezone = kwargs['timezone']
    except KeyError:
        timezone = None
    try:
        start = kwargs['start']
    except KeyError:
        start = None
    for ticker in tickers:
        utils.print_issue(None, '=' * 80)
        utils.print_issue('INFO', 'Current ticker: {}'.format(ticker))
        z_values, tols, means = _create_z_values(model=model, ticker=ticker,
                                                 stats_data=stats_data, timezone=timezone,
                                                 start=start,
                                                 auto_update_tolerances=auto_update_tolerances)

        freq_range, frequencies = _create_freq()
        delta_t = model.data.index[-1].to_datetime64() - pd.Timestamp.now().to_datetime64()
        delta_t = pd.Timedelta(delta_t).seconds / 3600

        arg = np.argsort(tols)
        value_arg = np.argsort(model.break_values[ticker])
        probs = ss.norm.cdf(z_values) * 100
        # do 1 - if:
        flip_arg = np.where(z_values > 0)
        probs[np.where(z_values > 0)] = (1 - ss.norm.cdf(z_values[flip_arg])) * 100
        poly_deg = 5
        poly_probs = np.zeros(2)
        fig, axs = plt.subplots(2, 1, figsize=(16, 9), sharex=True, sharey=True)
        for n, ax in enumerate(axs):
            ax.plot(frequencies, probs[n],
                    label='Probability')
            ax.vlines(delta_t, np.min(probs), np.max(probs), label='Time to deadline')
            poly_line = np.poly1d(np.polyfit(freq_range, probs[n], poly_deg))
            ax.plot(frequencies, poly_line(freq_range), 'r', label='Polyfit of deg {}'.format(poly_deg))
            title = 'Ticker: {} - Break Value: {} - Tolerance: {}'.format(ticker,
            model.break_values[ticker][value_arg[n]], tols[arg[n]])
            current_prob = poly_line(delta_t)
            ax.text(x=delta_t - .25, y=(np.max(probs) + np.min(probs))*.5,
                    s='{:.2f}%'.format(current_prob), fontsize='larger')
            ax.set_title(title, fontsize='large')
            ax.legend()
            ax.grid()
            ax.yaxis.get_label().set_fontsize('larger')
            ax.xaxis.get_label().set_fontsize('larger')
            poly_probs[n] = current_prob

        ax.invert_xaxis()
        plt.setp(axs[-1], xlabel='Time to break value [h]')
        plt.setp(axs, ylabel='Probability [%]')
        prob_between = np.abs(np.diff(poly_probs))[0]
        for n, prob in enumerate(poly_probs):
            utils.print_issue('STATS-EVAL',
                               'Probability for tol={:.5f}: {:.2f}%'.format(tols[arg][n], prob))

        utils.print_issue('STATS-EVAL',
                           'Probability between: {:.2f}%'.format(prob_between))
    
        save_figures = utils.parse_kwargs(key="save_figures",
                                          kwargs=kwargs,
                                          error_arg=False)
        return_plot = utils.parse_kwargs(key="return_plot",
                                         kwargs=kwargs,
                                         error_arg=False)
        output_folder = utils.parse_kwargs(key="output_folder",
                                           kwargs=kwargs,
                                           error_arg=False)
        fig_name = "{}_statistics".format(ticker)
        if fig_name is not None:
            plt.suptitle(fig_name)
        if all([save_figures, output_folder, fig_name]):
            fname = "{}/{}.pdf".format(output_folder, fig_name)
            plt.savefig(fname=fname)
            plt.close()
            message = "Exported: %s" %fname
            utils.print_issue("INFO", message, do_print=do_print)
            #return
        if return_plot:
            return plt
Esempio n. 17
0
def main(**kwargs):
    model_param = default_config()
    model_param.update({"top_k":3})

    model_param = parse_kwargs(model_param, kwargs)


    print("Start evaluating on top {} predictions.".format(model_param["top_k"]))

    # load map
    dise2id, id2dise = read_dise2id("dataset/EHR")

    # load train data model
    data_model = ehr.EHR("dataset/EHR","train")

    test_data = ehr.EHR("dataset/EHR","test")
    test_data_loader  = DataLoader(test_data, 
            model_param["batch_size"], shuffle=False, num_workers=0, collate_fn=collate_fn)

    data_model_param = parse_data_model(data_model)
    model_param.update(data_model_param)

    gnn = HGNN(**model_param)
    if model_param["use_gpu"]:
        gnn.cuda()

    ckpt_path = kwargs.get("ckpt")

    if ckpt_path is None:
        print("[Warning] Do not set ckpt path, load from the default path.")
        load_ckpt("ckpt/checkpoint.pt", gnn, model_param["use_gpu"])
    else:
        load_ckpt(ckpt_path, gnn, model_param["use_gpu"])

    dsd_sampler = DSD_sampler("dataset/EHR")
    usu_sampler = USU_sampler("dataset/EHR")

    gnn.eval()

    emb_dise = gnn.gen_all_dise_emb(dsd_sampler)

    rank_list = None
    dise_list = None

    for idx, (feat, dise) in enumerate(test_data_loader):

        this_dise_list = parse_rank(dise, id2dise)

        if dise_list is None:
            dise_list = this_dise_list
        else:
            dise_list = np.r_[dise_list, this_dise_list]

        # get symps
        symp_list = []
        for x in feat:
            symp_list.append(x["symp"])

        symp_ar = np.array(symp_list)

        # re-sampling users embeddings by their symptoms
        pred_rank = gnn.rank_query(symp_ar, emb_dise, usu_sampler, top_k=model_param["top_k"])

        # parse rank for print
        pred_list = parse_rank(pred_rank, id2dise)

        if rank_list is None:
            rank_list = pred_list
        else:
            rank_list = np.r_[rank_list, pred_list]

    # save results
    res_ar = np.c_[dise_list, rank_list]
    df_res = pd.DataFrame(res_ar)
    col_name = ["GroundTruth"] + ["Pred_"+str(i+1) for i in range(rank_list.shape[1])]
    df_res.columns = col_name
    df_res.to_csv("Test_Results.csv",encoding="utf-8")

    print("Test done, save results in", "Test_Results.csv")
Esempio n. 18
0
    def comp_break_values(self, tickers='all', refactor_step_size=1,
                          append_break_values=False, parallel_computing=True,\
                          *args, **kwargs):
        do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True)
        if tickers == 'all':
            tickers = self.tickers
        else:
            tickers = utils.check_ticker_input(tickers_input=tickers,
                                               tickers_avail=self.tickers,
                                               do_print=True)
        imag_model = self.copy_model()
        break_values_dict = dict.fromkeys(tickers)
        current_values = dict.fromkeys(tickers, None)
        tolerances = dict.fromkeys(tickers)
        deviation = .3
        utils.print_issue('INFO', 'Compute break values with {:.2%} deviation'.format(deviation),
                           do_print=do_print)

        for ticker in tickers:
            utils.print_issue('INFO', 'Current ticker: {}'.format(ticker),
                               do_print=do_print)
            break_values = [None, None]
            if np.isnan(self.data[ticker].values[-1]):
                value_index = -2
            else:
                value_index = -1
            current_values[ticker] = self.data[ticker].values[value_index]
            #create range:
            start_value = current_values[ticker] * (1 - deviation)
            end_value = current_values[ticker] * (1 + deviation)
            step_size = (current_values[ticker] / 5000) * refactor_step_size
            rng = np.arange(start_value, end_value, step_size)
            try:
                import multiprocessing as mp
            except ModuleNotFoundError:
                utils.print_issue('ERROR', 'Multiprocessing module not available.',
                                   do_print=do_print)
                parallel_computing = False
            if not parallel_computing:
                break_values_dict[ticker] = np.sort(self._comp_bvs(model=imag_model,
                                                                   rng=rng,
                                                                   ticker=ticker))
            else:
                n_procs = cpu_count()
                utils.print_issue('INFO', 'Using {} processes.'.format(n_procs),
                                   do_print=do_print)
                rng_list = self._do_array_split(rng, n_procs)
                from functools import partial
                inputs_partial = partial(self._comp_bvs, imag_model, ticker)
                with mp.Pool(processes=n_procs) as pool:
                    bvs = pool.map(inputs_partial, rng_list)
                bv_final = [None, None]
                for bv_list in bvs:
                    for n, bv in enumerate(bv_list):
                        if bv is not None and bv_final[n] is None:
                            bv_final[n] = bv
                        if all(bv_final):
                            break
                break_values_dict[ticker] = np.sort(bv_final)
            #make sure to already have sort break_values_dict!
            tolerances[ticker] = break_values_dict[ticker] - current_values[ticker]

        self.tolerances = tolerances
        self.break_values = break_values_dict
        if append_break_values:
            utils.print_issue('INFO', 'Appending break values to model data',
                              do_print=do_print)
            for ticker in tickers:
                smal_tol = np.argsort(tolerances[ticker])[0]
                self.data[ticker][-1] = break_values_dict[ticker][smal_tol]
                self._init_model(do_print=False)
        else:
            utils.print_issue('INFO', 'Current values: {}'.format(current_values),
                               do_print=do_print)
            utils.print_issue('INFO', 'Break values: {}'.format(break_values_dict),
                               do_print=do_print)
            utils.print_issue('INFO', 'Tolerances: {}'.format(tolerances),
                               do_print=do_print)
Esempio n. 19
0
    def eval_model(self, tickers='all', entry_money=200, fees=(1.0029, .9954), tax=.25, visualize=False, *args, **kwargs):
        '''
        Function to evaluate the price model predictions
        Inputs:
            - data: price data of asset
            - locs: buy and sell locations, i.e. return from from function price_model()
            - entry_money: initial investment
                default = 100
            - fees: fee for buying and selling prices, i.e. buy asset at broker for slightly higher price than actual asset prices, vice versa for sells
                default = (1.005, .995), i.e. .5% higher buy price and .5% lower sell price
            - tax: german tay payments for annual wins > 800€
                default = .25, i.e. 25%
            - df_return: return model evaluation as pandas DataFrame
                default = True
        Outputs:
            - net_income: Net Income/win after entry_money (and possibly tax) subtracted
            - df_return: model evaluation as pandas DataFrame
        '''
        do_print = utils.parse_kwargs('do_print', kwargs, error_arg=True)
        if tickers == 'all':
            valid_tickers = self.tickers
        else:
            valid_tickers = utils.check_ticker_input(tickers_input=tickers,
                                                     tickers_avail=self.tickers,
                                                     do_print=do_print)
        utils.print_opening(ticker=valid_tickers,
                            start_date=self.data.index[0].strftime('%D'),
                            end_date=self.data.index[-1].strftime('%D'),
                            initial_investment_per_ticker=entry_money,
                            do_print=do_print)

        if any([self.local_min is None, self.local_max is None, self.grad is None]):
            self._init_model(do_print=do_print)

        for ticker in valid_tickers:
            utils.print_issue('TICKER', ticker, do_print=do_print)
            buy_locs, sell_locs = self._get_locs(ticker=ticker,
                                                 do_print=do_print)
            buy_prices = self.data[ticker][buy_locs]
            buy_dates = self.data[ticker].index.values[buy_locs]
            sell_prices = self.data[ticker][sell_locs]
            sell_dates = self.data[ticker].index.values[sell_locs]

            buy_prices *= fees[0]
            sell_prices *= fees[1]
            #check if nan in prices:
            #TODO:
            '''
            nan_indices = np.isnan(sell_prices)
            sell_prices = sell_prices[~nan_indices]
            buy_prices = buy_prices[~nan_indices]
            nan_indices = np.isnan(buy_prices)
            sell_prices = sell_prices[~nan_indices]
            buy_prices = buy_prices[~nan_indices]
            '''
            n_calls = sell_prices.shape[0]
            if buy_prices.shape > sell_prices.shape:
                #must use to_numpy() since the dates are still stored in prices as names
                #-> pandas devides same dates, obviously buy and sell dates differ,
                #hence pandas would return NaN all the time
                ratios = sell_prices.to_numpy() / buy_prices.to_numpy()[:-1]
            else:
                ratios = sell_prices.to_numpy() / buy_prices.to_numpy()
            trade_rewards = entry_money * np.cumprod(ratios)
            #Calculate trade wins
            trade_wins = np.diff(trade_rewards)
            #Insert first win
            try:
                trade_wins = np.insert(trade_wins, 0, trade_rewards[0] - entry_money)
            except IndexError:
                #case where one has one buy but not yet selled.
                pass
            #Evaluate Calls
            good_calls = np.where(trade_wins > 0)
            bad_calls = np.where(trade_wins < 0)
            try:
                efficiency = good_calls[0].shape[0] / n_calls
            except ZeroDivisionError:
                efficiency = np.nan
            #TODO: Error handling here:
            win_loss = trade_wins / (trade_rewards - trade_wins)
            average_win = np.mean(win_loss[np.where(win_loss > 0)])
            average_loss = np.mean(win_loss[np.where(win_loss < 0)])
            if np.sum(trade_wins) > 800:
                tax_pays = np.sum(trade_wins) * tax
                utils.print_issue('INFO', '{:.2f} tax was paid.'.format(tax_pays),
                                   do_print=do_print)
                net_income = (trade_rewards[-1] - entry_money) * (1 - tax)
            else:
                utils.print_issue('INFO', 'No tax paid.',
                                   do_print=do_print)
                net_income = np.sum(trade_wins)
            #create final DataFrame
            sell_grad = self.grad[ticker][sell_locs - self.buy_delay]
            buy_grad = self.grad[ticker][buy_locs - self.buy_delay]
            #be aware that buy_dates can be 1 entry longer then sell dates!
            if buy_dates.shape[0] > sell_dates.shape[0]:
                if sell_dates.shape[0] > 0:
                    utils.print_issue('INFO', 'Last entry of "Sell Dates" will \
be assigned equally as the penultimate one.', do_print=do_print)
                    sell_dates = np.append(sell_dates, sell_dates[-1])
                else:
                    utils.print_issue('INFO', 'First entry of "Sell Dates" \
will be first entry of "Buy Dates".', do_print=do_print)
                    sell_dates = buy_dates[0]
                try:
                    sell_prices.loc[pd.Timestamp.max] = np.nan
                except: #OverflowError: --> NOT WORKING?
                    sell_prices.loc[buy_prices.index[-1]] = np.nan
                trade_rewards = np.append(trade_rewards, np.nan)
                trade_wins = np.append(trade_wins, np.nan)
                win_loss = np.append(win_loss, np.nan)
                sell_grad = np.append(sell_grad, np.nan)
            grad_diff = sell_grad - buy_grad
            final_df = pd.DataFrame(data = {'Buy Dates': buy_dates,
                                            'Sell Dates': sell_dates,
                                            'Buy Prices': buy_prices.to_numpy(),
                                            'Sell Prices': sell_prices.to_numpy(),
                                            'Trade Reward': trade_rewards,
                                            'Trade Win': trade_wins,
                                            'Trade Efficiency': win_loss,
                                            'Grad at Buy': buy_grad,
                                            'Grad at Sell': sell_grad,
                                            'Grad Difference': grad_diff})
            self.ticker_df[ticker] = final_df
            utils.print_issue(None, '-' * 80, do_print=do_print)
            utils.print_issue('SUMMARY',
                               'Average trade win: {:.10%}'.format(average_win),
                               do_print=do_print)
            utils.print_issue('SUMMARY',
                               'Average trade loss: {:.10%}'.format(average_loss),
                               do_print=do_print)
            utils.print_issue('SUMMARY',
                               'Efficiency: {:.2%}'.format(efficiency),
                               do_print=do_print)
            utils.print_issue('SUMMARY',
                               'NET WIN: {:.2f}'.format(net_income),
                               do_print=do_print)
            utils.print_issue(None, '=' * 80, do_print=do_print)