Exemple #1
0
def testdl():

    total_epochs = 5
    iter_per_epoch = int(saturation / param_bs)
    optim = None
    starting_epoch = 0
    starting_iteration = 0

    num_workers = 32
    ig = InputGenJ(no_underlying=True)
    # ig = InputGenH(no_underlying=True)
    trainds = ig.get_train()
    validds = ig.get_valid()
    traindl = DataLoader(dataset=trainds,
                         batch_size=param_bs,
                         num_workers=num_workers,
                         collate_fn=pad_collate,
                         pin_memory=True)
    validdl = DataLoader(dataset=validds,
                         batch_size=param_bs,
                         num_workers=num_workers // 2,
                         collate_fn=pad_collate,
                         pin_memory=False)

    for i, (input, target, loss_type) in enumerate(traindl):
        if i == 100:
            break
Exemple #2
0
    def __init__(self):
        super(Dumb, self).__init__()

        ig = InputGenJ(no_underlying=True, death_only=True)
        series = ig.get_series("death", "code")
        index_list = []
        dic = ig.death_code_dict
        total_patients = len(
            ig.death_rep_person_id)  # equals to the intersection
        patients_lookup = {id: 0 for id in ig.death_rep_person_id}

        for index, row in tqdm.tqdm(ig.death.iterrows()):
            if index[0] in patients_lookup:
                code = row['code']
                idx = dic[code]
                index_list.append(idx)

        # for row in tqdm.tqdm(series):
        #     idx=dic[row]
        #     index_list.append(idx)

        counter = Counter(index_list)
        code_proportion = list(counter.values())

        for i in range(len(code_proportion)):
            code_proportion[i] /= total_patients

        self.pred = np.array([0] + code_proportion)
Exemple #3
0
def plottoe():
    ig = InputGenJ(elim_rare_code=True,
                   no_underlying=True,
                   death_only=True,
                   debug=True)
    death_date = ig.death["death_date"].to_frame()
    latest_visit = ig.earla["latest"].to_frame()
    death_date = death_date.reset_index()
    # ensure uniqueness
    death_date = death_date.drop_duplicates(subset="rep_person_id")
    death_date = death_date.drop(['id'], axis=1)

    merged = pd.merge(death_date,
                      latest_visit,
                      how='left',
                      on=['rep_person_id'])
    # 19583, thank god
    merged = merged.dropna()
    merged["death_date"] = merged["death_date"].apply(
        lambda x: x.to_datetime64())
    merged["latest"] = merged["latest"].apply(lambda x: x.to_datetime64())
    merged["toe"] = merged["death_date"] - merged["latest"]
    toe = merged["toe"].astype("timedelta64[M]").astype("int")
    toe[toe < 0] = 0
    toe.to_csv("toe_frequency.csv")
Exemple #4
0
def plotcod():
    ig = InputGenJ(elim_rare_code=True, no_underlying=True, death_only=True, debug=True)
    for dfn, col in ig.bar_separated + ig.no_bar:

        dic = ig.get_dict(dfn,col)
        countdic=ig.get_count_dict(dfn,col)
        sortdic=ig.get_sort_dic(dfn,col)
        df=pd.DataFrame({"code":list(dic.keys()), "index":list(dic.values())})
        df.to_csv("igstats/"+dfn+"_"+col+"_dict.csv")

        # the number of times the key has appeared in the final ig

        df=pd.DataFrame({"code":list(countdic.keys()), "count":list(countdic.values())})
        df.to_csv("igstats/"+dfn+"_"+col+"_count.csv")

        # sorted by the order of frequency of the keys
        df=pd.DataFrame({"code":list(sortdic.keys()), "place":list(sortdic.values())})
        df.to_csv("igstats/"+dfn+"_"+col+"_sort.csv")

    print("Done")
Exemple #5
0
def calcexpcod():
    ig=InputGenJ()
    bs=32
    tr=ig.get_train_cached()
    bce=nn.BCELoss()
    losses = []
    prior_probability = get_death_code_proportion(ig)
    output=torch.from_numpy(prior_probability).float().cuda()
    output=output.unsqueeze(0).repeat(bs, 1)
    tr= DataLoader(dataset=tr, batch_size=bs, num_workers=8,collate_fn=pad_collate)

    for idx,data in tqdm.tqdm(enumerate(tr)):
        input, target, loss_type, time_length=data
        cause_of_death_target = target[:,1:].cuda()
        cause_of_death_target=cause_of_death_target.float()
        try:
            cod_loss=bce(output,cause_of_death_target)
        except ValueError:
            cod_loss=bce(output[:cause_of_death_target.shape[0],],cause_of_death_target)
        losses.append(cod_loss.item())

    # 0.09312667474150657
    print(sum(losses)/len(losses))
Exemple #6
0
def calcexptoe():
    ig = InputGenJ(elim_rare_code=True,
                   no_underlying=True,
                   death_only=True,
                   debug=True)
    death_date = ig.death["death_date"].to_frame()
    latest_visit = ig.earla["latest"].to_frame()
    death_date = death_date.reset_index()
    # ensure uniqueness
    death_date = death_date.drop_duplicates(subset="rep_person_id")
    death_date = death_date.drop(['id'], axis=1)

    merged = pd.merge(death_date,
                      latest_visit,
                      how='left',
                      on=['rep_person_id'])
    # 19583, thank god
    merged = merged.dropna()
    merged["death_date"] = merged["death_date"].apply(
        lambda x: x.to_datetime64())
    merged["latest"] = merged["latest"].apply(lambda x: x.to_datetime64())
    merged["toe"] = merged["death_date"] - merged["latest"]
    toe = merged["toe"].astype("timedelta64[M]").astype("int")
    # 7.233
    print(toe.mean())
    # smoothl1loss
    #11.276917505612655
    print((toe - toe.mean()
           ).abs().apply(lambda x: 0.5 * x * x if x < 1 else x - 0.5).mean())
    print("we done")

    onedgeomedian(toe)
    # the geo median is 0
    # 7.423658275034469
    print(toe.abs().apply(lambda x: 0.5 * x * x if x < 1 else x - 0.5).mean())

    toe[toe < 0] = 0
    onedgeomedian(toe)

    # still zero. well.
    print("DONE")
Exemple #7
0
def main(load, savestr='default', lr=1e-3, beta=0.01, kill_time=True):
    """
    :param load:
    :param savestr:
    :param lr:
    :param curri:
    :return:
    """

    total_epochs = 40
    iter_per_epoch = int(saturation / param_bs)
    optim = None
    starting_epoch = 0
    starting_iteration = 0
    logfile = "log/dnc_" + savestr + "_" + datetime_filename() + ".txt"

    num_workers = 32
    ig = InputGenJ(no_underlying=True, death_only=True, debug=True)
    # ig = InputGenH(no_underlying=True)
    param_x = ig.input_dim
    param_v_t = ig.output_dim
    target_dim = param_v_t
    trainds = ig.get_train()
    validds = ig.get_valid()
    traindl = DataLoader(dataset=trainds,
                         batch_size=param_bs,
                         num_workers=num_workers,
                         collate_fn=pad_collate,
                         pin_memory=True)
    validdl = DataLoader(dataset=validds,
                         batch_size=param_bs,
                         num_workers=num_workers // 2,
                         collate_fn=pad_collate,
                         pin_memory=True)

    print("Generating prior")
    prior_probability = get_death_code_proportion(ig)

    print("Using", num_workers, "workers for training set")
    computer = PriorDNC(x=param_x,
                        h=param_h,
                        L=param_L,
                        v_t=param_v_t,
                        W=param_W,
                        R=param_R,
                        N=param_N,
                        prior=prior_probability)
    # load model:
    if load:
        print("loading model")
        computer, optim, starting_epoch, starting_iteration = load_model(
            computer, optim, starting_epoch, starting_iteration, savestr)

    computer = computer.cuda()
    if optim is None:
        # does adamax perform better with sparse output?
        optimizer = torch.optim.Adam(computer.parameters(), lr=lr)
    else:
        # print('use Adadelta optimizer with learning rate ', lr)
        # optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr)
        optimizer = optim
        for group in optimizer.param_groups:
            print("Currently using a learing rate of ", group["lr"])

    # creating the positive_weights
    # with open("/infodev1/rep/projects/jason/pickle/dcc.pkl","rb") as f:
    #     # loaded here is a vector where v_i is the number of times death label i has occured
    #     weights=pickle.load(f)
    # negs=59652-weights
    # weights[weights<4]=3
    # weights=negs/weights
    # weights=torch.from_numpy(weights).float().cuda()
    # weights=Variable(weights)

    real_criterion = TOELoss()
    # this parameter does not appear in PyTorch 0.3.1
    # binary_criterion = WeightedBCELLoss(pos_weight=None)
    binary_criterion = nn.BCEWithLogitsLoss()
    # starting with the epoch after the loaded one

    train(computer, optimizer, real_criterion, binary_criterion,
          traindl, validdl, int(starting_epoch), total_epochs,
          int(starting_iteration), iter_per_epoch, target_dim, savestr, beta,
          logfile, kill_time)
Exemple #8
0
# make latex table
# no, this does not work.
from death.post.inputgen_planJ import InputGenJ
ig = InputGenJ(cached=False)

for dfn in ig.dfn:
    df = getattr(ig, dfn)
    print(dfn)
    print(df.to_latex())
Exemple #9
0
def main(load, savestr, lr=1e-3, beta=1e-3):
    total_epochs = 10
    iter_per_epoch = 2019
    optim = None
    starting_epoch = 0
    starting_iteration = 0

    logfile = "log/lstm_" + savestr + "_" + datetime_filename() + ".txt"

    num_workers = 16
    ig = InputGenJ()
    trainds = ig.get_train()
    validds = ig.get_valid()
    testds = ig.get_test()
    validdl = DataLoader(dataset=validds,
                         batch_size=8,
                         num_workers=num_workers // 2,
                         collate_fn=pad_collate,
                         shuffle=True)
    traindl = DataLoader(dataset=trainds,
                         batch_size=8,
                         num_workers=num_workers,
                         collate_fn=pad_collate,
                         shuffle=True)

    print("Using", num_workers, "workers for training set")
    # testing whether this LSTM works is basically a question whether
    lstm = lstmwrapperJ(input_size=ig.input_dim, output_size=ig.output_dim)

    # load model:
    if load:
        print("loading model")
        lstm, optim, starting_epoch, starting_iteration = load_model(
            lstm, optim, starting_epoch, starting_iteration, savestr)

    lstm = lstm.cuda()
    if optim is None:
        optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)
    else:
        # print('use Adadelta optimizer with learning rate ', lr)
        # optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr)
        optimizer = optim
        for group in optimizer.param_groups:
            print("Currently using a learing rate of ", group["lr"])

    real_criterion = TOELoss()

    # creating the positive_weights
    # with open("/infodev1/rep/projects/jason/pickle/dcc.pkl","rb") as f:
    #     # loaded here is a vector where v_i is the number of times death label i has occured
    #     weights=pickle.load(f)
    # negs=59652-weights
    # weights[weights<4]=3
    # weights=negs/weights
    # weights=torch.from_numpy(weights).float().cuda()
    # weights=Variable(weights)

    # binary_criterion = WeightedBCELLoss(pos_weight=weights)
    binary_criterion = nn.BCEWithLogitsLoss()
    # starting with the epoch after the loaded one

    train(lstm, optimizer, real_criterion, binary_criterion, traindl, validdl,
          int(starting_epoch), total_epochs, int(starting_iteration),
          iter_per_epoch, savestr, beta, ig.output_dim, logfile)
Exemple #10
0
        """
        beforep = torch.sigmoid(beforepipe)
        afterp = torch.sigmoid(afterpipe)
        beforep = beforep.clamp(1e-6, 1 - 1e-6)
        afterp = afterp.clamp(1e-6, 1 - 1e-6)
        pos = beforepipe * torch.log(beforep / afterp)
        neg = (1 - beforep) * torch.log((1 - beforep) / (1 - afterp))
        return pos + neg


if __name__ == '__main__':
    from death.post.inputgen_planJ import InputGenJ, pad_collate
    from death.final.losses import TOELoss
    real_criterion = TOELoss()
    binary_criterion = DiscreteCrossEntropy()
    igj = InputGenJ()
    trainig = igj.get_train()
    d1 = trainig[10]
    d2 = trainig[11]
    # input=torch.empty(64,400,7298).uniform_()
    # target=torch.empty(64,435).uniform_()
    input, target, loss_type, time_length = [
        t.cuda() for t in pad_collate((d1, d2))
    ]
    model = TransformerMixedAttnSoftmax(binary_criterion=binary_criterion,
                                        real_criterion=real_criterion,
                                        input_size=7298,
                                        output_size=435,
                                        prior=None).cuda()
    all_loss, lml, lat, lem, lvat, toe_loss, output = model.one_pass(
        input, time_length, target, loss_type)
Exemple #11
0
        self.w_1 = nn.Linear(d_in, d_hid)  # position-wise
        self.w_2 = nn.Linear(d_hid, d_in)  # position-wise
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        # output = x.transpose(1, 2)
        # this generates nan
        output = self.w_2(F.relu(self.w_1(x)))
        assert (output == output).all()

        # output = output.transpose(1, 2)
        output = self.dropout(output)
        assert (output == output).all()
        return output


if __name__ == '__main__':
    from death.post.inputgen_planJ import InputGenJ
    ig = InputGenJ(elim_rare_code=True,
                   no_underlying=True,
                   death_only=True,
                   debug=True)
    i, t, _, _ = ig[4069]
    simple = Simple(7298, 435)
    i = torch.from_numpy(i)
    t = torch.from_numpy(t)
    i = i.unsqueeze(0)
    t = t.unsqueeze(0)
    o = simple(i)
    print(o)