def testdl(): total_epochs = 5 iter_per_epoch = int(saturation / param_bs) optim = None starting_epoch = 0 starting_iteration = 0 num_workers = 32 ig = InputGenJ(no_underlying=True) # ig = InputGenH(no_underlying=True) trainds = ig.get_train() validds = ig.get_valid() traindl = DataLoader(dataset=trainds, batch_size=param_bs, num_workers=num_workers, collate_fn=pad_collate, pin_memory=True) validdl = DataLoader(dataset=validds, batch_size=param_bs, num_workers=num_workers // 2, collate_fn=pad_collate, pin_memory=False) for i, (input, target, loss_type) in enumerate(traindl): if i == 100: break
def __init__(self): super(Dumb, self).__init__() ig = InputGenJ(no_underlying=True, death_only=True) series = ig.get_series("death", "code") index_list = [] dic = ig.death_code_dict total_patients = len( ig.death_rep_person_id) # equals to the intersection patients_lookup = {id: 0 for id in ig.death_rep_person_id} for index, row in tqdm.tqdm(ig.death.iterrows()): if index[0] in patients_lookup: code = row['code'] idx = dic[code] index_list.append(idx) # for row in tqdm.tqdm(series): # idx=dic[row] # index_list.append(idx) counter = Counter(index_list) code_proportion = list(counter.values()) for i in range(len(code_proportion)): code_proportion[i] /= total_patients self.pred = np.array([0] + code_proportion)
def plottoe(): ig = InputGenJ(elim_rare_code=True, no_underlying=True, death_only=True, debug=True) death_date = ig.death["death_date"].to_frame() latest_visit = ig.earla["latest"].to_frame() death_date = death_date.reset_index() # ensure uniqueness death_date = death_date.drop_duplicates(subset="rep_person_id") death_date = death_date.drop(['id'], axis=1) merged = pd.merge(death_date, latest_visit, how='left', on=['rep_person_id']) # 19583, thank god merged = merged.dropna() merged["death_date"] = merged["death_date"].apply( lambda x: x.to_datetime64()) merged["latest"] = merged["latest"].apply(lambda x: x.to_datetime64()) merged["toe"] = merged["death_date"] - merged["latest"] toe = merged["toe"].astype("timedelta64[M]").astype("int") toe[toe < 0] = 0 toe.to_csv("toe_frequency.csv")
def plotcod(): ig = InputGenJ(elim_rare_code=True, no_underlying=True, death_only=True, debug=True) for dfn, col in ig.bar_separated + ig.no_bar: dic = ig.get_dict(dfn,col) countdic=ig.get_count_dict(dfn,col) sortdic=ig.get_sort_dic(dfn,col) df=pd.DataFrame({"code":list(dic.keys()), "index":list(dic.values())}) df.to_csv("igstats/"+dfn+"_"+col+"_dict.csv") # the number of times the key has appeared in the final ig df=pd.DataFrame({"code":list(countdic.keys()), "count":list(countdic.values())}) df.to_csv("igstats/"+dfn+"_"+col+"_count.csv") # sorted by the order of frequency of the keys df=pd.DataFrame({"code":list(sortdic.keys()), "place":list(sortdic.values())}) df.to_csv("igstats/"+dfn+"_"+col+"_sort.csv") print("Done")
def calcexpcod(): ig=InputGenJ() bs=32 tr=ig.get_train_cached() bce=nn.BCELoss() losses = [] prior_probability = get_death_code_proportion(ig) output=torch.from_numpy(prior_probability).float().cuda() output=output.unsqueeze(0).repeat(bs, 1) tr= DataLoader(dataset=tr, batch_size=bs, num_workers=8,collate_fn=pad_collate) for idx,data in tqdm.tqdm(enumerate(tr)): input, target, loss_type, time_length=data cause_of_death_target = target[:,1:].cuda() cause_of_death_target=cause_of_death_target.float() try: cod_loss=bce(output,cause_of_death_target) except ValueError: cod_loss=bce(output[:cause_of_death_target.shape[0],],cause_of_death_target) losses.append(cod_loss.item()) # 0.09312667474150657 print(sum(losses)/len(losses))
def calcexptoe(): ig = InputGenJ(elim_rare_code=True, no_underlying=True, death_only=True, debug=True) death_date = ig.death["death_date"].to_frame() latest_visit = ig.earla["latest"].to_frame() death_date = death_date.reset_index() # ensure uniqueness death_date = death_date.drop_duplicates(subset="rep_person_id") death_date = death_date.drop(['id'], axis=1) merged = pd.merge(death_date, latest_visit, how='left', on=['rep_person_id']) # 19583, thank god merged = merged.dropna() merged["death_date"] = merged["death_date"].apply( lambda x: x.to_datetime64()) merged["latest"] = merged["latest"].apply(lambda x: x.to_datetime64()) merged["toe"] = merged["death_date"] - merged["latest"] toe = merged["toe"].astype("timedelta64[M]").astype("int") # 7.233 print(toe.mean()) # smoothl1loss #11.276917505612655 print((toe - toe.mean() ).abs().apply(lambda x: 0.5 * x * x if x < 1 else x - 0.5).mean()) print("we done") onedgeomedian(toe) # the geo median is 0 # 7.423658275034469 print(toe.abs().apply(lambda x: 0.5 * x * x if x < 1 else x - 0.5).mean()) toe[toe < 0] = 0 onedgeomedian(toe) # still zero. well. print("DONE")
def main(load, savestr='default', lr=1e-3, beta=0.01, kill_time=True): """ :param load: :param savestr: :param lr: :param curri: :return: """ total_epochs = 40 iter_per_epoch = int(saturation / param_bs) optim = None starting_epoch = 0 starting_iteration = 0 logfile = "log/dnc_" + savestr + "_" + datetime_filename() + ".txt" num_workers = 32 ig = InputGenJ(no_underlying=True, death_only=True, debug=True) # ig = InputGenH(no_underlying=True) param_x = ig.input_dim param_v_t = ig.output_dim target_dim = param_v_t trainds = ig.get_train() validds = ig.get_valid() traindl = DataLoader(dataset=trainds, batch_size=param_bs, num_workers=num_workers, collate_fn=pad_collate, pin_memory=True) validdl = DataLoader(dataset=validds, batch_size=param_bs, num_workers=num_workers // 2, collate_fn=pad_collate, pin_memory=True) print("Generating prior") prior_probability = get_death_code_proportion(ig) print("Using", num_workers, "workers for training set") computer = PriorDNC(x=param_x, h=param_h, L=param_L, v_t=param_v_t, W=param_W, R=param_R, N=param_N, prior=prior_probability) # load model: if load: print("loading model") computer, optim, starting_epoch, starting_iteration = load_model( computer, optim, starting_epoch, starting_iteration, savestr) computer = computer.cuda() if optim is None: # does adamax perform better with sparse output? optimizer = torch.optim.Adam(computer.parameters(), lr=lr) else: # print('use Adadelta optimizer with learning rate ', lr) # optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr) optimizer = optim for group in optimizer.param_groups: print("Currently using a learing rate of ", group["lr"]) # creating the positive_weights # with open("/infodev1/rep/projects/jason/pickle/dcc.pkl","rb") as f: # # loaded here is a vector where v_i is the number of times death label i has occured # weights=pickle.load(f) # negs=59652-weights # weights[weights<4]=3 # weights=negs/weights # weights=torch.from_numpy(weights).float().cuda() # weights=Variable(weights) real_criterion = TOELoss() # this parameter does not appear in PyTorch 0.3.1 # binary_criterion = WeightedBCELLoss(pos_weight=None) binary_criterion = nn.BCEWithLogitsLoss() # starting with the epoch after the loaded one train(computer, optimizer, real_criterion, binary_criterion, traindl, validdl, int(starting_epoch), total_epochs, int(starting_iteration), iter_per_epoch, target_dim, savestr, beta, logfile, kill_time)
# make latex table # no, this does not work. from death.post.inputgen_planJ import InputGenJ ig = InputGenJ(cached=False) for dfn in ig.dfn: df = getattr(ig, dfn) print(dfn) print(df.to_latex())
def main(load, savestr, lr=1e-3, beta=1e-3): total_epochs = 10 iter_per_epoch = 2019 optim = None starting_epoch = 0 starting_iteration = 0 logfile = "log/lstm_" + savestr + "_" + datetime_filename() + ".txt" num_workers = 16 ig = InputGenJ() trainds = ig.get_train() validds = ig.get_valid() testds = ig.get_test() validdl = DataLoader(dataset=validds, batch_size=8, num_workers=num_workers // 2, collate_fn=pad_collate, shuffle=True) traindl = DataLoader(dataset=trainds, batch_size=8, num_workers=num_workers, collate_fn=pad_collate, shuffle=True) print("Using", num_workers, "workers for training set") # testing whether this LSTM works is basically a question whether lstm = lstmwrapperJ(input_size=ig.input_dim, output_size=ig.output_dim) # load model: if load: print("loading model") lstm, optim, starting_epoch, starting_iteration = load_model( lstm, optim, starting_epoch, starting_iteration, savestr) lstm = lstm.cuda() if optim is None: optimizer = torch.optim.Adam(lstm.parameters(), lr=lr) else: # print('use Adadelta optimizer with learning rate ', lr) # optimizer = torch.optim.Adadelta(computer.parameters(), lr=lr) optimizer = optim for group in optimizer.param_groups: print("Currently using a learing rate of ", group["lr"]) real_criterion = TOELoss() # creating the positive_weights # with open("/infodev1/rep/projects/jason/pickle/dcc.pkl","rb") as f: # # loaded here is a vector where v_i is the number of times death label i has occured # weights=pickle.load(f) # negs=59652-weights # weights[weights<4]=3 # weights=negs/weights # weights=torch.from_numpy(weights).float().cuda() # weights=Variable(weights) # binary_criterion = WeightedBCELLoss(pos_weight=weights) binary_criterion = nn.BCEWithLogitsLoss() # starting with the epoch after the loaded one train(lstm, optimizer, real_criterion, binary_criterion, traindl, validdl, int(starting_epoch), total_epochs, int(starting_iteration), iter_per_epoch, savestr, beta, ig.output_dim, logfile)
""" beforep = torch.sigmoid(beforepipe) afterp = torch.sigmoid(afterpipe) beforep = beforep.clamp(1e-6, 1 - 1e-6) afterp = afterp.clamp(1e-6, 1 - 1e-6) pos = beforepipe * torch.log(beforep / afterp) neg = (1 - beforep) * torch.log((1 - beforep) / (1 - afterp)) return pos + neg if __name__ == '__main__': from death.post.inputgen_planJ import InputGenJ, pad_collate from death.final.losses import TOELoss real_criterion = TOELoss() binary_criterion = DiscreteCrossEntropy() igj = InputGenJ() trainig = igj.get_train() d1 = trainig[10] d2 = trainig[11] # input=torch.empty(64,400,7298).uniform_() # target=torch.empty(64,435).uniform_() input, target, loss_type, time_length = [ t.cuda() for t in pad_collate((d1, d2)) ] model = TransformerMixedAttnSoftmax(binary_criterion=binary_criterion, real_criterion=real_criterion, input_size=7298, output_size=435, prior=None).cuda() all_loss, lml, lat, lem, lvat, toe_loss, output = model.one_pass( input, time_length, target, loss_type)
self.w_1 = nn.Linear(d_in, d_hid) # position-wise self.w_2 = nn.Linear(d_hid, d_in) # position-wise self.dropout = nn.Dropout(dropout) def forward(self, x): residual = x # output = x.transpose(1, 2) # this generates nan output = self.w_2(F.relu(self.w_1(x))) assert (output == output).all() # output = output.transpose(1, 2) output = self.dropout(output) assert (output == output).all() return output if __name__ == '__main__': from death.post.inputgen_planJ import InputGenJ ig = InputGenJ(elim_rare_code=True, no_underlying=True, death_only=True, debug=True) i, t, _, _ = ig[4069] simple = Simple(7298, 435) i = torch.from_numpy(i) t = torch.from_numpy(t) i = i.unsqueeze(0) t = t.unsqueeze(0) o = simple(i) print(o)