def getTrainedModel1(self): # We build a matrix of LF votes for each comment ticket LF_matrix = self.make_Ls_matrix(self.LF_set['comments'], self.LFs) # Get true labels for LF set Y_LF_set = np.array(self.LF_set['resolution']) display( lf_summary(sparse.csr_matrix(LF_matrix), Y=Y_LF_set, lf_names=self.LF_names.values())) print("label coverage: " + label_coverage(LF_matrix)) mv = MajorityLabelVoter() Y_train_majority_votes = mv.predict(LF_matrix) print("classification report:\n" + classification_report(Y_LF_set, Y_train_majority_votes)) Ls_train = self.make_Ls_matrix(self.train, self.LFs) # You can tune the learning rate and class balance. model = LabelModel(k=2, seed=123) trainer = model.train_model(Ls_train, n_epochs=2000, print_every=1000, lr=0.0001, class_balance=np.array([0.2, 0.8])) Y_train = model.predict(Ls_train) + Y_LF_set print('Trained Label Model Metrics:') scores = model.score((Ls_train[1], Y_train[1]), metric=['accuracy', 'precision', 'recall', 'f1']) print(scores) return trainer, Y_train
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L,Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y = Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder cnn_encoder = FrameEncoderOC if(torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction="attention", encoder_class=cnn_encoder, ) # Define end model end_model = EndModel( input_module=lstm_module, layer_out_dims=[hidden_size, num_classes], optimizer="adam", #use_cuda=cuda, batchnorm=True, seed=123, verbose=False, device = device, ) #print('Training model') #tic = time.time() dropout = 0.4 # Train end model end_model.train_model( train_data=data_loader["train"], valid_data=data_loader["dev"], l2=args.weight_decay, lr=args.lr, n_epochs=args.n_epochs, log_train_every=1, verbose=True, progress_bar = True, loss_weights = [0.45,0.55], batchnorm = 'True', input_dropout = dropout, middle_dropout = dropout, #validation_metric='f1', ) #print('Time taken for training:') #print(time.time() - tic) # evaluate end model end_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 108 # using get_frm_output_size() if (torch.cuda.is_available()): device = torch.device('cuda:0') #device = 'cuda' else: device = 'cpu' #print(device) L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model - no temporal modelling label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=500, log_train_every=50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # training label model without temporal modelling # naive model #print(L["train"].todense().shape) # (18850,5) #print(L["dev"].todense().shape) # (1500,5) #print(Y["dev"].shape) # (1500,) m_per_task = L["train"].todense().shape[1] # 5 MRI_data_naive = { 'Li_train': torch.FloatTensor(np.array(L["train"].todense().astype('int_'))), 'Li_dev': torch.FloatTensor(np.array(L["dev"].todense())), 'R_dev': Y["dev"] } MRI_data_naive['class_balance'] = torch.FloatTensor([0.5, 0.5]).to(device) # training naive model naive_model = DPLabelModel( m=m_per_task, T=1, edges=[], coverage_sets=[[ 0, ]] * m_per_task, mu_sharing=[[ i, ] for i in range(m_per_task)], phi_sharing=[], device=device, #class_balance=MRI_data_naive['class_balance'], seed=0) optimize(naive_model, L_hat=MRI_data_naive['Li_train'], num_iter=300, lr=1e-3, momentum=0.8, clamp=True, seed=0) # evaluating naive model R_pred = naive_model.predict(MRI_data_naive['Li_dev']).data.numpy() R_pred = 2 - R_pred #print(R_pred) #print(MRI_data_naive['R_dev']) for metric in ['accuracy', 'f1', 'recall', 'precision']: score = metric_score(MRI_data_naive['R_dev'], R_pred, metric) print(f"{metric.capitalize()}: {score:.3f}") # training label model with temporal modelling # reshaping dataset num_frames = 50 n_patients_train = round(L["train"].todense().shape[0] / num_frames) #(377) n_patients_dev = round(L["dev"].todense().shape[0] / num_frames) #(30) Ltrain = np.reshape(np.array(L["train"].todense()), (n_patients_train, num_frames, -1)) Ldev = np.reshape(np.array(L["dev"].todense()), (n_patients_dev, num_frames, -1)) Ydev = np.reshape(Y["dev"], (n_patients_dev, num_frames)) # print(Ltrain.shape) # (377,50,5) #print(Ldev.shape) # (30,50,5) #print(Ydev.shape) # (30,50) # subsampling # selecting frames 3,13,23,33,43 indices = np.linspace(2, 42, 5).astype(int) m_per_task = 5 T = 5 Ltrain_small = Ltrain[:, indices, :] # shape (377,5,5) Ldev_small = Ldev[:, indices, :] # shape (30,5,5) Ydev_small = Ydev[:, indices] # shape (30,5) Ltrain_small = np.reshape( Ltrain_small, ((n_patients_train * T), m_per_task)) # shape (1885,5) Ldev_small = np.reshape( Ldev_small, ((n_patients_dev * T), m_per_task)) # shape (150,5) Ydev_small = np.reshape(Ydev_small, ((n_patients_dev * T), )) # shape (150,) MRI_data_temporal = { 'Li_train': torch.LongTensor(Ltrain_small).view(n_patients_train, (m_per_task * T)), 'Li_dev': torch.LongTensor(Ldev_small).view(n_patients_dev, (m_per_task * T)), 'R_dev': torch.LongTensor(Ydev_small)[::T] * (2**T - 1), 'm': m_per_task * T, 'T': T } MRI_data_temporal['class_balance'] = normalize( (MRI_data_temporal['R_dev'].unsqueeze(1) == torch.arange( 2**T, device=device).unsqueeze(0)).sum(0).float(), dim=0, p=1) max_seed = 10 temporal_models = [ None, ] * max_seed for seed in range(max_seed): markov_model = DPLabelModel( m=m_per_task * T, T=T, edges=[(i, i + m_per_task) for i in range((T - 1) * m_per_task)], coverage_sets=[[ t, ] for t in range(T) for _ in range(m_per_task)], mu_sharing=[[t * m_per_task + i for t in range(T)] for i in range(m_per_task)], phi_sharing=[[(t * m_per_task + i, (t + 1) * m_per_task + i) for t in range(T - 1)] for i in range(m_per_task)], device=device, class_balance=MRI_data_temporal['class_balance'], seed=seed) optimize(markov_model, L_hat=MRI_data_temporal['Li_train'], num_iter=1000, lr=1e-5, momentum=0.8, clamp=True, verbose=False, seed=seed) temporal_models[seed] = markov_model for seed, model in enumerate(temporal_models): R_pred = model.predict(MRI_data_temporal['Li_dev'].cpu()) F1 = metric_score(MRI_data_temporal['R_dev'].cpu() > 0, R_pred.cpu() > 0, 'f1') accuracy = metric_score(MRI_data_temporal['R_dev'].cpu(), R_pred.cpu(), 'accuracy') print(f"seed={seed} accuracy={accuracy:.3f} F1={F1:.3f}")
list(acc_df.transpose().index), acc_df.transpose()[0], "bo-", label="DaG", ) plt.legend() # In[23]: label_model.train_model(correct_L[:, 0:7], n_epochs=1000, print_every=200, seed=100, lr=0.01, l2=1.08) label_model.score((correct_L_train[:, 0:7], candidate_dfs['train'] ['curated_dsh'].apply(lambda x: 1 if x > 0 else 2).values)) # In[24]: lf_stats = zip(lf_names, range(0, label_model.mu.detach().clone().numpy().shape[0], 2)) estimated_param = pd.np.clip(label_model.mu.detach().clone().numpy(), 0.01, 0.99) value_type = ["P(L=1|Y=1)", "P(L=1|Y=2)", "P(L=2|Y=1)", "P(L=2|Y=2)"] data = [] for lf_name, lf_index in lf_stats: data += list( zip([lf_name] * len(value_type), estimated_param[lf_index:lf_index + 2, :].flatten(), value_type))
balance2 = Counter(Y_test).values() new_balance = [] for elem in balance: new_balance.append(elem[1] / sum(balance2)) print(sorted(Counter(Y_test).items())) print(balance) print(new_balance) label_model = LabelModel(k=2, seed=123) label_model.train_model(Ls[0], class_balance=new_balance, n_epochs=500, log_train_every=50) score = label_model.score((Ls[1], Ys[1])) print('Trained Label Model Metrics:') scores = label_model.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') scores = mv.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1']) Y_train_ps = label_model.predict_proba(Ls[0]) Y_dev_p = label_model.predict(Ls[1]) """ mv2 = MajorityClassVoter()
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L,Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y = Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs = 500, log_train_every = 50) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy','precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder cnn_encoder = FrameEncoderOC if(torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction="attention", encoder_class=cnn_encoder, ) train_args = [data_loader["train"]] train_kwargs = { 'seed':args.seed, 'progress_bar':True, 'log_train_every':1} init_args = [ [hidden_size, num_classes] ] init_kwargs = { "input_module": lstm_module, "optimizer": "adam", "verbose": False, "input_batchnorm": True, "use_cuda":torch.cuda.is_available(), 'checkpoint_dir':args.checkpoint_dir, 'seed':args.seed, 'device':device} search_space = { 'n_epochs':[10], 'batchnorm':[True], 'dropout': [0.1,0.25,0.4], 'lr':{'range': [1e-3, 1e-2], 'scale': 'log'}, 'l2':{'range': [1e-5, 1e-4], 'scale': 'log'},#[ 1.21*1e-5], #'checkpoint_metric':['f1'], } log_config = { "log_dir": "./run_logs", "run_name": 'cnn_lstm_oc' } max_search = 5 tuner_config = {"max_search": max_search } validation_metric = 'accuracy' # Set up logger and searcher tuner = RandomSearchTuner(EndModel, **log_config, log_writer_class=TensorBoardWriter, validation_metric=validation_metric, seed=1701) disc_model = tuner.search( search_space, valid_data = data_loader["dev"], train_args=train_args, init_args=init_args, init_kwargs=init_kwargs, train_kwargs=train_kwargs, max_search=tuner_config["max_search"], clean_up=False, ) # evaluate end model disc_model.score(data_loader["dev"], verbose=True, metric=['accuracy','precision', 'recall', 'f1'])
val_ground = remap_labels(loader.val_ground) L_train_sparse = sparse.csc_matrix( (remap_labels(L_train_sparse.data), L_train_sparse.indices, L_train_sparse.indptr)).T L_val_sparse = sparse.csc_matrix((remap_labels(L_val_sparse.data), L_val_sparse.indices, L_val_sparse.indptr)).T print('\n\n####### Running METAL Label Model ########') label_model = LabelModel() label_model.train_model(L_train_sparse, n_epochs=200, print_every=50, seed=123, verbose=False) train_marginals = label_model.predict_proba(L_train_sparse) label_model.score((L_train_sparse, train_ground), metric=metrics) ####### METAL with Exact Class Balance ######## print( '\n\n####### Running METAL Label Model with exact class balance ########') train_class_balance = np.array([ np.sum(train_ground == 1) / loader.train_num, np.sum(train_ground == 2) / loader.train_num ]) val_class_balance = np.array([ np.sum(val_ground == 1) / loader.val_num, np.sum(val_ground == 2) / loader.val_num ]) print('Train set class balance:', train_class_balance) print('Val set class balance:', val_class_balance) label_model2 = LabelModel(seed=123)
def train_model(args): #global args #args = parser.parse_args() hidden_size = 128 num_classes = 2 encode_dim = 1000 # using get_frm_output_size() L, Y = load_labels(args) # Label Model # labelling functions analysis print(lf_summary(L["dev"], Y=Y["dev"])) # training label model label_model = LabelModel(k=num_classes, seed=123) label_model.train_model(L["train"], Y["dev"], n_epochs=2000, log_train_every=100) # evaluating label model print('Trained Label Model Metrics:') label_model.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) # comparison with majority vote of LFs mv = MajorityLabelVoter(seed=123) print('Majority Label Voter Metrics:') mv.score((L["dev"], Y["dev"]), metric=['accuracy', 'precision', 'recall', 'f1']) Ytrain_p = label_model.predict_proba(L["train"]) #print(Ytrain_ps.shape) #(377*50,2) #Ydev_p = label_model.predict_proba(L["dev"]) # test models #label_model.score((Ltest,Ytest), metric=['accuracy','precision', 'recall', 'f1']) # End Model # Create datasets and dataloaders train, dev, test = load_dataset(args, Ytrain_p, Y["dev"], Y["test"]) data_loader = get_data_loader(train, dev, test, args.batch_size, args.num_workers) #print(len(data_loader["train"])) # 18850 / batch_size #print(len(data_loader["dev"])) # 1500 / batch_size #print(len(data_loader["test"])) # 1000 / batch_size #import ipdb; ipdb.set_trace() # Define input encoder #cnn_encoder = FrameEncoderOC cnn_encoder = FrameEncoderOCDense if (torch.cuda.is_available()): device = 'cuda' else: device = 'cpu' #import ipdb; ipdb.set_trace() # Define LSTM module lstm_module = LSTMModule( encode_dim, hidden_size, bidirectional=False, verbose=False, lstm_reduction=args.lstm_reduction, encoder_class=cnn_encoder, encoder_kwargs={"requires_grad": args.requires_grad}) ''' # Define end model end_model = EndModel( input_module=lstm_module, layer_out_dims=[hidden_size, num_classes], optimizer="adam", #use_cuda=cuda, batchnorm=False, seed=args.seed, verbose=False, device = device, ) ''' init_kwargs = { "layer_out_dims": [hidden_size, num_classes], "input_module": lstm_module, "optimizer": "adam", "verbose": False, "input_batchnorm": False, "use_cuda": cuda, 'seed': args.seed, 'device': device } end_model = EndModel(**init_kwargs) if not os.path.exists(args.checkpoint_dir): os.mkdir(args.checkpoint_dir) with open(args.checkpoint_dir + '/init_kwargs.pickle', "wb") as f: pickle.dump(init_kwargs, f, protocol=pickle.HIGHEST_PROTOCOL) dropout = 0.4 # Train end model end_model.train_model( train_data=data_loader["train"], valid_data=data_loader["dev"], l2=args.weight_decay, lr=args.lr, n_epochs=args.n_epochs, log_train_every=1, verbose=True, progress_bar=True, loss_weights=[0.55, 0.45], input_dropout=0.1, middle_dropout=dropout, checkpoint_dir=args.checkpoint_dir, #writer = "json", #writer_config = { #"log_dir": args.log_dir, #"run_dir": args.run_dir, #"run_name": args.run_name, #"writer_metrics": ['accuracy','precision', 'recall', 'f1','roc-auc','ndcg'] #}, #validation_metric='f1', ) # evaluate end model print("Dev Set Performance") end_model.score( data_loader["dev"], verbose=True, metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg']) print("Test Set Performance") end_model.score( data_loader["test"], verbose=True, metric=['accuracy', 'precision', 'recall', 'f1', 'roc-auc', 'ndcg'])