def test_shapes(self, batch_size=1): models = utils.build_all_models() vocab = utils.build_sample_vocab() tag_vocab = utils.build_sample_tag_vocab() train_dataset = utils.construct_sample_dataloader() data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_dataset, batch_size=batch_size, shuffle=False, num_workers=0, ) def _convert(item: object) -> torch.Tensor: if isinstance(item, torch.Tensor): return item else: return torch.Tensor(item) for batch_i, (s_ids, x, x_chars, y, weight) in enumerate(data_loader): outs = [_convert(model(x, x_chars)) for model in models] for i in range(1, len(outs)): # all models should output the same dimensions assert outs[i].shape == outs[i - 1].shape
def _test_single_model_eval(self, model: nn.Module): vocab = utils.build_sample_vocab() tag_vocab = utils.build_sample_tag_vocab() train_dataset = utils.construct_sample_dataloader() data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_dataset, batch_size=1, shuffle=False, num_workers=0, ) f1_data, _ = ner_utils.compute_f1_dataloader( model=model, data_loader=data_loader, tag_vocab=tag_vocab, ) ner_utils.estimate_f1( model=model, data_loader=data_loader, tag_vocab=tag_vocab, threshold=1, ) # compute average avg = ner_utils.compute_avg_f1(f1_data)
def _test_single_model_train(self, model: nn.Module): vocab = utils.build_sample_vocab() tag_vocab = utils.build_sample_tag_vocab() train_dataset = utils.construct_sample_dataloader() data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_dataset, batch_size=4, shuffle=False, num_workers=0, ) if not isinstance(model, dictionary_model.DictionaryModel): optim = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-9) else: optim = None # iterate over epochs for e in range(1): loss_sum = 0 with tqdm(data_loader) as pbar: for i, (s_ids, x, x_chars, y, weight) in enumerate(pbar): if isinstance(model, dictionary_model.DictionaryModel): model.add_example(x.long(), y.long()) continue model.zero_grad() model_loss = model.compute_mle(x, x_chars, y) loss = torch.mean(model_loss) loss.backward() # backpropogate optim.step() # update parameters
def force_eval(self): f1_data = ner.utils.compute_f1_dataloader( self.model, conlldataloader.get_data_loader( self.vocab, self.tag_vocab, self.test_data, 1, False, 0, label_fn=lambda data, index: (data[index][0], data[index][1]), ), tag_vocab=self.tag_vocab, ) return f1_data
def test_one_hot_iteration(self): ''' A test to ensure one hot iteration occurs with no problem ''' vocab = utils.build_sample_vocab() tag_vocab = utils.build_sample_tag_vocab() train_dataset = utils.construct_sample_dataloader() data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_dataset, batch_size=4, shuffle=False, num_workers=0, one_hot=True ) for i, entry in enumerate(data_loader): # ensure iteration works pass
def test_elmo_from_raw_sentence(self): vocab = utils.build_sample_vocab() tag_vocab = utils.build_sample_tag_vocab() train_dataset = utils.construct_sample_dataloader() embedding_dim = 4 hidden_dim = 4 batch_size = 4 elmo_model = elmo_bilstm_crf.ELMo_BiLSTM_CRF(vocab, tag_vocab, hidden_dim, batch_size) data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_dataset, batch_size=batch_size, shuffle=False, num_workers=0, ) for batch_i, (s_ids, x, x_chars, y, weight) in enumerate(data_loader): # assure no errors in elmo reconstruction elmo_model(x, None) break
def test_analyze(self): models = utils.build_all_models() vocab = utils.build_sample_vocab() tag_vocab = utils.build_sample_tag_vocab() train_dataset = utils.construct_sample_dataloader() data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_dataset, batch_size=1, shuffle=False, num_workers=0, ) for model in models: ner_utils.analyze_predictions( model=model, data_loader=data_loader, vocab=vocab, tag_vocab=tag_vocab, log_file=utils.MockFile(), csv_file=None, # no need to output to csv )
def load_session_data(session_dir: str, ner_class: str): tag_vocab = ner_vocab.build_output_vocab( [f'B-{ner_class}', f'I-{ner_class}', 'O']) with open(os.path.join(session_dir, "vocab.pkl"), 'rb') as f: vocab = pickle.load(f) with open(os.path.join(session_dir, "entry_to_sentences.pkl"), 'rb') as f: entry_to_sentences = pickle.load(f) with open(os.path.join(session_dir, "database.pkl"), 'rb') as f: database = pickle.load(f) users = get_users(session_dir) gold_data = SCIERCDataset(os.path.join(session_dir, "gold_set.txt")) gold_data.parse_file() user_data = {} with open('output.csv', 'w') as f: csv_writer = csv.writer(f, delimiter=',') row_header = [ 'user_name', 'train_f1', 'train_prec', 'train_rec', 'train_acc', # train metrics 'valid_f1', 'valid_prec', 'valid_rec', 'valid_acc', # valid metrics 'gold_f1', 'gold_prec', 'gold_rec', 'gold_acc', # gold metrics ] csv_writer.writerow(row_header) for user_name in users: model = load_model( vocab=vocab, tag_vocab=tag_vocab, file_name=os.path.join(session_dir, user_name, "model.ckpt"), session_dir=session_dir, ) if model is None: continue train_dataset, valid_dataset = get_data_loaders( session_dir=session_dir, user_name=user_name) row_data = [] for dataset, dataset_name in zip( [train_dataset, valid_dataset, gold_data], ['train', 'valid', 'gold']): data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, dataset, 1, False, 0, ) f1_data, acc = compute_f1_data(model, data_loader, tag_vocab) user_data[user_name] = { 'model': model, 'f1_data': f1_data, 'acc': acc, } row_data.extend([ f1_data[ner_class]['f1'], f1_data[ner_class]['precision'], f1_data[ner_class]['recall'], acc, ]) csv_writer.writerow(row_data) ner.utils.print_f1_summary(f1_data) return user_data
def train(model, train_data, vocab, tag_vocab): trainer = ner.trainer.Trainer( model=model, learning_rate=0.01, weight_decay=1e-4, momentum=0, optimizer_type='SGD', vocab=vocab, tags=tag_vocab, batch_size=1, shuffle=True, num_workers=0, train_dataset=train_data, logger=None, device='cpu', verbose_print=True, verbose_log=False, test_dataset=[], train_label_fn=lambda data, index: (data[index][0], data[index][1][0], data[index][1][1]), test_label_fn=lambda data, index: (data[index][0], data[index][1][0], data[index][1][1]), epoch_comparator=None, ) train_data_loader = conlldataloader.get_data_loader( vocab, tag_vocab, train_data, 1, False, 0, label_fn=lambda data, index: (data[index][0], data[index][1][0], data[index][1][1]), ) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-9, momentum=0) loss_sum = 0.0 # model.train() print('starting epoch!!!!!!!') with tqdm(train_data_loader) as pbar: for i, (s_ids, x, x_chars, y) in enumerate(pbar): print(f'at iteration step 2: {i}') model.zero_grad() print(f'lets go forward!: {i}') model_loss = model.compute_mle(x, x_chars, y, s_ids=s_ids) print(f'loss iteration: {i}') loss = torch.mean(model_loss) loss.backward() # backpropogate # loss = torch.mean(model_loss) optimizer.step() # update parameters loss_sum += loss.item() # update TQDM bar pbar.set_postfix(loss_avg=loss_sum / (i + 1), epoch="{}/{}".format(0 + 1, 1)) pbar.refresh() print(f'finishing iteration: {i}') # model.eval() # model.train() loss_sum /= len(train_data_loader)