def train_and_eval(thread_ids, posts, labels, max_posts=20, max_words=400, frac=[0.8, 0.1, 0.1], seed=0, batch_size=9, embedding='glove', max_epoch=500, validate=False, result_dir=None): # preliminary check if len(thread_ids) != len(posts) or \ len(thread_ids) != len(labels) or \ len(posts) != len(labels): raise Exception('Invalid length of data.') if len(frac) != 3 or frac[0] + frac[1] + frac[2] != 1: raise Exception('Invalid value of frac.') if frac[0] <= 0 or frac[1] <= 0 or frac[2] <= 0: raise Exception('Invalid value(s) for one or more frac element(s).') if embedding not in ['glove']: raise Exception('Invalid embedding.') train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = utils.filter_and_shuffle_data( thread_ids, posts, labels, max_words, max_posts, seed, frac) print(f'''---------- Data Split Result: Train data = {len(train_texts)} Test data = {len(test_texts)} Val data = {len(val_texts)} ----------''') # from here on is glove specific implementation (may need to extract to a function) print('Init embedding') glove = Glove() glove.create_custom_embedding( [item for sublist in train_texts for item in sublist]) glove.add_to_embedding(['.', '!', '?']) print('Padding and packing data into data loader') for i, thread in enumerate(train_texts): for j, post_text in enumerate(thread): train_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) for i, thread in enumerate(test_texts): for j, post_text in enumerate(thread): test_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) for i, thread in enumerate(val_texts): for j, post_text in enumerate(thread): val_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) # padding at the post level post_padding = [glove.word2idx[glove.pad_token]] * max_words for posts in [train_texts, test_texts, val_texts]: for sublist in posts: if len(sublist) < max_posts: sublist.extend([post_padding] * (max_posts - len(sublist))) for labels in [train_labels, test_labels, val_labels]: for sublist in labels: if len(sublist) < max_posts: sublist.extend([0] * (max_posts - len(sublist))) train_loader = utils.to_data_loader(batch_size, train_texts, train_labels) test_loader = utils.to_data_loader(batch_size, test_texts, test_labels) val_loader = utils.to_data_loader(batch_size, val_texts, val_labels) print('Creating model') embedding = create_emb_layer( torch.from_numpy(glove.weights_matrix).float().to(utils.get_device())) model = hLSTM(input_size=glove.emb_dim, hidden_size=glove.emb_dim, output_size=glove.emb_dim, batch_size=batch_size, num_layers=1, bidirectional=False, embedding=embedding, drop_prob=0.5, max_output=max_posts, device=utils.get_device()) labels = [label for sublist in train_labels for label in sublist] intervention_ratio = len([label for label in labels if label == 1]) / len(labels) loss_fn = WeightedBCELoss(zero_weight=intervention_ratio, one_weight=1 - intervention_ratio) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) writer = None if result_dir is not None: writer = SummaryWriter(f'runs/{result_dir}') if not os.path.exists(f'models/{result_dir}'): os.makedirs(f'models/{result_dir}') if not validate: val_loader = None print('Start training model') train_model(model, train_loader, max_epoch, loss_fn, optimizer, val_loader, writer) print('Evaluating model') f1, precision, recall = eval_model(model, test_loader, False) print(f''' Test results: F1 = {f1} Precision = {precision} Recall = {recall} ''') if result_dir is not None: print('Saving final model') torch.save(model.state_dict(), f'models/{result_dir}/final_model.pth') print('DONE :)))')
def val_dataloader(self): v = self.val_tuple return utils.to_data_loader(v[0], v[1], v[2], v[3], self.batch_size)
def train_and_eval_crf(thread_ids, posts, labels, max_posts=20, max_words=400, frac=[0.8, 0.1, 0.1], seed=0, batch_size=9, embedding='glove', max_epoch=500, validate=False, result_dir=None): # preliminary check if len(thread_ids) != len(posts) or \ len(thread_ids) != len(labels) or \ len(posts) != len(labels): raise Exception('Invalid length of data.') if len(frac) != 3 or frac[0]+frac[1]+frac[2] != 1: raise Exception('Invalid value of frac.') if frac[0] <= 0 or frac[1] <= 0 or frac[2] <= 0: raise Exception('Invalid value(s) for one or more frac element(s).') if embedding not in ['glove']: raise Exception('Invalid embedding.') train_texts, train_labels, test_texts, test_labels, val_texts, val_labels = utils.filter_and_shuffle_data(thread_ids, posts, labels, max_words, max_posts, seed, frac) # from here on is glove specific implementation (may need to extract to a function) print('Init embedding') glove = Glove() glove.create_custom_embedding([item for sublist in train_texts for item in sublist]) glove.add_to_embedding(['.', '!', '?']) print('Padding and packing data into data loader') for i, thread in enumerate(train_texts): for j, post_text in enumerate(thread): train_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) for i, thread in enumerate(test_texts): for j, post_text in enumerate(thread): test_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) for i, thread in enumerate(val_texts): for j, post_text in enumerate(thread): val_texts[i][j] = glove.sentence_to_indices(post_text, seq_len=max_words) # padding at the post level post_padding = [glove.word2idx[glove.pad_token]] * max_words for posts in [train_texts, test_texts, val_texts]: for sublist in posts: if len(sublist) < max_posts: sublist.extend([post_padding] * (max_posts - len(sublist))) train_masks, test_masks, val_masks = [], [], [] def get_to_append(ones): to_append = [1] * len(labels) if len(to_append) < max_posts: to_append.extend([0] * (max_posts-len(labels))) return to_append for labels in train_labels: to_append = get_to_append(len(labels)) train_masks.append(to_append) for labels in test_labels: to_append = get_to_append(len(labels)) test_masks.append(to_append) for labels in val_labels: to_append = get_to_append(len(labels)) val_masks.append(to_append) for labels in [train_labels, test_labels, val_labels]: for sublist in labels: if len(sublist) < max_posts: sublist.extend([0] * (max_posts-len(sublist))) train_loader = utils.to_data_loader(batch_size, train_texts, train_labels, train_masks) test_loader = utils.to_data_loader(batch_size, test_texts, test_labels, test_masks) val_loader = utils.to_data_loader(batch_size, val_texts, val_labels, val_masks) print('Creating model') embedding = create_emb_layer(torch.from_numpy(glove.weights_matrix).float().to(utils.get_device())) model = hLSTM_CRF(num_tags=2, input_size=glove.emb_dim, hidden_size=glove.emb_dim, output_size=glove.emb_dim, batch_size=batch_size, num_layers=1, bidirectional=False, embedding=embedding, drop_prob=0.5, max_output=max_posts, device=utils.get_device()) labels = [label for sublist in train_labels for label in sublist] intervention_ratio = len([label for label in labels if label == 1]) / len(labels) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) writer = None if result_dir is not None: writer = SummaryWriter(f'runs/{result_dir}') if not os.path.exists(f'models/{result_dir}'): os.makedirs(f'models/{result_dir}') if not validate: val_loader = None print('Start training model') model.zero_grad() model.train() running_loss = 0.0 for epoch in range(max_epoch): if (epoch + 1) % 20 == 0: print(f'Training model ({epoch + 1} / {max_epoch})') for i, (inputs, labels, masks) in enumerate(train_loader): inputs, labels, masks = inputs.to(utils.get_device()), labels.to(utils.get_device()), masks.to(utils.get_device()) loss = model.loss(inputs, labels, masks) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5) optimizer.step() running_loss += loss.item() if i % 1000 == 999: # every 100 mini-batches if writer is not None: writer.add_scalar('training loss', running_loss / 1000, epoch * len(train_loader) + i) running_loss = 0.0 if val_loader is not None: f1, _, _ = eval_model(model, val_loader) writer.add_scalar('validation f1', f1, epoch * len(train_loader) + i) print('Evaluating model') f1, precision, recall = eval_model(model, test_loader, False) print(f''' Test results: F1 = {f1} Precision = {precision} Recall = {recall} ''') if result_dir is not None: print('Saving final model') torch.save(model.state_dict(), f'models/{result_dir}/final_model.pth') print('DONE :)))')
def train_dataloader(self): t = self.train_tuple return utils.to_data_loader(t[0], t[1], t[2], t[3], self.batch_size)
writer_t = SummaryWriter(f'runs/{TB_FOLDER}_train') train_texts = list(train.posts) print('Init GloVe embedding') glove = Glove() glove.create_custom_embedding([word for text in train_texts for word in text.split()]) print(len(glove.word2idx)) print('Padding and packing data into data loader') train_indices, train_labels, train_wl, train_pl = utils.process_data(train, glove, MAX_WORDS, MAX_POSTS) test_indices, test_labels, test_wl, test_pl = utils.process_data(test, glove, MAX_WORDS, MAX_POSTS) val_indices, val_labels, val_wl, val_pl = utils.process_data(val, glove, MAX_WORDS, MAX_POSTS) train_loader = utils.to_data_loader(train_indices, train_labels, train_wl, train_pl, BATCH_SIZE) test_loader = utils.to_data_loader(test_indices, test_labels, test_wl, test_pl, BATCH_SIZE) val_loader = utils.to_data_loader(val_indices, val_labels, val_wl, val_pl, BATCH_SIZE) print('Creating model') embedding = create_emb_layer(torch.from_numpy(glove.weights_matrix).float().to(device)) criterion = WeightedBCELoss(zero_weight=INTERVENED_RATIO, one_weight=1-INTERVENED_RATIO) model = HierarchicalModel(input_dim=glove.emb_dim, hidden_dim1=glove.emb_dim, embedding=embedding, criterion=criterion) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.005) model.zero_grad() model.train()