def train(self): total_t0 = time.time() for i in range(0, self.epochs): if i >= self.train_roberta_epoch: for group in self.b_parameters: for param in group['params']: param.requires_grad = False print("") print('======== Epoch {:} / {:} ========'.format( i + 1, self.epochs)) t0 = time.time() self.model.train() self.model.zero_grad() self.train_loss = 0.0 for step, batch in tqdm.tqdm(enumerate(self.train_dataloader), desc="Training process", total=len(self.train_dataloader)): x_sent, y_sent, x_position, y_position, x_sent_pos, y_sent_pos, flag, xy = batch[ 2:] if CUDA: x_sent = x_sent.cuda() y_sent = y_sent.cuda() x_position = x_position.cuda() y_position = y_position.cuda() xy = xy.cuda() flag = flag.cuda() x_sent_pos = x_sent_pos.cuda() y_sent_pos = y_sent_pos.cuda() logits, loss = self.model(x_sent, y_sent, x_position, y_position, xy, flag, x_sent_pos, y_sent_pos) self.train_loss += loss.item() loss.backward() self.optimizer.step() self.scheduler.step() # if step%50==0 and not step==0: # elapsed = format_time(time.time() - t0) # print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(self.train_dataloader), elapsed)) # print("LR: {} - {}".format(self.optimizer.param_groups[0]['lr'], self.optimizer.param_groups[-1]['lr'])) epoch_training_time = format_time(time.time() - t0) print(" Total training loss: {0:.2f}".format(self.train_loss)) self.evaluate() print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) print("Best micro F1:{}".format(self.best_micro_f1)) print("Best confusion matrix: ") for cm in self.best_cm: print(cm) return self.best_micro_f1, self.best_cm, self.best_matres
def parser_first_page_article(html, video_id, url): regex = '(<div class="m-feedSection clearfix.*?)<!-- 评论列表 end-->' content_blocks = tools.get_info(html, regex) for content_block in content_blocks: regex = 'data-paopao-feedId="(.*?)"' article_id = tools.get_info(content_block, regex, fetch_one = True) regex = '<img width="50".*?"(http.*?)"' head_url = tools.get_info(content_block, regex, fetch_one = True) regex = '<a.*?data-paopao-ele="userUrl".*?title="(.*?)"' name = tools.get_info(content_block, regex, fetch_one = True) regex = '<p class="feed_por_time">(.*?)</p>' release_time = tools.get_info(content_block, regex, fetch_one = True) release_time = tools.format_time(release_time) release_time = tools.format_date(release_time) regex = '<h3 class="title_icon_right" title="(.*?)">' title = tools.get_info(content_block, regex, fetch_one = True) regex = '<span data-paopao-ele="dispalyContent.*?">(.*?)</span>' content = tools.get_info(content_block, regex, fetch_one = True) regex = '<img width="100%" height="100%" data-lazy="(.*?)"' image_urls = tools.get_info(content_block, regex, split = ',') regex = '<em data-paopao-uvCnt=.*?>(.*?)</em>' watch_count = tools.get_info(content_block, regex, fetch_one = True) watch_count = tools.get_int(watch_count) regex = '<em data-paopao-agreeCnt="(.*?)">' up_count = tools.get_info(content_block, regex, fetch_one = True) regex = '<em data-paopao-commentCnt="(.*?)">' comment_count = tools.get_info(content_block, regex, fetch_one = True) log.debug(''' id: %s 节目id %s 头像地址: %s 名字: %s 发布时间: %s 标题: %s 内容: %s 图片地址: %s 观看量: %s 点赞量: %s 评论量: %s '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count)) if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'): # 解析評論 regex = "\['wallId'\] = \"(.*?)\"" wall_id = tools.get_info(html, regex, fetch_one = True) parser_comment(article_id, wall_id) else: break
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] user_id = url_info['remark']['user_id'] head_url = url_info['remark']['head_url'] user_name = url_info['remark']['user_name'] gender = url_info['remark']['gender'] program_id = url_info['remark']['program_id'] page_count = 50 is_continue = True for i in range(0, page_count + 1): if not is_continue: break weibo_content_url = root_url + '&page=%d' % i headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } html = tools.get_json_by_requests(weibo_content_url, headers=headers) cards = tools.get_json_value(html, 'data.cards') if len(cards) < 2: base_parser.update_url('mms_urls', root_url, Constance.DONE) return for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') article_id = tools.get_json_value(mblog, 'id') article_url = 'https://m.weibo.cn/status/' + article_id headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } origin_html, r = tools.get_html_by_requests(url, headers=headers) if not origin_html: continue # 精确到具体时分秒 需进入到article_url release_time = mblog['created_at'] release_time = tools.format_time(release_time) # release_time = get_release_time(mblog) release_time = tools.format_date(release_time) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') comments_count = tools.get_json_value(mblog, 'comments_count') log.debug(''' 原文地址: %s 博主ID: %s 文章id %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 评论数: %s 转发数: %s 点赞数: %s ''' % (article_url, user_id, article_id, release_time, come_from, content, image_url, video_url, comments_count, transpond_count, praise_count)) if self_base_parser.add_article(article_id, head_url, user_name, release_time, None, content, image_url, None, praise_count, comments_count, program_id=program_id, gender=gender, url=article_url, info_type=1, emotion=random.randint(0, 2), collect=0, source='新浪微博'): if comments_count > 0: parser_comment(article_id) else: is_continue = False break base_parser.update_url('mms_urls', root_url, Constance.DONE)
def emit_train_embeddings(dataloader, train_dataset, model, device, args): # timing metrics t0 = time.time() batch_num = args.embed_batch_size num_documents = len(train_dataset) # set file location and layer / feature information if args.checkpoint == 'bert-base-uncased': save_location = 'C:\\w266\\data\\h5py_embeds\\' args.n_layers = 13 args.n_features = 768 else: save_location = 'C:\\w266\\data\\h5py_embeds\\bert_large\\' args.n_layers = 25 args.n_features = 1024 # create the dirs os.makedirs(save_location, exist_ok=True) with h5py.File(save_location + 'mnli_bert_embeds.h5', 'w') as f: # create empty data set; [batch_sz, layers, tokens, features] dset = f.create_dataset('embeds', shape=(num_documents, args.n_layers, args.max_seq_length, args.n_features), maxshape=(None, args.n_layers, args.max_seq_length, args.n_features), chunks=(args.embed_batch_size, args.n_layers, args.max_seq_length, args.n_features), dtype=np.float32) with h5py.File(save_location + 'mnli_labels.h5', 'w') as l: # create empty data set; [batch_sz] label_dset = l.create_dataset('labels', shape=(num_documents, ), maxshape=(None, ), chunks=(args.embed_batch_size, ), dtype=np.int64) with h5py.File(save_location + 'mnli_idx.h5', 'w') as i: # create empty data set; [batch_sz] idx_dset = i.create_dataset('idx', shape=(num_documents, ), maxshape=(None, ), chunks=(args.embed_batch_size, ), dtype=np.int64) print('Generating embeddings for all {:,} documents...'.format( len(train_dataset))) for step, batch in enumerate(dataloader): # send necessary items to GPU input_ids, attn_mask, token_type_ids, label, idx = ( batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['token_type_ids'].to(device), batch['labels'].to(device), batch['idx'].to(device)) if step % 20 == 0 and not batch_num == 0: # calc elapsed time elapsed = format_time(time.time() - t0) # calc time remaining rows_per_sec = (time.time() - t0) / batch_num remaining_sec = rows_per_sec * (num_documents - batch_num) remaining = format_time(remaining_sec) # report progress print('Documents {:>7,} of {:>7,}. Elapsed: {:}. Remaining: {:}'. format(batch_num, num_documents, elapsed, remaining)) # get embeddings with no gradient calcs with torch.no_grad(): out = model(input_ids=input_ids.squeeze(1), attention_mask=attn_mask.squeeze(1), token_type_ids=token_type_ids.squeeze(1), labels=label) # ['hidden_states'] is embeddings for all layers # stack embeddings [layers, batch_sz, tokens, features] embeddings = torch.stack(out['hidden_states']).float() # float32 # swap the order to: [batch_sz, layers, tokens, features] # we need to do this to emit batches from h5 dataset later embeddings = embeddings.permute(1, 0, 2, 3).cpu().numpy() # add embeds to ds with h5py.File(save_location + 'mnli_bert_embeds.h5', 'a') as f: # initialize dset dset = f['embeds'] # counter to add chunk of rows start = step * args.embed_batch_size # add to the dset [batch_sz, layer, tokens, features] dset[start:start + args.embed_batch_size, :, :, :] = embeddings[:, :, :, :] # create attribute with last_index value dset.attrs['last_index'] = (step + 1) * args.embed_batch_size # add labels to ds with h5py.File(save_location + 'mnli_labels.h5', 'a') as l: # initialize dset label_dset = l['labels'] # counter to add chunk of rows start = step * args.embed_batch_size # add to the dset [batch_sz, ] label_dset[start:start + args.embed_batch_size] = label.cpu().numpy() # create attribute with last_index value label_dset.attrs['last_index'] = (step + 1) * args.embed_batch_size # add idx to ds with h5py.File(save_location + 'mnli_idx.h5', 'a') as i: # initialize dset idx_dset = i['idx'] # counter to add chunk of rows start = step * args.embed_batch_size # [batch_sz, ] idx_dset[start:start + args.embed_batch_size] = idx.cpu().numpy() # create attribute with last_index value idx_dset.attrs['last_index'] = (step + 1) * args.embed_batch_size batch_num += args.embed_batch_size torch.cuda.empty_cache() # check data with h5py.File(save_location + 'mnli_bert_embeds.h5', 'r') as f: print('last embed batch entry', f['embeds'].attrs['last_index']) print('embed shape', f['embeds'].shape) print('last entry:', f['embeds'][-1, :, :, :]) with h5py.File(save_location + 'mnli_labels.h5', 'r') as l: print('last embed batch entry', l['labels'].attrs['last_index']) print('embed shape', l['labels'].shape) print('last entry:', l['labels'][len(train_dataset) - 10:len(train_dataset)]) return None
def train(self): total_t0 = time.time() for epoch_i in range(0, self.epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, self.epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() self.model.train() self.total_train_loss = 0.0 for step, batch in enumerate(self.train_dataloader): # Progress update every 50 batches. if step%50 == 0 and not step==0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(self.train_dataloader), elapsed)) x_sent = batch[3].to(self.cuda) #print(x_sent) y_sent = batch[4].to(self.cuda) z_sent = batch[5].to(self.cuda) x_position = batch[6].to(self.cuda) y_position = batch[7].to(self.cuda) z_position = batch[8].to(self.cuda) xy = batch[12].to(self.cuda) yz = batch[13].to(self.cuda) xz = batch[14].to(self.cuda) flag = batch[15].to(self.cuda) if self.finetune: alpha_logits, beta_logits, gamma_logits, loss = self.model(x_sent, y_sent, z_sent, x_position, y_position, z_position, xy, yz, xz, flag, loss_out=True) self.total_train_loss += loss.item() loss.backward() self.optimizer.step() # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Total training loss: {0:.2f}".format(self.total_train_loss)) print(" Training epoch took: {:}".format(training_time)) if self.dataset in ["HiEve", "MATRES", "I2B2"]: flag = self.evaluate(self.dataset) else: flag = self.evaluate("HiEve") flag = self.evaluate("MATRES") flag = self.evaluate("I2B2") if flag == 1: self.best_epoch = epoch_i print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) if self.dataset in ["MATRES", "Joint"]: print(" MATRES best micro F1: {0:.3f}".format(self.MATRES_best_micro_F1)) print(" MATRES best confusion matrix:\n", self.MATRES_best_cm) print(" Dev best:", file = self.file) print(" MATRES best micro F1: {0:.3f}".format(self.MATRES_best_micro_F1), file = self.file) print(" MATRES best confusion matrix:", file = self.file) print(self.MATRES_best_cm, file = self.file) if self.dataset in ["I2B2", "Joint"]: print(" I2B2 best micro F1: {0:.3f}".format(self.I2B2_best_micro_F1)) print(" I2B2 best confusion matrix:\n", self.I2B2_best_cm) print(" Dev best:", file = self.file) print(" I2B2 best micro F1: {0:.3f}".format(self.I2B2_best_micro_F1), file = self.file) print(" I2B2 best confusion matrix:", file = self.file) print(self.I2B2_best_cm, file = self.file) if self.dataset in ["HiEve", "Joint"]: print(" HiEve best F1_PC_CP_avg: {0:.3f}".format(self.HiEve_best_F1)) print(" HiEve best precision_recall_fscore_support:\n", self.HiEve_best_prfs) print(" Dev best:", file = self.file) print(" HiEve best F1_PC_CP_avg: {0:.3f}".format(self.HiEve_best_F1), file = self.file) print(" HiEve best precision_recall_fscore_support:", file = self.file) print(self.HiEve_best_prfs, file = self.file) return self.MATRES_best_micro_F1, self.HiEve_best_F1, self.I2B2_best_micro_F1
def evaluate(self, eval_data, test = False): # ======================================== # Validation / Test # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. # Also applicable to test set. t0 = time.time() if test: if self.load_model_path: self.model = torch.load(self.load_model_path + self.model_name + ".pt") elif eval_data == "HiEve": self.model = torch.load(self.HiEve_best_PATH) elif eval_data == "IB2B": self.model = torch.load(self.I2B2_best_PATH) else: # MATRES self.model = torch.load(self.MATRES_best_PATH) self.model.to(self.cuda) print("") print("loaded " + eval_data + " best model:" + self.model_name + ".pt") print("(from epoch " + str(self.best_epoch) + " )") print("Running Evaluation on " + eval_data + " Test Set...") if eval_data == "MATRES": dataloader = self.test_dataloader_MATRES elif eval_data == "I2B2": dataloader = self.test_dataloader_I2B2 else: dataloader = self.test_dataloader_HIEVE else: # Evaluation print("") print("Running Evaluation on Validation Set...") if eval_data == "MATRES": dataloader = self.valid_dataloader_MATRES if eval_data == "I2B2": dataloader = self.valid_dataloader_I2B2 else: dataloader = self.valid_dataloader_HIEVE self.model.eval() y_pred = [] y_gold = [] # Evaluate data for one epoch for batch in dataloader: x_sent = batch[3].to(self.cuda) y_sent = batch[4].to(self.cuda) z_sent = batch[5].to(self.cuda) x_position = batch[6].to(self.cuda) y_position = batch[7].to(self.cuda) z_position = batch[8].to(self.cuda) xy = batch[12].to(self.cuda) yz = batch[13].to(self.cuda) xz = batch[14].to(self.cuda) flag = batch[15].to(self.cuda) with torch.no_grad(): if self.finetune: alpha_logits, beta_logits, gamma_logits = self.model(x_sent, y_sent, z_sent, x_position, y_position, z_position, xy, yz, xz, flag, loss_out = None) else: with torch.no_grad(): x_sent_e = self.my_func(x_sent) y_sent_e = self.my_func(y_sent) z_sent_e = self.my_func(z_sent) alpha_logits, beta_logits, gamma_logits = self.model(x_sent_e, y_sent_e, z_sent_e, x_position, y_position, z_position, xy = xy, yz = yz, xz = xz, flag = flag, loss_out = None) # Move logits and labels to CPU label_ids = xy.to('cpu').numpy() y_predict = torch.max(alpha_logits, 1).indices.cpu().numpy() y_pred.extend(y_predict) y_gold.extend(label_ids) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print("Eval took: {:}".format(validation_time)) if eval_data == "MATRES": Acc, P, R, F1, CM = metric(y_gold, y_pred) print(" P: {0:.3f}".format(P)) print(" R: {0:.3f}".format(R)) print(" F1: {0:.3f}".format(F1)) if test: print("Test result:", file = self.file) print(" P: {0:.3f}".format(P), file = self.file) print(" R: {0:.3f}".format(R), file = self.file) print(" F1: {0:.3f}".format(F1), file = self.file) print(" Confusion Matrix", file = self.file) print(CM, file = self.file) if not test: if F1 > self.MATRES_best_micro_F1 or path.exists(self.MATRES_best_PATH) == False: self.MATRES_best_micro_F1 = F1 self.MATRES_best_cm = CM ### save model parameters to .pt file ### torch.save(self.model, self.MATRES_best_PATH) return 1 if eval_data == "I2B2": Acc, P, R, F1, CM = metric(y_gold, y_pred) print(" P: {0:.3f}".format(P)) print(" R: {0:.3f}".format(R)) print(" F1: {0:.3f}".format(F1)) if test: print("Test result:", file = self.file) print(" P: {0:.3f}".format(P), file = self.file) print(" R: {0:.3f}".format(R), file = self.file) print(" F1: {0:.3f}".format(F1), file = self.file) print(" Confusion Matrix", file = self.file) print(CM, file = self.file) if not test: if F1 > self.MATRES_best_micro_F1 or path.exists(self.MATRES_best_PATH) == False: self.MATRES_best_micro_F1 = F1 self.MATRES_best_cm = CM ### save model parameters to .pt file ### torch.save(self.model, self.MATRES_best_PATH) return 1 if eval_data == "HiEve": # Report the final accuracy for this validation run. cr = classification_report(y_gold, y_pred, output_dict = True) rst = classification_report(y_gold, y_pred) F1_PC = cr['0']['f1-score'] F1_CP = cr['1']['f1-score'] F1_coref = cr['2']['f1-score'] F1_NoRel = cr['3']['f1-score'] F1_PC_CP_avg = (F1_PC + F1_CP) / 2.0 print(rst) print(" F1_PC_CP_avg: {0:.3f}".format(F1_PC_CP_avg)) if test: print(" rst:", file = self.file) print(rst, file = self.file) print(" F1_PC_CP_avg: {0:.3f}".format(F1_PC_CP_avg), file = self.file) if not test: if F1_PC_CP_avg > self.HiEve_best_F1 or path.exists(self.HiEve_best_PATH) == False: self.HiEve_best_F1 = F1_PC_CP_avg self.HiEve_best_prfs = rst torch.save(self.model, self.HiEve_best_PATH) return 1 return 0
def objective(trial): params = { "downsample": trial.suggest_float("downsample", 0.01, 0.2), "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True), 'lambda_annoT': trial.suggest_float('lambda_annoT', 0.0, 1.0), 'lambda_annoH': trial.suggest_float('lambda_annoH', 0.0, 1.0), 'lambda_transT': trial.suggest_float('lambda_transT', 0.0, 1.0), 'lambda_transH': trial.suggest_float('lambda_transH', 0.0, 1.0), 'lambda_cross': trial.suggest_float('lambda_cross', 0.0, 1.0), 'MLP_size': trial.suggest_categorical("MLP_size", [512, 256, 768]), 'num_layers': trial.suggest_int("num_layers", 1, 3), 'lstm_hidden_size': trial.suggest_categorical("lstm_hidden_size", [512, 256]), 'roberta_hidden_size': trial.suggest_categorical("roberta_hidden_size", [768]), 'lstm_input_size': 768, } global interaction interaction += 1 start = timer() train_dataloader, valid_dataloader_MATRES, test_dataloader_MATRES, valid_dataloader_HIEVE, test_dataloader_HIEVE, valid_dataloader_I2B2, test_dataloader_I2B2, num_classes = joint_constrained_loader( dataset, params['downsample'], batch_size) model = roberta_mlp(num_classes, dataset, add_loss, params) if CUDA: model.cuda() model.zero_grad() print("# of parameters:", count_parameters(model)) model_name = rst_file_name.replace( ".rst", "") # to be designated after finding the best parameters total_steps = len(train_dataloader) * epochs print("Total steps: [number of batches] x [number of epochs] =", total_steps) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). if dataset == "MATRES": total_steps = len(train_dataloader) * epochs print("Total steps: [number of batches] x [number of epochs] =", total_steps) matres_exp = EXP(model, epochs, params['learning_rate'], train_dataloader, valid_dataloader_MATRES, test_dataloader_MATRES, None, None, None, None, finetune, dataset, MATRES_best_PATH, None, None, None, model_name) T_F1, H_F1, I_F1 = matres_exp.train() matres_exp.evaluate(eval_data="MATRES", test=True) if dataset == "I2B2": total_steps = len(train_dataloader) * epochs print("Total steps: [number of batches] x [number of epochs] =", total_steps) i2b2_exp = EXP(model, epochs, params['learning_rate'], train_dataloader, None, None, valid_dataloader_I2B2, test_dataloader_I2B2, valid_dataloader_HIEVE, test_dataloader_HIEVE, finetune, dataset, None, I2B2_best_PATH, None, None, model_name) T_F1, H_F1, I_F1 = i2b2_exp.train() i2b2_exp.evaluate(eval_data="I2B2", test=True) elif dataset == "HiEve": total_steps = len(train_dataloader) * epochs print("Total steps: [number of batches] x [number of epochs] =", total_steps) hieve_exp = EXP(model, epochs, params['learning_rate'], train_dataloader, None, None, None, None, valid_dataloader_HIEVE, test_dataloader_HIEVE, finetune, dataset, None, None, HiEve_best_PATH, None, model_name) T_F1, H_F1, I_F1 = hieve_exp.train() hieve_exp.evaluate(eval_data="HiEve", test=True) elif dataset == "Joint": total_steps = len(train_dataloader) * epochs print("Total steps: [number of batches] x [number of epochs] =", total_steps) joint_exp = EXP(model, epochs, params['learning_rate'], train_dataloader, valid_dataloader_MATRES, test_dataloader_MATRES, valid_dataloader_I2B2, test_dataloader_I2B2, valid_dataloader_HIEVE, test_dataloader_HIEVE, finetune, dataset, MATRES_best_PATH, I2B2_best_PATH, HiEve_best_PATH, None, model_name) T_F1, H_F1, I_F1 = joint_exp.train() joint_exp.evaluate(eval_data="HiEve", test=True) joint_exp.evaluate(eval_data="MATRES", test=True) joint_exp.evaluate(eval_data="I2B2", test=True) else: raise ValueError("Currently not supporting this dataset! -_-'") print( f'Iteration {interaction} result: MATRES F1: {T_F1}; HiEve F1: {H_F1}; I2B2 F1: {I_F1}' ) run_time = format_time(timer() - start) # Write to the csv file ('a' means append) return T_F1, H_F1, I_F1