Esempio n. 1
0
    def train(self):
        total_t0 = time.time()
        for i in range(0, self.epochs):
            if i >= self.train_roberta_epoch:
                for group in self.b_parameters:
                    for param in group['params']:
                        param.requires_grad = False

            print("")
            print('======== Epoch {:} / {:} ========'.format(
                i + 1, self.epochs))

            t0 = time.time()
            self.model.train()
            self.model.zero_grad()
            self.train_loss = 0.0
            for step, batch in tqdm.tqdm(enumerate(self.train_dataloader),
                                         desc="Training process",
                                         total=len(self.train_dataloader)):
                x_sent, y_sent, x_position, y_position, x_sent_pos, y_sent_pos, flag, xy = batch[
                    2:]
                if CUDA:
                    x_sent = x_sent.cuda()
                    y_sent = y_sent.cuda()
                    x_position = x_position.cuda()
                    y_position = y_position.cuda()
                    xy = xy.cuda()
                    flag = flag.cuda()
                    x_sent_pos = x_sent_pos.cuda()
                    y_sent_pos = y_sent_pos.cuda()

                logits, loss = self.model(x_sent, y_sent, x_position,
                                          y_position, xy, flag, x_sent_pos,
                                          y_sent_pos)
                self.train_loss += loss.item()
                loss.backward()
                self.optimizer.step()
                self.scheduler.step()

                # if step%50==0 and not step==0:
                #     elapsed = format_time(time.time() - t0)
                #     print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(self.train_dataloader), elapsed))
                #     print("LR: {} - {}".format(self.optimizer.param_groups[0]['lr'], self.optimizer.param_groups[-1]['lr']))

            epoch_training_time = format_time(time.time() - t0)
            print("  Total training loss: {0:.2f}".format(self.train_loss))
            self.evaluate()

        print("Training complete!")
        print("Total training took {:} (h:mm:ss)".format(
            format_time(time.time() - total_t0)))
        print("Best micro F1:{}".format(self.best_micro_f1))
        print("Best confusion matrix: ")
        for cm in self.best_cm:
            print(cm)
        return self.best_micro_f1, self.best_cm, self.best_matres
Esempio n. 2
0
def parser_first_page_article(html, video_id, url):
    regex = '(<div class="m-feedSection clearfix.*?)<!-- 评论列表 end-->'
    content_blocks = tools.get_info(html, regex)

    for content_block in content_blocks:
        regex = 'data-paopao-feedId="(.*?)"'
        article_id = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<img width="50".*?"(http.*?)"'
        head_url = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<a.*?data-paopao-ele="userUrl".*?title="(.*?)"'
        name = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<p class="feed_por_time">(.*?)</p>'
        release_time = tools.get_info(content_block, regex, fetch_one = True)
        release_time = tools.format_time(release_time)
        release_time = tools.format_date(release_time)

        regex = '<h3 class="title_icon_right" title="(.*?)">'
        title = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<span data-paopao-ele="dispalyContent.*?">(.*?)</span>'
        content = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<img width="100%" height="100%" data-lazy="(.*?)"'
        image_urls = tools.get_info(content_block, regex, split = ',')

        regex = '<em data-paopao-uvCnt=.*?>(.*?)</em>'
        watch_count = tools.get_info(content_block, regex, fetch_one = True)
        watch_count = tools.get_int(watch_count)

        regex = '<em data-paopao-agreeCnt="(.*?)">'
        up_count = tools.get_info(content_block, regex, fetch_one = True)

        regex = '<em data-paopao-commentCnt="(.*?)">'
        comment_count = tools.get_info(content_block, regex, fetch_one = True)

        log.debug('''
            id:       %s
            节目id     %s
            头像地址: %s
            名字:     %s
            发布时间: %s
            标题:     %s
            内容:     %s
            图片地址: %s
            观看量:   %s
            点赞量:   %s
            评论量:   %s
            '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count))

        if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'):

            # 解析評論
            regex = "\['wallId'\] = \"(.*?)\""
            wall_id = tools.get_info(html, regex, fetch_one = True)
            parser_comment(article_id, wall_id)
        else:
            break
Esempio n. 3
0
def parser(url_info):
    url_info['_id'] = str(url_info['_id'])
    log.debug('处理 \n' + tools.dumps_json(url_info))

    root_url = url_info['url']
    user_id = url_info['remark']['user_id']
    head_url = url_info['remark']['head_url']
    user_name = url_info['remark']['user_name']
    gender = url_info['remark']['gender']
    program_id = url_info['remark']['program_id']

    page_count = 50
    is_continue = True

    for i in range(0, page_count + 1):
        if not is_continue: break

        weibo_content_url = root_url + '&page=%d' % i

        headers = {
            "Cache-Control":
            "max-age=0",
            "Cookie":
            "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011",
            "Accept-Language":
            "zh-CN,zh;q=0.8",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
            "Host":
            "m.weibo.cn",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Upgrade-Insecure-Requests":
            "1",
            "Connection":
            "keep-alive",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"
        }
        html = tools.get_json_by_requests(weibo_content_url, headers=headers)

        cards = tools.get_json_value(html, 'data.cards')
        if len(cards) < 2:
            base_parser.update_url('mms_urls', root_url, Constance.DONE)
            return

        for card in cards:
            mblog = tools.get_json_value(card, 'mblog')
            if not mblog:
                continue

            url = tools.get_json_value(card, 'scheme')
            article_id = tools.get_json_value(mblog, 'id')
            article_url = 'https://m.weibo.cn/status/' + article_id

            headers = {
                "User-Agent":
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br",
                "Cookie":
                "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011",
                "Host": "m.weibo.cn",
                "Accept-Language": "zh-CN,zh;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "Connection": "keep-alive"
            }
            origin_html, r = tools.get_html_by_requests(url, headers=headers)
            if not origin_html:
                continue

            # 精确到具体时分秒 需进入到article_url
            release_time = mblog['created_at']
            release_time = tools.format_time(release_time)
            # release_time = get_release_time(mblog)
            release_time = tools.format_date(release_time)

            come_from = tools.get_json_value(mblog, 'source')
            regexs = ['"text": "(.+?)",']
            content = ''.join(tools.get_info(origin_html, regexs))
            # content = tools.del_html_tag(content)
            content = content.replace('\\', '')

            regexs = ['"pic_ids": \[(.*?)\],']
            image_url = ''.join(tools.get_info(origin_html, regexs))
            image_url = tools.del_html_tag(image_url).replace('\"',
                                                              '').replace(
                                                                  '\\n', '')
            if image_url:
                image_url = image_url.split(',')
                for i in range(len(image_url)):
                    image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[
                        i] + '.jpg'

                image_url = ','.join(image_url)

            regexs = ['"stream_url": "(.*?)"']
            video_url = ''.join(tools.get_info(origin_html, regexs))
            transpond_count = tools.get_json_value(mblog, 'reposts_count')
            praise_count = tools.get_json_value(mblog, 'attitudes_count')
            comments_count = tools.get_json_value(mblog, 'comments_count')

            log.debug('''
                原文地址:     %s
                博主ID:       %s
                文章id         %s
                发布时间:     %s
                来自:         %s
                内容:         %s
                图片地址:     %s
                视频地址:     %s
                评论数:       %s
                转发数:       %s
                点赞数:       %s
                ''' % (article_url, user_id, article_id, release_time,
                       come_from, content, image_url, video_url,
                       comments_count, transpond_count, praise_count))

            if self_base_parser.add_article(article_id,
                                            head_url,
                                            user_name,
                                            release_time,
                                            None,
                                            content,
                                            image_url,
                                            None,
                                            praise_count,
                                            comments_count,
                                            program_id=program_id,
                                            gender=gender,
                                            url=article_url,
                                            info_type=1,
                                            emotion=random.randint(0, 2),
                                            collect=0,
                                            source='新浪微博'):

                if comments_count > 0:
                    parser_comment(article_id)
            else:
                is_continue = False
                break

    base_parser.update_url('mms_urls', root_url, Constance.DONE)
Esempio n. 4
0
def emit_train_embeddings(dataloader, train_dataset, model, device, args):
    # timing metrics
    t0 = time.time()
    batch_num = args.embed_batch_size
    num_documents = len(train_dataset)

    # set file location and layer / feature information
    if args.checkpoint == 'bert-base-uncased':
        save_location = 'C:\\w266\\data\\h5py_embeds\\'
        args.n_layers = 13
        args.n_features = 768
    else:
        save_location = 'C:\\w266\\data\\h5py_embeds\\bert_large\\'
        args.n_layers = 25
        args.n_features = 1024
    # create the dirs
    os.makedirs(save_location, exist_ok=True)

    with h5py.File(save_location + 'mnli_bert_embeds.h5', 'w') as f:
        # create empty data set; [batch_sz, layers, tokens, features]
        dset = f.create_dataset('embeds',
                                shape=(num_documents, args.n_layers,
                                       args.max_seq_length, args.n_features),
                                maxshape=(None, args.n_layers,
                                          args.max_seq_length,
                                          args.n_features),
                                chunks=(args.embed_batch_size, args.n_layers,
                                        args.max_seq_length, args.n_features),
                                dtype=np.float32)

    with h5py.File(save_location + 'mnli_labels.h5', 'w') as l:
        # create empty data set; [batch_sz]
        label_dset = l.create_dataset('labels',
                                      shape=(num_documents, ),
                                      maxshape=(None, ),
                                      chunks=(args.embed_batch_size, ),
                                      dtype=np.int64)

    with h5py.File(save_location + 'mnli_idx.h5', 'w') as i:
        # create empty data set; [batch_sz]
        idx_dset = i.create_dataset('idx',
                                    shape=(num_documents, ),
                                    maxshape=(None, ),
                                    chunks=(args.embed_batch_size, ),
                                    dtype=np.int64)

    print('Generating embeddings for all {:,} documents...'.format(
        len(train_dataset)))
    for step, batch in enumerate(dataloader):
        # send necessary items to GPU
        input_ids, attn_mask, token_type_ids, label, idx = (
            batch['input_ids'].to(device), batch['attention_mask'].to(device),
            batch['token_type_ids'].to(device), batch['labels'].to(device),
            batch['idx'].to(device))

        if step % 20 == 0 and not batch_num == 0:
            # calc elapsed time
            elapsed = format_time(time.time() - t0)
            # calc time remaining
            rows_per_sec = (time.time() - t0) / batch_num
            remaining_sec = rows_per_sec * (num_documents - batch_num)
            remaining = format_time(remaining_sec)
            # report progress
            print('Documents {:>7,} of {:>7,}. Elapsed: {:}. Remaining: {:}'.
                  format(batch_num, num_documents, elapsed, remaining))

        # get embeddings with no gradient calcs
        with torch.no_grad():

            out = model(input_ids=input_ids.squeeze(1),
                        attention_mask=attn_mask.squeeze(1),
                        token_type_ids=token_type_ids.squeeze(1),
                        labels=label)

        # ['hidden_states'] is embeddings for all layers
        # stack embeddings [layers, batch_sz, tokens, features]
        embeddings = torch.stack(out['hidden_states']).float()  # float32
        # swap the order to: [batch_sz, layers, tokens, features]
        # we need to do this to emit batches from h5 dataset later
        embeddings = embeddings.permute(1, 0, 2, 3).cpu().numpy()

        # add embeds to ds
        with h5py.File(save_location + 'mnli_bert_embeds.h5', 'a') as f:
            # initialize dset
            dset = f['embeds']
            # counter to add chunk of rows
            start = step * args.embed_batch_size
            # add to the dset              [batch_sz, layer, tokens, features]
            dset[start:start +
                 args.embed_batch_size, :, :, :] = embeddings[:, :, :, :]
            # create attribute with last_index value
            dset.attrs['last_index'] = (step + 1) * args.embed_batch_size

        # add labels to ds
        with h5py.File(save_location + 'mnli_labels.h5', 'a') as l:
            # initialize dset
            label_dset = l['labels']
            # counter to add chunk of rows
            start = step * args.embed_batch_size
            # add to the dset              [batch_sz, ]
            label_dset[start:start +
                       args.embed_batch_size] = label.cpu().numpy()
            # create attribute with last_index value
            label_dset.attrs['last_index'] = (step + 1) * args.embed_batch_size

        # add idx to ds
        with h5py.File(save_location + 'mnli_idx.h5', 'a') as i:
            # initialize dset
            idx_dset = i['idx']
            # counter to add chunk of rows
            start = step * args.embed_batch_size
            # [batch_sz, ]
            idx_dset[start:start + args.embed_batch_size] = idx.cpu().numpy()
            # create attribute with last_index value
            idx_dset.attrs['last_index'] = (step + 1) * args.embed_batch_size

        batch_num += args.embed_batch_size
        torch.cuda.empty_cache()

    # check data
    with h5py.File(save_location + 'mnli_bert_embeds.h5', 'r') as f:
        print('last embed batch entry', f['embeds'].attrs['last_index'])
        print('embed shape', f['embeds'].shape)
        print('last entry:', f['embeds'][-1, :, :, :])

    with h5py.File(save_location + 'mnli_labels.h5', 'r') as l:
        print('last embed batch entry', l['labels'].attrs['last_index'])
        print('embed shape', l['labels'].shape)
        print('last entry:',
              l['labels'][len(train_dataset) - 10:len(train_dataset)])

    return None
Esempio n. 5
0
    def train(self):
        total_t0 = time.time()
        for epoch_i in range(0, self.epochs):
            # ========================================
            #               Training
            # ========================================

            # Perform one full pass over the training set.
            print("")
            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, self.epochs))
            print('Training...')

            # Measure how long the training epoch takes.
            t0 = time.time()
            self.model.train()
            self.total_train_loss = 0.0
            for step, batch in enumerate(self.train_dataloader):
                # Progress update every 50 batches.
                if step%50 == 0 and not step==0:
                    # Calculate elapsed time in minutes.
                    elapsed = format_time(time.time() - t0)
                    # Report progress.
                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(self.train_dataloader), elapsed))
                x_sent = batch[3].to(self.cuda)
                #print(x_sent)
                y_sent = batch[4].to(self.cuda)
                z_sent = batch[5].to(self.cuda)
                x_position = batch[6].to(self.cuda)
                y_position = batch[7].to(self.cuda)
                z_position = batch[8].to(self.cuda)
                xy = batch[12].to(self.cuda)
                yz = batch[13].to(self.cuda)
                xz = batch[14].to(self.cuda)
                flag = batch[15].to(self.cuda)
                if self.finetune:
                    alpha_logits, beta_logits, gamma_logits, loss = self.model(x_sent, y_sent, z_sent, x_position, y_position, z_position, xy, yz, xz, flag, loss_out=True)
                self.total_train_loss += loss.item()
                loss.backward()
                self.optimizer.step()
            
            # Measure how long this epoch took.
            training_time = format_time(time.time() - t0)
            print("")
            print("  Total training loss: {0:.2f}".format(self.total_train_loss))
            print("  Training epoch took: {:}".format(training_time))
            if self.dataset in ["HiEve", "MATRES", "I2B2"]:
                flag = self.evaluate(self.dataset)
            else:
                flag = self.evaluate("HiEve")
                flag = self.evaluate("MATRES")
                flag = self.evaluate("I2B2")
            if flag == 1:
                self.best_epoch = epoch_i

        print("")
        print("Training complete!")
        print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
        if self.dataset in ["MATRES", "Joint"]:
            print("  MATRES best micro F1: {0:.3f}".format(self.MATRES_best_micro_F1))
            print("  MATRES best confusion matrix:\n", self.MATRES_best_cm)
            print("  Dev best:", file = self.file)
            print("  MATRES best micro F1: {0:.3f}".format(self.MATRES_best_micro_F1), file = self.file)
            print("  MATRES best confusion matrix:", file = self.file)
            print(self.MATRES_best_cm, file = self.file)
        if self.dataset in ["I2B2", "Joint"]:
            print("  I2B2 best micro F1: {0:.3f}".format(self.I2B2_best_micro_F1))
            print("  I2B2 best confusion matrix:\n", self.I2B2_best_cm)
            print("  Dev best:", file = self.file)
            print("  I2B2 best micro F1: {0:.3f}".format(self.I2B2_best_micro_F1), file = self.file)
            print("  I2B2 best confusion matrix:", file = self.file)
            print(self.I2B2_best_cm, file = self.file)
        if self.dataset in ["HiEve", "Joint"]:
            print("  HiEve best F1_PC_CP_avg: {0:.3f}".format(self.HiEve_best_F1))
            print("  HiEve best precision_recall_fscore_support:\n", self.HiEve_best_prfs)
            print("  Dev best:", file = self.file)
            print("  HiEve best F1_PC_CP_avg: {0:.3f}".format(self.HiEve_best_F1), file = self.file)
            print("  HiEve best precision_recall_fscore_support:", file = self.file)
            print(self.HiEve_best_prfs, file = self.file)
        return self.MATRES_best_micro_F1, self.HiEve_best_F1, self.I2B2_best_micro_F1
Esempio n. 6
0
 def evaluate(self, eval_data, test = False):
     # ========================================
     #             Validation / Test
     # ========================================
     # After the completion of each training epoch, measure our performance on
     # our validation set.
     # Also applicable to test set.
     t0 = time.time()
         
     if test:
         if self.load_model_path:
             self.model = torch.load(self.load_model_path + self.model_name + ".pt")
         elif eval_data == "HiEve":
             self.model = torch.load(self.HiEve_best_PATH)
         elif eval_data == "IB2B":
             self.model = torch.load(self.I2B2_best_PATH)
         else: # MATRES
             self.model = torch.load(self.MATRES_best_PATH)
         self.model.to(self.cuda)
         print("")
         print("loaded " + eval_data + " best model:" + self.model_name + ".pt")
         print("(from epoch " + str(self.best_epoch) + " )")
         print("Running Evaluation on " + eval_data + " Test Set...")
         if eval_data == "MATRES":
             dataloader = self.test_dataloader_MATRES
         elif eval_data == "I2B2":
             dataloader = self.test_dataloader_I2B2
         else:
             dataloader = self.test_dataloader_HIEVE
     else:
         # Evaluation
         print("")
         print("Running Evaluation on Validation Set...")
         if eval_data == "MATRES":
             dataloader = self.valid_dataloader_MATRES
         if eval_data == "I2B2":
             dataloader = self.valid_dataloader_I2B2
         else:
             dataloader = self.valid_dataloader_HIEVE
         
     self.model.eval()
     
     y_pred = []
     y_gold = []
     # Evaluate data for one epoch
     for batch in dataloader:
         x_sent = batch[3].to(self.cuda)
         y_sent = batch[4].to(self.cuda)
         z_sent = batch[5].to(self.cuda)
         x_position = batch[6].to(self.cuda)
         y_position = batch[7].to(self.cuda)
         z_position = batch[8].to(self.cuda)
         xy = batch[12].to(self.cuda)
         yz = batch[13].to(self.cuda)
         xz = batch[14].to(self.cuda)
         flag = batch[15].to(self.cuda)
         with torch.no_grad():
             if self.finetune:
                 alpha_logits, beta_logits, gamma_logits = self.model(x_sent, y_sent, z_sent, x_position, y_position, z_position, xy, yz, xz, flag, loss_out = None)
             else:
                 with torch.no_grad():
                     x_sent_e = self.my_func(x_sent)
                     y_sent_e = self.my_func(y_sent)
                     z_sent_e = self.my_func(z_sent)
                 alpha_logits, beta_logits, gamma_logits = self.model(x_sent_e, y_sent_e, z_sent_e, x_position, y_position, z_position, xy = xy, yz = yz, xz = xz, flag = flag, loss_out = None)
         # Move logits and labels to CPU
         label_ids = xy.to('cpu').numpy()
         y_predict = torch.max(alpha_logits, 1).indices.cpu().numpy()
         y_pred.extend(y_predict)
         y_gold.extend(label_ids)
         
     # Measure how long the validation run took.
     validation_time = format_time(time.time() - t0)
     print("Eval took: {:}".format(validation_time))
     
     if eval_data == "MATRES":
         Acc, P, R, F1, CM = metric(y_gold, y_pred)
         print("  P: {0:.3f}".format(P))
         print("  R: {0:.3f}".format(R))
         print("  F1: {0:.3f}".format(F1))
         if test:
             print("Test result:", file = self.file)
             print("  P: {0:.3f}".format(P), file = self.file)
             print("  R: {0:.3f}".format(R), file = self.file)
             print("  F1: {0:.3f}".format(F1), file = self.file)
             print("  Confusion Matrix", file = self.file)
             print(CM, file = self.file)
         if not test:
             if F1 > self.MATRES_best_micro_F1 or path.exists(self.MATRES_best_PATH) == False:
                 self.MATRES_best_micro_F1 = F1
                 self.MATRES_best_cm = CM
                 ### save model parameters to .pt file ###
                 torch.save(self.model, self.MATRES_best_PATH)
                 return 1
     
     if eval_data == "I2B2":
         Acc, P, R, F1, CM = metric(y_gold, y_pred)
         print("  P: {0:.3f}".format(P))
         print("  R: {0:.3f}".format(R))
         print("  F1: {0:.3f}".format(F1))
         if test:
             print("Test result:", file = self.file)
             print("  P: {0:.3f}".format(P), file = self.file)
             print("  R: {0:.3f}".format(R), file = self.file)
             print("  F1: {0:.3f}".format(F1), file = self.file)
             print("  Confusion Matrix", file = self.file)
             print(CM, file = self.file)
         if not test:
             if F1 > self.MATRES_best_micro_F1 or path.exists(self.MATRES_best_PATH) == False:
                 self.MATRES_best_micro_F1 = F1
                 self.MATRES_best_cm = CM
                 ### save model parameters to .pt file ###
                 torch.save(self.model, self.MATRES_best_PATH)
                 return 1
     
     if eval_data == "HiEve":
         # Report the final accuracy for this validation run.
         cr = classification_report(y_gold, y_pred, output_dict = True)
         rst = classification_report(y_gold, y_pred)
         F1_PC = cr['0']['f1-score']
         F1_CP = cr['1']['f1-score']
         F1_coref = cr['2']['f1-score']
         F1_NoRel = cr['3']['f1-score']
         F1_PC_CP_avg = (F1_PC + F1_CP) / 2.0
         print(rst)
         print("  F1_PC_CP_avg: {0:.3f}".format(F1_PC_CP_avg))
         if test:
             print("  rst:", file = self.file)
             print(rst, file = self.file)
             print("  F1_PC_CP_avg: {0:.3f}".format(F1_PC_CP_avg), file = self.file)
         if not test:
             if F1_PC_CP_avg > self.HiEve_best_F1 or path.exists(self.HiEve_best_PATH) == False:
                 self.HiEve_best_F1 = F1_PC_CP_avg
                 self.HiEve_best_prfs = rst
                 torch.save(self.model, self.HiEve_best_PATH)
                 return 1
     return 0 
def objective(trial):
    params = {
        "downsample":
        trial.suggest_float("downsample", 0.01, 0.2),
        "learning_rate":
        trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
        'lambda_annoT':
        trial.suggest_float('lambda_annoT', 0.0, 1.0),
        'lambda_annoH':
        trial.suggest_float('lambda_annoH', 0.0, 1.0),
        'lambda_transT':
        trial.suggest_float('lambda_transT', 0.0, 1.0),
        'lambda_transH':
        trial.suggest_float('lambda_transH', 0.0, 1.0),
        'lambda_cross':
        trial.suggest_float('lambda_cross', 0.0, 1.0),
        'MLP_size':
        trial.suggest_categorical("MLP_size", [512, 256, 768]),
        'num_layers':
        trial.suggest_int("num_layers", 1, 3),
        'lstm_hidden_size':
        trial.suggest_categorical("lstm_hidden_size", [512, 256]),
        'roberta_hidden_size':
        trial.suggest_categorical("roberta_hidden_size", [768]),
        'lstm_input_size':
        768,
    }

    global interaction
    interaction += 1
    start = timer()
    train_dataloader, valid_dataloader_MATRES, test_dataloader_MATRES, valid_dataloader_HIEVE, test_dataloader_HIEVE, valid_dataloader_I2B2, test_dataloader_I2B2, num_classes = joint_constrained_loader(
        dataset, params['downsample'], batch_size)

    model = roberta_mlp(num_classes, dataset, add_loss, params)
    if CUDA:
        model.cuda()
    model.zero_grad()
    print("# of parameters:", count_parameters(model))
    model_name = rst_file_name.replace(
        ".rst", "")  # to be designated after finding the best parameters
    total_steps = len(train_dataloader) * epochs
    print("Total steps: [number of batches] x [number of epochs] =",
          total_steps)

    # Total number of training steps is [number of batches] x [number of epochs].
    # (Note that this is not the same as the number of training samples).
    if dataset == "MATRES":
        total_steps = len(train_dataloader) * epochs
        print("Total steps: [number of batches] x [number of epochs] =",
              total_steps)
        matres_exp = EXP(model, epochs, params['learning_rate'],
                         train_dataloader, valid_dataloader_MATRES,
                         test_dataloader_MATRES, None, None, None, None,
                         finetune, dataset, MATRES_best_PATH, None, None, None,
                         model_name)
        T_F1, H_F1, I_F1 = matres_exp.train()
        matres_exp.evaluate(eval_data="MATRES", test=True)
    if dataset == "I2B2":
        total_steps = len(train_dataloader) * epochs
        print("Total steps: [number of batches] x [number of epochs] =",
              total_steps)
        i2b2_exp = EXP(model, epochs, params['learning_rate'],
                       train_dataloader, None, None, valid_dataloader_I2B2,
                       test_dataloader_I2B2, valid_dataloader_HIEVE,
                       test_dataloader_HIEVE, finetune, dataset, None,
                       I2B2_best_PATH, None, None, model_name)
        T_F1, H_F1, I_F1 = i2b2_exp.train()
        i2b2_exp.evaluate(eval_data="I2B2", test=True)
    elif dataset == "HiEve":
        total_steps = len(train_dataloader) * epochs
        print("Total steps: [number of batches] x [number of epochs] =",
              total_steps)
        hieve_exp = EXP(model, epochs, params['learning_rate'],
                        train_dataloader, None, None, None, None,
                        valid_dataloader_HIEVE, test_dataloader_HIEVE,
                        finetune, dataset, None, None, HiEve_best_PATH, None,
                        model_name)
        T_F1, H_F1, I_F1 = hieve_exp.train()
        hieve_exp.evaluate(eval_data="HiEve", test=True)
    elif dataset == "Joint":
        total_steps = len(train_dataloader) * epochs
        print("Total steps: [number of batches] x [number of epochs] =",
              total_steps)
        joint_exp = EXP(model, epochs, params['learning_rate'],
                        train_dataloader, valid_dataloader_MATRES,
                        test_dataloader_MATRES, valid_dataloader_I2B2,
                        test_dataloader_I2B2, valid_dataloader_HIEVE,
                        test_dataloader_HIEVE, finetune, dataset,
                        MATRES_best_PATH, I2B2_best_PATH, HiEve_best_PATH,
                        None, model_name)
        T_F1, H_F1, I_F1 = joint_exp.train()
        joint_exp.evaluate(eval_data="HiEve", test=True)
        joint_exp.evaluate(eval_data="MATRES", test=True)
        joint_exp.evaluate(eval_data="I2B2", test=True)
    else:
        raise ValueError("Currently not supporting this dataset! -_-'")

    print(
        f'Iteration {interaction} result: MATRES F1: {T_F1}; HiEve F1: {H_F1}; I2B2 F1: {I_F1}'
    )

    run_time = format_time(timer() - start)

    # Write to the csv file ('a' means append)
    return T_F1, H_F1, I_F1