Example #1
0
async def default(self, message, args):
    group = get_group(args, message.author.id)
    ics = await get_group_ics(message, group)
    if not ics:
        return

    # This is only to test (on weekends bruh)
    # now = datetime.datetime.now().replace(tzinfo=pytz.UTC)
    # events = ics.timeline.start_after(now)
    events = list(ics.timeline.today())
    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    exists = any(events)

    embed = discord.Embed(title="It seems you are free today!"
                          if not exists else f"Today's {group} lessons' are",
                          colour=BOT_COLOR,
                          timestamp=now)

    if exists:
        embed.set_thumbnail(url=ICON)
        embed.set_footer(text="Momento", icon_url=ICON)

    for event in events:
        start = datetime.datetime.fromisoformat(str(event.begin))
        end = datetime.datetime.fromisoformat(str(event.end))
        lesson_link = BASE_LESSON + get_group_id(event)

        desc = ""
        delim = "__"
        if now >= start:
            if now < end:
                fmt = format_time(get_time_diff(now - start))
                desc += f":teacher: [Started since]({lesson_link}) {fmt}\n"
            else:
                # if the lesson is over
                delim = "~~"
        else:
            fmt = format_time(get_time_diff(start - now))
            desc += f"⏲ [Starts]({lesson_link}) in {fmt}\n"

        desc += f"From **{start:%H:%M}** to **{end:%H:%M}** "
        desc += f"with **{get_teacher(event)}**\n"

        embed.add_field(name=f"{delim}{str(event.name)}{delim}",
                        value=desc,
                        inline=False)

    msg = await message.channel.send(embed=embed)
    await msg.add_reaction(emoji='❌')
Example #2
0
    def predict(self, model, test_dataset, ids, coef=None):
        model.load_state_dict(torch.load(self.config.model_save_path))
        model.eval()
        start_time = time.time()
        if coef is None:
            coef = [1.0, 1.0, 1.0]
        torch_coef = torch.tensor(coef, device=self.config.device).view(-1, 3)
        predicts_all = []
        for inputs, _ in tqdm(
                DataLoader(dataset=TensorDataset(test_dataset.dataset,
                                                 test_dataset.labels),
                           batch_size=self.config.batch_size,
                           shuffle=False)):
            outputs = model(inputs)
            outputs = F.softmax(outputs, dim=1)
            outputs = outputs * torch_coef
            predicts = list(
                torch.max(outputs.data, dim=1)[1].cpu().numpy() - 1)
            predicts_all = predicts_all + predicts

        time_dif = get_time_diff(start_time)
        print("Time usage:", time_dif)
        result_pd = pd.DataFrame({'id': ids, 'y': predicts_all})
        result_pd.to_csv('predict_ans.csv', index=False)
        print("finish !")
Example #3
0
    def train_model(self, data_loader, model):
        # 训练模型
        start_time = time.time() #获取起始位置
        optimizer = optim.Adam(model.parameters(), lr=self.config.learn_rate, betas=(0.9, 0.999))
        model.train()
        criterion = nn.CrossEntropyLoss()
        total_batch = 0
        for data_batch in data_loader:
            total_batch += 1
            sentences1 = data_batch[0]
            sentences2 = data_batch[1]
            labels = data_batch[2]
            if torch.cuda.is_available(): # 判断设备信息
                sentences1 = data_batch[0].to(self.config.device)
                sentences2 = data_batch[1].to(self.config.device)
                labels = data_batch[2].to(self.config.device)
            optimizer.zero_grad()
            outputs = model(sentences1, sentences2)
            #计算 loss
            loss = criterion(outputs, labels) 
            loss.backward()
            optimizer.step()

            if total_batch % 100 == 0:
                true_label = labels.data.cpu().numpy()
                predict = torch.max(outputs, dim=1)[1].cpu().numpy() #获取预测结果
                train_acc = metrics.accuracy_score(true_label, predict)
                time_diff = get_time_diff(start_time)
                msg = 'Iter:{0:>6} Train loss: {1:>5.3} Train acc:{2:>6.2%} Time:{3}'
                print(msg.format(total_batch, loss.item(), train_acc, time_diff))
Example #4
0
def print_end_message(start_time):
    """Prints message to indicate end of the run, plus it shows time passed
    since the beginning"""
    
    end_time = utils.get_time()
    time_lapsed = utils.get_time_diff(start_time,end_time)
    print(f"Bachelor thesis code ended at {end_time}."
          f" Total time lapsed: {time_lapsed}")
Example #5
0
    def on_epoch_end(self, epoch, logs={}):
        predict = self.model.predict_classes(self.x_val, batch_size=self.batch_size)
        # Average precision
        weighted_prec=precision_score(self.y_val, predict, 'weighted') * 100
        self.avg_precision_weighted.append( weighted_prec)
        micro_prec=precision_score(self.y_val, predict, 'micro') * 100
        self.avg_precision_micro.append( micro_prec)
        macro_prec=precision_score(self.y_val, predict, 'macro') * 100
        self.avg_precision_macro.append( macro_prec)

        weighted_recall = recall_score(self.y_val, predict, 'weighted') * 100
        self.avg_recall_weighted.append(weighted_recall)
        micro_recall = recall_score(self.y_val, predict, 'micro') * 100
        self.avg_recall_micro.append(micro_recall)
        macro_recall = recall_score(self.y_val, predict, 'macro') * 100
        self.avg_recall_macro.append(macro_recall)

        weighted_fscore = f1_score(self.y_val, predict, 'weighted') * 100
        self.avg_f1score_weighted.append(weighted_fscore)
        micro_fscore = f1_score(self.y_val, predict, 'micro') * 100
        self.avg_f1score_micro.append(micro_fscore)
        macro_fscore = f1_score(self.y_val, predict, 'macro') * 100
        self.avg_f1score_macro.append(macro_fscore)

        positive_prec = precision_score(self.y_val, predict, 'binary') * 100
        neg_prec = precision_score(self.y_val, predict, average='binary', pos_label=0) * 100
        self.pos_precision.append(positive_prec)
        self.neg_precision.append(neg_prec)

        pos_recall = recall_score(self.y_val, predict, 'binary') * 100
        neg_recall = recall_score(self.y_val, predict, average='binary', pos_label=0) * 100
        self.pos_recall.append(pos_recall)
        self.neg_recall.append(neg_recall)

        pos_fscore = f1_score(self.y_val, predict, 'binary') * 100
        neg_fscore = f1_score(self.y_val, predict, average='binary', pos_label=0) * 100
        self.pos_f1_score.append(pos_fscore )
        self.neg_f1_score.append(neg_fscore)
        self.loss.append(logs.get('loss'))
        acc=accuracy_score(self.y_val, predict)*100
        self.accuracy.append(acc)

        done = time.time()

        elapsed_formated = get_time_diff(self.start_time,done)
        elapsed_time=int((done-self.start_time)/60) # in minutes
        print elapsed_formated
        print elapsed_time
        self.elapsed_time.append(elapsed_time)
        self.elapsed_time_formatted.append(elapsed_formated)
        result_csv = open(self.out_csv, "a+")
        csv_writer = csv.writer(result_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        csv_writer.writerow(
            [epoch, acc, logs.get('loss'), weighted_prec,weighted_recall, weighted_fscore, positive_prec, neg_prec, pos_recall, neg_recall,
             pos_fscore, neg_fscore, elapsed_time])
        return
Example #6
0
async def next(self, message, args):
    group = get_group(args, message.author.id)
    ics = await get_group_ics(message, group)
    if not ics:
        return

    now = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC)
    # Should be using this but the function is overloaded Kek
    # event = next(ics.timeline.start_after(now))
    event = ics.timeline.start_after(now).__next__()

    start = datetime.datetime.fromisoformat(str(event.begin))
    end = datetime.datetime.fromisoformat(str(event.end))
    lesson_link = BASE_LESSON + get_group_id(event)

    desc = ""
    if now >= start:
        if now < end:
            fmt = format_time(get_time_diff(now - start))
            desc += f":teacher: [Started since]({lesson_link}) {fmt} "
    else:
        fmt = format_time(get_time_diff(start - now))
        desc += f"⏲ [Starts]({lesson_link}) in {fmt} "

    fmt = "%A %d %B"
    desc += f"on **{start.strftime(fmt)}**\n"
    desc += f"From **{start:%H:%M}** to **{end:%H:%M}** "
    desc += f"with **{get_teacher(event)}**\n"

    embed = discord.Embed(title=f"Next lesson for {group}",
                          colour=BOT_COLOR,
                          timestamp=now)

    embed.add_field(name=f"__{str(event.name)}__", value=desc, inline=False)

    try:
        url = PROFESSORS[get_teacher(event).lower()]
    except:
        url = ICON
    embed.set_thumbnail(url=url)
    embed.set_footer(text="Momento", icon_url=ICON)

    await message.channel.send(embed=embed)
Example #7
0
 def evaluate_model(self, model, dev_loader, fold_num):
     start_time = time.time()
     dev_acc, dev_loss, dev_report, dev_confusion, f1_score = self.evaluate(
         model, dev_loader, fold_num, flag=True)
     print('未使用指标优化')
     msg = "Test Loss:{0:>5.2}, Test Acc:{1:>6.2%}"
     print(msg.format(dev_loss, dev_acc))
     print("Precision, Recall and F1-Score...")
     print(dev_report)
     print("Confusion Matrix...")
     print(dev_confusion)
     time_diff = get_time_diff(start_time)
     print("Time usage:", time_diff)
Example #8
0
    def on_epoch_end(self, epoch, logs={}):
        predict = self.model.predict_classes(self.x_val, batch_size=self.batch_size)
        # Average precision
        weighted_prec=precision_score(self.y_val, predict, 'weighted') * 100
        self.avg_precision_weighted.append( weighted_prec)
        micro_prec=precision_score(self.y_val, predict, 'micro') * 100
        self.avg_precision_micro.append( micro_prec)
        macro_prec=precision_score(self.y_val, predict, 'macro') * 100
        self.avg_precision_macro.append( macro_prec)

        weighted_recall = recall_score(self.y_val, predict, 'weighted') * 100
        self.avg_recall_weighted.append(weighted_recall)
        micro_recall = recall_score(self.y_val, predict, 'micro') * 100
        self.avg_recall_micro.append(micro_recall)
        macro_recall = recall_score(self.y_val, predict, 'macro') * 100
        self.avg_recall_macro.append(macro_recall)

        weighted_fscore = f1_score(self.y_val, predict, 'weighted') * 100
        self.avg_f1score_weighted.append(weighted_fscore)
        micro_fscore = f1_score(self.y_val, predict, 'micro') * 100
        self.avg_f1score_micro.append(micro_fscore)
        macro_fscore = f1_score(self.y_val, predict, 'macro') * 100
        self.avg_f1score_macro.append(macro_fscore)

        positive_prec = precision_score(self.y_val, predict, 'binary') * 100
        neg_prec = precision_score(self.y_val, predict, average='binary', pos_label=0) * 100
        self.pos_precision.append(positive_prec)
        self.neg_precision.append(neg_prec)

        pos_recall = recall_score(self.y_val, predict, 'binary') * 100
        neg_recall = recall_score(self.y_val, predict, average='binary', pos_label=0) * 100
        self.pos_recall.append(pos_recall)
        self.neg_recall.append(neg_recall)

        pos_fscore = f1_score(self.y_val, predict, 'binary') * 100
        neg_fscore = f1_score(self.y_val, predict, average='binary', pos_label=0) * 100
        self.pos_f1_score.append(pos_fscore )
        self.neg_f1_score.append(neg_fscore)
        self.loss.append(logs.get('loss'))
        acc=accuracy_score(self.y_val, predict)*100
        self.accuracy.append(acc)

        done = time.time()

        elapsed_formated = get_time_diff(self.start_time,done)
        elapsed_time=done-self.start_time
        self.elapsed_time.append(elapsed_time)
        self.elapsed_time_formatted.append(elapsed_formated)
        return
Example #9
0
    def process(self, vals):
        self.event_count = self.event_count + 1

        event = vals[8]
        curr_time = vals[1]

        if event in ["PRACTICE_SEARCH_TASK_COMMENCED", "SEARCH_TASK_COMMENCED"]:
            self.session_start_time = curr_time

        if self.session_start_time is not None and event in [
            "PRACTICE_SEARCH_TASK_COMPLETED",
            "SESSION_COMPLETED",
            "EXPERIMENT_TIMEOUT",
            "SNIPPET_POSTTASK_SURVEY_STARTED",
            "SEARCH_TASK_COMPLETED",
        ]:
            self.session_time = get_time_diff(self.session_start_time, curr_time)
            self.session_start_time = None

        if event in ["DOC_CLICKED"]:
            self.doc_clicked_time = curr_time

        if event in ["DOC_MARKED_VIEWED", "DOC_MARKED_RELEVANT"]:
            self.doc_lag_time += get_time_diff(self.doc_clicked_time, curr_time)
Example #10
0
def test(config, model, test_iter):
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config,
                                                                model,
                                                                test_iter,
                                                                test=True)
    msg = "Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}"
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_diff(start_time)
    print("Time usage:", time_dif)
def _load_data_into_dataframe(filename, is_train):
    """
    
    """
    data_path = os.path.join(conf.DATA_DIR,
                             filename) if is_train else conf.PRED_DATA_DIR
    logger.info('加载数据集: %s' % data_path)
    start_time = time()
    if not is_train:
        disk_smart_df = _get_pred_data(data_path)

    # the store format for pre-processed train data is h5
    else:
        disk_smart_df = pd.read_hdf(
            data_path,
            columns=SELECTED_CONT_COLS + SELECTED_INDEX_COLS +
            SELECTED_CATE_COLS + SELECTED_LABEL_COLS,
        )
    logger.info('使用的cols: %s' % disk_smart_df.columns)
    end_time = time()
    logger.info('加载数据集完成,共用时: %s' % get_time_diff(start_time, end_time))
    return disk_smart_df
Example #12
0
 def end_query_session(self,end_time):
     # DMAX added in this condition to take the first event only.
     # Subsequent events add overhead to the time - that isn't strictly part of the session.
     if self.session_end_time is None:
             
             # Added by David on December 13, 2016
             # Last events (e.g. EXPERIMENT_TIMEOUT) should not be considered as the final session event.
             # Some people in the experiment walk away (or something) for several minutes, meaning that times are way out.
             # In this case, we roll back to the last interaction event - where the event is not EXPERIMENT_TIMEOUT or SESSION_COMPLETED.
             end_time = self.last_interaction_time
             self.session_end_time = end_time
             
             # print "END EVENT: {0}".format(self.last_interaction_event)
             # print "END TIME: {0}".format(self.last_interaction_time)
             
             self.session_time = get_time_diff(self.session_start_time, end_time)
         #print "session time", self.session_time
     
             self.update_times(end_time)
         #if self.last_event == 'VIEW_SEARCH_RESULTS_PAGE':
         #    self.snippet_time = self.snippet_time + get_time_diff(self.view_serp_time, end_time)
     
     # Adding some code to work out probabilities for clicking!
     relevant_count = 0
     
     for i in range(0, self.hover_depth):
         if self.hover_depth > len(self.query_response.results):
             continue
     
         if self.qrel_handler.get_value(self.topic, self.query_response.results[i].docid) > 0:
             relevant_count = relevant_count + 1
         
     for i in range(0, self.hover_depth):
         docid_at_rank = self.query_response.results[i].docid
         
         if is_relevant(self.qrel_handler, self.topic, docid_at_rank) == 0:
             self.hover_trec_nonrel_count = self.hover_trec_nonrel_count + 1
         else:
             self.hover_trec_rel_count = self.hover_trec_rel_count + 1
Example #13
0
    def predict_k_fold(self, model, test_loader, ids, fold_index, coef=None):
        model.load_state_dict(
            torch.load(self.config.model_save_path + "-fold" +
                       str(fold_index)))
        model.eval()
        start_time = time.time()
        if coef is None:
            coef = [1.0, 1.0, 1.0]
        torch_coef = torch.tensor(coef, device=self.config.device).view(-1, 3)
        predicts_all = np.random.randn(1, 3)
        with torch.no_grad():
            for data_batch in tqdm(test_loader):
                input_ids = data_batch[0].clone().detach().to(
                    self.config.device)
                attention_masks = data_batch[1].clone().detach().to(
                    self.config.device)
                token_type_ids = data_batch[2].clone().detach().to(
                    self.config.device)
                model_inputs = (input_ids, attention_masks, token_type_ids)
                outputs = model(model_inputs)
                outputs = F.softmax(outputs, dim=1)
                outputs = outputs * torch_coef
                predicts = outputs.cpu().numpy()
                predicts_all = np.concatenate((predicts_all, predicts), axis=0)

        time_dif = get_time_diff(start_time)
        print("Time usage:", time_dif)
        predicts_all = np.delete(predicts_all, 0, axis=0)
        result_pd = pd.DataFrame(
            {
                '-1': predicts_all.T[0],
                '0': predicts_all.T[1],
                '1': predicts_all.T[2],
            },
            index=ids)
        save_name = self.config.predict_save_path + "-_fold" + str(
            fold_index) + ".csv"
        result_pd.to_csv(save_name)
Example #14
0
def test(config, model, test_iter):
    """
    模型测试
    :param config:
    :param model:
    :param test_iter:
    :return:
    """
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config,
                                                                model,
                                                                test_iter,
                                                                test=True)
    msg = 'Test Loss:{0:>5.2}, Test Acc:{1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score")
    print(test_report)
    print("Confusion Maxtrix")
    print(test_confusion)
    time_diff = utils.get_time_diff(start_time)
    print("使用时间:", time_diff)
Example #15
0
 def get_time_diff(self):
     return utils.get_time_diff(self)
Example #16
0
def train(config, model, train_iter, dev_iter, test_iter=None):
    start_time = time.time()
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)
    total_batch = 0
    dev_best_loss = float("inf")
    last_improve = 0
    flag = False
    writer = SummaryWriter(
        log_dir=os.path.join(config.log_path, time.strftime('%H_%M_%S')))
    for epoch in range(config.num_epoches):
        print("Epoch [{}/{}]".format(epoch + 1, config.num_epoches))
        # scheduler.step()
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward()
            optimizer.step()
            if total_batch % 100 == 0:
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = "*"
                    last_improve = total_batch
                else:
                    improve = ""
                time_dif = get_time_diff(start_time)
                msg = "Iter: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%},  Time: {5} {6}"
                print(
                    msg.format(
                        total_batch,
                        loss.item(),
                        train_acc,
                        dev_loss,
                        dev_acc,
                        time_dif,
                        improve,
                    ))
                writer.add_scalar("loss/train", loss.item(), total_batch)
                writer.add_scalar("loss/dev", dev_loss, total_batch)
                writer.add_scalar("acc/train", train_acc, total_batch)
                writer.add_scalar("acc/dev", dev_acc, total_batch)
                model.train()
            total_batch += 1
            if total_batch - last_improve > config.require_improvement:
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break
    writer.close()
    if test_iter:
        test(config, model, test_iter)
    else:
        test(config, model, dev_iter)
Example #17
0
    default='SkyerERNIEDPCNN',
    help=
    'choose a model SkyerBert, SkyerBertCNN, SkyerBertRNN, SkyerBertRCNN, SkyerBertDPCNN, SkyerERNIE,SkyerERNIEDPCNN'
)
args = parser.parse_args()

if __name__ == '__main__':
    dataset = 'THUCNews'  #数据集地址
    model_name = args.model
    x = import_module('models.' + model_name)
    config = x.Config(dataset)
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(4)
    torch.backends.cudnn.deterministic = True  #保证每次运行结果一样

    start_time = time.time()
    print('加载数据集')
    train_data, dev_data, test_data = utils.build_dataset(config)
    train_iter = utils.build_iterator(train_data, config)
    dev_iter = utils.build_iterator(dev_data, config)
    test_iter = utils.build_iterator(test_data, config)

    time_diff = utils.get_time_diff(start_time)
    print("模型开始之前,准备数据时间:", time_diff)

    # 模型训练,评估与测试
    model = x.Model(config).to(config.device)
    train.train(config, model, train_iter, dev_iter, test_iter)
    #train.test(config, model, test_iter)
Example #18
0
def train(config, model, train_iter, dev_iter, test_iter):
    """
    模型训练方法
    :param config:
    :param model:
    :param train_iter:
    :param dev_iter:
    :param test_iter:
    :return:
    """
    start_time = time.time()
    #启动 BatchNormalization 和 dropout
    model.train()
    #拿到model的所有参数
    param_optimizer = list(model.named_parameters())
    # 不需要衰减的参数
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(params=optimizer_grouped_parameters,
                         lr=config.learning_rate,
                         warmup=0.05,
                         t_total=len(train_iter) * config.num_epochs)

    total_batch = 0  #记录进行多少batch
    dev_best_loss = float('inf')  #记录校验集合最好的loss
    last_improve = 0  #记录上次校验集loss下降的batch数
    flag = False  #记录是否很久没有效果提升,停止训练
    model.train()
    for epoch in range(config.num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs))
        for i, (trains, labels) in enumerate(train_iter):
            outputs = model(trains)
            model.zero_grad()
            loss = F.cross_entropy(outputs, labels)
            loss.backward(retain_graph=False)
            optimizer.step()
            if total_batch % 100 == 0:  #每多少次输出在训练集和校验集上的效果
                true = labels.data.cpu()
                predict = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predict)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ''
                time_diff = utils.get_time_diff(start_time)
                msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}'
                print(
                    msg.format(total_batch, loss.item(), train_acc, dev_loss,
                               dev_acc, time_diff, improve))
                model.train()
            total_batch = total_batch + 1
            if total_batch - last_improve > config.require_improvement:
                #在验证集合上loss超过1000batch没有下降,结束训练
                print('在校验数据集合上已经很长时间没有提升了,模型自动停止训练')
                flag = True
                break

        if flag:
            break
    test(config, model, test_iter)
Example #19
0
        if not label.startswith('.'))

    config = x.Config(data_dir, class_list, vocab_path)
    config.pad_size = pad_size
    np.random.seed(1)
    torch.manual_seed(1)
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True

    start_time = time.time()
    print("Loading data...")
    vocab, train_data, dev_data, test_data = build_dataset(config, args.word)
    train_iter = build_iterator(train_data, config)
    dev_iter = build_iterator(dev_data, config)
    test_iter = build_iterator(test_data, config)
    time_dif = get_time_diff(start_time)
    print("Time usage:", time_dif)

    config.num_vocab = len(vocab)
    model = x.Model(config).to(config.device)
    init_network(model)
    print(model.parameters)
    train(config, model, train_iter, dev_iter)

    predictions = predict(config, model, test_iter)

    if not os.path.exists('result'):
        os.mkdir('result')

    with open(os.path.join(
            'result',
Example #20
0
 def train(self,
           model,
           train_loader,
           dev_loader,
           fold_num,
           use_weight=False):
     start_time = time.time()
     t_total = len(train_loader) * self.config.num_epochs
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ['bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [{
         'params': [
             p for n, p in model.named_parameters()
             if not any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         self.config.weight_decay
     }, {
         'params': [
             p for n, p in model.named_parameters()
             if any(nd in n for nd in no_decay)
         ],
         'weight_decay':
         0.0
     }]
     warmup_steps = int(t_total * 0.1)
     optimizer = AdamW(optimizer_grouped_parameters,
                       lr=self.config.learn_rate,
                       eps=1e-8)
     scheduler = get_linear_schedule_with_warmup(
         optimizer,
         num_warmup_steps=warmup_steps,
         num_training_steps=t_total)
     total_batch = 0
     if use_weight:
         weight = torch.tensor([2, 1, 1.5],
                               dtype=torch.float).to(self.config.device)
         criterion = nn.CrossEntropyLoss(weight=weight)
     else:
         criterion = nn.CrossEntropyLoss()
     dev_per_batch = 500
     #dev_best_loss = float('inf')
     f1_score_best = 0
     last_improve = 0
     model.train()
     total_loss = 0
     if self.config.adv_type == 'fgm':
         fgm = FGM(model)
     for epoch in range(self.config.num_epochs):
         print('epoch [{}/{}]'.format(epoch + 1, self.config.num_epochs))
         for data_batch in train_loader:
             # 转化为 tensor
             input_ids = data_batch[0].clone().detach().to(
                 self.config.device)
             attention_masks = data_batch[1].clone().detach().to(
                 self.config.device)
             token_type_ids = data_batch[2].clone().detach().to(
                 self.config.device)
             labels = data_batch[3].clone().detach().long().to(
                 self.config.device)
             model_inputs = (input_ids, attention_masks, token_type_ids)
             total_batch += 1
             model.zero_grad()
             outputs = model(model_inputs)
             loss = criterion(outputs, labels.view(-1))
             loss.backward()
             # 对抗
             if self.config.adv_type == 'fgm':
                 fgm.attack()  ##对抗训练
                 adv_outputs = model(model_inputs)
                 loss_adv = criterion(adv_outputs, labels.view(-1))
                 loss_adv.backward()
                 fgm.restore()
             torch.nn.utils.clip_grad_norm_(model.parameters(),
                                            self.config.max_grad_norm)
             optimizer.step()
             scheduler.step()  # Update learning rate schedule
             if total_batch % dev_per_batch == 0:
                 true_labels = labels.data.cpu()
                 predicts = torch.max(outputs.data, dim=1)[1].cpu().numpy()
                 train_acc = metrics.accuracy_score(true_labels, predicts)
                 time_dif = get_time_diff(start_time)
                 dev_acc, dev_loss, report, confusion, f1_score = self.evaluate(
                     model, dev_loader, fold_num)
                 model.train()
                 if f1_score_best < f1_score:
                     f1_score_best = f1_score
                     improve = '*'
                     torch.save(
                         model.state_dict(), self.config.model_save_path +
                         "-fold" + str(fold_num))
                     torch.save(model.state_dict(), 'saveModel/temp')
                 else:
                     improve = ' '
                     if f1_score_best - 0.02 > f1_score:
                         improve = '-'
                         model.load_state_dict(torch.load('saveModel/temp'))
                     else:
                         torch.save(model.state_dict(), 'saveModel/temp')
                 msg = 'Epoch:{0:>2} Iter: {1:>6}, Train Loss: {2:>5.2}, Train Acc: {3:>6.3%},' \
                             ' Dev Loss: {4:>5.2}, Dev Acc: {5:>6.3%}, f1_score: {6:>8.7}, Time: {7} {8}'
                 print(
                     msg.format(epoch + 1, total_batch, loss.item(),
                                train_acc, dev_loss, dev_acc, f1_score,
                                time_dif, improve))
     self.evaluate_model(model, dev_loader, fold_num)
Example #21
0
    def update_times(self, curr_time):

        #print curr_time, self.last_time, get_time_diff(self.last_time, curr_time)

        if self.curr_event == 'DELAY_RESULTS_PAGE':
            self.serp_lag = get_time_diff(self.session_start_time, curr_time)
            self.last_query_delay_time = curr_time

        if self.curr_event == 'QUERY_COMPLETE':  # Was VIEW_SEARCH_RESULTS_PAGE
            if self.last_event == 'DELAY_RESULTS_PAGE':
                self.imposed_query_delay = get_time_diff(self.last_query_delay_time, curr_time)
            #if self.last_event == 'QUERY_END':  # Was QUERY_ISSUED
            #    self.serp_lag = get_time_diff(self.session_start_time, curr_time)
        
        if self.system_query_delay == 0.0 and self.curr_event == 'QUERY_END' and self.last_event == 'QUERY_START':
            self.system_query_delay = self.system_query_delay + get_time_diff(self.last_time, curr_time)

        if self.curr_event == 'DOCUMENT_DELAY_VIEW':
            # Document delay occurred, so track the time this happened at.
            self.last_document_delay_time = curr_time
            self.view_serp_time = self.view_serp_time + get_time_diff(self.last_time, curr_time)

        if self.curr_event == 'DOC_MARKED_VIEWED':
            if self.last_document_delay_time:
                if get_time_diff(self.last_document_delay_time, curr_time) < 10.0:
                    self.imposed_document_delay += get_time_diff(self.last_document_delay_time, curr_time)
                else:
                    self.view_serp_time += get_time_diff(self.last_time, curr_time)
            else:
                self.view_serp_time += get_time_diff(self.last_time, curr_time)

        if self.curr_event in ['DOCUMENT_HOVER_OUT', 'DOCUMENT_HOVER_IN', 'QUERY_FOCUS','VIEW_SAVED_DOCS','VIEW_TASK' ]:
            self.view_serp_time = self.view_serp_time + get_time_diff(self.last_time, curr_time)
	
	# This could be more robust.
	# What if the searcher were to view the list of documents marked, or view the task, whilst viewing a document?
	# Maybe this functionality should be disabled while a document is being viewed.
	# Commented out by DMAX on June 8th 2016 - replaced with more robust document time measures (see below).        
	#if self.last_event in ['DOC_MARKED_VIEWED','DOC_MARKED_RELEVANT','DOC_MARKED_NONRELEVANT']:
        #    self.document_time = self.document_time + get_time_diff(self.last_time, curr_time)
	
	# DMAX - Added new document time measures (June 8th 2016)
	# self.doc_click_time contains the document click time. Set to False otherwise.
	if not self.doc_click_time and self.curr_event == 'DOC_CLICKED':
	    self.doc_click_time = curr_time
	
	# Added in VIEW_SAVED_DOCS to cater for the event where a searcher flips to the saved document screen instead.
	if self.doc_click_time and self.curr_event in ['QUERY_START', 'VIEW_SAVED_DOCS', 'PRACTICE_SEARCH_TASK_COMPLETED','SESSION_COMPLETED','EXPERIMENT_TIMEOUT','SNIPPET_POSTTASK_SURVEY_STARTED','SEARCH_TASK_COMPLETED']:
	    self.document_time = self.document_time + get_time_diff(self.doc_click_time, curr_time)
	    self.doc_click_time = False
	# DMAX - End new document time measures
	
	# DMAX - Adding in new SERP details
	if not self.last_serp_event and self.curr_event == 'VIEW_SEARCH_RESULTS_PAGE':
	    self.last_serp_event = curr_time
	#elif self.last_serp_event and self.curr_event == 'QUERY_FOCUS':
	#    print 'QF', curr_time
	elif self.last_serp_event and self.curr_event not in ['DOCUMENT_HOVER_IN', 'DOCUMENT_HOVER_OUT']:
	    self.new_total_serp = self.new_total_serp + get_time_diff(self.last_serp_event, curr_time)
	    self.last_serp_event = None
	# DMAX - End new SERP details
	
	# DMAX - Updated SERP lag time
	if self.curr_event == 'QUERY_END' and self.last_event == 'QUERY_START':
	    self.serp_lag = self.serp_lag + get_time_diff(self.last_time, curr_time)
Example #22
0
    def process(self, vals):
        self.event_count = self.event_count + 1

        #self.last_event_time
        # We want to measure query time from the last QUERY_FOCUS event.
        # We could do it from the first, but we decided this could be too unreliable...
        # So every time we see a new QUERY_FOCUS, we override what we have before and update the time accordingly.
        
        # Commented out this line so that this is overwritten
        #if self.last_query_focus_time is None:

        if ('QUERY_FOCUS' in vals):
            self.last_query_focus_time = '{date} {time}'.format(date=vals[0],time=vals[1])

        if self.last_query_focus_time is None:
            if ('VIEW_SEARCH_BOX' in vals):
                self.last_query_focus_time = '{date} {time}'.format(date=vals[0],time=vals[1])
        
        # End de-dentation

        
        if ('QUERY_ISSUED' in vals):
            # new query, create a query log entry
            if self.current_query:
                if self.last_query_focus_time:
                    lqft = self.last_query_focus_time
                else:
                    lqft = self.last_event_time  # We didn't see a FOCUS or VIEW_SEARCH_BOX, so fallback to last event time.

                self.current_query.end_query_session(lqft)

            #print "QUERY ISSUED:", vals[8:]
            #print self.last_query_focus_time, ':::', vals[1], ':::', get_time_diff(self.last_query_focus_time, vals[1])
            #print
            if self.last_query_focus_time is None:
                self.last_query_focus_time = self.last_event_time
            
            self.current_query = QueryLogEntry(self.key, vals, self.qrel_handler, self.engine, get_time_diff(self.last_query_focus_time, '{date} {time}'.format(date=vals[0],time=vals[1])))
            self.last_query_focus_time = None
            self.query_ended_previously = False
            self.queries.append(self.current_query)
        else:
            if self.current_query:
                # process result under this query object
                self.current_query.process(vals)
        
        # probably should put a condition on this (start task, doc viewed, view serp, etc, ) not all/any
        self.last_event_time = '{date} {time}'.format(date=vals[0],time=vals[1])

        event = vals[8]
        if event in ['PRACTICE_SEARCH_TASK_COMPLETED','SESSION_COMPLETED','EXPERIMENT_TIMEOUT','SNIPPET_POSTTASK_SURVEY_STARTED','SEARCH_TASK_COMPLETED']:
            #print 'search task complete - event'
            if self.current_query and not self.query_ended_previously:
                #print "end of search session"
                self.current_query.end_query_session('{date} {time}'.format(date=vals[0],time=vals[1]))
                self.query_ended_previously = True
        
        # Code for removing documents that were previously marked, but are then reselected as non-relevant.
        all_docs_unmarked = []
        
        for query_object in self.queries:
            all_docs_unmarked = all_docs_unmarked + query_object.doc_unmarked_list
            query_object.doc_unmarked_list = []
        
        for query_object in self.queries:
            for docid in all_docs_unmarked:
                if docid in query_object.doc_marked_list:
                    topic = self.key.split(' ')[4]
                    
                    query_object.doc_marked_list.remove(docid)
                    query_object.doc_rel_count = query_object.doc_rel_count - 1

                    if is_relevant(self.qrel_handler, topic, docid) == 0:
                        query_object.doc_clicked_trec_nonrel_count = query_object.doc_clicked_trec_nonrel_count - 1
                    else:
                        query_object.doc_clicked_trec_rel_count = query_object.doc_clicked_trec_rel_count - 1