Esempio n. 1
0
File: runCSN.py Progetto: DaoD/CSN
def evaluate(model, X_test, best_result, patience, is_test=False):
    y_pred, y_label = predict(model, X_test)
    metrics = Metrics(args.score_file_path)

    with open(args.score_file_path, 'w') as output:
        for score, label in zip(y_pred, y_label):
            output.write(str(score) + '\t' + str(label) + '\n')

    result = metrics.evaluate_all_metrics()

    if not is_test and result[0] + result[1] + result[2] > best_result[
            0] + best_result[1] + best_result[2]:
        # tqdm.write("save model!!!")
        best_result = result
        tqdm.write("Best Result: R1: %.4f R2: %.4f R5: %.4f" %
                   (best_result[0], best_result[1], best_result[2]))
        logger.info("Best Result: R1: %.4f R2: %.4f R5: %.4f" %
                    (best_result[0], best_result[1], best_result[2]))
        model_to_save = model.module if hasattr(model, 'module') else model
        torch.save(model_to_save.state_dict(), args.save_path)
    else:
        patience += 1

    if is_test:
        print("Best Result: R1: %.4f R2: %.4f R5: %.4f" %
              (best_result[0], best_result[1], best_result[2]))

    return best_result, patience
Esempio n. 2
0
class NeuralNetwork(nn.Module):

    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.patience = 0
        self.init_clip_max_norm = 2.0# bert adam 에선 전멸임..
        self.optimizer = None
        self.best_result = [0, 0, 0, 0, 0, 0]
        self.metrics = Metrics(self.args.score_file_path)
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    def forward(self):
        raise NotImplementedError

    def convert_examples_to_features(self,X_train_utterances, X_train_responses, tokenizer,Utterance_len,Response_len):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
       # tokens_a=tokenizer.convert_tokens_to_ids("[CLS]")

  #      for context in X_train_utterances:
 #           for utterance in context:

#        X_train_utterances,
        #label_map = {label: i for i, label in enumerate(y_train)}
        maxbertlen=256
        features = []
        for (ex_index, (utterances ,response,length_utter)) in enumerate(zip(X_train_utterances,X_train_responses,Utterance_len)):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d of %d" % (ex_index, len(utterances)))

            #토큰 인덱스로 변형된거 읽어와서 CLS,SEP 끝에 붙여줌.
            tokens_a=[]
            utterlen=[]
            for utterance in utterances:
                utterlen.append(len(utterance))
                tokens_a=tokens_a+ utterance+[tokenizer.convert_tokens_to_ids("_eos_")]
             #   print(tokenizer.convert_ids_to_tokens(tokens_a))
            tokens_a = [tokenizer.cls_token_id] + tokens_a + [tokenizer.sep_token_id]
         #   if(len(response)>51): "토큰단위라 클수도 있다."
          #      print("something wrong")
            tokens_b = response+[tokenizer.sep_token_id]
            utterlen.append(len(response))
           # if response==53:
            #    print('53')
            if (len(utterlen)!=11):#문장이 아에 없을때..
                utterlen=[0]*(11-len(utterlen))+utterlen


            input_ids = tokens_a + tokens_b
            if len(input_ids)>maxbertlen:
                input_ids=[tokenizer.cls_token_id]+input_ids[-maxbertlen+1:]
            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids:   0   0   0   0  0     0   0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            segment_ids = [0] * (len(input_ids) - len(tokens_b))# 컨텍스트 다합친거.
            segment_ids += [1] * len(tokens_b) # #이건 리스폰스.

            if len(input_ids)>350:
                print(len(input_ids))
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = maxbertlen - len(input_ids)

            if (padding_length>0):
                input_ids = input_ids + ([tokenizer.pad_token_id] * padding_length)
                input_mask = input_mask + ([tokenizer.pad_token_id] * padding_length)
                segment_ids = segment_ids + ([tokenizer.pad_token_id] * padding_length)#패딩은 0이다.

  #          assert len(input_ids) == 256
   #         assert len(input_mask) == 256
    #        assert len(segment_ids) == 256

            #label_id=y_train[ex_index]
            #label_id = label_map[example.label]

            if ex_index < 1:
                logger.info("*** Example ***")
                logger.info("tokens_idx: %s" % " ".join(
                    [str(x) for x in input_ids]))
                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                #logger.info("label: %d " % (utter))

            features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              utterlen=utterlen))
        batch_length=[]
        for Ui_len,ri_len in zip(Utterance_len,Response_len):
            length=[]
            for uij_len in Ui_len:
                length+=uij_len
            length+=ri_len
            length+=[0]*(300-len(length))
            if len(length)>300:
                #print("300넘음")
                length=length[-300:]
            batch_length.append(length)

        return features, batch_length

    def train_step(self, i, data):
        with torch.no_grad():
            batch_ids,batch_mask,batch_seg,batch_utterlen,batch_y,batch_word_len = (item.cuda(device=self.device) for item in data)

        self.optimizer.zero_grad()

        logits = self.forward([batch_ids,batch_mask,batch_seg,batch_utterlen,batch_word_len])

        loss = self.loss_func(logits, target=batch_y)
        loss.backward()
        self.optimizer.step()
        if i%10==0:
            print('Batch[{}] - loss: {:.6f}  batch_size:{}'.format(i, loss.item(),batch_y.size(0)) )  # , accuracy, corrects
        return loss


    def fit(self, X_train_utterances,  X_train_responses, y_train, ############################여기가 메인임.
                  X_dev_utterances, X_dev_responses, y_dev, tokenizer, B_train_utterances_len,B_train_responses_len, B_dev_utterances_len, B_dev_responses_len):

        if torch.cuda.is_available(): self.cuda()

        features,length =self.convert_examples_to_features(X_train_utterances, X_train_responses,tokenizer,B_train_utterances_len,B_train_responses_len)
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_utterlen_ids = torch.tensor([f.utter_len for f in features], dtype=torch.long)#배치당 한 컨텍스트 리스폰스 세트임.f는 고로 한개의
        y_labels = torch.FloatTensor(y_train)
        length = torch.IntTensor(length)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_utterlen_ids, y_labels,length)

        #dataset = DialogueDataset(X_train_utterances, X_train_responses, y_train)#아직은  인덱스인데 여기서 tensor로 바뀌는게 문제임.
        #이것도 인덱스.
        dataloader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True)
        self.loss_func = nn.BCELoss()
      #  self.optimizer =optim.Adam(self.parameters(), lr=self.args.learning_rate, weight_decay=self.args.l2_reg)

        if self.args.no_bert is True:
            optimizer_grouped_parameters = [
                {'params': [p for n, p in self.named_parameters() if 'bert_model' not in n]
                 }
            ]
            print("bert 동결 함")
            self.optimizer = AdamW(optimizer_grouped_parameters, lr=1e-3,weight_decay=self.args.l2_reg, correct_bias=True)
        else:
            no_decay = ['bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)],
                 'weight_decay': self.args.l2_reg},
                {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
            print("bert 학습중")
            self.optimizer=AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, weight_decay=self.args.l2_reg, correct_bias=True)

        for epoch in range(self.args.epochs):
            print("\nEpoch ", epoch+1, "/", self.args.epochs)
            avg_loss = 0

            self.train()
            for i, data in enumerate(dataloader):#원래 배치는 200

                loss = self.train_step(i, data)


                if i > 0 and i % 500000== 0:#200*500 십만..지금은 16
                    self.evaluate(X_dev_utterances, X_dev_responses, y_dev, tokenizer,B_dev_utterances_len, B_dev_responses_len)
                    self.train()

                if epoch >= 2 and self.patience >= 1:
                    print("Reload the best model...")
                    self.load_state_dict(torch.load(self.args.save_path))
                    if self.args.no_bert is True:
                        self.adjust_learning_rate(0.6)
                    else:
                        self.adjust_learning_rate(0.8)
                    self.patience = 0

                if self.init_clip_max_norm is not None:
                    utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm)

                avg_loss += loss.item()
            cnt = len(y_train) // self.args.batch_size + 1
            print("Average loss:{:.6f} ".format(avg_loss/cnt))

            self.evaluate(X_dev_utterances, X_dev_responses, y_dev,tokenizer,B_dev_utterances_len, B_dev_responses_len)


    def adjust_learning_rate(self, decay_rate=.8):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * decay_rate
            self.args.learning_rate = param_group['lr']
        print("Decay learning rate to: ", self.args.learning_rate)


    def evaluate(self, X_dev_utterances, X_dev_responses, y_dev,tokenizer,B_dev_utterances_len,B_dev_responses_len,is_test=False,):
        y_pred = self.predict(X_dev_utterances, X_dev_responses,tokenizer,B_dev_utterances_len,B_dev_responses_len)
        with open(self.args.score_file_path, 'w') as output:
            for score, label in zip(y_pred, y_dev):
                output.write(
                    str(score) + '\t' +
                    str(label) + '\n'
                )

        result = self.metrics.evaluate_all_metrics()
        print("Evaluation Result: \n",
              "MAP:", result[0], "\t",
              "MRR:", result[1], "\t",
              "P@1:", result[2], "\t",
              "R1:",  result[3], "\t",
              "R2:",  result[4], "\t",
              "R5:",  result[5])

        if not is_test and result[3] + result[4] + result[5] > self.best_result[3] + self.best_result[4] + self.best_result[5]:
            print("Best Result: \n",
                  "MAP:", self.best_result[0], "\t",
                  "MRR:", self.best_result[1], "\t",
                  "P@1:", self.best_result[2], "\t",
                  "R1:",  self.best_result[3], "\t",
                  "R2:",  self.best_result[4], "\t",
                  "R5:",  self.best_result[5])
            self.patience = 0
            self.best_result = result
            torch.save(self.state_dict(), self.args.save_path)
            print("save model!!!\n")
        else:
            self.patience += 1


    def predict(self, X_dev_utterances, X_dev_responses,tokenizer,B_dev_utterances_len,B_dev_responses_len):
        self.eval()
        y_pred = []
        features ,length= self.convert_examples_to_features(X_dev_utterances, X_dev_responses, tokenizer,B_dev_utterances_len,B_dev_responses_len)

  #      for f in features:
   #         print(f.input_ids)
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_utterlen_ids = torch.tensor([f.utter_len for f in features],dtype=torch.long)  # 배치당 한 컨텍스트 리스폰스 세트임.f는 고로 한개의
        length = torch.IntTensor(length)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_utterlen_ids,length)#여기도 만개로 수정했음.
        #dataset = DialogueDataset(X_dev_utterances, X_dev_responses)

        dataloader = DataLoader(dataset, batch_size=128)

        for i, data in enumerate(dataloader):
            with torch.no_grad():
                batch_ids, batch_mask, batch_seg, batch_utterlen,batch_word_len= (item.cuda() for item in data)
            with torch.no_grad():
                logits = self.forward([batch_ids, batch_mask, batch_seg, batch_utterlen,batch_word_len])
            if i % 10==0:
                print('Batch[{}] batch_size:{}'.format(i, batch_ids.size(0)))  # , accuracy, corrects
            y_pred += logits.data.cpu().numpy().tolist()
        return y_pred


    def load_model(self, path):
        self.load_state_dict(state_dict=torch.load(path))
        if torch.cuda.is_available(): self.cuda()
Esempio n. 3
0
class NeuralNetwork(nn.Module):
    def __init__(self, args):
        super(NeuralNetwork, self).__init__()
        self.args = args
        self.patience = 0
        self.init_clip_max_norm = 5.0
        self.optimizer = None
        self.best_result = [0, 0, 0, 0, 0, 0]
        self.metrics = Metrics(self.args.score_file_path)
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            args.model_type]

        self.bert_config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            finetuning_task="classification",
            num_labels=1)

        self.bert_tokenizer = BertTokenizer.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        special_tokens_dict = {'eos_token': '[eos]'}
        num_added_toks = self.bert_tokenizer.add_special_tokens(
            special_tokens_dict)

        self.bert_model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=self.bert_config)
        self.bert_model.resize_token_embeddings(len(self.bert_tokenizer))

        self.bert_model = self.bert_model.cuda()
        '''
        self.attn = nn.Linear(300, 300)
        self.rnn1 = nn.GRU(
            input_size=768, hidden_size=300,
            num_layers=1, batch_first=True, bidirectional=False
        )
        self.bilinear=nn.Bilinear(600,600,1)
        '''
        # multihop
        self.transformer_utt = TransformerBlock(input_size=768)
        self.transformer_eu = TransformerBlock(input_size=768)
        self.transformer_ru = TransformerBlock(input_size=768)

        self.transformer_ett = TransformerBlock(input_size=768)
        self.transformer_ue = TransformerBlock(input_size=768)
        self.transformer_re = TransformerBlock(input_size=768)

        self.transformer_rtt = TransformerBlock(input_size=768)
        self.transformer_ur = TransformerBlock(input_size=768)
        self.transformer_er = TransformerBlock(input_size=768)

        self._projection = nn.Sequential(nn.Linear(4 * 768, 200), nn.ReLU())

        self.rnn2 = nn.GRU(input_size=200,
                           hidden_size=200,
                           num_layers=1,
                           batch_first=True,
                           bidirectional=True)

        self._classification = nn.Sequential(nn.Dropout(p=0.2),
                                             nn.Linear(2 * 6 * 200, 200),
                                             nn.Tanh(), nn.Dropout(p=0.2),
                                             nn.Linear(200, 1))

    def forward(self):
        raise NotImplementedError

    def get_Matching_Map(self, bU_embedding, bE_embedding, bR_embedding, umask,
                         emask, rmask):
        '''
        :param bU_embedding: (batch_size*max_utterances, max_u_words, embedding_dim)
        :param bR_embedding: (batch_size*max_utterances, max_r_words, embedding_dim)
        :return: E: (bsz*max_utterances, max_u_words, max_r_words)
        '''
        #1셀프,2크로스,3셀프-크로스4,셀프-크로스 elementwise product
        Hutt = self.transformer_utt(bU_embedding, bU_embedding, bU_embedding)
        Hue = self.transformer_ue(bU_embedding, bE_embedding, bE_embedding)
        Hur = self.transformer_ur(bU_embedding, bR_embedding, bR_embedding)

        Hett = self.transformer_ett(bE_embedding, bE_embedding, bE_embedding)
        Heu = self.transformer_eu(bE_embedding, bU_embedding, bU_embedding)
        Her = self.transformer_er(bE_embedding, bR_embedding, bR_embedding)

        Hrtt = self.transformer_rtt(bR_embedding, bR_embedding, bR_embedding)
        Hru = self.transformer_ru(bR_embedding, bU_embedding, bU_embedding)
        Hre = self.transformer_re(bR_embedding, bE_embedding, bE_embedding)

        #utterance
        ue_input = torch.cat((Hutt, Hue, Hutt - Hue, Hutt * Hue), dim=-1)
        ur_input = torch.cat((Hutt, Hur, Hutt - Hur, Hutt * Hur), dim=-1)
        #evidence
        eu_input = torch.cat((Hett, Heu, Hett - Heu, Hett * Heu), dim=-1)
        er_input = torch.cat((Hett, Her, Hett - Her, Hett * Her), dim=-1)
        #response
        ru_input = torch.cat((Hrtt, Hru, Hrtt - Hru, Hrtt * Hru), dim=-1)
        re_input = torch.cat((Hrtt, Hre, Hrtt - Hre, Hrtt * Hre), dim=-1)

        ue_input = self._projection(ue_input)
        ur_input = self._projection(ur_input)
        eu_input = self._projection(eu_input)
        er_input = self._projection(er_input)
        ru_input = self._projection(ru_input)
        re_input = self._projection(re_input)

        ue_output, _ = self.rnn2(ue_input)
        ur_output, _ = self.rnn2(ur_input)
        eu_output, _ = self.rnn2(eu_input)
        er_output, _ = self.rnn2(er_input)
        ru_output, _ = self.rnn2(ru_input)
        re_output, _ = self.rnn2(re_input)
        '''
        ue_output= ue_output.masked_select(umask)
        ur_output= ur_output.masked_select(umask)
        eu_output= emask
        er_output= emask
        ru_output= rmask
        re_output= rmask
        '''
        maxue, _ = ue_output.max(dim=1)
        maxur, _ = ur_output.max(dim=1)
        maxeu, _ = eu_output.max(dim=1)
        maxer, _ = er_output.max(dim=1)
        maxru, _ = ru_output.max(dim=1)
        maxre, _ = re_output.max(dim=1)

        umask = umask.sum(dim=1, keepdim=True)
        emask = emask.sum(dim=1, keepdim=True)
        rmask = rmask.sum(dim=1, keepdim=True)

        meanue = ue_output.sum(dim=1) / umask
        meanur = ur_output.sum(dim=1) / umask
        meaneu = eu_output.sum(dim=1) / emask
        meaner = er_output.sum(dim=1) / emask
        meanru = ru_output.sum(dim=1) / rmask
        meanre = re_output.sum(dim=1) / rmask

        v = torch.cat(
            [
                maxue + maxur, meanue + meanur, maxeu + maxer, meaneu + meaner,
                maxru + maxre, meanru + meanre
            ],
            dim=1)  # (bsz*max_utterances, channel, max_u_words, max_r_words)

        logits = self._classification(v)
        return logits.squeeze()

    def batch_att_cal(self, bertoutput, lenidx):
        batchsize = lenidx.shape[0]
        output = torch.zeros(batchsize)
        c_arr = torch.zeros((batchsize, 256, 768), dtype=torch.float32)
        e_arr = torch.zeros((batchsize, 250, 768), dtype=torch.float32)
        r_arr = torch.zeros((batchsize, 150, 768), dtype=torch.float32)
        c_mask = torch.zeros((batchsize, 256), dtype=torch.float32)
        e_mask = torch.zeros((batchsize, 250), dtype=torch.float32)
        r_mask = torch.zeros((batchsize, 150), dtype=torch.float32)

        #context = ho[0:lenidx[:][0]]
        for i in range(batchsize):
            c_arr[i, :lenidx[i][0] - 1] = bertoutput[i, 1:lenidx[i][0]]
            c_mask[i, :lenidx[i][0] - 1] = 1
            e_arr[i, :lenidx[i][1] -
                  lenidx[i][0]] = bertoutput[i, lenidx[i][0]:lenidx[i][1]]
            e_mask[i, :lenidx[i][1] - lenidx[i][0]] = 1
            r_arr[i, :lenidx[i][2] - lenidx[i][1] -
                  1] = bertoutput[i, lenidx[i][1] + 1:lenidx[i][2]][:150]
            r_mask[i, :lenidx[i][2] - lenidx[i][1] - 1] = 1

        c_arr, e_arr, r_arr = c_arr.cuda(), e_arr.cuda(), r_arr.cuda()
        c_mask, e_mask, r_mask = c_mask.cuda(), e_mask.cuda(), r_mask.cuda()

        logit = self.get_Matching_Map(c_arr, e_arr, r_arr, c_mask, e_mask,
                                      r_mask)
        '''
        hc,c=self.rnn(c_arr)
        _,e=self.rnn(e_arr)
        hr, r = self.rnn(r_arr)
        ceattn = self.forward_attn(hc, e,c_mask)
        crattn = self.forward_attn(hc, r,c_mask)
        cattn = torch.cat([ceattn, crattn], dim=2)
        reattn = self.forward_attn(hr, e,r_mask)
        rcattn = self.forward_attn(hr, r,r_mask)
        rattn = torch.cat([reattn, rcattn], dim=2)
        output=self.bilinear(cattn, rattn)
        '''

        return logit

    def train_step(self, i, data):
        with torch.no_grad():
            batch_ids, batch_mask, batch_seg, batch_y, batch_len = (item.cuda(
                device=self.device) for item in data)

        self.optimizer.zero_grad()

        output, _ = self.bert_model(batch_ids, batch_mask, batch_seg)
        output = self.batch_att_cal(output, batch_len)

        logits = torch.sigmoid(output)

        loss = self.loss_func(logits, target=batch_y)

        loss.backward()

        self.optimizer.step()
        if i % 100 == 0:
            print('Batch[{}] - loss: {:.6f}  batch_size:{}'.format(
                i, loss.item(), batch_y.size(0)))  # , accuracy, corrects
        return loss

    def fit(self, train, dev, train_evi,
            dev_evi):  ############################여기가 메인임.

        if torch.cuda.is_available(): self.cuda()

        dataset = BERTDataset(self.args, train, train_evi, self.bert_tokenizer)
        sampler = RandomSampler(dataset)
        dataloader = DataLoader(dataset,
                                batch_size=self.args.batch_size,
                                sampler=sampler)

        self.loss_func = nn.BCELoss()
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in self.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        self.optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.args.learning_rate,
            correct_bias=True
        )  #weight_decay=self.args.l2_reg, correct_bias=False)

        for epoch in range(self.args.epochs):
            print("\nEpoch ", epoch + 1, "/", self.args.epochs)
            avg_loss = 0

            self.train()
            for i, data in tqdm(enumerate(dataloader)):  #원래 배치는 200
                # torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)

                if epoch >= 2 and self.patience >= 3:
                    print("Reload the best model...")
                    self.load_state_dict(torch.load(self.args.save_path))
                    self.adjust_learning_rate()
                    self.patience = 0

                loss = self.train_step(i, data)

                if self.init_clip_max_norm is not None:
                    utils.clip_grad_norm_(self.parameters(),
                                          max_norm=self.init_clip_max_norm)

                avg_loss += loss.item()
            cnt = len(train['y']) // self.args.batch_size + 1
            print("Average loss:{:.6f} ".format(avg_loss / cnt))

            self.evaluate(dev, dev_evi)

    def adjust_learning_rate(self, decay_rate=.5):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * decay_rate
            self.args.learning_rate = param_group['lr']
        print("Decay learning rate to: ", self.args.learning_rate)

    def evaluate(self, dev, dev_evi, is_test=False):
        y_pred = self.predict(dev, dev_evi)
        with open(self.args.score_file_path, 'w') as output:
            for score, label in zip(y_pred, dev['y']):
                output.write(str(score) + '\t' + str(label) + '\n')

        result = self.metrics.evaluate_all_metrics()
        print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:",
              result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t",
              "R2:", result[4], "\t", "R5:", result[5])

        if not is_test and result[3] + result[4] + result[5] > self.best_result[
                3] + self.best_result[4] + self.best_result[5]:
            print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:",
                  self.best_result[1], "\t", "P@1:", self.best_result[2], "\t",
                  "R1:", self.best_result[3], "\t", "R2:", self.best_result[4],
                  "\t", "R5:", self.best_result[5])
            self.patience = 0
            self.best_result = result
            torch.save(self.state_dict(), self.args.save_path)
            print("save model!!!\n")
        else:
            self.patience += 1

    def predict(self, dev, dev_evi):
        self.eval()
        y_pred = []
        #      for f in features:
        #         print(f.input_ids)
        dataset = BERTDataset(self.args, dev, dev_evi, self.bert_tokenizer)
        dataloader = DataLoader(dataset, batch_size=128)

        for i, data in enumerate(dataloader):
            with torch.no_grad():
                batch_ids, batch_mask, batch_seg, batch_y, batch_len = (
                    item.cuda() for item in data)
            with torch.no_grad():
                output, _ = self.bert_model(batch_ids, batch_mask, batch_seg)
                output = self.batch_att_cal(output, batch_len)
                logits = torch.sigmoid(output)

            if i % 100 == 0:
                print('Batch[{}] batch_size:{}'.format(
                    i, batch_ids.size(0)))  # , accuracy, corrects
            y_pred += logits.data.cpu().numpy().tolist()
        return y_pred

    def load_model(self, path):
        self.load_state_dict(state_dict=torch.load(path))
        if torch.cuda.is_available(): self.cuda()
Esempio n. 4
0
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.patience = 0
        self.init_clip_max_norm = 5.0
        self.optimizer = None
        self.best_result = [0, 0, 0, 0, 0, 0]
        self.metrics = Metrics(self.args.score_file_path)
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.all_preds = []

    def forward(self):
        raise NotImplementedError

    def train_step(self, i, data):
        with torch.no_grad():
            batch_u, batch_r, batch_y = (item.to(self.device) for item in data)

        self.optimizer.zero_grad()
        logits = self.forward(batch_u, batch_r)
        loss = self.loss_func(logits, target=batch_y)
        loss.backward()
        self.optimizer.step()
        print('Batch[{}] - loss: {:.6f}  batch_size:{}'.format(
            i, loss.item(), batch_y.size(0)))  # , accuracy, corrects
        return loss

    def fit(self, X_train_utterances, X_train_responses, y_train,
            X_dev_utterances, X_dev_responses, y_dev):

        self.to(self.device)

        dataset = DialogueDataset(X_train_utterances, X_train_responses,
                                  y_train)
        dataloader = DataLoader(dataset,
                                batch_size=self.args.batch_size,
                                shuffle=True)

        self.loss_func = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(),
                                    lr=self.args.learning_rate,
                                    weight_decay=self.args.l2_reg)

        for epoch in range(int(self.args.epochs)):
            print("\nEpoch ", epoch + 1, "/", self.args.epochs)
            avg_loss = 0

            self.train()
            for i, data in enumerate(dataloader):
                loss = self.train_step(i, data)

                if i > 0 and i % 500 == 0:
                    self.evaluate(X_dev_utterances, X_dev_responses, y_dev)
                    self.train()

                if epoch >= 2 and self.patience >= 3:
                    print("Reload the best model...")
                    self.load_state_dict(torch.load(self.args.save_path))
                    self.adjust_learning_rate()
                    self.patience = 0

                if self.init_clip_max_norm is not None:
                    utils.clip_grad_norm_(self.parameters(),
                                          max_norm=self.init_clip_max_norm)

                avg_loss += loss.item()

            cnt = len(y_train) // self.args.batch_size + 1
            print("Average loss:{:.6f} ".format(avg_loss / cnt))
            self.evaluate(X_dev_utterances, X_dev_responses, y_dev)

    def adjust_learning_rate(self, decay_rate=.5):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * decay_rate
            self.args.learning_rate = param_group['lr']
        print("Decay learning rate to: ", self.args.learning_rate)

    def evaluate(self,
                 X_dev_utterances,
                 X_dev_responses,
                 y_dev,
                 is_test=False):
        y_pred = self.predict(X_dev_utterances, X_dev_responses)
        with open(self.args.score_file_path, 'w') as output:
            for score, label in zip(y_pred, y_dev):
                output.write(str(score) + '\t' + str(label) + '\n')
        result = self.metrics.evaluate_all_metrics()
        print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:",
              result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t",
              "R2:", result[4], "\t", "R5:", result[5])

        args = vars(self.args)
        if not os.path.isdir(args['output_predictions_folder']):
            os.makedirs(args['output_predictions_folder'])

        with open(
                os.path.join(args['output_predictions_folder'], 'config.json'),
                'w') as f:
            args['ranker'] = "MSN"
            args['seed'] = str(args['seed'])
            args_dict = {}
            args_dict['args'] = args

            f.write(json.dumps(args_dict, indent=4, sort_keys=True))

        df = pd.DataFrame(self.all_preds,
                          columns=[
                              'prediction_' + str(i)
                              for i in range(len(self.all_preds[0]))
                          ])
        df.to_csv(args['output_predictions_folder'] + "/predictions.csv",
                  index=False)

        if not is_test and result[3] + result[4] + result[5] > self.best_result[
                3] + self.best_result[4] + self.best_result[5]:
            print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:",
                  self.best_result[1], "\t", "P@1:", self.best_result[2], "\t",
                  "R1:", self.best_result[3], "\t", "R2:", self.best_result[4],
                  "\t", "R5:", self.best_result[5])
            self.patience = 0
            self.best_result = result
            torch.save(self.state_dict(), self.args.save_path)
            print("save model!!!\n")
        else:
            self.patience += 1

    def predict(self, X_dev_utterances, X_dev_responses):
        self.eval()
        y_pred = []
        dataset = DialogueDataset(X_dev_utterances, X_dev_responses)
        dataloader = DataLoader(dataset, batch_size=51)

        for i, data in enumerate(dataloader):
            with torch.no_grad():
                batch_u, batch_r = (item.to(self.device) for item in data)

            logits = self.forward(batch_u, batch_r)
            self.all_preds.append(logits.data.cpu().numpy().tolist())
            y_pred += logits.data.cpu().numpy().tolist()

        return y_pred

    def load_model(self, path):
        self.load_state_dict(state_dict=torch.load(path))
        # if torch.cuda.is_available(): self.cuda()
        self.to(self.device)
Esempio n. 5
0
class NeuralNetwork(nn.Module):
    def __init__(self, args):
        super(NeuralNetwork, self).__init__()
        self.args = args
        self.patience = 0
        self.init_clip_max_norm = 5.0
        self.optimizer = None
        self.best_result = [0, 0, 0, 0, 0, 0]
        self.metrics = Metrics(self.args.score_file_path)
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        config_class, model_class, tokenizer_class = MODEL_CLASSES[
            args.model_type]

        self.bert_config = config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            finetuning_task="classification",
            num_labels=1)

        self.bert_tokenizer = BertTokenizer.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case)
        special_tokens_dict = {
            'eos_token': '[eos]',
            'additional_special_tokens': ['[soe]', '[eoe]']
        }
        num_added_toks = self.bert_tokenizer.add_special_tokens(
            special_tokens_dict)

        self.bert_model = model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool('.ckpt' in args.model_name_or_path),
            config=self.bert_config)
        self.bert_model.resize_token_embeddings(len(self.bert_tokenizer))

        self.bert_model = self.bert_model.cuda()

        self.attn = nn.Linear(768, 768)
        self.rnn = nn.GRU(input_size=768,
                          hidden_size=200,
                          num_layers=1,
                          batch_first=True,
                          bidirectional=False)
        self.bilinear = nn.Bilinear(768, 768, 1)

    def forward(self):
        raise NotImplementedError

    def forward_attn(self, x1, x2):
        """
        attention
        x1=T D
        x2=T D -> D
        """
        max_len = x1.size(0)  #T D
        x2 = x2.mean(dim=0)
        attn = self.attn(x1)  # T,D
        attn_energies = attn.mm(x2.unsqueeze(1))  #T,D * D,1 --> T,1
        alpha = F.softmax(attn_energies, dim=0)  # T,1
        alpha = alpha.transpose(0, 1)  #1,T
        weighted_attn = alpha.mm(x1)  # 1,T * T D= 1 D

        return weighted_attn

    def batch_att_cal(self, bertoutput, lenidx):
        #hid_out,_=self.rnn(bertoutput)
        batchsize = lenidx.shape[0]
        output = torch.zeros(batchsize)
        #context = ho[0:lenidx[:][0]]
        for i in range(batchsize):
            #context_evidence=bertoutput[i,1:lenidx[i][1]]
            context_evidence = torch.cat(
                (bertoutput[i, 1:lenidx[i][0]],
                 bertoutput[i, lenidx[i][0] + 1:lenidx[i][1]]),
                dim=0)
            response = bertoutput[i, lenidx[i][1] + 1:lenidx[i][2]]

            ceattn = self.forward_attn(context_evidence, response)
            rattn = self.forward_attn(response, context_evidence)
            output[i] = self.bilinear(ceattn, rattn)
            #if torch.isnan(output[i])==True:
            #   print("nan")
        return output.cuda()

    def train_step(self, i, data):
        with torch.no_grad():
            batch_ids, batch_mask, batch_seg, batch_y, batch_len = (item.cuda(
                device=self.device) for item in data)

        self.optimizer.zero_grad()

        output, _ = self.bert_model(batch_ids, batch_mask, batch_seg)
        output = self.batch_att_cal(output, batch_len)

        logits = torch.sigmoid(output)

        loss = self.loss_func(logits, target=batch_y)

        loss.backward()

        self.optimizer.step()
        if i % 100 == 0:
            print('Batch[{}] - loss: {:.6f}  batch_size:{}'.format(
                i, loss.item(), batch_y.size(0)))  # , accuracy, corrects
        return loss

    def fit(self, train, dev, train_evi,
            dev_evi):  ############################여기가 메인임.

        if torch.cuda.is_available(): self.cuda()

        dataset = BERTDataset(self.args, train, train_evi, self.bert_tokenizer)
        sampler = RandomSampler(dataset)
        dataloader = DataLoader(dataset,
                                batch_size=self.args.batch_size,
                                sampler=sampler)

        self.loss_func = nn.BCELoss()
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in self.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params': [
                p for n, p in self.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]
        self.optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.args.learning_rate,
            correct_bias=True
        )  #weight_decay=self.args.l2_reg, correct_bias=False)

        for epoch in range(self.args.epochs):
            print("\nEpoch ", epoch + 1, "/", self.args.epochs)
            avg_loss = 0

            self.train()
            for i, data in tqdm(enumerate(dataloader)):  #원래 배치는 200
                # torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0)

                if epoch >= 2 and self.patience >= 3:
                    print("Reload the best model...")
                    self.load_state_dict(torch.load(self.args.save_path))
                    self.adjust_learning_rate()
                    self.patience = 0

                loss = self.train_step(i, data)

                if self.init_clip_max_norm is not None:
                    utils.clip_grad_norm_(self.parameters(),
                                          max_norm=self.init_clip_max_norm)

                avg_loss += loss.item()
            cnt = len(train['y']) // self.args.batch_size + 1
            print("Average loss:{:.6f} ".format(avg_loss / cnt))

            self.evaluate(dev, dev_evi)

    def adjust_learning_rate(self, decay_rate=.5):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * decay_rate
            self.args.learning_rate = param_group['lr']
        print("Decay learning rate to: ", self.args.learning_rate)

    def evaluate(self, dev, dev_evi, is_test=False):
        y_pred = self.predict(dev, dev_evi)
        with open(self.args.score_file_path, 'w') as output:
            for score, label in zip(y_pred, dev['y']):
                output.write(str(score) + '\t' + str(label) + '\n')

        result = self.metrics.evaluate_all_metrics()
        print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:",
              result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t",
              "R2:", result[4], "\t", "R5:", result[5])

        if not is_test and result[3] + result[4] + result[5] > self.best_result[
                3] + self.best_result[4] + self.best_result[5]:
            print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:",
                  self.best_result[1], "\t", "P@1:", self.best_result[2], "\t",
                  "R1:", self.best_result[3], "\t", "R2:", self.best_result[4],
                  "\t", "R5:", self.best_result[5])
            self.patience = 0
            self.best_result = result
            torch.save(self.state_dict(), self.args.save_path)
            print("save model!!!\n")
        else:
            self.patience += 1

    def predict(self, dev, dev_evi):
        self.eval()
        y_pred = []
        #      for f in features:
        #         print(f.input_ids)
        dataset = BERTDataset(self.args, dev, dev_evi, self.bert_tokenizer)
        dataloader = DataLoader(dataset, batch_size=128)

        for i, data in enumerate(dataloader):
            with torch.no_grad():
                batch_ids, batch_mask, batch_seg, batch_y, batch_len = (
                    item.cuda() for item in data)
            with torch.no_grad():
                output, _ = self.bert_model(batch_ids, batch_mask, batch_seg)
                output = self.batch_att_cal(output, batch_len)
                #for out in torch.isnan(output):
                #   if out == True:
                #      print(out)
                logits = torch.sigmoid(output)

            if i % 100 == 0:
                print('Batch[{}] batch_size:{}'.format(
                    i, batch_ids.size(0)))  # , accuracy, corrects
            y_pred += logits.data.cpu().numpy().tolist()
        return y_pred

    def load_model(self, path):
        self.load_state_dict(state_dict=torch.load(path))
        if torch.cuda.is_available(): self.cuda()
Esempio n. 6
0
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.patience = 0
        self.init_clip_max_norm = 5.0
        self.optimizer = None
        self.best_result = [0, 0, 0]
        self.metrics = Metrics(self.args.score_file_path)
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

    def forward(self):
        raise NotImplementedError

    def train_step(self, data):
        with torch.no_grad():
            batch_u, batch_r, batch_p, batch_y = (item.cuda(device=self.device)
                                                  for item in data)

        self.optimizer.zero_grad()
        logits = self.forward(batch_u, batch_r, batch_p)
        loss = self.loss_func(logits, target=batch_y)
        loss.backward()
        self.optimizer.step()
        # print('Batch[{}] - loss: {:.6f}  batch_size:{}'.format(i, loss.item(), batch_y.size(0)))  # , accuracy, corrects
        return loss, batch_y.size(0)

    def fit(self, X_train_utterances, X_train_responses, X_train_personas,
            y_train, X_dev_utterances, X_dev_responses, X_dev_personas, y_dev):
        if torch.cuda.is_available():
            self.cuda()

        dataset = Dataset(X_train_utterances, X_train_responses,
                          X_train_personas, y_train)
        dataloader = DataLoader(dataset,
                                batch_size=self.args.batch_size,
                                shuffle=True)

        self.loss_func = nn.CrossEntropyLoss()
        self.optimizer = optim.AdamW(self.parameters(),
                                     lr=self.args.learning_rate)

        for epoch in range(self.args.epochs):
            self.epoch = epoch
            print("\nEpoch ", epoch + 1, "/", self.args.epochs)
            avg_loss = 0

            self.train()
            with tqdm(total=len(y_train), ncols=90) as pbar:
                for i, data in enumerate(dataloader):
                    loss, batch_size = self.train_step(data)
                    pbar.set_postfix(lr=self.args.learning_rate,
                                     loss=loss.item())

                    if i > 0 and i % 500 == 0:
                        self.evaluate(X_dev_utterances, X_dev_responses,
                                      X_dev_personas, y_dev)
                        self.train()

                    if epoch >= 1 and self.patience >= 3:
                        # tqdm.write("Reload the best model...")
                        self.load_state_dict(torch.load(self.args.save_path))
                        self.adjust_learning_rate()
                        self.patience = 0

                    if self.init_clip_max_norm is not None:
                        utils.clip_grad_norm_(self.parameters(),
                                              max_norm=self.init_clip_max_norm)

                    pbar.update(batch_size)
                    avg_loss += loss.item()
            cnt = len(y_train) // self.args.batch_size + 1
            tqdm.write("Average loss:{:.6f} ".format(avg_loss / cnt))
            self.evaluate(X_dev_utterances, X_dev_responses, X_dev_personas,
                          y_dev)
            tqdm.write("Best Result: R@1: %.3f R@2: %.3f R@5: %.3f" %
                       (self.best_result[0], self.best_result[1],
                        self.best_result[2]))

    def adjust_learning_rate(self, decay_rate=0.5):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * decay_rate
            self.args.learning_rate = param_group['lr']
        # tqdm.write("Decay learning rate to: " + str(self.args.learning_rate))

    def evaluate(self,
                 X_dev_utterances,
                 X_dev_responses,
                 X_dev_personas,
                 y_dev,
                 is_test=False):
        y_pred = self.predict(X_dev_utterances, X_dev_responses,
                              X_dev_personas)
        y_dev_one_hot = np.zeros((len(y_dev), 20), dtype=np.int)
        for i in range(len(y_dev)):
            y_dev_one_hot[i][y_dev[i]] = 1
        y_dev_one_hot = y_dev_one_hot.reshape(-1)
        with open(self.args.score_file_path, 'w') as output:
            for score, label in zip(y_pred, y_dev_one_hot):
                output.write(str(score) + '\t' + str(label) + '\n')

        result = self.metrics.evaluate_all_metrics()

        if not is_test and result[0] + result[1] + result[2] > self.best_result[
                0] + self.best_result[1] + self.best_result[2]:
            # tqdm.write("save model!!!")
            self.best_result = result
            tqdm.write("Best Result: R@1: %.3f R@2: %.3f R@5: %.3f" %
                       (self.best_result[0], self.best_result[1],
                        self.best_result[2]))
            self.logger.info("Best Result: R@1: %.3f R@2: %.3f R@5: %.3f" %
                             (self.best_result[0], self.best_result[1],
                              self.best_result[2]))
            self.patience = 0
            torch.save(self.state_dict(), self.args.save_path)
        else:
            self.patience += 1

        if is_test:
            print("Evaluation Result: R@1: %.3f R@2: %.3f R@5: %.3f" %
                  (result[0], result[1], result[2]))

    def predict(self, X_dev_utterances, X_dev_responses, X_dev_personas):
        self.eval()
        y_pred = []
        dataset = Dataset(X_dev_utterances, X_dev_responses, X_dev_personas)
        dataloader = DataLoader(dataset, batch_size=self.args.batch_size)
        with torch.no_grad():
            for i, data in enumerate(dataloader):
                batch_u, batch_r, batch_l = (item.cuda() for item in data)
                logits = self.forward(batch_u, batch_r, batch_l)
                y_pred.append(logits.data.cpu().numpy().reshape(-1))
        y_pred = np.concatenate(y_pred, axis=0).tolist()
        return y_pred

    def load_model(self, path):
        self.load_state_dict(state_dict=torch.load(path))
        if torch.cuda.is_available():
            self.cuda()
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.patience = 0
        self.init_clip_max_norm = 10.0
        self.optimizer = None
        self.best_result = [0, 0, 0, 0, 0, 0]
        self.metrics = Metrics(self.args.score_file_path)
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

    def forward(self):
        raise NotImplementedError

    def train_step(self, i, data):
        with torch.no_grad():
            batch_u, batch_r, batch_key_r, batch_key_mask_r, batch_y = (
                item.cuda(device=self.device) for item in data)

        self.optimizer.zero_grad()
        logits = self.forward(batch_u, batch_r, batch_key_r, batch_key_mask_r)
        loss = self.loss_func(logits, target=batch_y)
        loss.backward()
        self.optimizer.step()
        if i % 100 == 0:
            print('Batch[{}] - loss: {:.6f}  batch_size:{}'.format(
                i, loss.item(), batch_y.size(0)))  # , accuracy, corrects
        return loss

    def fit(self, X_train_utterances, X_train_responses, y_train,
            X_dev_utterances, X_dev_responses, y_dev, key_r, key_mask_r,
            dev_key_r, dev_key_mask_r):

        if torch.cuda.is_available(): self.cuda()

        dataset = DialogueDataset(X_train_utterances, X_train_responses, key_r,
                                  key_mask_r, y_train)
        dataloader = DataLoader(dataset,
                                batch_size=self.args.batch_size,
                                shuffle=True)

        self.loss_func = nn.BCELoss()
        self.optimizer = optim.Adam(self.parameters(),
                                    lr=self.args.learning_rate,
                                    weight_decay=self.args.l2_reg)

        for epoch in range(self.args.epochs):
            print("\nEpoch ", epoch + 1, "/", self.args.epochs)
            avg_loss = 0

            self.train()
            for i, data in enumerate(dataloader):

                if epoch >= 2 and self.patience >= 2:
                    print("Reload the best model...")
                    self.load_state_dict(torch.load(self.args.save_path))
                    self.adjust_learning_rate()
                    self.patience = 0

                loss = self.train_step(i, data)

                if i > 0 and i % 1000000 == 0:
                    self.evaluate(X_dev_utterances, X_dev_responses, dev_key_r,
                                  dev_key_mask_r, y_dev)
                    self.train()

                if self.init_clip_max_norm is not None:
                    utils.clip_grad_norm_(self.parameters(),
                                          max_norm=self.init_clip_max_norm)

                avg_loss += loss.item()
            cnt = len(y_train) // self.args.batch_size + 1
            print("Average loss:{:.6f} ".format(avg_loss / cnt))
            self.evaluate(X_dev_utterances, X_dev_responses, dev_key_r,
                          dev_key_mask_r, y_dev)

    def adjust_learning_rate(self, decay_rate=.5):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * decay_rate
            self.args.learning_rate = param_group['lr']
        print("Decay learning rate to: ", self.args.learning_rate)

    def evaluate(self,
                 X_dev_utterances,
                 X_dev_responses,
                 dev_key_r,
                 dev_key_mask_r,
                 y_dev,
                 is_test=False):
        y_pred = self.predict(X_dev_utterances, X_dev_responses, dev_key_r,
                              dev_key_mask_r)
        with open(self.args.score_file_path, 'w') as output:
            for score, label in zip(y_pred, y_dev):
                output.write(str(score) + '\t' + str(label) + '\n')

        result = self.metrics.evaluate_all_metrics()
        print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:",
              result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t",
              "R2:", result[4], "\t", "R5:", result[5])

        if not is_test and result[3] + result[4] + result[5] > self.best_result[
                3] + self.best_result[4] + self.best_result[5]:
            print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:",
                  self.best_result[1], "\t", "P@1:", self.best_result[2], "\t",
                  "R1:", self.best_result[3], "\t", "R2:", self.best_result[4],
                  "\t", "R5:", self.best_result[5])
            self.patience = 0
            self.best_result = result
            torch.save(self.state_dict(), self.args.save_path)
            print("save model!!!\n")
        else:
            self.patience += 1

    def predict(self, X_dev_utterances, X_dev_responses, dev_key_r,
                dev_key_mask_r):
        self.eval()
        y_pred = []
        dataset = DialogueDataset(X_dev_utterances, X_dev_responses, dev_key_r,
                                  dev_key_mask_r)
        dataloader = DataLoader(dataset, batch_size=400)

        for i, data in enumerate(dataloader):
            with torch.no_grad():
                batch_u, batch_r, batch_key_r, batch_key_masked_r = (
                    item.cuda() for item in data)

                logits = self.forward(batch_u, batch_r, batch_key_r,
                                      batch_key_masked_r)
                y_pred += logits.data.cpu().numpy().tolist()
        return y_pred

    def load_model(self, path):
        self.load_state_dict(state_dict=torch.load(path))
        if torch.cuda.is_available(): self.cuda()