コード例 #1
0
 def evaluate(self, iterator):
     self.model.eval()
     epoch_loss = 0
     epoch_acc = 0
     with torch.no_grad():
         for _, batch in enumerate(iterator):
             label = batch["label"]
             text = batch["text"]
             input_ids, token_type_ids = convert_text_to_ids(
                 self.tokenizer, text)
             input_ids = seq_padding(self.tokenizer, input_ids)
             token_type_ids = seq_padding(self.tokenizer, token_type_ids)
             label = label.unsqueeze(1)
             input_ids, token_type_ids, label = input_ids.long(
             ), token_type_ids.long(), label.long()
             input_ids, token_type_ids, label = input_ids.to(
                 self.device), token_type_ids.to(self.device), label.to(
                     self.device)
             output = self.model(input_ids=input_ids,
                                 token_type_ids=token_type_ids,
                                 labels=label)
             y_pred_label = output[1].argmax(dim=1)
             loss = output[0]
             acc = ((y_pred_label == label.view(-1)).sum()).item()
             epoch_loss += loss.item()
             epoch_acc += acc
     return epoch_loss / len(iterator), epoch_acc / len(
         iterator.dataset.dataset)
コード例 #2
0
 def get_embedding(self, text1):
     x1token = ['[CLS]'] + self.tokenizer.tokenize(text1) + ['[SEP]']
     x1mask = [1] * len(x1token)
     x1mask = FloatTensor(
         utils.seq_padding(np.array([x1mask]), self.para.bert_maxlen))
     x1ids = self.tokenizer.convert_tokens_to_ids(x1token)
     x1ids = LongTensor(
         utils.seq_padding(np.array([x1ids]), self.para.bert_maxlen))
     x1 = self.my_model.bert_embedding([x1ids, x1mask])
     x1 = x1 / torch.sqrt(torch.sum(x1 * x1, -1, keepdim=True))
     print('x1 shape: ', np.shape(x1[0]))
     return x1[0].cpu().detach().numpy()
コード例 #3
0
 def get_embedding_list(self, text1):
     x1token = ['[CLS]'] + self.tokenizer.tokenize(text1) + ['[SEP]']
     x1mask = [1] * len(x1token)
     x1mask = FloatTensor(
         utils.seq_padding(np.array([x1mask]), self.para.bert_maxlen))
     x1ids = self.tokenizer.convert_tokens_to_ids(x1token)
     x1ids = LongTensor(
         utils.seq_padding(np.array([x1ids]), self.para.bert_maxlen))
     x1 = self.my_model.bert_embedding.bert_embedding_model(
         x1ids, attention_mask=x1mask)
     x1 = x1[0][-2:]
     x1 = torch.cat(x1, -1)
     x1mask = x1mask.view(-1, self.para.bert_maxlen, 1)
     x1mask = x1mask.expand(-1, -1, 2048)
     x1 = x1 * x1mask  # 将mask位置的向量置零
     return x1[0, :len(x1token), :].cpu().detach().numpy()
コード例 #4
0
 def predict(self, sentence):
     input_ids, token_type_ids = convert_text_to_ids(
         self.tokenizer, sentence)
     input_ids = seq_padding(self.tokenizer, [input_ids])
     token_type_ids = seq_padding(self.tokenizer, [token_type_ids])
     # 需要 LongTensor
     input_ids, token_type_ids = input_ids.long(), token_type_ids.long()
     # 梯度清零
     self.optimizer.zero_grad()
     # 迁移到GPU
     input_ids, token_type_ids = input_ids.to(
         self.device), token_type_ids.to(self.device)
     output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
     y_pred_prob = output[0]
     y_pred_label = y_pred_prob.argmax(dim=1)
     print(y_pred_label)
コード例 #5
0
 def get_batch_embedding(self, text_batch):
     X1ids = []
     X1mask = []
     for text1 in text_batch:
         x1token = ['[CLS]'] + self.tokenizer.tokenize(text1) + ['[SEP]']
         x1mask = [1] * len(x1token)
         x1ids = self.tokenizer.convert_tokens_to_ids(x1token)
         X1ids.append(x1ids)
         X1mask.append(x1mask)
     X1ids = LongTensor(
         utils.seq_padding(np.array(X1ids), self.para.bert_maxlen))
     X1mask = FloatTensor(
         utils.seq_padding(np.array(X1mask), self.para.bert_maxlen))
     X1 = self.my_model.bert_embedding([X1ids, X1mask])
     X1 = X1 / torch.sqrt(torch.sum(X1 * X1, -1, keepdim=True))
     # print('X1 shape: ', np.shape(X1))
     return X1
コード例 #6
0
    def splitBatch(self, en, cn, batch_size, shuffle=True):
        idx_list = np.arange(0, len(en), batch_size)
        if shuffle:
            np.random.shuffle(idx_list)
        batch_indexs = []
        for idx in idx_list:
            batch_indexs.append(np.arange(idx, min(idx + batch_size, len(en))))

        batches = []
        for batch_index in batch_indexs:
            batch_en = [en[index] for index in batch_index]
            batch_cn = [cn[index] for index in batch_index]
            batch_cn = seq_padding(batch_cn)
            batch_en = seq_padding(batch_en)
            batches.append(Batch(batch_en, batch_cn))

        return batches
コード例 #7
0
    def train_an_epoch(self, iterator):
        self.model_setup()
        epoch_loss = 0
        epoch_acc = 0

        for i, batch in enumerate(iterator):
            label = batch["label"]
            text = batch["text"]
            input_ids, token_type_ids = convert_text_to_ids(
                self.tokenizer, text)
            input_ids = seq_padding(self.tokenizer, input_ids)
            token_type_ids = seq_padding(self.tokenizer, token_type_ids)
            # 标签形状为 (batch_size, 1)
            label = label.unsqueeze(1)
            # 需要 LongTensor
            input_ids, token_type_ids, label = input_ids.long(
            ), token_type_ids.long(), label.long()
            # 梯度清零
            self.optimizer.zero_grad()
            # 迁移到GPU
            input_ids, token_type_ids, label = input_ids.to(
                self.device), token_type_ids.to(self.device), label.to(
                    self.device)
            output = self.model(input_ids=input_ids,
                                token_type_ids=token_type_ids,
                                labels=label)
            y_pred_prob = output[1]
            y_pred_label = y_pred_prob.argmax(dim=1)
            # 计算loss
            # 这个 loss 和 output[0] 是一样的
            loss = self.criterion(y_pred_prob.view(-1, 2), label.view(-1))
            #loss = output[0]
            # 计算acc
            acc = ((y_pred_label == label.view(-1)).sum()).item()
            # 反向传播
            loss.backward()
            self.optimizer.step()
            # epoch 中的 loss 和 acc 累加
            epoch_loss += loss.item()
            epoch_acc += acc
            if i % 200 == 0:
                print("current loss:", epoch_loss / (i + 1), "\t",
                      "current acc:", epoch_acc / ((i + 1) * len(label)))
        return epoch_loss / len(iterator), epoch_acc / len(
            iterator.dataset.dataset)
コード例 #8
0
 def __get_testbatch__(self,  num):
     start_idx = num*self.batch_size
     end_idx = [start_idx+self.batch_size, len(self.data)][start_idx+self.batch_size > len(self.data)]
     idxs = [start_idx + i for i in range(end_idx - start_idx)]
     X1, X2, Y = [], [], []
     for i in idxs:
         d = self.data[i]
         text = d[0][:self.pad_size]
         x1, x2 = tokenizer.encode(first=text)
         X1.append(x1)
         X2.append(x2)
         label = [0] * 202
         for p in d[1]:
             label[self.tag2id[p]] = 1
         Y.append(label)
     #print(X1)
     X1 = np.array(seq_padding(X1))
     X2 = np.array(seq_padding(X2))
     Y = np.array(seq_padding(Y))
     return [X1, X2], Y
コード例 #9
0
    def splitBatch(self, en, cn, batch_size, shuffle=True):
        """
        将以单词id列表表示的翻译前(英文)数据和翻译后(中文)数据
        按照指定的batch_size进行划分
        如果shuffle参数为True,则会对这些batch数据顺序进行随机打乱

        排序之后,一个batch深入,填充的位置会变少
        """
        # 在按数据长度生成的各条数据下标列表[0, 1, ..., len(en)-1]中
        # 每隔指定长度(batch_size)取一个下标作为后续生成batch的起始下标
        idx_list = np.arange(0, len(en), batch_size)
        # 如果shuffle参数为True,则将这些各batch起始下标打乱
        if shuffle:
            np.random.shuffle(idx_list)
        # 存放各个batch批次的句子数据索引下标
        batch_indexs = []
        for idx in idx_list:
            # 注意,起始下标最大的那个batch可能会超出数据大小
            # 因此要限定其终止下标不能超过数据大小
            """
            形如[array([4, 5, 6, 7]), 
                 array([0, 1, 2, 3]), 
                 array([8, 9, 10, 11]),
                 ...]
            """
            batch_indexs.append(np.arange(idx, min(idx + batch_size, len(en))))

        # 按各batch批次的句子数据索引下标,构建实际的单词id列表表示的各batch句子数据
        batches = []
        for batch_index in batch_indexs:
            # 按当前batch的各句子下标(数组批量索引)提取对应的单词id列表句子表示数据
            batch_en = [en[index] for index in batch_index]
            batch_cn = [cn[index] for index in batch_index]
            # 对当前batch的各个句子都进行padding对齐长度
            # 维度为:batch数量×batch_size×每个batch最大句子长度
            batch_cn = seq_padding(batch_cn)
            batch_en = seq_padding(batch_en)
            # 将当前batch的英文和中文数据添加到存放所有batch数据的列表中
            batches.append(Batch(batch_en, batch_cn))

        return batches
コード例 #10
0
 def __iter__(self):
     train_data = self.data
     while True:
         idxs = [i for i in range(len(train_data))]
         np.random.shuffle(idxs)
         X1, X2, Y = [], [], []
         for i in idxs:
             #print(i)
             d = train_data[i]
             text = d[0][:self.pad_size]
             x1, x2 = tokenizer.encode(first=text)
             X1.append(x1)
             X2.append(x2)
             label = [0] * 202
             for p in d[1]:
                 label[self.tag2id[p]] = 1
             Y.append(label)
             if len(X1) == self.batch_size or i == idxs[-1]:
                 X1 = np.array(seq_padding(X1))
                 X2 = np.array(seq_padding(X2))
                 Y = np.array(seq_padding(Y))
                 yield [X1, X2], Y
                 X1, X2, Y = [], [], []
コード例 #11
0
    def predict(self, sentence):
        self.model.setup()
        self.model.eval()
        # 转token后padding
        input_ids, token_type_ids = convert_text_to_ids(
            self.tokenizer, sentence)
        input_ids = seq_padding(self.tokenizer, [input_ids])
        token_type_ids = seq_padding(self.tokenizer, [token_type_ids])
        # 需要 LongTensor
        input_ids, token_type_ids = input_ids.long(), token_type_ids.long()
        # 梯度清零
        self.optimizer.zero_grad()
        # 迁移到GPU
        input_ids, token_type_ids = input_ids.to(
            self.device), token_type_ids.to(self.device)
        output = self.model(input_ids=input_ids, token_type_ids=token_type_ids)
        # y_pred_prob:各个类别的概率
        y_pred_prob = output[0]
        # 取概率最大的标签
        y_pred_label = y_pred_prob.argmax(dim=1)

        # 将torch.tensor转换回int形式
        return y_pred_label.item()
コード例 #12
0
    def get_sim(self, text1, text2):
        x1token = ['[CLS]'] + tokenizer.tokenize(text1) + ['[SEP]']
        x2token = ['[CLS]'] + tokenizer.tokenize(text2) + ['[SEP]']

        x1mask = [1] * len(x1token)
        x2mask = [1] * len(x2token)
        x1mask = FloatTensor(
            utils.seq_padding(np.array([x1mask]), para.bert_maxlen))
        x2mask = FloatTensor(
            utils.seq_padding(np.array([x2mask]), para.bert_maxlen))
        x1ids = tokenizer.convert_tokens_to_ids(x1token)
        x2ids = tokenizer.convert_tokens_to_ids(x2token)
        x1ids = LongTensor(
            utils.seq_padding(np.array([x1ids]), para.bert_maxlen))
        x2ids = LongTensor(
            utils.seq_padding(np.array([x2ids]), para.bert_maxlen))
        x1 = self.my_model.bert_embedding([x1ids, x1mask])
        x2 = self.my_model.bert_embedding([x2ids, x2mask])

        x1 = x1 / torch.sqrt(torch.sum(x1 * x1, -1, keepdim=True))
        x2 = x2 / torch.sqrt(torch.sum(x2 * x2, -1, keepdim=True))
        x1x2 = torch.sum(x1 * x2, -1)
        return x1x2[0].item()
    def get_batch(self):
        files = self.files
        file_type = self.file_type
        batch_size = self.batch_size

        product_id_list = []
        boxes_list = []
        images_features_list = []
        idx_class_labels_list = []
        idx_class_labels_mask_list = []
        idx_query_list = []
        query_id_list = []
        label_list = []
        mask_query_list = []
        mask_idx_query_list = []
        mask_label_list = []

        epoch_num = 0.0
        while 1:
            random.shuffle(files)
            for filename in files:
                with open(os.path.join(KDD_DATA, filename),
                          'r',
                          encoding='utf-8') as f:
                    lines = f.readlines()

                index_list = [i for i in range(len(lines))]
                random.shuffle(index_list)

                for i, index in enumerate(index_list):
                    try:
                        line = lines[index]
                        if "product_id" in line:
                            continue
                        product_id, boxes, images_features, idx_class_labels, idx_class_labels_mask, \
                            idx_query, query_id, query, class_label, mask_query, mask_idx_query, mask_label = read_line(line, self.dict_multimodal_labels, self.tokenizer)

                        label_list.append(1)

                        product_id_list.append(product_id)
                        boxes_list.append(boxes)
                        images_features_list.append(images_features)
                        idx_class_labels_list.append(idx_class_labels)
                        idx_class_labels_mask_list.append(
                            idx_class_labels_mask)
                        idx_query_list.append(idx_query)
                        query_id_list.append(query_id)
                        mask_query_list.append(mask_query)
                        mask_idx_query_list.append(mask_idx_query)
                        mask_label_list.append(mask_label)

                        if len(product_id_list) == batch_size or i == (
                                len(index_list) - 1):
                            np_boxes, _ = seq_padding_2(boxes_list,
                                                        maxlen=MAX_BOX_NUM,
                                                        padding_value=0)
                            np_images_features, np_images_features_mask = seq_padding_2(
                                images_features_list,
                                maxlen=MAX_BOX_NUM,
                                padding_value=0)
                            np_idx_class_labels, _ = seq_padding_2(
                                idx_class_labels_list,
                                maxlen=MAX_BOX_NUM,
                                padding_value=0)
                            np_idx_class_labels_mask, _ = seq_padding_2(
                                idx_class_labels_mask_list,
                                maxlen=MAX_BOX_NUM,
                                padding_value=0)
                            np_idx_query, np_idx_query_mask = seq_padding(
                                idx_query_list,
                                maxlen=MAX_LENGTH,
                                padding_value=0)
                            np_mask_idx_query, np_mask_idx_query_mask = seq_padding(
                                mask_idx_query_list,
                                maxlen=MAX_LENGTH,
                                padding_value=0)
                            np_mask_label, _ = seq_padding(mask_label_list,
                                                           maxlen=MAX_LENGTH,
                                                           padding_value=-1)

                            yield product_id_list, np_boxes, np_images_features, np_images_features_mask, \
                                np_idx_class_labels, np_idx_class_labels_mask, \
                                query_id_list, np_idx_query, np_idx_query_mask, \
                                np_mask_idx_query, np_mask_idx_query_mask, np_mask_label, \
                                np.array(label_list)

                            product_id_list = []
                            boxes_list = []
                            images_features_list = []
                            idx_class_labels_list = []
                            idx_class_labels_mask_list = []
                            idx_query_list = []
                            query_id_list = []
                            label_list = []
                            mask_query_list = []
                            mask_idx_query_list = []
                            mask_label_list = []

                    except Exception as e:
                        import traceback
                        traceback.print_exc()
                        continue
コード例 #14
0
    print(evaluator.get_embedding(text1))

    for epoch in range(para.epoch):
        loss_list = []
        for step, data in tqdm(enumerate(train_loader)):
            X1, X2 = data
            X1ids, X2ids, X1mask, X2mask = [], [], [], []
            for i in range(len(X1)):
                x1token = ['[CLS]'] + tokenizer.tokenize(X1[i]) + ['[SEP]']
                x2token = ['[CLS]'] + tokenizer.tokenize(X2[i]) + ['[SEP]']
                X1ids.append(tokenizer.convert_tokens_to_ids(x1token))
                X2ids.append(tokenizer.convert_tokens_to_ids(x2token))
                X1mask.append([1] * len(x1token))
                X2mask.append([1] * len(x2token))
            X1ids = LongTensor(
                utils.seq_padding(np.array(X1ids), para.bert_maxlen))
            X2ids = LongTensor(
                utils.seq_padding(np.array(X2ids), para.bert_maxlen))
            X1mask = FloatTensor(
                utils.seq_padding(np.array(X1mask), para.bert_maxlen))
            X2mask = FloatTensor(
                utils.seq_padding(np.array(X2mask), para.bert_maxlen))

            # 使用dataParallel之后下面两行需要加module()
            X1embed = model.bert_embedding([X1ids, X1mask])
            X2embed = model.bert_embedding([X2ids, X2mask])

            T1embed, T2embed = \
                utils.get_pairs([X1embed.cpu().data.numpy(), X2embed.cpu().data.numpy()])

            T1embed = FloatTensor(T1embed)