def __init__(self, eval_path): self.tokenizer = Tokenizer(VocabPath) self.eval_path = eval_path self.eval_lines = [] self.label_lines = [] self.__load_data()
def __init__(self): self.tokenizer = Tokenizer(VocabPath) self.seg = pkuseg.pkuseg() self.vocab_size = self.tokenizer._vocab_size self.token_pad_id = self.tokenizer._token_pad_id self.token_cls_id = self.tokenizer._token_start_id self.token_sep_id = self.tokenizer._token_end_id self.token_mask_id = self.tokenizer._token_mask_id
def random_wrong(text): tokenizer = Tokenizer(VocabPath) length = len(text) position = random.randint(0, length - 1) number = random.randint(672, 7992) text = list(text) text[position] = tokenizer.id_to_token(number) text = ''.join(text) return text
def __init__(self, test_path): self.tokenizer = Tokenizer(VocabPath) self.test_path = test_path self.test_lines = [] self.label_lines = [] # 读取数据 with open(self.test_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line_list = line.split('-***-') self.test_lines.append(line_list[1]) self.label_lines.append(line_list[0])
def __init__(self, vocab_size=VocabSize, hidden=HiddenSize, max_len=SentenceLength, num_hidden_layers=HiddenLayerNum, attention_heads=AttentionHeadNum, dropout_prob=DropOut, intermediate_size=IntermediateSize): super(SMBertMlm, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden self.max_len = max_len self.num_hidden_layers = num_hidden_layers self.attention_head_num = attention_heads self.dropout_prob = dropout_prob self.attention_head_size = hidden // attention_heads self.tokenizer = Tokenizer(VocabPath) self.intermediate_size = intermediate_size # 申明网络 self.smbert_emd = SMBbertEmbeddings(vocab_size=self.vocab_size, max_len=self.max_len, hidden_size=self.hidden_size) self.bi_gru = BiGRU(self.hidden_size, self.hidden_size) self.sigmoid = nn.Sigmoid() self.transformer_blocks = nn.ModuleList( Transformer(hidden_size=self.hidden_size, attention_head_num=self.attention_head_num, attention_head_size=self.attention_head_size, intermediate_size=self.intermediate_size).to(device) for _ in range(self.num_hidden_layers)) self.mlm = Mlm(self.hidden_size, self.vocab_size)
def __init__(self, test_path): self.tokenizer = Tokenizer(VocabPath) self.test_path = test_path self.test_lines = [] self.label_lines = [] self.labels = [] # 读取数据 with open(self.test_path, 'r', encoding='utf-8') as f: for line in f: if line: line_list = line.strip().split('-***-') self.test_lines.append(line_list[0].strip()) self.label_lines.append(line_list[1].strip()) label = line_list[2].strip() labels = [int(i) for i in label if i != ' '] labels = [0] + labels[:min(len(labels), SentenceLength - 2)] + [0] pad_label_len = SentenceLength - len(labels) labels = labels + [0] * pad_label_len self.labels.append(labels)
class RobertaTestSet(Dataset): def __init__(self, test_path): self.tokenizer = Tokenizer(VocabPath) self.test_path = test_path self.test_lines = [] self.label_lines = [] self.labels = [] # 读取数据 with open(self.test_path, 'r', encoding='utf-8') as f: for line in f: if line: line_list = line.strip().split('-***-') self.test_lines.append(line_list[0].strip()) self.label_lines.append(line_list[1].strip()) label = line_list[2].strip() labels = [int(i) for i in label if i != ' '] labels = [0] + labels[:min(len(labels), SentenceLength - 2)] + [0] pad_label_len = SentenceLength - len(labels) labels = labels + [0] * pad_label_len self.labels.append(labels) def __len__(self): return len(self.label_lines) def __getitem__(self, item): output = {} test_text = self.test_lines[item] label_text = self.label_lines[item] labels = self.labels[item] test_token = self.__gen_token(test_text) label_token = self.__gen_token(label_text) segment_ids = [0 if x else 0 for x in label_token] output['input_token_ids'] = test_token output['token_ids_labels'] = label_token output['segment_ids'] = segment_ids output['label'] = labels instance = {k: torch.tensor(v, dtype=torch.long) for k, v in output.items()} return instance def __gen_token(self, tokens): tar_token_ids = [101] tokens = list(tokens) tokens = tokens[:(SentenceLength - 2)] for token in tokens: token_id = self.tokenizer.token_to_id(token) tar_token_ids.append(token_id) tar_token_ids.append(102) if len(tar_token_ids) < SentenceLength: for i in range(SentenceLength - len(tar_token_ids)): tar_token_ids.append(0) return tar_token_ids
class SMBertEvalSet(Dataset): def __init__(self, eval_path): self.tokenizer = Tokenizer(VocabPath) self.eval_path = eval_path self.eval_lines = [] self.label_lines = [] self.__load_data() def __load_data(self): # 读取数据 with open(self.eval_path, 'r', encoding='utf-8') as f: for line in f: if line: line = line.strip() line_list = line.split('-***-') self.eval_lines.append(line_list[1]) self.label_lines.append(line_list[0]) def __gen_token(self, tokens): tar_token_ids = [101] tokens = list(tokens) tokens = tokens[:(SentenceLength - 2)] for token in tokens: token_id = self.tokenizer.token_to_id(token) tar_token_ids.append(token_id) tar_token_ids.append(102) return tar_token_ids def __len__(self): return len(self.label_lines) def __getitem__(self, item): output = {} eval_text = self.eval_lines[item] label_text = self.label_lines[item] eval_token = self.__gen_token(eval_text) label_token = self.__gen_token(label_text) position_ids = [i for i in range(len(eval_token))] segment_ids = [1 if x else 0 for x in label_token] output['eval_token'] = eval_token output['eval_position'] = position_ids output['eval_segment'] = segment_ids output['eval_label'] = label_token instance = { k: torch.tensor(v, dtype=torch.long) for k, v in output.items() } return instance
class DataFactory(object): def __init__(self): self.tokenizer = Tokenizer(VocabPath) self.seg = pkuseg.pkuseg() self.vocab_size = self.tokenizer._vocab_size self.token_pad_id = self.tokenizer._token_pad_id self.token_cls_id = self.tokenizer._token_start_id self.token_sep_id = self.tokenizer._token_end_id self.token_mask_id = self.tokenizer._token_mask_id def __token_process(self, token_id): """ 以80%的几率替换为[MASK],以10%的几率保持不变, 以10%的几率替换为一个随机token。 """ rand = np.random.random() if rand <= 0.8: return self.token_mask_id elif rand <= 0.9: return token_id else: return np.random.randint(0, self.vocab_size) def texts_to_ids(self, texts): texts_ids = [] for text in texts: # 处理每个句子 for word in text: # text_ids首位分别是cls和sep,这里暂时去除 word_tokes = self.tokenizer.tokenize(text=word)[1:-1] words_ids = self.tokenizer.tokens_to_ids(word_tokes) texts_ids.append(words_ids) return texts_ids def ids_to_mask(self, texts_ids): instances = [] total_ids = [] total_masks = [] # 为每个字或者词生成一个概率,用于判断是否mask mask_rates = np.random.random(len(texts_ids)) for i, word_id in enumerate(texts_ids): # 为每个字生成对应概率 total_ids.extend(word_id) if mask_rates[i] < MaskRate: # 因为word_id可能是一个字,也可能是一个词 for sub_id in word_id: total_masks.append(self.__token_process(sub_id)) else: total_masks.extend([0] * len(word_id)) # 每个实例的最大长度为512,因此对一个段落进行裁剪 # 510 = 512 - 2,给cls和sep留的位置 for i in range(math.ceil(len(total_ids) / (SentenceLength - 2))): tmp_ids = [self.token_cls_id] tmp_masks = [self.token_pad_id] tmp_ids.extend( total_ids[i * (SentenceLength - 2):min((i + 1) * (SentenceLength - 2), len(total_ids))]) tmp_masks.extend(total_masks[i * (SentenceLength - 2):min( (i + 1) * (SentenceLength - 2), len(total_masks))]) # 不足512的使用padding补全 diff = SentenceLength - len(tmp_ids) if diff == 1: tmp_ids.append(self.token_sep_id) tmp_masks.append(self.token_pad_id) else: # 添加结束符 tmp_ids.append(self.token_sep_id) tmp_masks.append(self.token_pad_id) # 将剩余部分padding补全 tmp_ids.extend([self.token_pad_id] * (diff - 1)) tmp_masks.extend([self.token_pad_id] * (diff - 1)) instances.append([tmp_ids, tmp_masks]) return instances def ids_all_mask(self, texts_ids, tokenid2count): instances = [] tmp_ids = [101] # 格式化数据 for token_ids in texts_ids: if isinstance(token_ids, list): for token_id in token_ids: tmp_ids.append(token_id) if len(tmp_ids) == SentenceLength - 1: break else: tmp_ids.append(token_ids) if len(tmp_ids) == SentenceLength - 1: break if len(tmp_ids) == SentenceLength - 1: break tmp_ids.append(102) input_length = len(tmp_ids) - 2 if len(tmp_ids) < SentenceLength: for i in range(SentenceLength - len(tmp_ids)): tmp_ids.append(0) for i in range(1, input_length + 1): # 如果某字出现次数很少,则强行增加训练集 if tokenid2count[tmp_ids[i]] < WordGenTimes: for j in range(WordGenTimes - tokenid2count[tmp_ids[i]]): tmp_masks = [0] * SentenceLength rand_num = np.random.randint(672, 7992) tmp_masks[i] = rand_num instances.append([tmp_ids, tmp_masks]) tmp_masks = [0] * SentenceLength if random.random() < RanWrongDivisor: rand_num = np.random.randint(672, 7992) tmp_masks[i] = rand_num else: tmp_masks[i] = tmp_ids[i] instances.append([tmp_ids, tmp_masks]) return instances