def _load_dataset(self): """ 加载数据集,并构建词汇表,word embedding :return: """ file_name = self.data_name.split('.')[0] # 如果pkl 文件不存在:1.加载训练集 2.构建训练集上的词汇表 3.构建训练集上的word embedding if not os.path.exists(os.path.join(self.data_dir, file_name + '.pkl')): pretrained_wordembedding = load_pretrained_wordembedding(self.word_embedding_path) # 加载预训练的word embedding examples = [] # 根据训练数据构建词汇表 'token':id stoi = {} itos = {} stoi['UNK'] = 0 stoi['PAD'] = 1 itos[0] = 'UNK' itos[1] = 'PAD' # 根据训练数据构建word_embedding id:-1.08563066e+00 9.97345448e-01 2.82978505e-01 -1.50629473e+00........ vectors = [] vectors.append(pretrained_wordembedding['UNK']) vectors.append(pretrained_wordembedding['PAD']) raw_data = pd.read_csv(os.path.join(self.data_dir, self.data_name), header=0, names=['label', 'text']) for item in tqdm(raw_data.iterrows()): weibo_example = WeiboExample(item[1]['text'], item[1]['label']) # 使用词向量 weibo_example.tokens = [*jieba.lcut(weibo_example.text)] # 使用字向量 # weibo_example.tokens = list(weibo_example.text) for token in weibo_example.tokens: if token in pretrained_wordembedding: # 如果token在预训练的word embedding 词汇表中 if token not in stoi: stoi[token] = len(stoi) itos[len(stoi)] = token vectors.append(pretrained_wordembedding[token]) weibo_example.tokens_ids.append(stoi[token]) else: # 如果token 不在预训练的word embedding 词汇表中 weibo_example.tokens_ids.append(stoi['UNK']) examples.append(weibo_example) word_embedding = WordEmbedding(stoi, itos) word_embedding.vectors = np.array(vectors) # 保存成pkl文件,方便加载 with open(os.path.join(self.data_dir, file_name + '.pkl'), 'wb') as f: pickle.dump(examples, f) with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'wb') as f: pickle.dump(word_embedding, f) else: with open(os.path.join(self.data_dir, file_name + '.pkl'), 'rb') as f: examples = pickle.load(f) with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'rb') as f: word_embedding = pickle.load(f) return examples, word_embedding
def _load_dataset(self, file_name): """ 加载数据集,并构建词汇表,word embedding :return: """ # 只在训练集上构建word embedding if 'train' in file_name: # 如果pkl 文件不存在:1.加载训练集 2.构建训练集上的词汇表 3.构建训练集上的word embedding if not os.path.exists( os.path.join(self.data_dir, file_name.rsplit('.', 1)[0] + '_char.pkl')): examples = [] # 加载预训练的word embedding pretrained_wordembedding = load_pretrained_wordembedding( self.word_embedding_path) stoi = {} itos = {} stoi['UNK'] = 0 stoi['PAD'] = 1 itos[0] = 'UNK' itos[1] = 'PAD' # 根据训练数据构建word_embedding id:-1.08563066e+00 9.97345448e-01 2.82978505e-01 -1.50629473e+00........ vectors = [] vectors.append(pretrained_wordembedding['UNK']) vectors.append(pretrained_wordembedding['PAD']) with open(os.path.join(self.data_dir, file_name), 'r') as f: for line in tqdm(f): label, text = line.split('\t') if len(text) > self.max_sent_len: text = text[:self.max_sent_len] cnews_example = CnewsExample(text, label) # 使用字向量 cnews_example.tokens = list(cnews_example.text) # 使用词向量 # cnews_example.tokens = [*jieba.lcut(cnews_example.text)] for token in cnews_example.tokens: if token in pretrained_wordembedding: if token not in stoi: stoi[token] = len(stoi) itos[len(stoi)] = token vectors.append( pretrained_wordembedding[token]) cnews_example.tokens_ids.append(stoi[token]) else: cnews_example.tokens_ids.append(stoi['UNK']) examples.append(cnews_example) word_embedding = WordEmbedding(stoi, itos) word_embedding.vectors = np.array(vectors) # 保存成pkl文件,方便加载 with open( os.path.join(self.data_dir, file_name.rsplit('.', 1)[0] + '_char.pkl'), 'wb') as f: pickle.dump(examples, f) with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'wb') as f: pickle.dump(word_embedding, f) else: with open( os.path.join(self.data_dir, file_name.rsplit('.', 1)[0] + '_char.pkl'), 'rb') as f: examples = pickle.load(f) with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'rb') as f: word_embedding = pickle.load(f) return examples, word_embedding else: if not os.path.exists( os.path.join(self.data_dir, file_name.rsplit('.', 1)[0] + '_char.pkl')): examples = [] # 加载训练集上的word embedding with open(os.path.join(self.data_dir, 'word_embedding.pkl'), 'rb') as f: word_embedding = pickle.load(f) with open(os.path.join(self.data_dir, file_name), 'r') as f: for line in tqdm(f): label, text = line.split('\t') if len(text) > self.max_sent_len: text = text[:self.max_sent_len] cnews_example = CnewsExample(text, label) # 使用字向量 cnews_example.tokens = list(cnews_example.text) # 使用词向量 # cnews_example.tokens = [*jieba.lcut(cnews_example.text)] for token in cnews_example.tokens: if token in word_embedding.stoi: cnews_example.tokens_ids.append( word_embedding.stoi[token]) else: cnews_example.tokens_ids.append( word_embedding.stoi['UNK']) examples.append(cnews_example) # 保存成pkl文件,方便加载 with open( os.path.join(self.data_dir, file_name.rsplit('.', 1)[0] + '_char.pkl'), 'wb') as f: pickle.dump(examples, f) else: with open( os.path.join(self.data_dir, file_name.rsplit('.', 1)[0] + '_char.pkl'), 'rb') as f: examples = pickle.load(f) return examples
def _load_dataset(self): """ 加载源数据集,并构建词嵌入 :return: 数据集,词嵌入 """ # 数据集文件名 file_name = self.data_name.split('.')[0] # 数据集pkl,不存在 # 1.加载训练集、构建训练集上的词汇表 # 2.构建训练集上的word embedding if not self.is_mlp: examples_path = os.path.join(self.data_dir, file_name + '_examples.pkl') else: examples_path = os.path.join(self.data_dir, file_name + '_examples_mlp.pkl') if not os.path.exists(examples_path): # 1、加载预训练词向量文件 pretrained_wordembedding = load_pretrained_wordembedding( self.word_embedding_path) # 2、初始化词汇表、词向量 """ stoi: 字典,token键,index值;'UNK': 0 itos: 也是字典,index键,token值;0: 'UNK' vectors: 词向量集合 word_embedding id: -1.08563066e+00 9.97345448e-01 ...... """ stoi = {} itos = {} stoi['UNK'] = 0 stoi['PAD'] = 1 itos[0] = 'UNK' itos[1] = 'PAD' vectors = [] vectors.append(pretrained_wordembedding['UNK']) vectors.append(pretrained_wordembedding['PAD']) # 3、根据词汇表、词向量,构建数据集实例 # 数据集 examples = [] # 读取源数据集csv,为raw_data """ header,数据开始行数 names,列名列表 """ raw_data = pd.read_csv(os.path.join(self.data_dir, self.data_name), header=0, names=['label', 'text']) # encoding = 'utf-8', dtype = str # .astype(str) # 遍历raw_data.iterrows(),作item,行迭代 for item in tqdm(raw_data.iterrows()): # new一个数据集对象 hotspring_example = HotspringExample(item[1]['text'], item[1]['label']) # 使用词向量 """ 分词, jieba.lcut(),返回列表 jieba.cut(),返回迭代器 """ hotspring_example.tokens = [ *jieba.lcut(str(hotspring_example.text)) ] # 使用字向量 """ 直接转成列表 """ # hotspring_example.tokens = list(hotspring_example.text) # 遍历文本的tokens,为token for token in hotspring_example.tokens: # 如果token在预训练词向量中 if token in pretrained_wordembedding: # 如果不在stoi,加到词汇表 if token not in stoi: # stoi、itos———— 放进 stoi[token] = len(stoi) itos[len(stoi)] = token # 根据token找pretrained_wordembedding对象中对应的一条词向量,放到vectors,用于构造word_embedding对象 vectors.append(pretrained_wordembedding[token]) # hotspring_example对象的tokens_ids(list),,根据{'UNK' : 0}添加0 hotspring_example.tokens_ids.append(stoi[token]) # 如果token不在 else: # hotspring_example对象的tokens_ids(list),,直接添加0 hotspring_example.tokens_ids.append(stoi['UNK']) # 如果是mlp,padding到260 """ 也可以在mlp_model的时候再补齐的, 这样就做到和其它神经网络传入模型前的数据部分都统一了 """ if self.is_mlp: tokens_ids_len = len(hotspring_example.tokens_ids) if tokens_ids_len < self.needed_by_mlp_max_seq_len: tokens_ids_len_need = self.needed_by_mlp_max_seq_len - tokens_ids_len for i in range(tokens_ids_len_need): hotspring_example.tokens_ids.append(stoi['PAD']) # hotspring_example(object,一个评论),添加到examples(list,n个评论) examples.append(hotspring_example) # 4、根据词汇表、词向量,构建词嵌入实例 # new一个词嵌入对象,参数stoi、itos、vectors word_embedding = WordEmbedding(stoi, itos) word_embedding.vectors = np.array(vectors) # 5、数据集tokens_ids、词嵌入两种字典+向量,保存成pkl文件,方便加载 """ 对于_examples.pkl,mlp和其它神经网络是不一样的; 对于_word_embedding.pkl,mlp和其它神经网络应该是一样的, 但是在测试集中,不知道为什么, 用mlp的原始_word_embedding.pkl,测试其它神经网络,auc一样, 用其它神经网络的原始_word_embedding.pkl,测试mlp络,auc偏低0.001量级一左右, 再说,先用着分别保存 """ if not self.is_mlp: with open( os.path.join(self.data_dir, file_name + '_examples.pkl'), 'wb') as f: pickle.dump(examples, f) with open( os.path.join(self.data_dir, file_name + '_word_embedding.pkl'), 'wb') as f: pickle.dump(word_embedding, f) else: with open( os.path.join(self.data_dir, file_name + '_examples_mlp.pkl'), 'wb') as f: pickle.dump(examples, f) with open( os.path.join(self.data_dir, file_name + '_word_embedding_mlp.pkl'), 'wb') as f: pickle.dump(word_embedding, f) # 额外保存一下原始词典,较为方便,其实word_embedding对象里面有 with open("data/dictionary/word2idx.json", 'w+', encoding='utf-8') as f: f.write(json.dumps(stoi, ensure_ascii=False)) # 数据集pkl,存在 else: if not self.is_mlp: # 读取数据集pkl with open( os.path.join(self.data_dir, file_name + '_examples.pkl'), 'rb') as f: examples = pickle.load(f) # 读取词向量pkl with open( os.path.join(self.data_dir, file_name + '_word_embedding.pkl'), 'rb') as f: word_embedding = pickle.load(f) else: with open( os.path.join(self.data_dir, file_name + '_examples_mlp.pkl'), 'rb') as f: examples = pickle.load(f) # 读取词向量pkl with open( os.path.join(self.data_dir, file_name + '_word_embedding_mlp.pkl'), 'rb') as f: word_embedding = pickle.load(f) return examples, word_embedding
def _load_dataset(self): file_name = self.data_name.split('.')[0] if not self.is_mlp: examples_path = os.path.join(self.data_dir, file_name + '_examples.pkl') else: examples_path = os.path.join(self.data_dir, file_name + '_examples_mlp.pkl') if not os.path.exists(examples_path): # 这里加载的pretrained_wordembedding,其实也就是原始语料的corpus_word_embedding.pkl pretrained_wordembedding = load_pretrained_wordembedding( self.word_embedding_path) # 加载原始词典 """ 通过保存的字典方式加载: with open("data/dictionary/word2idx.json", "r", encoding="utf-8") as f: stoi = json.load(f) """ with open(os.path.join(self.model_word_embedding_path), 'rb') as f: word_embedding_for_stoi = pickle.load(f) stoi = word_embedding_for_stoi.stoi # 根据stoi,造itos """ {"zero":0, "one":1} --> {0: 'zero', 1: 'one'} """ itos = {k: v for k, v in enumerate(stoi)} vectors = [] vectors.append(pretrained_wordembedding['UNK']) vectors.append(pretrained_wordembedding['PAD']) examples = [] raw_data = pd.read_csv(os.path.join(self.data_dir, self.data_name), header=0, names=['label', 'text']) for item in tqdm(raw_data.iterrows()): hotspring_example = HotspringExample(item[1]['text'], item[1]['label']) hotspring_example.tokens = [ *jieba.lcut(str(hotspring_example.text)) ] # 使用字向量 # hotspring_example.tokens = list(hotspring_example.text) # 一句话 for token in hotspring_example.tokens: # 一个字 # 如果token在,原始字典stoi if token in stoi: # 构造vectors:根据token,从pretrained_wordembedding,找到对应的一条词向量,放到vectors vectors.append(pretrained_wordembedding[token]) # 构造example:根据原始字典,翻译为这句话的tokens_ids hotspring_example.tokens_ids.append(stoi[token]) # 不在字典,直接添加未知token的token_id else: hotspring_example.tokens_ids.append(stoi['UNK']) if self.is_mlp: tokens_ids_len = len(hotspring_example.tokens_ids) if tokens_ids_len < self.needed_by_mlp_max_seq_len: tokens_ids_len_need = self.needed_by_mlp_max_seq_len - tokens_ids_len for i in range(tokens_ids_len_need): hotspring_example.tokens_ids.append(stoi['PAD']) else: hotspring_example.tokens_ids = hotspring_example.tokens_ids[: self . needed_by_mlp_max_seq_len] examples.append(hotspring_example) word_embedding = WordEmbedding(stoi, itos) word_embedding.vectors = np.array(vectors) if not self.is_mlp: with open( os.path.join(self.data_dir, file_name + '_examples.pkl'), 'wb') as f: pickle.dump(examples, f) with open( os.path.join(self.data_dir, file_name + '_word_embedding.pkl'), 'wb') as f: pickle.dump(word_embedding, f) else: with open( os.path.join(self.data_dir, file_name + '_examples_mlp.pkl'), 'wb') as f: pickle.dump(examples, f) with open( os.path.join(self.data_dir, file_name + '_word_embedding_mlp.pkl'), 'wb') as f: pickle.dump(word_embedding, f) else: if not self.is_mlp: with open( os.path.join(self.data_dir, file_name + '_examples.pkl'), 'rb') as f: examples = pickle.load(f) with open( os.path.join(self.data_dir, file_name + '_word_embedding.pkl'), 'rb') as f: word_embedding = pickle.load(f) else: with open( os.path.join(self.data_dir, file_name + '_examples_mlp.pkl'), 'rb') as f: examples = pickle.load(f) with open( os.path.join(self.data_dir, file_name + '_word_embedding_mlp.pkl'), 'rb') as f: word_embedding = pickle.load(f) return examples, word_embedding