def torchLoad(config): TEXT = data.Field(sequential=True, fix_length=config.kwargs['padding_size']) LABEL = data.Field(sequential=True,use_vocab=False) train = GrandDataset(config.kwargs['raw_train_path'], text_field=TEXT, label_field=LABEL, config=config,test=False) val = GrandDataset(config.kwargs['raw_vali_path'], text_field=TEXT, label_field=LABEL, config=config,test=False) test = GrandDataset(config.kwargs['raw_test_path'], text_field=TEXT, label_field=None, config=config,test=True) cache = '../cache/' #读取W2V embedding_path = config.kwargs['embedding_path'] print('load word2vec vectors from {}'.format(embedding_path)) vectors = Vectors(name=embedding_path, cache=cache) vectors.unk_init = init.xavier_uniform_ print('building {} vocabulary......'.format('Word')) TEXT.build_vocab(train, val, test, min_freq=1, vectors=vectors) train_iter = data.Iterator(dataset=train, batch_size=config.kwargs['batch_size'], sort=False,shuffle=True,repeat=False, device=-1) val_iter = data.Iterator(dataset=val, batch_size=config.kwargs['batch_size'], shuffle=False, sort=False, repeat=False, device=-1) test_iter = data.Iterator(dataset=test, batch_size=config.kwargs['batch_size'], shuffle=False, sort=False, repeat=False, device=-1) numerical_dict = TEXT.vocab.stoi return train_iter,val_iter,test_iter,TEXT.vocab.vectors,numerical_dict # torchLoad(config)
def get_dataset(fix_length=100, lower=False, vectors=None): if vectors is not None: lower = True logging.info("预处理 csv......") prepare_csv() TEXT = Field(sequential=True, fix_length=fix_length, tokenize=tokenizer, pad_first=True, lower=lower) LABEL = Field(sequential=False, use_vocab=False) train_datafields = [("id", None), ("comment_text", TEXT), ("toxic", LABEL), ("severe_toxic", LABEL), ("threat", LABEL), ("obscene", LABEL), ("insult", LABEL), ("identity_hate", LABEL)] logging.info("读取 train.csv......") train, val = TabularDataset.splits(path='cache', train='train.csv', validation="val.csv", format='csv', skip_header=True, fields=train_datafields) logging.info("读取 test.csv......") test = TabularDataset(path='cache/test.csv', format='csv', skip_header=True, fields=[('id', None), ('comment_text', TEXT)]) logging.info('读取glove词向量......') # vectors = GloVe(name='6B', dim=300) #会下载词向量 #读取本地词向量 cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) vectors = Vectors( name='/home/sunyan/quora/input/embeddings/glove.840B.300d.txt', cache=cache, max_vectors=200000) vectors.unk_init = init.xavier_uniform_ logging.info('构建词表......') TEXT.build_vocab(train, test, max_size=20000, min_freq=50, vectors=vectors) print(TEXT.vocab.freqs.most_common(10)) logging.info("预处理结束!") return (train, val, test), TEXT
def load_test_data(option): #====== Text_filed = data.Field(sequential=True, fix_length=option.max_text_len) #Label_field = data.Field(sequential=False, use_vocab=False) #====== test_path = option.data_path + option.text_type + '/test_set.csv' #if option.aug: #print('make augementation datasets!') test = buildDataset(test_path, text_field=Text_filed, label_field=None, text_type=option.text_type, test=True) #====== cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) embedding_path = '{}/{}_{}_.txt'.format(option.embedding_path, option.text_type, option.emb_size) print('embedding_path:', embedding_path) # vectors = Vectors(name=embedding_path, cache=cache) print('load word2vec vectors from {}'.format(embedding_path)) vectors.unk_init = init.xavier_uniform_ #如何指定 Vector 缺失值的初始化方式: vector.unk_init = init.xavier_uniform 这种方式指定完再传入 build_vocab #======构建vocab print('building {} vocabulary......'.format(option.text_type)) Text_filed.build_vocab(test, min_freq=option.min_freq, vectors=vectors) print('vocabulary has been made!\n') #======构建Iterator ''' 1. 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 2. 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序 ''' print('building {} Iterator......'.format(option.text_type)) test_iter = data.Iterator(dataset=test, batch_size=option.batch_size, shuffle=False, sort=False, repeat=False, device=option.device) print('Iterator has been made!\n') return test_iter
def load_data(opt): # 不设置fix_length TEXT = data.Field(sequential=True, fix_length=opt.max_text_len) # 词或者字符 LABEL = data.Field(sequential=False, use_vocab=False) # load # word/ or article/ train_path = opt.data_path + opt.text_type + '/train_set.csv' val_path = opt.data_path + opt.text_type + '/val_set.csv' test_path = opt.data_path + opt.text_type + '/test_set.csv' train_path = 'util/word/train_set.csv' test_path = 'util/word/test_set.csv' val_path = 'util/word/val_set.csv' # aug for data augmentation if opt.aug: print('make augmentation datasets!') train = GrandDataset(train_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False, aug=opt.aug) val = GrandDataset(val_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False) test = GrandDataset(test_path, text_field=TEXT, label_field=None, text_type=opt.text_type, test=True) cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) embedding_path = '{}/{}_{}_.txt'.format(opt.embedding_path, opt.text_type, opt.embedding_dim) vectors = Vectors(name=embedding_path, cache=cache) print('load word2vec vectors from {}'.format(embedding_path)) vectors.unk_init = init.xavier_uniform # 没有命中的token的初始化方式 # 构建Vocab print('building {} vocabulary......'.format(opt.text_type)) TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors) # LABEL.build_vocab(train) # 构建Iterator # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序 train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, shuffle=True, sort_within_batch=False, repeat=False, device=opt.device) # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False, # device=opt.device) # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device) val_iter = data.Iterator(dataset=val, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) test_iter = data.Iterator(dataset=test, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
def __init__(self, data_fields, train_file, valid_file, batch_size, device, skip_header, delimiter, pre_embeddings, vector_cache, min_freq=2, extend_vocab=True, pre_vocab_size=200000, use_pre_embedding=False): self.x_field = Field(sequential=True, tokenize=self.word_tokenize, batch_first=True, include_lengths=True) self.y_field = LabelField(batch_first=True) self.train_fields, self.x_var, self.y_vars = self.parse_fields( data_fields, self.x_field, self.y_field) self.train_ds = TabularDataset( train_file, fields=self.train_fields, skip_header=skip_header, format="csv", csv_reader_params={"delimiter": delimiter}) self.valid_ds = TabularDataset( valid_file, fields=self.train_fields, skip_header=skip_header, format="csv", csv_reader_params={"delimiter": delimiter}) self.x_field.build_vocab(self.train_ds, min_freq=min_freq) if use_pre_embedding: vectors = Vectors(pre_embeddings, vector_cache) if extend_vocab: self.extend_vocab_with_vectors(self.x_field.vocab, vectors, pre_vocab_size) vectors.unk_init = partial(init_unk, vocab_size=len(self.x_field.vocab)) self.x_field.vocab.load_vectors(vectors) self.y_field.build_vocab(self.train_ds) self.train_iter, self.valid_iter = BucketIterator.splits( (self.train_ds, self.valid_ds), batch_size=batch_size, device=device, sort=False, sort_key=lambda sample: len(getattr(sample, self.x_var)), sort_within_batch=False, shuffle=True, repeat=False, ) self.vocab = self.x_field.vocab self.vocab_size = len(self.x_field.vocab) self.num_labels = len(self.y_vars) self.num_classes = len(self.y_field.vocab) self.classes = list(self.y_field.vocab.stoi.values()) self.unk_token = self.x_field.unk_token self.pad_token = self.x_field.pad_token self.unk_idx = self.x_field.vocab.stoi[self.unk_token] self.pad_idx = self.x_field.vocab.stoi[self.pad_token] self.train_wrapper = BatchWrapper(self.train_iter, self.x_var, self.y_vars) self.valid_wrapper = BatchWrapper(self.valid_iter, self.x_var, self.y_vars)
test_dataset_torchtext = MyDataset("data/tnews/test.json", TEXT, LABEL, True) for batch_data in test_dataset_torchtext[:3]: print(batch_data.text) # **Embedding** # 下面介绍如何在torchtext中使用预训练的词向量,进而传送给神经网络模型进行训练 # In[25]: # 加载预训练的词向量 vectors = Vectors(name="data/tnews_jieba_tencent_embeddings.txt") # 指定Vector缺失值的初始化方式,oov词的初始化方式 vectors.unk_init = nn.init.uniform_ # 这里只使用训练集的数据进行词汇表的构建 TEXT.build_vocab(train_dataset_torchtext, vectors=vectors) # In[26]: # 统计词频 TEXT.vocab.freqs.most_common(10) # **迭代器** # # * **Iterator**:保持数据样本顺序不变来构建批数据,适用测试集 #