def load_data(train_dir, test_dir): NLP = spacy.load('en_core_web_sm') tokenizer = lambda sent: [ x.text for x in NLP.tokenizer(sent) if x.text != " " ] TEXT = data.Field(sequential=True, batch_first=True, lower=True, fix_length=50, tokenize=tokenizer) LABEL = data.Field(sequential=False, batch_first=True) train_data = TabularDataset(path=train_dir, skip_header=True, format='csv', fields=[('text', TEXT), ('label', LABEL)]) test_data = TabularDataset(path=test_dir, skip_header=True, format='csv', fields=[('text', TEXT), ('label', LABEL)]) train_data, valid_data = train_data.split(split_ratio=0.8) return train_data, valid_data, test_data, TEXT, LABEL
def load_dataset_from_csv(params, device): """ tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied Field : A class that stores information about the way of preprocessing fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which will pad each sequence to have a fix length of 200. build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding. vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. """ # define tokenizer en = English() def tokenize(sentence): return [tok.text for tok in en.tokenizer(sentence)] TEXT = Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', batch_first=True, fix_length=128) LABEL = LabelField() fields_list = [('Unnamed: 0', None), ('text', TEXT), ('conf', None), ('label', LABEL)] base_path = params.DATA_PATH train_path = os.path.join(base_path, "filtered_train.csv") test_path = os.path.join(base_path, "filtered_test.csv") train_data = TabularDataset(path=train_path, # the root directory where the data lies format='csv', skip_header=True, fields=fields_list) test_data = TabularDataset(path=test_path, # the root directory where the data lies format='csv', skip_header=True, fields=fields_list) if params.VOCAB_USE_GLOVE: TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ, vectors=GloVe(name='6B', dim=300)) logging.info("Loaded Glove embedding, Vector size of Text Vocabulary: " + str(TEXT.vocab.vectors.size())) else: TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ) LABEL.build_vocab(train_data) word_embeddings = TEXT.vocab.vectors logging.info("Length of Text Vocabulary: " + str(len(TEXT.vocab))) train_iter, test_iter = data.BucketIterator.splits((train_data, test_data), batch_sizes=(params.TRAIN_BATCH_SIZE, params.TRAIN_BATCH_SIZE), sort_key=lambda x: len(x.text), repeat=False, shuffle=True, device=device) # Disable shuffle test_iter.shuffle = False return TEXT, word_embeddings, train_iter, test_iter
def build_dataset(self, field): train = TabularDataset(path=self.train_path, format='tsv', fields=[('sent', field)]) valid = TabularDataset(path=self.valid_path, format='tsv', fields=[('sent', field)]) test = TabularDataset(path=self.test_path, format='tsv', fields=[('sent', field)]) return train, valid, test
def create_tabular_dataset(data_info, **args): disable = [ 'tagger', 'parser', 'ner', 'textcat' 'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities', 'merge_subtokens' ] lang = args.get('lang', 'en') pretrained_emb = args.get('pretrained_emb', 'glove.6B.300d') _, path_train_dataset, path_valid_dataset = analyze_datainfo_paths( data_info) try: spacy_en = spacy.load(f'{lang}_core_web_sm', disable=disable) except: log(f"Download {lang}") import importlib os.system(f"python -m spacy download {lang}") spacy_en = importlib.import_module(f'{lang}_core_web_sm').load( disable=disable) # sleep(60) # spacy_en = spacy.load( f'{lang}_core_web_sm', disable= disable) def tokenizer(text): return [tok.text for tok in spacy_en.tokenizer(text)] # Creating field for text and label TEXT = Field(sequential=True, tokenize=tokenizer, lower=True) LABEL = Field(sequential=False) print('Preprocessing the text...') # clean the text TEXT.preprocessing = torchtext.data.Pipeline(clean_str) print('Creating tabular datasets...It might take a while to finish!') train_datafield = [('text', TEXT), ('label', LABEL)] tabular_train = TabularDataset(path=path_train_dataset, format='csv', skip_header=True, fields=train_datafield) valid_datafield = [('text', TEXT), ('label', LABEL)] tabular_valid = TabularDataset(path=path_valid_dataset, format='csv', skip_header=True, fields=valid_datafield) print('Building vocaulary...') TEXT.build_vocab(tabular_train, vectors=pretrained_emb) LABEL.build_vocab(tabular_train) return tabular_train, tabular_valid, TEXT.vocab
def build_dataset(self, ORIG, PARA): train_val = TabularDataset(path=self.train_path, format='tsv', fields=[('orig', ORIG), ('para', PARA)]) train, val = train_val.split(split_ratio=0.8) # FIXME: test data is too large! test = TabularDataset(path=self.test_path, format='tsv', fields=[('orig', ORIG), ('para', PARA)]) return train, val, test
def get_datasets(path_to_data: PathOrStr, len_context_vocab: int, len_title_vocab: int, len_aut_vocab: int) -> BaseData: """ Initializes torchtext Field and TabularDataset objects used for training. The vocab of the author, context and title fields is built *on the whole dataset* with vocab_size=30000 for all fields. The dataset is split into train, valid and test with [0.7, 0.2, 0.1] splits. ## Parameters: - **path_to_data** *(PathOrStr)*: Path object or string to a .csv dataset. - **len_context_vocab** *(int)*: Maximum length of context vocab size before adding special tokens. - **len_title_vocab** *(int)*: Maximum length of context vocab size before adding special tokens. - **len_aut_vocab** *(int)*: Maximum length of context vocab size before adding special tokens. ## Output: - **data** *(BaseData)*: Container holding CNTXT (*Field*), TTL (*Field*), AUT (*Field*), train (*TabularDataset*), valid (*TabularDataset*), test (*TabularDataset*) objects. """ # set the seed for the data split random.seed(SEED) state = random.getstate() logger.info("Getting fields...") CNTXT, TTL, AUT = get_fields() # generate torchtext dataset from a .csv given the fields for each datatype # has to be single dataset in order to build proper vocabularies logger.info("Loading dataset...") dataset = TabularDataset(str(path_to_data), "CSV", [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)], skip_header=True) #train = TabularDataset('/home/maria/input/mag_subset2_train.csv', "CSV", # [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True) #valid = TabularDataset('/home/maria/input/mag_subset2_valid.csv', "CSV", # [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True) #test = TabularDataset('/home/maria/input/mag_test.csv', "CSV", # [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True) train, valid, test = TabularDataset.splits(path='/home/maria/input', train='mag_subset2_train.csv',validation='mag_subset2_valid.csv', test='mag_test.csv',format='csv', fields=[("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True) # build field vocab before splitting data logger.info("Building vocab...") TTL.build_vocab(dataset, max_size=len_title_vocab) AUT.build_vocab(dataset, max_size=len_aut_vocab) CNTXT.build_vocab(dataset, max_size=len_context_vocab) # split dataset #train, valid, test = TabularDataset.splits(path='/home/maria/input', train='mag_subset2_train.csv',validation='mag_subset2_valid.csv', test='mag_test.csv',format='csv', # fields=[("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True) #train, valid, test = dataset.split([0.5,0.45,0.05], random_state = state) return BaseData(cntxt=CNTXT, ttl=TTL, aut=AUT, train=train, valid=valid, test=test)
def get_train_loader(self, batch_size): data_field = [('line', self.text_field), ('label', self.label_field)] train_set = TabularDataset('train_set.csv', 'csv', data_field) test_set = TabularDataset('test_set.csv', 'csv', data_field) self.text_field.build_vocab(train_set, test_set) self.label_field.build_vocab(train_set, test_set) train_loader = Iterator(train_set, batch_size) return train_loader
def build_dataset(self, DOCS, SUMM): fields = { 'doc{}'.format(i): ('doc{}'.format(i), f) for i, f in enumerate(DOCS, 1) } fields['summ'] = ('summ', SUMM) data = TabularDataset(path=self.data_path, format='json', fields=fields) train, test, valid = data.split(split_ratio=[0.8, 0.1, 0.1]) return train, valid, test
def __init__(self, path=DATA_PATH, device=torch.device('cpu'), batch_size=DEFAULT_BATCH_SIZE, train_test_val_ratio=TRAIN_TEST_VAL_RATIO): """Loads dataset examples and creates bucket iterators. Creates vocabulary from loaded examples. Train, test and validation splits and their iterators are created. Args: path (str, optional): Path to the dataset file. Default: constants.DATA_PATH. device (torch.device, optional): Torch device where tensors will be created. Default: torch.device('cpu'). batch_size (int, optional): Size of batch. Default: 32. train_test_val_ratio (iterable, optional): Iterable of 3 elements denoting ratio of train, test and validation splits. Default: [0.90, 0.05, 0.05]. """ print(colorize('\nLoading dataset')) self._batch_size = batch_size self._device = device self._field = Field(tokenize='revtok', lower=True, batch_first=True) fields = [ ('query', self._field), ('response', self._field), ] self.data = TabularDataset(path=path, format='csv', fields=fields) self._train, self._val, self._test = self.data.split( train_test_val_ratio) self.train_iter, self.validation_iter, self.test_iter = BucketIterator.splits( datasets=(self._train, self._val, self._test), batch_size=self._batch_size, repeat=False, sort_key=lambda ex: interleave_keys(len(ex.query), len(ex.response) ), device=self._device) self.iterator = BucketIterator(dataset=self.data, batch_size=self._batch_size, repeat=False, sort_key=lambda ex: interleave_keys( len(ex.query), len(ex.response)), device=self._device) print(colorize(' • Building vocabulary', color='yellow')) self._field.build_vocab(self.data) self.vocab = self._field.vocab
def binary_classification(obj): tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, batch_first=True, fix_length=obj.fix_length) LABEL = Field(sequential=False, dtype=torch.float, batch_first=True, use_vocab=False) fields = [ ('id', None), ('content', TEXT), ('trump_percentage', LABEL), ] train_csv = 'twitter_pollster_' + str( obj.days) + '_days_train_trump_percentage.csv' test_csv = 'twitter_pollster_' + str( obj.days) + '_days_test_trump_percentage.csv' train_dataset = TabularDataset(path=obj.data_path + '/' + train_csv, format='csv', skip_header=True, fields=fields) test_dataset = TabularDataset(path=obj.data_path + '/' + test_csv, format='csv', skip_header=True, fields=fields) TEXT.build_vocab(train_dataset, vectors=GloVe(name=obj.Glove_name, dim=obj.embedding_dim)) vocab_size = len(TEXT.vocab) word_embeddings = TEXT.vocab.vectors print("vector size of text vocabulary: ", TEXT.vocab.vectors.size()) train_iter, test_iter = Iterator.splits( (train_dataset, test_dataset), sort_key=lambda x: len(x.content), batch_sizes=(obj.train_batch_size, obj.test_batch_size), device=torch.device(obj.device), sort_within_batch=True, repeat=False) train_iter_ = BatchWrapper(train_iter, 'content', ['trump_percentage']) test_iter_ = BatchWrapper(test_iter, 'content', ['trump_percentage']) return TEXT, vocab_size, word_embeddings, train_iter_, test_iter_
def loadDataset(): """ Loading and Spliting Dataset """ train_data, test_data = [], [] print("==> Loading Training Set") fields_tuple = { "question": ('Q', Q), "answer": ('A', A), "answer_sentence": ('Ans_Sen', Ans_Sen) } train_data = TabularDataset(path='data/Final_Dataset_Train.json', format='json', fields=fields_tuple) print("Size of Training Set : {}".format(len(train_data))) print("Training Set Example: {}".format(train_data[0].__dict__)) print("==> Loading Test Set") test_data = TabularDataset(path='data/Final_Dataset_Test.json', format='json', fields=fields_tuple) #print(dataset.__dict__.keys()) #dataset = extract_QnA_Ans_Sent(dataset) #print("==> Creating Training Set and Test Set") #train_data, test_data = dataset.split(split_ratio=0.8) #train, val = train_test_split(train, test_size=0.2) #train_data = make_torchtext(train,fields_tuple) #test_data = make_torchtext(test,fields_tuple) print("Size of Test Set : {}".format(len(test_data))) print("Test Set Example: {}".format(train_data[0].__dict__)) #val_data = make_torchtext(val,fields_tuple) print("==> Building Vocabulary using Fasttext") Q.build_vocab(train_data, specials=['<sep>'], vectors='fasttext.en.300d') A.build_vocab(train_data, vectors='fasttext.en.300d') Ans_Sen.build_vocab(train_data, vectors='fasttext.en.300d') QnA_vocab = merge_vocabs([Q.vocab, A.vocab, Ans_Sen.vocab]) #fields = [('A',A),('Q',Q),('Ans_Sen',Ans_Sen)] return train_data, test_data, QnA_vocab #, fields
def load_data(train_dir, test_dir): nlp = spacy.load('en_core_web_sm') tokenizer = lambda sent: [x.text for x in nlp.tokenizer(sent) if x.text != " "] text = data.Field(sequential=True, batch_first=True, lower=True, fix_length=50, tokenize=tokenizer) label = data.LabelField() train_data = TabularDataset(path=train_dir, skip_header=True, format='csv', fields=[('turn1', text), ('turn2', text), ('turn3', text), ('label', label)]) test_data = TabularDataset(path=test_dir, skip_header=True, format='csv', fields=[('turn1', text), ('turn2', text), ('turn3', text), ('label', label)]) train_data, valid_data = train_data.split(split_ratio=0.8) return train_data, valid_data, test_data, text, label
def get_iterators_and_fields(self) -> Tuple: """ Builds train test/iterators, fields, vocab and tokenizer Returns: train/test iterators, data and fields for source/target """ data = self.result_df[['text', 'title']] data.columns = ['src', 'trg'] data.to_csv('data/all_data.csv', index=False) SRC = Field(tokenize=self.tokenize, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) TRG = Field(tokenize=self.tokenize, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) data_fields = [('src', SRC), ('trg', TRG)] all_data = TabularDataset(path='data/all_data.csv', format='csv', fields=data_fields) train, test = train_test_split(data, test_size=self.test_size) train.to_csv(self.path_to_save_data.joinpath('train.csv'), index=False) test.to_csv(self.path_to_save_data.joinpath('val.csv'), index=False) train_data, test_data = TabularDataset.splits(path='data/', train='train.csv', validation='val.csv', format='csv', fields=data_fields) SRC.build_vocab(all_data, min_freq=self.min_freq) TRG.build_vocab(all_data, min_freq=self.min_freq) # due to limited amount of data we have to build vocab on all data otherwise we get too much <unk> tokens train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=self.batch_size, device=self.device, sort=True, sort_within_batch=True, sort_key=lambda x: len(x.src)) return train_iterator, test_iterator, train_data, test_data, SRC, TRG
def get_dataset(fix_length=100, lower=False, vectors=None): if vectors is not None: lower = True logging.info("预处理 csv......") prepare_csv() TEXT = Field(sequential=True, fix_length=fix_length, tokenize=tokenizer, pad_first=True, lower=lower) LABEL = Field(sequential=False, use_vocab=False) train_datafields = [("id", None), ("comment_text", TEXT), ("toxic", LABEL), ("severe_toxic", LABEL), ("threat", LABEL), ("obscene", LABEL), ("insult", LABEL), ("identity_hate", LABEL)] logging.info("读取 train.csv......") train, val = TabularDataset.splits(path='cache', train='train.csv', validation="val.csv", format='csv', skip_header=True, fields=train_datafields) logging.info("读取 test.csv......") test = TabularDataset(path='cache/test.csv', format='csv', skip_header=True, fields=[('id', None), ('comment_text', TEXT)]) logging.info('读取glove词向量......') # vectors = GloVe(name='6B', dim=300) #会下载词向量 #读取本地词向量 cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) vectors = Vectors( name='/home/sunyan/quora/input/embeddings/glove.840B.300d.txt', cache=cache, max_vectors=200000) vectors.unk_init = init.xavier_uniform_ logging.info('构建词表......') TEXT.build_vocab(train, test, max_size=20000, min_freq=50, vectors=vectors) print(TEXT.vocab.freqs.most_common(10)) logging.info("预处理结束!") return (train, val, test), TEXT
def load_tabular_set(file_path,format,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),**args): """ :param file_path: :param format: :param fields: :param split_ratio: :param split_seed: :param skip_header: :param save_vocab_path: :param args: :return: """ if os.path.exists(save_vocab_path) == False: os.mkdir(save_vocab_path) dataset_fields = [] for field in fields: dataset_fields.append((field.name,field.field)) dataset = TabularDataset(file_path,format,dataset_fields,skip_header=skip_header,**args) for f_input in fields: name = f_input.name field = f_input.field vocab = f_input.vocab if vocab is None: field.build_vocab(dataset,max_size=f_input.max_size, min_freq=f_input.min_freq, vectors=f_input.vectors, unk_init=f_input.unk_init, vectors_cache=f_input.vectors_cache) with open(os.path.join(save_vocab_path,"{}.json".format(name)), "w") as jfile: json.dump(field.vocab.stoi,jfile,sort_keys=True) else: with open(vocab, "r") as jfile: dict_ = json.load(jfile) field.build_vocab() field.vocab.stoi = dict_ if split_ratio is not None: dataset = dataset.split(split_ratio,random_state=split_seed) return dataset
def load_data(preprocessing=None): # Fields for the dataset # The actual review message #TEXT = Field(tokenize='spacy') # -- Old way, unclear exactly what language model is used TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, preprocessing=preprocessing) LABEL = LabelField(dtype=torch.float) # Get the entire dataset that we will then split data = TabularDataset(path=path, format='tsv', fields=[('text', TEXT), ('label', LABEL)]) # We should probabily look at the proportion of fake to non fake in each of these # set to make sure it is fairly even. Though probabilistically it should be I suppose train_data, valid_data, test_data = data.split( split_ratio=TRAIN_VAL_TEST_SPLIT, random_state=random.seed(SEED)) #valid_data, test_data = test_data.split(split_ratio=VAL_TEST_SPLIT, random_state=random.seed(SEED)) print('Size of train set: ' + str(len(train_data.examples))) print('Size of val / test: ' + str(len(valid_data.examples))) ''' # Try loading in the IMB dataset to label pos or negative train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) # Get train/valid split!! train_data, valid_data = train_data.split(random_state=random.seed(SEED)) ''' # Now we need to build the vocab for our actual data # Here we will use the pre-trained word vetors from "glove.6b.100" TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d") LABEL.build_vocab(train_data) # Print stuff for sanity checks print('Size of the vocab: ' + str(len(TEXT.vocab))) print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size()) print("Label Length: " + str(len(LABEL.vocab))) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_itr, valid_itr, test_itr = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.text)) return TEXT, train_itr, valid_itr, test_itr
def generate_data(config): ## 不同字段的操作定义 tokenizer = lambda x: [one for one in x] TEXT = Field(sequential=True, tokenize=tokenizer, fix_length=config.sen_max_length) ##截断句长直接影响acc!!! LABEL = Field(sequential=False, use_vocab=False) ## 如果标签是数值型的话 datafields = [("context", TEXT), ("label_id", LABEL)] ## TEXT field, LABEL field test_field = [("context", TEXT), ("label_id", LABEL)] train_file, valid_file = TabularDataset.splits( path=config.data_ori, train=config.train_path, validation=config.valid_path, format="csv", skip_header=True, fields=datafields) test_file = TabularDataset(path=config.data_ori + config.test_path, format="csv", skip_header=True, fields=test_field) ## 构建词典 vectors = Vectors(name=config.data_ori + config.embedding_path, cache="./") TEXT.build_vocab(train_file, max_size=config.vocab_maxsize, min_freq=config.vocab_minfreq, vectors=vectors) TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) train_iter, val_iter = BucketIterator.splits( (train_file, valid_file), batch_sizes=(config.batch_size, config.batch_size), device=config.device, sort_key=lambda x: len(x.context), sort_within_batch=True, # 当要使用pack_padded_sequence时,需要将sort_within_batch设置为True,同时会将paded sequence 转为PackedSequence对象 repeat=False) test_iter = Iterator(test_file, batch_size=config.batch_size, device=config.device, sort=False, sort_within_batch=False, repeat=False) return train_iter, val_iter, test_iter, TEXT
def predict_text_cnn(model_path, file_path, vocab_path, batch_size=64): TEXT = data.Field(sequential=True, lower=True, batch_first=True) fields = [('sentence', TEXT)] test_data = TabularDataset(path=file_path, format='tsv', skip_header=True, fields=fields) with open(vocab_path, 'rb') as handle: vocab = pickle.load(handle) TEXT.vocab = vocab device = torch.device('cuda:0') test_iter = Iterator(test_data, batch_size=batch_size, shuffle=False, device=device) model = torch.load(model_path) sentiments = [] model.eval() with torch.no_grad(): for batch in test_iter: sentence = batch.sentence logit = model(sentence) prob = torch.softmax(logit, dim=-1)[:, 1].tolist() sentiments.extend(prob) return sentiments
def load_question_dataset(batch_size, dataset, device=0): spacy_en = spacy.load('en') def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] inp_lang = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>') opt_lang = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>') dataset = QGenDataset(dataset) # associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT data_fields = [('ans', inp_lang), ('que', opt_lang)] train, val, test = TabularDataset.splits(path='./.data/', train='train.csv', validation='val.csv', test="test.csv", format='csv', fields=data_fields) inp_lang.build_vocab(train, val, test) opt_lang.build_vocab(train, val, test) train_iter = BucketIterator(train, batch_size=batch_size, \ device=device, repeat=False , sort_key=lambda x: len(x.que), shuffle=True) val_iter = BucketIterator(val, batch_size=batch_size, \ device=device, sort_key=lambda x: len(x.que), shuffle=True) test_iter = BucketIterator(test, batch_size=batch_size, \ device=device, sort_key=lambda x: len(x.que), shuffle=True) return train_iter, val_iter, test_iter, inp_lang, opt_lang
def prepare_vocab(): ''' 加载需要的vocabulary 以及 迭代器 :return: ''' # 定义Field PREV = Field( tokenize=chi_tokenizer, init_token='<bos>', eos_token='<eos>') # 在这例可以添加很多有用的参数, 比如pa_token,unknowntoken,stopwords NEXT = Field(tokenize=chi_tokenizer, init_token='<bos>', eos_token='<eos>') # 定义字段与FIELD之间读配对 fields = [('prev', PREV), ('next', NEXT)] # 注意skip_header train, val = TabularDataset.splits(path='data', train='train.csv', validation='test.csv', format='csv', fields=fields, skip_header=True) # 构建vocabulary时同时使用到了train, 和val的数据 PREV.build_vocab(train, val) NEXT.build_vocab(train, val) # 需要注意的是, PREV和NEXT的字典是不一样的 # 定义数据生成器 train_iter = BucketIterator(train, batch_size=bc.batch_size, \ sort_key=lambda x: len(x.prev), sort_within_batch=True, shuffle=True) val_iter = BucketIterator(val, batch_size=bc.batch_size, \ sort_key=lambda x: len(x.prev), sort_within_batch=True, shuffle=True) return PREV, NEXT, train_iter, val_iter
def make_dataset(train_csv, val_csv, test_csv): ''' Generates the training, validation and testing datasets as torchtext objects for easy incorporation with Pytorch (cleaning them in the process) Inputs: train_csv(str): name of training data csv val_csv(str): name of validation data csv test_csv(str): name of testing data csv Outputs: train: tabular dataset obj representing the training data test: tabular dataset obj representing the testing data val: tabular dataset obj representing the validation data text: torchtext field obj representing how text should be processed and stored label: torchtext labelfield obj representing labels should be processed and stored ''' text = Field(sequential=True, tokenize=word_tokenize, preprocessing=normalize_tokens) label = LabelField(dtype=torch.float) data_fields = [('dab_id', None), ('alj_id', None), ('alj_text', text), ('decision_binary', label), ('dab_year', None)] train, val, test = TabularDataset.splits(path='', train=train_csv, validation=val_csv, test=test_csv, format='csv', fields=data_fields, skip_header=True) return train, test, val, text, label
def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
def preprocess(config: dict) -> None: logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') logger = logging.getLogger(__name__) base_path = config['base_path'] vocab_path = os.path.join(base_path, 'vocab.pkl') embedding_path = os.path.join(base_path, 'embedding.npy') glove_source_path = config['glove_source_path'] TEXT = data.Field(sequential=True, lower=True, batch_first=True) fields = [('sentence', TEXT)] train_data = TabularDataset(path=os.path.join(base_path, 'train.tsv'), format='tsv', skip_header=True, fields=fields) logger.info('build vocabulary') TEXT.build_vocab(train_data, specials=[UNK, PAD, SOS, EOS], max_size=50000) vocab = TEXT.vocab vocab_size = len(vocab.itos) logger.info('vocab_size: %d' % vocab_size) logger.info('save vocabulary') with open(vocab_path, 'wb') as handle: pickle.dump(vocab, handle) logger.info('load pretrained embedding') embedding = load_glove(glove_source_path, vocab_size, vocab.stoi) logger.info('save pretrained embedding') np.save(embedding_path, embedding) logger.info('finish')
def load_data_loader(args, mode='train'): if mode == 'train': train_dir_s = os.path.join(args.data_path, 'train/train.en') train_dir_t = os.path.join(args.data_path, 'train/train.de') load_raw_data_to_csv(train_dir_s, train_dir_t, mode='train') path = './train.csv' elif mode == 'test': test_dir_s = os.path.join(args.data_path, 'test/test.en') test_dir_t = os.path.join(args.data_path, 'test/test.de') load_raw_data_to_csv(test_dir_s, test_dir_t, mode='test') path = './test.csv' source, target = create_fields(args) data = TabularDataset(path=path, format='csv', fields=[('source', source), ('target', target)]) data_loader = BucketIterator(data, batch_size=args.batch_size, sort_key=lambda x: len(x.source), shuffle=True) source.build_vocab(data) target.build_vocab(data) if os.path.isfile('train.csv'): os.remove('train.csv') if os.path.isfile('test.csv'): os.remove('test.csv') return data_loader, source, target
def _process_data(self, filepath, train_dev_ratio): """ preprocess dataset Args: filepath: string, the path of dataset train_dev_ratio: a float, the ratio to split train and dev dataset Returns: A tuple of torchtext.data.Dataset objects: (train, dev) """ train, dev = TabularDataset( path=filepath, format='csv', fields=[('text', self.text_field), ('label', self.label_field)], csv_reader_params=dict(delimiter='\t')).split( split_ratio=train_dev_ratio) train_words = list(map(lambda x: len(x.text), train.examples)) train_labels = list(map(lambda x: int(x.label), train.examples)) dev_words = list(map(lambda x: len(x.text), dev.examples)) dev_labels = list(map(lambda x: int(x.label), dev.examples)) print('----------------------------------------------------------') print('train: min words={}, max words={}, counter={}'.format( min(train_words), max(train_words), str(Counter(train_labels)))) print('dev: min words={}, max words={}, counter={}'.format( min(dev_words), max(dev_words), str(Counter(dev_labels)))) print('----------------------------------------------------------') print('\n') return train, dev
def make_small_imdb(batch_size=8, device=-1, vectors=None): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # TEXT = data.Field(include_lengths=False, lower=True, batch_first=True) TEXT = data.Field(tokenize=get_tokenizer("basic_english"), init_token='<sos>', eos_token='<eos>', lower=True, batch_first=False) LABEL = data.LabelField() datafields = [('text', TEXT), ('label', LABEL)] train, test = TabularDataset.splits(path='.', train='train.csv', validation='cv.csv', format='csv', skip_header=True, fields=datafields) TEXT.build_vocab(train, test, vectors=vectors, max_size=30000) LABEL.build_vocab(train, test) train_iter, test_iter = BucketIterator.splits( (train, test), batch_sizes=(128, 128), device=device, sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False) return train_iter, test_iter, TEXT, LABEL
def preprocess(**kwargs): SRC = Field(include_lengths=False, init_token=None, pad_token="<pad>", unk_token="<unk>", lower=True, batch_first=False, tokenize=lambda text: list(text.strip())) _train, _test = TabularDataset.splits( path="data", root="data", train="train.weibo.txt", test="test.weibo.txt", format='tsv', skip_header=False, fields=[("text", SRC), ("label", SRC), ("target", SRC)], csv_reader_params={"quoting": csv.QUOTE_NONE}) SRC.build_vocab(_train, min_freq=5) train_iter = BucketIterator(_train, batch_size=kwargs["batch_size"], train=True, sort_within_batch=True, sort_key=lambda x: (len(x.text)), repeat=False, device=device) test_iter = BucketIterator(_test, batch_size=1, train=False, device=device) return train_iter, test_iter, SRC
def preprocess_data_for_RNN(vectors, batch_size, train_tagged_sentences, max_vocab_size, min_frequency): """ preprocess the train tagged sentences, and use BucketIterator for training. """ df = build_corpus_text_df(train_tagged_sentences) df.to_csv('train_text_data.csv', index=False) text_field = Field(lower=True, batch_first=True) tags_field = Field(batch_first=True) fields = [('text', text_field), ('tags', tags_field)] # TabularDataset train_data = TabularDataset(path='train_text_data.csv', format='CSV', fields=fields, skip_header=True) # Iterators data_iter = BucketIterator(train_data, batch_size=batch_size) # Vocabulary text_field.build_vocab(train_data, vectors=vectors, min_freq=min_frequency, max_size=max_vocab_size) tags_field.build_vocab(train_data, min_freq=min_frequency, max_size=max_vocab_size) pad_index = text_field.vocab.stoi[text_field.pad_token] tag_pad_index = tags_field.vocab.stoi[tags_field.pad_token] return data_iter, pad_index, tag_pad_index, text_field, tags_field
def build_and_cache_dataset(config: Config, mode='train'): """ 返回每个属性的Field,以及所有的属性的值 (id, category, news), datasets (Field, Field, Field), TabularDataset """ # id 已经序列化 ID = Field(sequential=False, use_vocab=False) CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True) NEWS = Field( sequential=True, tokenize=jieba.lcut, include_lengths=True, ) fields = [ ('id', ID), (None, None), ('category', CATEGORY), ('news', NEWS), ] logger.info("从当前目录创建特征 %s", config.dataset_dir) # `\t` 分割 dataset = TabularDataset( os.path.join(config.dataset_dir, f'{mode}.csv'), format='csv', fields=fields, csv_reader_params={'delimiter': '\t'}, ) # TabularDataset.split() features = ((ID, CATEGORY, NEWS), dataset) return features
def create_dataset(self): SOURCE = Field( sequential=True, tokenize=x_tokenize, use_vocab=False, batch_first=True, fix_length=self. fix_length, # 如需静态padding,则设置fix_length, 但要注意要大于文本最大长度 eos_token=None, init_token=None, include_lengths=True, pad_token=0) TARGET = Field( sequential=True, tokenize=x_tokenize, use_vocab=False, batch_first=True, fix_length=self. fix_length, # 如需静态padding,则设置fix_length, 但要注意要大于文本最大长度 eos_token=None, init_token=None, include_lengths=False, pad_token=-1) fields = {'source': ('source', SOURCE), 'target': ('target', TARGET)} train, valid = TabularDataset.splits(path=config.ROOT_DIR, train=self.train_path, validation=self.valid_path, format="json", skip_header=False, fields=fields) return train, valid
def pad_under_five(toknized): """ 모델에서 5-gram 단위 필터를 사용하기 때문에 5-gram이 안되는 문장에 <pad>로 채워준다 """ if len(toknized) < 5: toknized.extend(["<pad>"]*(5-len(toknized))) return toknized TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five) LABEL = Field(sequential=False,use_vocab=True,unk_token=None) train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/', train='ratings_train.txt', test='ratings_test.txt', format='tsv', skip_header=True, fields=[('id',None),('text',TEXT),('label',LABEL)], filter_pred = lambda x: True if len(x.text) > 1 else False) # 토큰 레벨 문장의 길이가 1 이상인 경우만 허용 TEXT.build_vocab(train_data,min_freq=2) LABEL.build_vocab(train_data) # print (TEXT.vocab) # print (len(TEXT.vocab),len(LABEL.vocab)) # print (TEXT.vocab.itos[:5]) # print (LABEL.vocab.itos) train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,