def create_dataset(path_to_dataset,batch_size,split_ratio=0.7,min_vocab_freq=10,max_vocab_size=4000): text_field = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<sos>",eos_token="<eos>",lower=True) def transform(caption): caption = caption.strip().lower().split() return caption dataset = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_train2014.json"),text_field=text_field,transform=transform) train,val = dataset.split(split_ratio=split_ratio) test = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_val2014.json"),text_field=text_field,transform=transform) print("Dataset loaded") print("Train set size:",len(train)) text_field.build_vocab(dataset.text,min_freq=min_vocab_freq,max_size=max_vocab_size) SOS_TOKEN = text_field.vocab.stoi['<sos>'] EOS_TOKEN = text_field.vocab.stoi['<eos>'] UNK_TOKEN = text_field.vocab.stoi['<unk>'] PAD_TOKEN = text_field.vocab.stoi['<pad>'] print("Vocabuly build") print("Vocabuly statistics") print("\nMost common words in the vocabulary:\n",text_field.vocab.freqs.most_common(10)) print("Size of the vocabulary:",len(text_field.vocab)) print("Max sequence lenght",dataset.max_seq_len) train_iter,val_iter = BucketIterator.splits((train,val),repeat=False,batch_size=batch_size) test_iter = BucketIterator(test,batch_size=batch_size,repeat=False,train=False) vocab_dict = text_field.vocab.stoi return {"data_iters":(train_iter,val_iter,test_iter),"fields":text_field, "word_to_num_vocab":vocab_dict,"num_to_word_vocab":{y:x for x,y in vocab_dict.items()}, "num_classes":len(text_field.vocab),"tokens":(SOS_TOKEN,EOS_TOKEN,UNK_TOKEN,PAD_TOKEN),"max_seq_len":dataset.max_seq_len}
def load_dataloaders(args): logger.info("Preparing dataloaders...") FR = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, init_token="<sos>", eos_token="<eos>",\ batch_first=True) EN = torchtext.data.Field(tokenize=dum_tokenizer, lower=True, batch_first=True) train_path = os.path.join("./data/", "df.csv") if not os.path.isfile(train_path): tokenize_data(args) train = torchtext.data.TabularDataset(train_path, format="csv", \ fields=[("EN", EN), ("FR", FR)]) FR.build_vocab(train) EN.build_vocab(train) train_iter = BucketIterator(train, batch_size=args.batch_size, repeat=False, sort_key=lambda x: (len(x["EN"]), len(x["FR"])),\ shuffle=True, train=True) train_length = len(train) logger.info("Loaded dataloaders.") return train_iter, FR, EN, train_length
def eval(self, test_set): print("========test only=======") test_iter = BucketIterator(test_set, batch_size=self.args.batch, train=False, shuffle=False, device=self.model.device, sort_key=lambda x: len(x.TOKENS)) with torch.no_grad(): self.run_a_epoch("final test", test_iter, need_backward=False, epoch_num=0, save_output=os.path.join(self.args.output_path, "check"), max_step=ceil(len(test_set) / self.args.batch)) print(self.model.argumentRoleClsLayer.arg_mask_init.tolist()) print(self.model.argumentRoleClsLayer.arg_mask.tolist())
def train(self, epochs, save_path, load_previous=True, clip=10, batch_size=128): save_dir = os.path.split(save_path)[0] os.makedirs(save_dir, exist_ok=True) if load_previous and os.path.exists(save_path): self._logger.debug(f'Loading model state from {save_path}') self.model.load_state_dict(torch.load(save_path)) train_iterator, test_iterator = BucketIterator.splits( (self.train_data, self.test_data), batch_size=batch_size, device=self.device, sort_key=lambda x: len(x.src)) optimizer = optim.Adam(self.model.parameters()) trg_pad_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.PAD_TOKEN] criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx) # Training loop self._logger.debug(f'Beginning training for {epochs} epoch(s)') for epoch in range(epochs): train_loss = self._train_iter(train_iterator, optimizer, criterion, clip) test_loss = self._evaluate_iter(test_iterator, criterion) if test_loss < self.best_test_loss: # Save model if better self.best_test_loss = test_loss torch.save(self.model.state_dict(), save_path) self._logger.debug( f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |' ) # Save model torch.save(self.model.state_dict(), save_path) self._logger.info(save_path) return self.best_test_loss
def load_naive_iterators(args, path, fields): train, valid, test = NaiveDatasetClassification.splits(\ exts = (args.src_ext, args.trg_ext), fields = fields, root=path) # Some stats print("Stats for dataset in ", path) # Train count = Counter() for e in train.examples: count.update([e.label]) train_d = dict(count) train_d['name'] = "train" # Val count = Counter() for e in valid.examples: count.update([e.label]) val_d = dict(count) val_d['name'] = 'val' # Test count = Counter() for e in test.examples: count.update([e.label]) test_d = dict(count) test_d['name'] = 'test' d_list = [train_d, val_d, test_d] df = data_stats.dicts_to_pandas(d_list) md = tabulate(df, headers='keys', tablefmt='pipe') print(md) fields[1].build_vocab(train) # Data iterators train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train, valid, test), batch_size=args.batch_size, sort_within_batch=True, sort_key=lambda x: len(x.text), device=args.device) return train_iterator, valid_iterator, test_iterator, fields[1]
def create(cls, config): src_field = Field(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', include_lengths=True) trg_field = Field(init_token='<sos>', eos_token='<eos>', pad_token='<pad>', lower=True, include_lengths=True) train = TranslationDataset(path=config.train_prefix, exts=config.exts, fields=(src_field, trg_field)) valid = TranslationDataset(path=config.valid_prefix, exts=config.exts, fields=(src_field, trg_field)) test = TranslationDataset(path=config.test_prefix, exts=config.exts, fields=(src_field, trg_field)) train_it, valid_it, test_it = BucketIterator.splits( [train, valid, test], batch_sizes=config.batch_sizes, sort_key=TranslationDataset.sort_key, device=-1) src_field.build_vocab(train, min_freq=10) trg_field.build_vocab(train, min_freq=10) src_voc = src_field.vocab trg_voc = trg_field.vocab model = Seq2Seq.create(src_voc, trg_voc, config) if config.use_cuda: model = model.cuda() return Trainer(model, train_it, valid_it, test_it, config.valid_step, config.checkpoint_path, config.pool_size)
def load_dataset(data_dir, word_dir, batch_size, device): TEXT = data.Field(batch_first=True, include_lengths=True, lower=True) LABEL = data.LabelField(batch_first=True) fields = {'sentence1': ('premise', TEXT), 'sentence2': ('hypothesis', TEXT), 'gold_label': ('label', LABEL)} trainDataset, valDataset, testDataset = data.TabularDataset.splits( path=data_dir, format='json', train='snli_1.0_train.jsonl', validation='snli_1.0_dev.jsonl', test='snli_1.0_test.jsonl', fields=fields, filter_pred=lambda x: x.label != '-' ) vectors = Vectors('glove.6B.200d.txt', word_dir) TEXT.build_vocab(trainDataset, vectors=vectors, unk_init=nn.init.xavier_uniform) LABEL.build_vocab(trainDataset) train_iter, val_iter = BucketIterator.splits( datasets=(trainDataset, valDataset), batch_sizes=(batch_size, batch_size), device=device, sort_key=lambda x: len(x.premise) + len(x.hypothesis), sort_within_batch=True, repeat=False, shuffle=True ) test_iter = Iterator( dataset=testDataset, batch_size=batch_size, device=device, sort=False, repeat=False, shuffle=False ) return TEXT, LABEL, train_iter, val_iter, test_iter
def score(self, sentences: List[str]) -> np.ndarray: results = [] fields = [('data', self.field)] examples = [Example.fromlist([s], fields) for s in sentences] dataset = Dataset(examples, fields) dataloader = BucketIterator(dataset, self.config.metrics.classifier.batch_size, repeat=False, shuffle=False, device=self.config.device_name) for batch in dataloader: scores = self.model(batch.data) scores = torch.sigmoid(scores) scores = scores.detach().cpu().numpy().tolist() results.extend(scores) return np.mean(results)
def splits(cls, train, valid, batch_size=2, device="cuda"): train_dataset = train.get("Dataset") valid_dataset = valid.get("Dataset") field_names = [field[0] for field in train.get("Field")] device = device train_iter, valid_iter = BucketIterator.splits( (train_dataset, valid_dataset), batch_size=batch_size, device=device, sort_key=lambda x: len(vars(x)[field_names[0]]), sort_within_batch=True) train_dataloader = MiniBatchWrapper(train_iter, field_names[0], field_names[1]) valid_dataloader = MiniBatchWrapper(valid_iter, field_names[0], field_names[1]) return train_dataloader, valid_dataloader
def prepare_data(self): self.text_field = Field(sequential=True, fix_length=200, include_lengths=True) self.label_field = LabelField() train_val, test = IMDB.splits(self.text_field, self.label_field) random.seed(42) train, val = train_val.split(random_state=random.getstate()) self.text_field.build_vocab( train, vectors=GloVe()) #vectors=FastText('simple')) self.label_field.build_vocab(train) self.train_iter, self.test_iter, self.val_iter = BucketIterator.splits( (train, test, val), batch_size=self.batch_size) self.train_iter.sort_within_batch = True self.val_iter.sort_within_batch = True
def load_or_generate_dataset(batch_size=128): download_spacy_models() # TODO check if need to download then do it spacy_en, spacy_de = load_tokenize_models() SRC = Field(tokenize=partial(tokenize_de, spacy_de=spacy_de), init_token=START_TOKEN, eos_token=END_TOKEN, lower=True) TRG = Field(tokenize=partial(tokenize_en, spacy_en=spacy_en), init_token=START_TOKEN, eos_token=END_TOKEN, lower=True) train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG)) logging.debug(f"Number of training examples: {len(train_data.examples)}") logging.debug(f"Number of validation examples: {len(valid_data.examples)}") logging.debug(f"Number of testing examples: {len(test_data.examples)}") SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2) logging.debug(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}") logging.debug(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, device=get_available_device()) # TODO class return { 'train_data': train_iterator, 'valid_data': valid_iterator, 'test_data': test_iterator, 'n_src_words': len(SRC.vocab), 'n_trg_words': len(TRG.vocab), 'trg_pad_idx': TRG.vocab.stoi[TRG.pad_token], 'src_vocab': SRC.vocab, 'trg_vocab': TRG.vocab, 'src_field': SRC, 'trg_field': TRG, }
def load_iters(batch_size=1, device="cpu", data_path='data'): TEXT = data.Field(batch_first=True, include_lengths=True, lower=True) LABEL = data.LabelField(batch_first=True) fields = { 'sentence1': ('premise', TEXT), 'sentence2': ('hypothesis', TEXT), 'gold_label': ('label', LABEL) } train_data, dev_data, test_data = data.TabularDataset.splits( path=data_path, train='snli_1.0_train.jsonl', validation='snli_1.0_dev.jsonl', test='snli_1.0_test.jsonl', format='json', fields=fields, filter_pred=lambda ex: ex.label != '-' # filter the example which label is '-'(means unlabeled) ) TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=300)) LABEL.build_vocab(train_data) # TEXT.build_vocab(train_data, vectors=vectors, unk_init=torch.Tensor.normal_) train_iter, dev_iter = BucketIterator.splits( (train_data, dev_data), batch_sizes=(batch_size, batch_size), device=device, sort_key=lambda x: len(x.premise) + len(x.hypothesis), #按 合并文本长度 排序, sort_within_batch=True, #方便后面pytorch lstm进行pack和pad repeat=False, shuffle=True) test_iter = Iterator(test_data, batch_size=batch_size, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) return train_iter, dev_iter, test_iter, TEXT, LABEL
def init(model_config, device='cpu'): logging.critical("[CRITICAL] %s device is selected" % device) logging.info('[INFO] Using directory %s for the translation pair with filename %s' % (os.path.abspath(model_config['global']['dataset_path']), model_config['global']['translate_pair'])) #initialize the field for src language src_field = Field(tokenize = english_tokenizer, init_token = '<sos>', eos_token = '<eos>', lower = True) #initialize the field for trg language trg_field = Field(tokenize = hindi_tokenizer, init_token = '<sos>', eos_token = '<eos>', lower = True) train_data, valid_data, test_data = load_datasets(model_config['global']['dataset_path'], model_config['global']['dataset_file_names'], model_config['global']['translate_pair'], model_config['global']['lang_extensions'], [src_field, trg_field]) #initialize the vocabulary src_field.build_vocab(train_data, min_freq = 1) trg_field.build_vocab(train_data, min_freq = 1) #display dataset stats print_dataset_statistics(train_data, valid_data, test_data, model_config['global']['lang_extensions'], [src_field, trg_field]) model = create_seq2seq_model(model_config, len(src_field.vocab), len(trg_field.vocab), device) optimizer = optim.Adam(model.parameters()) #defining the loss function loss_function = nn.CrossEntropyLoss(ignore_index = trg_field.vocab.stoi[trg_field.pad_token]) logging.info(model.apply(init_weights)) logging.info('[INFO] Model has %s trainable parameters' % (count_parameters(model))) logging.info('[INFO] About to start the primary training loop') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size = model_config['global']['batch_size'], device = device) cache_file_name = "%s-%s-%s-epoch-%s.pt" % (model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs']) cache_file_path = os.path.join(model_config['global']['cache_path'], cache_file_name) stats = execute_training_loop(model, train_iterator, valid_iterator, loss_function, optimizer, model_config['global']['clip_value'], src_field, trg_field, epochs=model_config['global']['epochs'], model_cache_path=os.path.abspath(cache_file_path)) stats_file_name = "%s-%s-%s-epoch-%s-stats.pickle" % (model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs']) store_object(stats, os.path.join(model_config['global']['cache_path'], stats_file_name)) logging.info("[INFO] loading the model %s" % (cache_file_name)) model.load_state_dict(torch.load(os.path.abspath(cache_file_path))) test_loss, test_bleu = evaluate_model(model, test_iterator, loss_function, src_field, trg_field) logging.info(f'[INFO] | Test Loss: {test_loss:.3f} Test Bleu: {test_bleu:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
def main(): train, test, field = dataset_reader(train=True, stop=900000) evl, _ = dataset_reader(train=False, fields=field) field.build_vocab(train, evl) _, evl_iter = BucketIterator.splits((train, evl), batch_sizes=(1024, 1024), device=device, sort_within_batch=False, repeat=False, sort=False) model = RNNCNN(num_embeddings=len(field.vocab), embedding_dim=256).to(device) model.load_state_dict(torch.load('model/rnn_cnn_model.pkl')) with open('data/rnn_cnn_result.txt', 'w+') as f: f.write('') model.eval() with torch.no_grad(): for i, data in tqdm.tqdm(enumerate(evl_iter), total=evl_iter.__len__()): inputs = torch.cat(( data.plat_form, data.biz_type, data.payed_day, data.payed_hour, data.cate1_id, data.cate2_id, data.cate3_id, data.preselling_shipped_day, data.seller_uid_field, data.company_name, data.rvcr_prov_name, data.rvcr_city_name, ), dim=1) outputs = model(inputs, 'test', field) day = outputs * 3 + 3 with open('data/rnn_cnn_result.txt', 'a+') as f: for b in range(day.size(0)): sign_day = str(float(day[b])) f.write(sign_day + '\n')
def main(): setup_seed(2020) POST = Field(tokenize=tokenize_post, init_token='<sos>', eos_token='<eos>') QUERY = Field(tokenize=tokenize_query, init_token='<sos>', eos_token='<eos>') RESP = Field(tokenize=tokenize_resp, init_token='<sos>', eos_token='<eos>') # first 'post' is the key in loaded json, second 'post' is the key in batch fields = { 'post': ('post', POST), 'query': ('query', QUERY), 'resp': ('resp', RESP) } train_data, valid_data, test_data = TabularDataset.splits( path='./data', train='train.json', validation='valid.json', test='test.json', format='json', fields=fields) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') POST.build_vocab(train_data, min_freq=1) QUERY.build_vocab(train_data, min_freq=1) RESP.build_vocab(train_data, min_freq=1) # print(POST.vocab.__dict__) batch_size = 10 train_iter, val_iter, test_iter = BucketIterator.splits( (train_data, valid_data, test_data), batch_sizes=(batch_size, batch_size, batch_size), device=device, sort_key=lambda x: len(x.post), sort_within_batch=True, # sort according to the len, for padding in LSTM repeat=False) cnt = 0 for i, batch in enumerate(train_iter): if cnt == 0: post = batch.post print(post.size()) print(batch.post) cnt += 1
def eval_dataset(model, dataset, batch_size, loss_fn, device, text_embeds, optimizer, stage, csv_file, update_grad=False): cols = ['loss', 'acc'] (iterator, ) = BucketIterator.splits(datasets=(dataset, ), batch_sizes=[batch_size], device=device, shuffle=True) metrics = [] for batch in iterator: (prem_embeds, prem_lens, hyp_embeds, hyp_lens, labels) = batch_cols(batch, text_embeds) predictions = model.forward(prem_embeds, prem_lens, hyp_embeds, hyp_lens) loss = loss_fn(predictions, labels) acc = accuracy(predictions, labels) vals = [loss, acc] stats = get_stats(cols, vals) metrics.append(stats) print( yaml.dump({ stage: { k: round(i, 3) if isinstance(i, float) else i for k, i in stats.items() } })) if update_grad: optimizer.zero_grad() loss.backward() optimizer.step() df = pd.DataFrame(metrics, columns=cols) df.to_csv(csv_file) (loss, acc) = list(df.mean()) return (loss, acc, df)
def _set_data(self, dataset: str, batch_size: int): train_ds, valid_ds, test_ds, TEXT = load_real_dataset(dataset) self._reverse = lambda x: TEXT.reverse(x) self.first_token = TEXT.vocab.stoi["<sos>"] self.vocab_size = len(TEXT.vocab) self.length = TEXT.max_length self.train_batchmanager, self.val_batchmanager, self.test_batchmanager = BucketIterator.splits( (train_ds, valid_ds, test_ds), batch_sizes=(batch_size, 2 * batch_size, 2 * batch_size), device="cpu", sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False) if self.evaluate_test: self.eval_test = MetricsEval(test_ds, TEXT, "Test") if self.evaluate_valid: self.eval_valid = MetricsEval(valid_ds, TEXT, "Valid")
def __init__(self, data_path, batch_size, device, embedding=None): self.TEXT = Field(sequential=True, tokenize=self.tokenize, lower=True, include_lengths = True) self.LABEL = Field(sequential=False, use_vocab=False) self.datafield = [("id", None), ("sentence", self.TEXT), ("label", self.LABEL)] train_data, dev_data_a, test_data_a = TabularDataset.splits(path=data_path, train='train.csv', validation="dev_a.csv", test="test_a.csv", format='csv', skip_header=True, fields=self.datafield) dev_data_b, test_data_b = TabularDataset.splits(path=data_path, validation="dev_b.csv", test="test_b.csv", format='csv', skip_header=True, fields=self.datafield) self.TEXT.build_vocab(train_data) if embedding: self.TEXT.vocab.load_vectors(embedding) self.embedding = self.TEXT.vocab.vectors.to(device) else: self.embedding = None # self.train_iter, self.val_iter = BucketIterator(train_data, batch_size=batch_size, device=device, # sort_key=lambda x: len(x.sentence), sort_within_batch=True) self.train_iter, self.dev_a_iter, self.test_a_iter, self.dev_b_iter, self.test_b_iter = \ BucketIterator.splits((train_data, dev_data_a, test_data_a, dev_data_b, test_data_b), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.sentence), device=device)
def predict(self, input_text): input_fields = [('id', self.ID), ('text', self.TEXT), ('label', self.LABEL)] input_example = Example() input_example = input_example.fromlist([1, input_text, 1], input_fields) input_dataset = Dataset([input_example], input_fields) input_iter = BucketIterator(input_dataset, batch_size=1, device=self.device, repeat=False) pred, attn, prob = test_model(self.encoder, self.classifier, 1, input_iter) pred = pred[0][1] prob = prob[0][1] attn = attn[0].tolist() attn = [i[0] for i in attn] return pred, prob, attn
def get_iterator(datafields, data_dir, bs): trn, vld = TabularDataset.splits( path=data_dir, train='train.csv', validation="val.csv", format='csv', skip_header=True, fields=datafields) _, label_field = datafields[0] _, text_field = datafields[1] text_field.build_vocab(trn, vectors="glove.6B.100d") label_field.build_vocab(trn) train_iterator, valid_iterator = BucketIterator.splits( (trn, vld), batch_size=bs, sort_key=lambda x: len(x.text), device=device) return train_iterator, valid_iterator
def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None): TEXT = data.Field(lower=True, batch_first=True, include_lengths=True) LABEL = data.LabelField(batch_first=True) train_fields = [(None, None), (None, None), ('text', TEXT), ('label', LABEL)] test_fields = [(None, None), (None, None), ('text', TEXT)] train_data = data.TabularDataset.splits( path=data_path, train='train.tsv', format='tsv', fields=train_fields, skip_header=True)[0] # return is a tuple. test_data = data.TabularDataset.splits(path='data', train='test.tsv', format='tsv', fields=test_fields, skip_header=True)[0] TEXT.build_vocab(train_data.text, vectors=vectors) LABEL.build_vocab(train_data.label) train_data, dev_data = train_data.split([0.8, 0.2]) train_iter, dev_iter = BucketIterator.splits( (train_data, dev_data), batch_sizes=(batch_size, batch_size), device=device, sort_key=lambda x: len(x.text), sort_within_batch=True, repeat=False, shuffle=True) test_iter = Iterator(test_data, batch_size=batch_size, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) return train_iter, dev_iter, test_iter, TEXT, LABEL
def getdata(): europarl_en = open('europarl-v7.cs-en.en', encoding='utf-8').read().split('\n') europarl_cs = open('europarl-v7.cs-en.cs', encoding='utf-8').read().split('\n') EN_TEXT = Field(tokenize=tokenize_en) CS_TEXT = Field(tokenize=tokenize_cs, init_token="<sos>", eos_token="<eos>") raw_data = { 'English': [line for line in europarl_en], 'Czech': [line for line in europarl_cs] } df = pd.DataFrame(raw_data, columns=["English", "Czech"]) # remove very long sentences and sentences where translations are df['en_len'] = df['English'].str.count(' ') df['cs_len'] = df['Czech'].str.count(' ') df = df.query('cs_len < 80 & en_len < 80') df = df.query('cs_len < en_len * 1.5 & cs_len * 1.5 > en_len') train, val = train_test_split(df, test_size=0.1) train.to_csv("train.csv", index=False) val.to_csv("val.csv", index=False) data_fields = [('English', EN_TEXT), ('Czech', CS_TEXT)] train, val = TabularDataset.splits(path='./', train='train.csv', validation='val.csv', format='csv', fields=data_fields) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') EN_TEXT.build_vocab(train, min_freq=2) CS_TEXT.build_vocab(train, min_freq=2) BATCH_SIZE = 16 INPUT_DIM = len(EN_TEXT.vocab) OUTPUT_DIM = len(CS_TEXT.vocab) PAD_IDX = EN_TEXT.vocab.stoi['<pad>'] train_iterator, valid_iterator = BucketIterator.splits( (train, val), batch_size=BATCH_SIZE, device=device) return train_iterator, valid_iterator, INPUT_DIM, OUTPUT_DIM, PAD_IDX
def load_dataset(db_name, batch_size): """ Load the csv datasets into torchtext files Inputs: db_name (string) The name of the dataset. This name must correspond to the folder name. batch_size The batch size """ print "Loading " + db_name + "..." i = 1 print('num', i) tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True) LABEL = Field(sequential=False, use_vocab=False) tv_datafields = [("sentence", TEXT), ("label", LABEL)] trn, vld = TabularDataset.splits( path=DATA_ROOT + db_name, # the root directory where the data lies train='train.csv', validation="test.csv", format='csv', skip_header=False, fields=tv_datafields) TEXT.build_vocab(trn) print "vocab size: %i" % len(TEXT.vocab) train_iter, val_iter = BucketIterator.splits( (trn, vld), batch_sizes=(batch_size, batch_size), device=-1, # specify dont use gpu sort_key=lambda x: len(x.sentence), # sort the sentences by length sort_within_batch=False, repeat=False) return train_iter, val_iter, len(TEXT.vocab)
def get_iterators(opt, device=None): """ Get dataset iterator and necessary fields information :param opt: opt from argparser. :param device: device to create the data. :return: train_iter, test_iter, dataset.fields """ import random random.seed(42) dataset = load_dataset(opt.use_cws) dataset.fields["text"].build_vocab(dataset) dataset.fields["author"].build_vocab(dataset) dataset.fields["book"].build_vocab(dataset) train, test = dataset.split(split_ratio=0.7) train_iter, test_iter = BucketIterator.splits( (train, test), # first one is default to train (shuffle each epoch) batch_sizes=(opt.train_batch_size, opt.eval_batch_size), device=device, sort_key=lambda x: len(x.text), sort_within_batch=True) return train_iter, test_iter, dataset.fields
def load_dataset(batch_size): spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) DE.build_vocab(train.src, min_freq=2) EN.build_vocab(train.trg, max_size=10000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, repeat=False) return train_iter, val_iter, test_iter, DE, EN
def create_iterator( corpus: Corpus, batch_size: int = 1, device: str = "cpu", sort: Optional[bool] = None, ) -> BucketIterator: if sort is not None: sort_key: Optional[Callable] = lambda e: len(e.sen) else: sort_key = None iterator = BucketIterator( dataset=corpus, batch_size=batch_size, device=device, shuffle=False, sort=sort, sort_key=sort_key, ) return iterator
def data_iter(train_data, valid_data, TEXT, LABEL): train = MyDataset(train_data, text_field=TEXT, label_field=LABEL, test=False) valid = MyDataset(valid_data, text_field=TEXT, label_field=LABEL, test=False) TEXT.build_vocab(train) train_iter, val_iter = BucketIterator.splits( (train, valid), # 构建数据集所需的数据集 batch_sizes=(100, 100), # 如果使用gpu,此处将-1更换为GPU的编号 device=device, # the BucketIterator needs to be told what function it should use to group the data. sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False) return train_iter, val_iter
def load_dataset(batch_size, device): """ Load the dataset from the files into iterator and initialize the vocabulary :param batch_size :param device :return: source and data iterators """ source = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = TranslationDataset.splits( path=DATA_FOLDER, exts=(POSITIVE_FILE_EXTENSION, NEGATIVE_FILE_EXTENSION), fields=(source, source)) source.build_vocab(train_data, min_freq=5) return source, BucketIterator.splits((train_data, valid_data, test_data), shuffle=True, batch_size=batch_size, device=device)
def get_data_iterator(batch_size, device): SRC = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token = '<eos>', lower = True) TRG = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos', lower = True) train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SRC, TRG)) SRC.build_vocab(train_data, min_freq = 2) TRG.build_vocab(train_data, min_freq = 2) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size = batch_size, device = device) return train_iterator, valid_iterator, test_iterator, SRC, TRG
def load_dataset(batch_size, device=0): spacy_de = spacy.load('de') spacy_en = spacy.load('en') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] DE = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>') train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) DE.build_vocab(train.src) EN.build_vocab(train.trg) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, device=device, repeat=False) return train_iter, val_iter, test_iter, DE, EN
def load_dataset(batch_size, debug=True, shuffle_dataset=True): spacy_en = spacy.load('en') def tokenize_en(line): return [token.text for token in spacy_en.tokenizer(line)] def tokenize_zh(line): return [token for token in jieba.cut(line)] EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') ZH = Field(tokenize=tokenize_zh, include_lengths=True, init_token='<sos>', eos_token='<eos>') exts = ['.en', '.zh'] fields = [('src', EN), ('trg', ZH)] train_dataset = TranslationDataset(train_sentence_path, exts=exts, fields=fields) val_dataset = TranslationDataset(val_sentence_path, exts=exts, fields=fields) print('Datasets Built!') EN.build_vocab(train_dataset.src, min_freq=2) ZH.build_vocab(train_dataset.trg, max_size=100000) print('Vocabularies Built!') val_iter, *_ = BucketIterator.splits( (val_dataset, ), shuffle=shuffle_dataset, batch_size=batch_size, repeat=False, sort_key=lambda x: interleave_keys(len(x.src), len(x.trg))) print('Training Iterators Built!') return val_iter, val_dataset, ZH, EN
# multi30k dataloader train,val,test = datasets.Multi30k.splits(exts=(".en",".de"),fields=(EN,DE),root=data_path) # wmt14 dataloader (better than using datasets.WMT14.splits since it's slow) #train,val,test = datasets.TranslationDataset.splits(exts=(".en",".de"),fields=[("src",EN),("trg",DE)],path=os.path.join(data_path,"wmt14"), # train="train.tok.clean.bpe.32000",validation="newstest2013.tok.bpe.32000",test="newstest2014.tok.bpe.32000") print("Dataset loaded") EN.build_vocab(train.src,min_freq=3) DE.build_vocab(train.trg,max_size=50000) print("Vocabularies build") train_iter,val_iter = BucketIterator.splits((train, val),batch_size=3) test_iter = BucketIterator(test,batch_size=3) print("Start iterating through data") for i,batch in enumerate(train_iter): print(batch.src) # the source language print(batch.trg) # the target language break for i,batch in enumerate(val_iter): print(batch.src) # the source language print(batch.trg) # the target language break for i,batch in enumerate(test_iter):
skip_header=True, fields=[('id',None),('text',TEXT),('label',LABEL)], filter_pred = lambda x: True if len(x.text) > 1 else False) # 토큰 레벨 문장의 길이가 1 이상인 경우만 허용 TEXT.build_vocab(train_data,min_freq=2) LABEL.build_vocab(train_data) # print (TEXT.vocab) # print (len(TEXT.vocab),len(LABEL.vocab)) # print (TEXT.vocab.itos[:5]) # print (LABEL.vocab.itos) train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True, repeat=False,shuffle=True, batch_size=32,device=DEVICE) for batch in train_loader: break EPOCH = 5 BATCH_SIZE = 32 EMBED = 300 KERNEL_SIZES = [3,4,5] KERNEL_DIM = 100 LR = 0.001 # model = CNNClassifier(len(TEXT.vocab), EMBED, 1, KERNEL_DIM, KERNEL_SIZES) model = RNN(len(TEXT.vocab), EMBED, KERNEL_DIM, 1, bidirec=False)