def _get_train_iter(self, shard_dataset: textdata.Dataset, batch_size: int, world_size: int = 1) -> BatchIterator: """ Generate data batch iterator for training data. If distributed training is enabled, the dataset will be partitioned first. We use BucketIterator here to pool together examples with a similar size length to reduce the padding required for each batch. Args: shard_dataset (str): sharded training or evaluation dataset batch_size (int): batch size rank (int): used for distributed training, the rank of current Gpu, don't set it to anything but 0 for non-distributed training world_size (int): used for distributed training, total number of Gpu """ # Compute the per-worker batch size batch_size = batch_size // world_size or batch_size return BatchIterator( textdata.BucketIterator( shard_dataset, batch_size=batch_size, device="cuda:{}".format(torch.cuda.current_device()) if cuda.CUDA_ENABLED else "cpu", sort_within_batch=self.sort_within_batch, repeat=False, sort_key=self.sort_key, shuffle=self.shuffle, ), self._postprocess_batch, num_batches=math.ceil(len(shard_dataset) / float(batch_size)), )
def __init__(self, config, w2v_stoi, w2v_vectors, device): self.config = config self.w2v_stoi = w2v_stoi self.w2v_vectors = w2v_vectors print(self.w2v_vectors.shape) self.device = device self.SRC = data.Field(tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', lower=True, batch_first=True, include_lengths=True) self.TRG = data.Field( tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', lower=True, batch_first=True, ) self.train_data = TranslationDataset( path='dataset/klue-ner-v1_train_cleaned_tokenized', exts=('.src', '.trg'), fields=(self.SRC, self.TRG)) self.test_data = TranslationDataset( path='dataset/klue-ner-v1_dev_cleaned_tokenized', exts=('.src', '.trg'), fields=(self.SRC, self.TRG)) self.build_vocab() print('number of training data : {}'.format(len(self.train_data))) print('number of test data : {}'.format(len(self.test_data))) self.train_iterator = data.BucketIterator( self.train_data, batch_size=self.config['batch_size'], device=device, sort_key=lambda x: len(x.src), sort_within_batch=True) self.test_iterator = data.BucketIterator( self.test_data, batch_size=self.config['batch_size'], device=device, sort_key=lambda x: len(x.src), sort_within_batch=True)
def main(): global WORD WORD = data.Field(include_lengths=True, batch_first=True, eos_token=None, init_token=None) LABEL = data.Field(sequential=False, batch_first=True) TREE = data.RawField(postprocessing=ListOpsDataset.tree_field(WORD)) TREE.is_target = False train = ListOpsDataset( "data/train_d20s.tsv", (("word", WORD), ("label", LABEL), ("tree", TREE)), filter_pred=lambda x: 5 < len(x.word) < config["train_len"], ) WORD.build_vocab(train) LABEL.build_vocab(train) valid = ListOpsDataset( "data/test_d20s.tsv", (("word", WORD), ("label", LABEL), ("tree", TREE)), filter_pred=lambda x: 5 < len(x.word) < 150, ) train_iter = TokenBucket(train, batch_size=1500, device="cuda:0", key=lambda x: len(x.word)) train_iter.repeat = False valid_iter = data.BucketIterator(train, batch_size=50, train=False, sort=False, device="cuda:0") NT = 1 T = len(WORD.vocab) V = T if True: tree_lstm = TreeLSTM(config["H"], len(WORD.vocab) + 100, len(LABEL.vocab)).cuda() for p in tree_lstm.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) model = SpanLSTM(NT, len(WORD.vocab), config["H"]).cuda() for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) wandb.watch((model, tree_lstm)) print(wandb.config) tree = run_train(train_iter, valid_iter, model, tree_lstm, V) else: print("loading") model, tree_lstm = torch.load("cp.yoyo.model") print(valid_sup(valid_iter, model, tree_lstm, V))
def load_dataset(config, train_pos='train.hh', train_neg='train.fb', dev_pos='dev.hh', dev_neg='dev.fb', test_pos='test.hh', test_neg='test.fb'): logger = logging.getLogger(__name__) root = config.data_path TEXT = data.Field(batch_first=True, eos_token='<eos>') dataset_fn = lambda name: data.TabularDataset( path=root + name, format='tsv', fields=[('text', TEXT)]) train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg]) dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg]) test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg]) TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq) if config.load_pretrained_embed: start = time.time() vectors = torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path) TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) logger.info('vectors', TEXT.vocab.vectors.size()) logger.info('load embedding took {:.2f} s.'.format(time.time() - start)) vocab = TEXT.vocab dataiter_fn = lambda dataset, train: data.BucketIterator( dataset=dataset, batch_size=config.batch_size, shuffle=train, repeat=train, sort_key=lambda x: len(x.text), sort_within_batch=False, device=config.device) train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True), [train_pos_set, train_neg_set]) dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False), [dev_pos_set, dev_neg_set]) test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False), [test_pos_set, test_neg_set]) train_iters = DatasetIterator(train_pos_iter, train_neg_iter) dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter) test_iters = DatasetIterator(test_pos_iter, test_neg_iter) return train_iters, dev_iters, test_iters, vocab
def make_data_iter(dataset: Dataset, batch_size: int, batch_type: str = "sentence", train: bool = False, shuffle: bool = False) -> Iterator: """ Returns a torchtext iterator for a torchtext dataset. :param dataset: torchtext dataset containing src and optionally trg :param batch_size: size of the batches the iterator prepares :param batch_type: measure batch size by sentence count or by token count :param train: whether it's training time, when turned off, bucketing, sorting within batches and shuffling is disabled :param shuffle: whether to shuffle the data before each epoch (no effect if set to True for testing) :return: torchtext iterator """ batch_size_fn = token_batch_size_fn if batch_type == "token" else None if train: # optionally shuffle and sort during training data_iter = data.BucketIterator(repeat=False, sort=False, dataset=dataset, batch_size=batch_size, batch_size_fn=batch_size_fn, train=True, sort_within_batch=True, sort_key=lambda x: len(x.src), shuffle=shuffle) else: # don't sort/shuffle for validation/inference data_iter = data.BucketIterator(repeat=False, dataset=dataset, batch_size=batch_size, batch_size_fn=batch_size_fn, train=False, sort=False) return data_iter
def train(self): """Reads .csv files for train and test sets and trains a Transformer architecture. Performs testing by calling the private _test() method and saves results with metrics in runs folder.""" data_path = "Datasets" data_file_names = ["train_dataset.csv.gz", "test_dataset.csv.gz"] for i, data_file_name in enumerate(data_file_names): data_file_name = os.path.join(data_path, data_file_name) uncompressed_data_file_name = ".".join( data_file_name.split(".")[:-1] ) if data_file_name.split(".")[-1] == "gz" and not os.path.exists( uncompressed_data_file_name ): print("Uncompressing data") with gzip.open(data_file_name, "rb") as f_in: with open(uncompressed_data_file_name, "wb") as f_out: shutil.copyfileobj(f_in, f_out) data_file_names[i] = uncompressed_data_file_name.split("/")[1] print("Extracting datasets") train_val_dataset, test_dataset = data.TabularDataset.splits( data_path, train="train_dataset.csv", test="test_dataset.csv", fields=(("label", self.label_field), ("tweet", self.text_field)), format="csv", skip_header=True, ) train_dataset, val_dataset = train_val_dataset.split(0.9) optimizer = torch.optim.Adam( lr=self.lr, params=self.model.parameters() ) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda i: min(i / (self.lr_warmup / self.batch_size), 1.0), ) print("Creating batch iterators") val_data_iter = data.BucketIterator( val_dataset, batch_size=self.test_batch_size, device=self.device, shuffle=True, ) train_data_iter = data.BucketIterator( train_dataset, batch_size=self.batch_size, device=self.device, shuffle=True, ) test_data_iter = data.BucketIterator( test_dataset, batch_size=self.test_batch_size, device=self.device, shuffle=True, ) avg_val_loss, avg_val_rec = self._test(0, val_data_iter) print( f"AvgRec: {round(avg_val_rec, 4)},\tavg loss: {round(avg_val_loss, 6)},\tepoch: 0\n" ) log_count = 0 step_loss = 0 tot_loss = 0 for epoch in range(self.epochs): self.model.train() for batch in tqdm(train_data_iter): optimizer.zero_grad() label = batch.label - 2 output = self.model(batch.tweet) loss = F.nll_loss(output, label[0]) step_loss += loss.item() loss.backward() if self.gradient_clipping > 0.0: nn.utils.clip_grad_norm_( self.model.parameters(), self.gradient_clipping ) optimizer.step() scheduler.step() log_count += len(batch) if log_count >= self.log_step: step_avg_loss = step_loss / log_count tot_loss += step_loss print( f"Train loss: {round(step_avg_loss, 6)}\tEpoch: {epoch}" ) log_count = 0 step_loss = 0.0 torch.save(self.model.state_dict(), self.model_name) avg_train_loss = tot_loss / len(train_dataset) self.writer.add_scalar( "train_avg_loss", avg_train_loss, global_step=epoch ) print(f"Train avg loss: {round(avg_train_loss, 6)}") avg_val_loss, avg_val_rec = self._test(epoch + 1, val_data_iter) self.writer.add_scalar( "avg_val_rec", avg_val_rec, global_step=epoch ) self.writer.add_scalar( "avg_val_loss", avg_val_loss, global_step=epoch ) print( f"AvgRec: {round(avg_val_rec, 4)},\tavg loss: {round(avg_val_loss, 6)},\tepoch: {epoch}\n" ) tot_loss = 0 step_avg_loss = 0 log_count = 0 print("Performance on test dataset") test_avg_loss, test_avg_rec = self._test(-1, test_data_iter) test_avg_loss = round(test_avg_loss, 6) self.writer.add_text("results", f"test_avg_loss {test_avg_loss}") self.writer.add_text("results", f"test_avg_rec {test_avg_rec}") print( f"AvgRec: {round(test_avg_rec, 4)},\tavg loss: {test_avg_loss},\tepoch: {epoch}\n" ) print( f"Best val AvgRec:{round(self.avg_rec[1], 3)} at epoch:{self.avg_rec[0]}" )
def caption_iterator(cfg, batch_size, phase): print(f'Contructing caption_iterator for "{phase}" phase') spacy_en = spacy.load('en') def tokenize_en(txt): return [token.text for token in spacy_en.tokenizer(txt)] CAPTION = data.ReversibleField(tokenize='spacy', init_token=cfg.start_token, eos_token=cfg.end_token, pad_token=cfg.pad_token, lower=True, batch_first=True, is_target=True) INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True) # the order has to be the same as in the table fields = [ ('video_id', None), ('caption', CAPTION), ('start', None), ('end', None), ('duration', None), ('phase', None), ('idx', INDEX), ] dataset = data.TabularDataset( path=cfg.train_meta_path, format='tsv', skip_header=True, fields=fields, ) CAPTION.build_vocab(dataset.caption, min_freq=cfg.min_freq_caps, vectors=cfg.word_emb_caps) train_vocab = CAPTION.vocab if phase == 'val_1': dataset = data.TabularDataset(path=cfg.val_1_meta_path, format='tsv', skip_header=True, fields=fields) elif phase == 'val_2': dataset = data.TabularDataset(path=cfg.val_2_meta_path, format='tsv', skip_header=True, fields=fields) elif phase == 'learned_props': dataset = data.TabularDataset(path=cfg.val_prop_meta_path, format='tsv', skip_header=True, fields=fields) # sort_key = lambda x: data.interleave_keys(len(x.caption), len(y.caption)) datasetloader = data.BucketIterator(dataset, batch_size, sort_key=lambda x: 0, device=torch.device(cfg.device), repeat=False, shuffle=True) return train_vocab, datasetloader
def __init__( self, root, train_path, test_path, predict_path, batch_size=64, valid_ratio=.2, max_vocab=999999, min_freq=1, use_eos=False, shuffle=True, rm = re.compile('[:;\'\"\[\]\(\)\.,@]') #제거할 특수문자 ): super().__init__() # 전처리는 여기서 진행한다. # Data Field 정의 self.id = data.Field( # 학습에 쓰지 않을 column sequential=False, use_vocab=False, unk_token=None ) self.text = data.Field( use_vocab=True, tokenize=word_tokenize, batch_first=True, include_lengths=False, eos_token='<EOS>' if use_eos else None ) self.label = data.Field( sequential=False, # 0 or 1 use_vocab=False, unk_token=None, is_target=True ) # 데이터 읽어오기 # ratings_train.txt : train+valid train, valid = data.TabularDataset( path = root + train_path, format ='tsv', fields = [ ('id', self.id), ('text', self.text), ('label', self.label)], skip_header=True ).split(split_ratio=(1 - valid_ratio)) # ratings_test.txt : test test = data.TabularDataset( path = root + test_path, format='tsv', fields=[ ('id', self.id), ('text', self.text), ('label', self.label)], skip_header=True ) # ko_data.csv : Kaggle commit predict = data.TabularDataset( path = root + predict_path, format='csv', fields=[ ('id', self.id), ('text', self.text)], skip_header=True ) # Batchify (Dataloader에 올리기) # train+valid loader self.train_loader, self.valid_loader = data.BucketIterator.splits( (train, valid), batch_size=batch_size, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), shuffle=shuffle, sort_key=lambda x: len(x.text), # 길이로 sort 후 batch 나눔! sort_within_batch=True, # 미니 배치 내에서 sort ) # test_loader self.test_loader = data.BucketIterator( test, batch_size=batch_size, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=False, ) # predict_loader self.predict_loader = data.BucketIterator( predict, batch_size=batch_size, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), shuffle=False ) self.label.build_vocab(train) self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq) # vocabulary set build
def main(): print("Using device: {}" "\n".format(str(device))) # Load the training dataset, and create a dataloader to generate a batch. textField = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize=student.tokenise, preprocessing=student.preprocessing, postprocessing=student.postprocessing, stop_words=student.stopWords) labelField = data.Field(sequential=False, use_vocab=False, is_target=True) dataset = data.TabularDataset( 'train.json', 'json', { 'reviewText': ('reviewText', textField), 'rating': ('rating', labelField), 'businessCategory': ('businessCategory', labelField) }) textField.build_vocab(dataset, vectors=student.wordVectors) # Allow training on the entire dataset, or split it for training and validation. if student.trainValSplit == 1: trainLoader = data.BucketIterator(dataset, shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) else: train, validate = dataset.split(split_ratio=student.trainValSplit) trainLoader, valLoader = data.BucketIterator.splits( (train, validate), shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) # Get model and optimiser from student. net = student.net.to(device) lossFunc = student.lossFunc optimiser = student.optimiser # Train. for epoch in range(student.epochs): runningLoss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to(device) length = batch.reviewText[1].to(device) rating = batch.rating.to(device) businessCategory = batch.businessCategory.to(device) # PyTorch calculates gradients by accumulating contributions to them # (useful for RNNs). Hence we must manually set them to zero before # calculating them. optimiser.zero_grad() # Forward pass through the network. ratingOutput, categoryOutput = net(inputs, length) loss = lossFunc(ratingOutput, categoryOutput, rating, businessCategory) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() runningLoss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, runningLoss / 32)) runningLoss = 0 # Save model. torch.save(net.state_dict(), 'savedModel.pth') print("\n" "Model saved to savedModel.pth") # Test on validation data if it exists. if student.trainValSplit != 1: net.eval() correctRatingOnlySum = 0 correctCategoryOnlySum = 0 bothCorrectSum = 0 with torch.no_grad(): for batch in valLoader: # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to( device) length = batch.reviewText[1].to(device) rating = batch.rating.to(device) businessCategory = batch.businessCategory.to(device) # Convert network output to integer values. ratingOutputs, categoryOutputs = student.convertNetOutput( *net(inputs, length)) # Calculate performance correctRating = rating == ratingOutputs.flatten() correctCategory = businessCategory == categoryOutputs.flatten() correctRatingOnlySum += torch.sum(correctRating & ~correctCategory).item() correctCategoryOnlySum += torch.sum(correctCategory & ~correctRating).item() bothCorrectSum += torch.sum(correctRating & correctCategory).item() correctRatingOnlyPercent = correctRatingOnlySum / len(validate) correctCategoryOnlyPercent = correctCategoryOnlySum / len(validate) bothCorrectPercent = bothCorrectSum / len(validate) neitherCorrectPer = 1 - correctRatingOnlyPercent \ - correctCategoryOnlyPercent \ - bothCorrectPercent score = 100 * (bothCorrectPercent + 0.5 * correctCategoryOnlyPercent + 0.1 * correctRatingOnlyPercent) print("\n" "Rating incorrect, business category incorrect: {:.2%}\n" "Rating correct, business category incorrect: {:.2%}\n" "Rating incorrect, business category correct: {:.2%}\n" "Rating correct, business category correct: {:.2%}\n" "\n" "Weighted score: {:.2f}".format(neitherCorrectPer, correctRatingOnlyPercent, correctCategoryOnlyPercent, bothCorrectPercent, score))
train_fileds = [("text", TEXT), ("label", LABEL)] train_data = data.TabularDataset(path=r"./imdb_data.csv", format='csv', skip_header=True, fields=train_fileds) train_data_real, val_data_real = train_data.split(split_ratio=0.7) vec = Vectors("glove.6B.100d.txt", "./Emotion") # 将训练集转换为词向量 TEXT.build_vocab(train_data_real, max_size=20000, vectors=vec) LABEL.build_vocab(train_data_real) # print(TEXT.vocab.freqs.most_common(n=10)) # print("类别标签情况: ", LABEL.vocab.freqs) # print("词典个数: ", len(TEXT.vocab.itos)) # 定义加载器 train_iter = data.BucketIterator(train_data_real, batch_size=BATCH_SIZE) val_iter = data.BucketIterator(val_data_real, batch_size=BATCH_SIZE) INPUT_DIM = len(TEXT.vocab) # 词典数量 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] model = TextCNN(INPUT_DIM, EMBEDDING_DIM, N_FILITERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX) # 将导入的词向量作为embedding.weight的初值 pretrained_embedding = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embedding) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)