def get_dataset(self, dataset, n_workers=4, dataset_args={}): """ Load data and return Dataset objects for training and validating. Args: data_path (str): Path to the data. """ self.logging.info('preprocessing data...') results = [None] * n_workers with Pool(processes=n_workers) as pool: for i in range(n_workers): batch_start = (len(dataset) // n_workers) * i if i == n_workers - 1: batch_end = len(dataset) else: batch_end = (len(dataset) // n_workers) * (i + 1) batch = dataset[batch_start: batch_end] results[i] = pool.apply_async(self.preprocess_samples, [batch]) pool.close() pool.join() processed = [] for result in results: processed += result.get() padding = self.words_dict["<PAD>"] sp_tag = [self.words_dict["<SOS>"], self.words_dict["<EOS>"]] return CorpusDataset(processed, padding=padding, sp_tag=sp_tag, **dataset_args)
def main(): random.seed(420) parser = argparse.ArgumentParser( description='Evaluate accuracy of trained model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', default='checkpoint.pt', help='model to use') parser.add_argument('--data', default='corpus.pt', help='preprocessed data file') parser.add_argument( '--device', default='cuda:0' if torch.cuda.is_available() else 'cpu', help='device to use') parser.add_argument('--batch', default=64, type=int, help='batch size') args = parser.parse_args() cp = CorpusPreprocessor() cp.load(args.data) net = Net(len(cp.alphabet), cp.max_sentence_length, cp.max_word_length) net.load_state_dict(torch.load(args.model, map_location=args.device)) net.to(args.device) _, testset = CorpusDataset.split(cp, 0.8) testloader = DataLoader(testset, batch_size=args.batch, num_workers=4) accuracy = evaluate(net, args.device, testloader) print('Model accuracy: {}'.format(accuracy))
def get_dataloader(data_path: str, transform: Callable[[List, List], Tuple], batch_size: int) -> DataLoader: """dataloader 생성 Args: data_path: dataset 경로 transform: input feature로 변환해주는 funciton batch_size: dataloader batch size Returns: dataloader """ dataset = CorpusDataset(data_path, transform) print(dataset[0]) dataloader = DataLoader(dataset, batch_size=batch_size) return dataloader
print(json.dumps([model_config, pretraining_config], indent=4)) ########################### Loading Datasets ########################### if "dataset" not in config: config["dataset"] = None tokenizer = RobertaTokenizerFast.from_pretrained( '/code/roberta-base', model_max_length=model_config["max_seq_len"]) tokenizer.model_max_length = model_config["max_seq_len"] tokenizer.init_kwargs['model_max_length'] = model_config["max_seq_len"] model_config["vocab_size"] = len(tokenizer.get_vocab()) if not args.use_data: dataset = CorpusDataset(folder_path=data_folder, file_json="train.json", option=config["dataset"]) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) data_loader = DataLoader(dataset, batch_size=pretraining_config["batch_size"], shuffle=True, collate_fn=data_collator) pretrain_dataloader_iter = enumerate(data_loader) ########################### Loading Model ########################### model = ModelForMaskedLM(model_config) print(model)
def get_dataloader(data_path, transform, batch_size): dataset = CorpusDataset(data_path, transform) dataloader = DataLoader(dataset, batch_size=batch_size) return dataloader
device_ids = list(range(torch.cuda.device_count())) print(f"GPU list: {device_ids}") print(json.dumps([model_config, pretraining_config], indent=4)) ########################### Loading Dataset ########################### tokenizer = utils.get_tokenizer(model_config["max_seq_len"]) model_config["vocab_size"] = len(tokenizer.get_vocab()) if "dataset" not in config: config["dataset"] = None dataset = CorpusDataset(folder_path=data_folder, file_json="dev.json", files_per_batch=128, option=config["dataset"]) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) data_loader = DataLoader(dataset, batch_size=pretraining_config["batch_size"], collate_fn=data_collator) pretrain_dataloader_iter = enumerate(data_loader) ########################### Loading Model ########################### model = ModelForMaskedLM(model_config) print(model) model = model.cuda()