def load_tester( config: Dict[str, Dict[str, str or int]], args # argparse.Namespace ) -> Tuple[Any, Any, Any]: # build model architecture first if config["arguments"]["model_name"] == "CNN": model = CNN(d_emb=config["arguments"]["d_emb"], embeddings=config["arguments"]["vocab_size"], kernel_widths=config["params"]["KernelWidths"], n_class=config["arguments"]["n_class"]) elif config["arguments"]["model_name"] == "LSTM": model = SelfAttentionLSTM(d_emb=config["arguments"]["d_emb"], d_hid=config["arguments"]["d_hid"], embeddings=config["arguments"]["vocab_size"], n_class=config["arguments"]["n_class"]) elif config["arguments"]["model_name"] == "Transformer": model = TransformerEncoder( d_emb=config["arguments"]["d_emb"], embeddings=config["arguments"]["vocab_size"], max_seq_len=config["arguments"]["max_seq_len"], n_class=config["arguments"]["n_class"]) else: raise KeyError( f'Unknown model name: {config["arguments"]["model_name"]}') # setup device if args.gpu and torch.cuda.is_available(): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu device = torch.device(f'cuda:{args.gpu}') else: device = torch.device('cpu') # load state dict state_dict = torch.load(args.model, map_location=device) model.load_state_dict(state_dict) model.to(device) # setup data_loader instances path = "debug" if args.debug else "documents" word_to_id = load_vocabulary(config[path]["vocabulary"]) test_data_loader = MyDataLoader( config[path]["test"], config[path]["labels"], config["arguments"]["delimiter"], word_to_id, config["arguments"]["max_seq_len"], batch_size=config["arguments"]["batch_size"], shuffle=True, num_workers=2) # build optimizer return model, device, test_data_loader
def run(config): def _print_config(config): import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) _print_config(config) if not logging.getLogger() == None: for handler in logging.getLogger( ).handlers[:]: # make a copy of the list logging.getLogger().removeHandler(handler) if not config.save_path and config.dict_path: all_subdir = [ int(s) for s in os.listdir(config.dict_path) if os.path.isdir(os.path.join(config.dict_path, str(s))) ] max_dir_num = 0 if all_subdir: max_dir_num = max(all_subdir) max_dir_num += 1 config.save_path = os.path.join(config.dict_path, str(max_dir_num)) os.mkdir(config.save_path) logging.basicConfig(filename=os.path.join(config.save_path, 'train_log'), level=tools.LOGFILE_LEVEL, filemode='w') console = logging.StreamHandler() console.setLevel(tools.CONSOLE_LEVEL) logging.getLogger().addHandler(console) logging.info("##################### Start Training") logging.debug(vars(config)) ##load data loader logging.info("##################### Load DataLoader") loader = MyDataLoader(train_path=config.train_path, valid_path=config.valid_path, dict_path=config.dict_path, batch_size=config.batch_size, max_sent_len=config.max_sent_len, max_svo_len=config.max_svo_len) train, valid, label_list = loader.get_train_valid() num_class = len(label_list) logging.info("##################### Train Dataset size : [" + str(len(train)) + "]") logging.info("##################### Valid Dataset size : [" + str(len(valid)) + "]") logging.info("##################### class size : [" + str(num_class) + "]") config.__setattr__("num_class", num_class) config.__setattr__("class_info", label_list) dict_size = loader.get_dict_size() word_vec_dim = loader.get_dict_vec_dim() embedding = loader.get_embedding() logging.info("##################### Load 'NTN attention' Model") model = DocumentNTN(dictionary_size=dict_size, embedding_size=word_vec_dim, tensor_dim=config.tensor_dim, num_class=config.num_class, hidden_size=config.hidden_size, attention_size=config.atten_size, n_layers=config.n_layers, dropout_p=config.dropout_p, device=config.device) model.set_embedding(embedding) model.to(config.device) crit = nn.NLLLoss() trainer = Trainer(model=model, crit=crit, config=config, device=config.device) history = trainer.train(train, valid) return history
def load_setting( config: Dict[str, Dict[str, str or int]], args # argparse.Namespace ) -> Tuple[Any, Any, Any, Any, Any]: torch.manual_seed(config["arguments"]["seed"]) path = "debug" if args.debug else "documents" word_to_id = load_vocabulary(config[path]["vocabulary"]) w2v = KeyedVectors.load_word2vec_format(config[path]["w2v"], binary=True) embeddings = ids_to_embeddings(word_to_id, w2v) config["arguments"]["vocab_size"] = len(embeddings) if config["arguments"]["model_name"] == "CNN": model = CNN(d_emb=config["arguments"]["d_emb"], embeddings=embeddings, kernel_widths=[1, 3, 5], n_class=config["arguments"]["n_class"]) elif config["arguments"]["model_name"] == "LSTM": model = SelfAttentionLSTM(d_emb=config["arguments"]["d_emb"], d_hid=config["arguments"]["d_hid"], embeddings=embeddings, n_class=config["arguments"]["n_class"]) elif config["arguments"]["model_name"] == "Transformer": model = TransformerEncoder( d_emb=config["arguments"]["d_emb"], embeddings=embeddings, max_seq_len=config["arguments"]["max_seq_len"], n_class=config["arguments"]["n_class"]) else: raise KeyError( f'Unknown model name: {config["arguments"]["model_name"]}') # setup device if args.gpu and torch.cuda.is_available(): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu device = torch.device(f'cuda:{args.gpu}') else: device = torch.device('cpu') model.to(device) # setup data_loader instances train_data_loader = MyDataLoader( config[path]["train"], config[path]["labels"], config["arguments"]["delimiter"], word_to_id, config["arguments"]["max_seq_len"], batch_size=config["arguments"]["batch_size"], shuffle=True, num_workers=2) valid_data_loader = MyDataLoader( config[path]["valid"], config[path]["labels"], config["arguments"]["delimiter"], word_to_id, config["arguments"]["max_seq_len"], batch_size=config["arguments"]["batch_size"], shuffle=False, num_workers=2) # build optimizer if config["arguments"]["model_name"] == "Transformer": # filter(lambda x: x.requires_grad, model.parameters()) = extract parameters to be updated optimizer = ScheduledOptimizer(torch.optim.Adam(filter( lambda x: x.requires_grad, model.parameters()), betas=(0.9, 0.98), eps=1e-09), config["arguments"]["d_emb"], warmup_steps=4000) else: optimizer = torch.optim.Adam(model.parameters(), lr=config["arguments"]["learning_rate"]) return model, device, train_data_loader, valid_data_loader, optimizer
hidden_size = 512 # number of features in hidden state of the RNN decoder num_epochs = 10 # number of training epochs # Define a transform to pre-process the training images transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5 transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Build data train_loader, applying the transforms train_loader = MyDataLoader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file) transform_val = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.CenterCrop(224), # get 224x224 crop from the center transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) val_loader = MyDataLoader(transform=transform_val, mode='val', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=True)
def run(config): def _print_config(config): import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(vars(config)) _print_config(config) if not logging.getLogger() == None: for handler in logging.getLogger( ).handlers[:]: # make a copy of the list logging.getLogger().removeHandler(handler) if not os.path.isdir(config.save_path): os.mkdir(config.save_path) all_subdir = [ int(s) for s in os.listdir(config.save_path) if os.path.isdir(os.path.join(config.save_path, str(s))) ] max_dir_num = 0 if all_subdir: max_dir_num = max(all_subdir) max_dir_num += 1 config.save_path = os.path.join(config.save_path, str(max_dir_num)) os.mkdir(config.save_path) logging.basicConfig(filename=os.path.join(config.save_path, 'train_log'), level=tools.LOGFILE_LEVEL, filemode='w') console = logging.StreamHandler() console.setLevel(tools.CONSOLE_LEVEL) logging.getLogger().addHandler(console) logging.info("##################### Start Training") logging.debug(vars(config)) logging.info("##################### Start Load BERT MODEL") if config.bert_name == 'kobert': from kobert_modified_utills import get_kobert_model_and_tokenizer bert, tokenizer = get_kobert_model_and_tokenizer() else: tokenizer = BertTokenizer.from_pretrained(config.bert_name) bert = BertModel.from_pretrained(config.bert_name) bert.to(config.device) ##load data loader logging.info("##################### Load DataLoader") loader = MyDataLoader(train_path=config.train_path, valid_path=config.valid_path, max_length=config.max_length, tokenizer=tokenizer) train, valid, num_class = loader.get_train_valid_data() logging.info("##################### Train Dataset size : [" + str(len(train)) + "]") logging.info("##################### Valid Dataset size : [" + str(len(valid)) + "]") logging.info("##################### class size : [" + str(num_class) + "]") #modified batch size logging.info("##################### Accumulation batch size : [" + str(config.batch_size) + "]") config.batch_size = config.batch_size // config.gradient_accumulation_steps logging.info("##################### Modified batch size : [" + str(config.batch_size) + "]") logging.info("##################### Load 'BERT Classifier' Model") model = MyClassifier(bert=bert, num_class=num_class, bert_finetuning=config.bert_finetuning, dropout_p=config.dropout_p, device=config.device) model.to(config.device) crit = nn.NLLLoss() trainer = Trainer(model=model, crit=crit, config=config, boost=config.boost, device=config.device) # If bert fine-tuning process is not necessary, convert text into vectors by using bert to make whole process fast if config.boost and not config.bert_finetuning: logging.info( "##################### Transform Dataset into Vectors by using BERT" ) train = loader.convert_ids_to_vector(data=train, model=model, batch_size=config.batch_size, device=config.device) valid = loader.convert_ids_to_vector(data=valid, model=model, batch_size=config.batch_size, device=config.device) train = DataLoader(dataset=train, batch_size=config.batch_size, shuffle=True) valid = DataLoader(dataset=valid, batch_size=config.batch_size, shuffle=True) history = trainer.train(train, valid) return history
from data_loader import MyDataLoader from data_looper import MyDataLooper from torch_utils import save_model, load_model if __name__ == "__main__": args = get_args() set_seed(args.seed) os.makedirs("logzero", exist_ok=True) logzero.loglevel(20) logzero.logfile(os.path.join("logzero", args.timestamp + ".txt"), loglevel=20) logzero.logger.info("args: " + str(args)) model = SSM(args) train_loader = MyDataLoader("train", args) test_loader = MyDataLoader("test", args) train_looper = MyDataLooper(model, train_loader, args) test_looper = MyDataLooper(model, test_loader, args) if args.load_epoch: resume_epoch = args.load_epoch + 1 load_model(model, args.load_epoch) else: resume_epoch = 1 for epoch in range(resume_epoch, args.epochs + 1): train_looper(epoch) test_looper(epoch) if epoch % 10 == 0: