def load_tester(
        config: Dict[str, Dict[str, str or int]],
        args  # argparse.Namespace
) -> Tuple[Any, Any, Any]:
    # build model architecture first
    if config["arguments"]["model_name"] == "CNN":
        model = CNN(d_emb=config["arguments"]["d_emb"],
                    embeddings=config["arguments"]["vocab_size"],
                    kernel_widths=config["params"]["KernelWidths"],
                    n_class=config["arguments"]["n_class"])
    elif config["arguments"]["model_name"] == "LSTM":
        model = SelfAttentionLSTM(d_emb=config["arguments"]["d_emb"],
                                  d_hid=config["arguments"]["d_hid"],
                                  embeddings=config["arguments"]["vocab_size"],
                                  n_class=config["arguments"]["n_class"])
    elif config["arguments"]["model_name"] == "Transformer":
        model = TransformerEncoder(
            d_emb=config["arguments"]["d_emb"],
            embeddings=config["arguments"]["vocab_size"],
            max_seq_len=config["arguments"]["max_seq_len"],
            n_class=config["arguments"]["n_class"])
    else:
        raise KeyError(
            f'Unknown model name: {config["arguments"]["model_name"]}')

    # setup device
    if args.gpu and torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
        device = torch.device(f'cuda:{args.gpu}')
    else:
        device = torch.device('cpu')
    # load state dict
    state_dict = torch.load(args.model, map_location=device)
    model.load_state_dict(state_dict)

    model.to(device)

    # setup data_loader instances
    path = "debug" if args.debug else "documents"
    word_to_id = load_vocabulary(config[path]["vocabulary"])

    test_data_loader = MyDataLoader(
        config[path]["test"],
        config[path]["labels"],
        config["arguments"]["delimiter"],
        word_to_id,
        config["arguments"]["max_seq_len"],
        batch_size=config["arguments"]["batch_size"],
        shuffle=True,
        num_workers=2)

    # build optimizer
    return model, device, test_data_loader
def run(config):
    def _print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    _print_config(config)

    if not logging.getLogger() == None:
        for handler in logging.getLogger(
        ).handlers[:]:  # make a copy of the list
            logging.getLogger().removeHandler(handler)

    if not config.save_path and config.dict_path:
        all_subdir = [
            int(s) for s in os.listdir(config.dict_path)
            if os.path.isdir(os.path.join(config.dict_path, str(s)))
        ]
        max_dir_num = 0
        if all_subdir:
            max_dir_num = max(all_subdir)
        max_dir_num += 1
        config.save_path = os.path.join(config.dict_path, str(max_dir_num))
        os.mkdir(config.save_path)

    logging.basicConfig(filename=os.path.join(config.save_path, 'train_log'),
                        level=tools.LOGFILE_LEVEL,
                        filemode='w')

    console = logging.StreamHandler()
    console.setLevel(tools.CONSOLE_LEVEL)
    logging.getLogger().addHandler(console)

    logging.info("##################### Start Training")
    logging.debug(vars(config))

    ##load data loader
    logging.info("##################### Load DataLoader")
    loader = MyDataLoader(train_path=config.train_path,
                          valid_path=config.valid_path,
                          dict_path=config.dict_path,
                          batch_size=config.batch_size,
                          max_sent_len=config.max_sent_len,
                          max_svo_len=config.max_svo_len)

    train, valid, label_list = loader.get_train_valid()
    num_class = len(label_list)
    logging.info("##################### Train Dataset size : [" +
                 str(len(train)) + "]")
    logging.info("##################### Valid Dataset size : [" +
                 str(len(valid)) + "]")
    logging.info("##################### class size : [" + str(num_class) + "]")

    config.__setattr__("num_class", num_class)
    config.__setattr__("class_info", label_list)

    dict_size = loader.get_dict_size()
    word_vec_dim = loader.get_dict_vec_dim()
    embedding = loader.get_embedding()

    logging.info("##################### Load 'NTN attention' Model")
    model = DocumentNTN(dictionary_size=dict_size,
                        embedding_size=word_vec_dim,
                        tensor_dim=config.tensor_dim,
                        num_class=config.num_class,
                        hidden_size=config.hidden_size,
                        attention_size=config.atten_size,
                        n_layers=config.n_layers,
                        dropout_p=config.dropout_p,
                        device=config.device)

    model.set_embedding(embedding)
    model.to(config.device)

    crit = nn.NLLLoss()
    trainer = Trainer(model=model,
                      crit=crit,
                      config=config,
                      device=config.device)
    history = trainer.train(train, valid)
    return history
def load_setting(
        config: Dict[str, Dict[str, str or int]],
        args  # argparse.Namespace
) -> Tuple[Any, Any, Any, Any, Any]:
    torch.manual_seed(config["arguments"]["seed"])

    path = "debug" if args.debug else "documents"
    word_to_id = load_vocabulary(config[path]["vocabulary"])
    w2v = KeyedVectors.load_word2vec_format(config[path]["w2v"], binary=True)
    embeddings = ids_to_embeddings(word_to_id, w2v)
    config["arguments"]["vocab_size"] = len(embeddings)

    if config["arguments"]["model_name"] == "CNN":
        model = CNN(d_emb=config["arguments"]["d_emb"],
                    embeddings=embeddings,
                    kernel_widths=[1, 3, 5],
                    n_class=config["arguments"]["n_class"])
    elif config["arguments"]["model_name"] == "LSTM":
        model = SelfAttentionLSTM(d_emb=config["arguments"]["d_emb"],
                                  d_hid=config["arguments"]["d_hid"],
                                  embeddings=embeddings,
                                  n_class=config["arguments"]["n_class"])
    elif config["arguments"]["model_name"] == "Transformer":
        model = TransformerEncoder(
            d_emb=config["arguments"]["d_emb"],
            embeddings=embeddings,
            max_seq_len=config["arguments"]["max_seq_len"],
            n_class=config["arguments"]["n_class"])
    else:
        raise KeyError(
            f'Unknown model name: {config["arguments"]["model_name"]}')

    # setup device
    if args.gpu and torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
        device = torch.device(f'cuda:{args.gpu}')
    else:
        device = torch.device('cpu')
    model.to(device)

    # setup data_loader instances
    train_data_loader = MyDataLoader(
        config[path]["train"],
        config[path]["labels"],
        config["arguments"]["delimiter"],
        word_to_id,
        config["arguments"]["max_seq_len"],
        batch_size=config["arguments"]["batch_size"],
        shuffle=True,
        num_workers=2)
    valid_data_loader = MyDataLoader(
        config[path]["valid"],
        config[path]["labels"],
        config["arguments"]["delimiter"],
        word_to_id,
        config["arguments"]["max_seq_len"],
        batch_size=config["arguments"]["batch_size"],
        shuffle=False,
        num_workers=2)

    # build optimizer
    if config["arguments"]["model_name"] == "Transformer":
        # filter(lambda x: x.requires_grad, model.parameters()) = extract parameters to be updated
        optimizer = ScheduledOptimizer(torch.optim.Adam(filter(
            lambda x: x.requires_grad, model.parameters()),
                                                        betas=(0.9, 0.98),
                                                        eps=1e-09),
                                       config["arguments"]["d_emb"],
                                       warmup_steps=4000)
    else:
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=config["arguments"]["learning_rate"])

    return model, device, train_data_loader, valid_data_loader, optimizer
Beispiel #4
0
hidden_size = 512       # number of features in hidden state of the RNN decoder
num_epochs = 10         # number of training epochs

# Define a transform to pre-process the training images
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Build data train_loader, applying the transforms
train_loader = MyDataLoader(transform=transform_train,
                            mode='train',
                            batch_size=batch_size,
                            vocab_threshold=vocab_threshold,
                            vocab_from_file=vocab_from_file)

transform_val = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.CenterCrop(224),                      # get 224x224 crop from the center
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

val_loader = MyDataLoader(transform=transform_val,
                        mode='val',
                        batch_size=batch_size,
                        vocab_threshold=vocab_threshold,
                        vocab_from_file=True)
Beispiel #5
0
def run(config):
    def _print_config(config):
        import pprint
        pp = pprint.PrettyPrinter(indent=4)
        pp.pprint(vars(config))

    _print_config(config)

    if not logging.getLogger() == None:
        for handler in logging.getLogger(
        ).handlers[:]:  # make a copy of the list
            logging.getLogger().removeHandler(handler)

    if not os.path.isdir(config.save_path):
        os.mkdir(config.save_path)
    all_subdir = [
        int(s) for s in os.listdir(config.save_path)
        if os.path.isdir(os.path.join(config.save_path, str(s)))
    ]
    max_dir_num = 0
    if all_subdir:
        max_dir_num = max(all_subdir)
    max_dir_num += 1
    config.save_path = os.path.join(config.save_path, str(max_dir_num))
    os.mkdir(config.save_path)

    logging.basicConfig(filename=os.path.join(config.save_path, 'train_log'),
                        level=tools.LOGFILE_LEVEL,
                        filemode='w')

    console = logging.StreamHandler()
    console.setLevel(tools.CONSOLE_LEVEL)
    logging.getLogger().addHandler(console)

    logging.info("##################### Start Training")
    logging.debug(vars(config))

    logging.info("##################### Start Load BERT MODEL")
    if config.bert_name == 'kobert':
        from kobert_modified_utills import get_kobert_model_and_tokenizer
        bert, tokenizer = get_kobert_model_and_tokenizer()
    else:
        tokenizer = BertTokenizer.from_pretrained(config.bert_name)
        bert = BertModel.from_pretrained(config.bert_name)
    bert.to(config.device)

    ##load data loader
    logging.info("##################### Load DataLoader")
    loader = MyDataLoader(train_path=config.train_path,
                          valid_path=config.valid_path,
                          max_length=config.max_length,
                          tokenizer=tokenizer)

    train, valid, num_class = loader.get_train_valid_data()
    logging.info("##################### Train Dataset size : [" +
                 str(len(train)) + "]")
    logging.info("##################### Valid Dataset size : [" +
                 str(len(valid)) + "]")
    logging.info("##################### class size : [" + str(num_class) + "]")

    #modified batch size
    logging.info("##################### Accumulation batch size : [" +
                 str(config.batch_size) + "]")
    config.batch_size = config.batch_size // config.gradient_accumulation_steps
    logging.info("##################### Modified batch size : [" +
                 str(config.batch_size) + "]")

    logging.info("##################### Load 'BERT Classifier' Model")
    model = MyClassifier(bert=bert,
                         num_class=num_class,
                         bert_finetuning=config.bert_finetuning,
                         dropout_p=config.dropout_p,
                         device=config.device)
    model.to(config.device)
    crit = nn.NLLLoss()
    trainer = Trainer(model=model,
                      crit=crit,
                      config=config,
                      boost=config.boost,
                      device=config.device)

    # If bert fine-tuning process is not necessary, convert text into vectors by using bert to make whole process fast
    if config.boost and not config.bert_finetuning:
        logging.info(
            "##################### Transform Dataset into Vectors by using BERT"
        )
        train = loader.convert_ids_to_vector(data=train,
                                             model=model,
                                             batch_size=config.batch_size,
                                             device=config.device)
        valid = loader.convert_ids_to_vector(data=valid,
                                             model=model,
                                             batch_size=config.batch_size,
                                             device=config.device)

    train = DataLoader(dataset=train,
                       batch_size=config.batch_size,
                       shuffle=True)
    valid = DataLoader(dataset=valid,
                       batch_size=config.batch_size,
                       shuffle=True)

    history = trainer.train(train, valid)
    return history
Beispiel #6
0
from data_loader import MyDataLoader
from data_looper import MyDataLooper
from torch_utils import save_model, load_model

if __name__ == "__main__":
    args = get_args()
    set_seed(args.seed)

    os.makedirs("logzero", exist_ok=True)
    logzero.loglevel(20)
    logzero.logfile(os.path.join("logzero", args.timestamp + ".txt"),
                    loglevel=20)
    logzero.logger.info("args: " + str(args))

    model = SSM(args)
    train_loader = MyDataLoader("train", args)
    test_loader = MyDataLoader("test", args)
    train_looper = MyDataLooper(model, train_loader, args)
    test_looper = MyDataLooper(model, test_loader, args)

    if args.load_epoch:
        resume_epoch = args.load_epoch + 1
        load_model(model, args.load_epoch)
    else:
        resume_epoch = 1

    for epoch in range(resume_epoch, args.epochs + 1):
        train_looper(epoch)
        test_looper(epoch)

        if epoch % 10 == 0: