Beispiel #1
0
def main():
    args = vars(parser.parse_args())
    check_args(args)
    set_seeds(2020)
    data_cfg = config.DataConfig(args["data_config"])
    model_cfg = config.ModelConfig(args["model_config"])
    run_cfg   = config.RunConfig(args["run_config"], eval=True)
    output, save_prefix = set_output(args, "evaluate_model_log")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output)
    torch.zeros((1)).to(device)

    ## Loading datasets
    start = Print(" ".join(['start loading datasets']), output)
    dataset_idxs, datasets, iterators = data_cfg.path.keys(), [], []
    for idx in dataset_idxs:
        dataset = get_dataset_from_configs(data_cfg, idx)
        iterator = torch.utils.data.DataLoader(dataset, run_cfg.batch_size, shuffle=False, pin_memory=True, num_workers=4)
        datasets.append(dataset)
        iterators.append(iterator)
        end = Print(" ".join(['loaded', str(len(dataset)), idx, 'samples']), output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## initialize a model
    start = Print('start initializing a model', output)
    model, params = get_model(model_cfg, data_cfg.with_esa)
    end = Print('end initializing a model', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## setup trainer configurations
    start = Print('start setting trainer configurations', output)
    trainer = Trainer(model)
    trainer.load_model(args["checkpoint"], output)
    trainer.set_device(device)
    end = Print('end setting trainer configurations', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## evaluate a model
    start = Print('start evaluating a model', output)
    ### validation
    for idx, dataset, iterator in zip(dataset_idxs, datasets, iterators):
        Print(" ".join(['processing', idx]), output)

        ### validation
        for B, batch in enumerate(iterator):
            trainer.evaluate(batch, device)
            if B % 5 == 0: print('# {} {:.1%}'.format(idx, B / len(iterator)), end='\r', file=sys.stderr)
        print(' ' * 150, end='\r', file=sys.stderr)

        ### save outputs
        trainer.aggregate(dataset.set_labels)
        trainer.save_outputs(idx, save_prefix)

    end = Print('end evaluating a model', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)
    if not output == sys.stdout: output.close()
Beispiel #2
0
def main():
    args = vars(parser.parse_args())
    check_args(args)
    set_seeds(2021)
    model_cfg = config.ModelConfig(args["model_config"])
    args["output_path"] = "%s/%s/" % (args["data_path"], model_cfg.embedder)
    output, save_prefix = set_output(args, "embed_data_log")

    embedder = get_embedder(model_cfg.embedder)
    for file in sorted(os.listdir(args["data_path"] + "/FASTA")):
        if not file.endswith("fasta"): continue
        data_idx = os.path.splitext(file)[0]
        os.makedirs(save_prefix + "/%s/" % (data_idx), exist_ok=True)

        FILE = open(args["data_path"] + "/FASTA/%s.fasta" % data_idx, "r")
        lines = FILE.readlines()
        FILE.close()

        start = Print('start embedding %s' % data_idx, output)
        for i, line in enumerate(lines):
            if line.startswith(">"): continue
            elif not os.path.exists(save_prefix + "/%s/%d.npy" %
                                    (data_idx, i // 2)):
                seq = line.strip().upper()
                if model_cfg.embedder == "ESM": seq = seq.replace("J", "X")

                e = embedder.embed(seq)
                if model_cfg.embedder == "SeqVec": e = np.sum(e, axis=0)
                elif model_cfg.embedder == "UniRep": e = e[1:]
                np.save(save_prefix + "/%s/%d.npy" % (data_idx, i // 2), e)

            if (i // 2) % 10 == 0:
                print('# {} {:.1%}'.format(data_idx,
                                           (i // 2) / ((len(lines) - 1) // 2)),
                      end='\r',
                      file=sys.stderr)
        print(' ' * 15, end='\r', file=sys.stderr)
        end = Print('end embedding %s' % data_idx, output)
        Print(" ".join(['elapsed time:', str(end - start)]),
              output,
              newline=True)

    if not output == sys.stdout: output.close()
Beispiel #3
0
def main():
    args = vars(parser.parse_args())
    check_args(args)
    set_seeds(2020)
    model_cfg = config.ModelConfig(args["model_config"])
    run_cfg = config.RunConfig(args["run_config"],
                               eval=True,
                               sanity_check=args["sanity_check"])
    output, writer, save_prefix = set_output(args, "eval_wrn_log")
    os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[
        "device"] is not None else ""
    device, data_parallel = torch.device("cuda" if torch.cuda.is_available(
    ) else "cpu"), torch.cuda.device_count() > 1
    config.print_configs(args, [model_cfg, run_cfg], device, output)

    ## Loading datasets
    start = Print(" ".join(['start loading datasets:', args["dataset"]]),
                  output)
    dataset_test, dataset_info = get_dataset(args["dataset"],
                                             test=True,
                                             sanity_check=args["sanity_check"])
    iterator_test = torch.utils.data.DataLoader(dataset_test,
                                                run_cfg.batch_size_eval,
                                                shuffle=True,
                                                num_workers=2)
    end = Print(
        " ".join(['loaded',
                  str(len(dataset_test)), 'dataset_test samples']), output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## initialize a model
    start = Print('start initializing a model', output)
    model_cfg.set_num_channels_classes(dataset_info["num_channels"],
                                       dataset_info["num_classes"])
    model_cfg.set_dropout_rate(run_cfg.dropout_rate)
    model = WideResNet(model_cfg)
    end = Print('end initializing a model', output)
    Print("".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## setup trainer configurations
    start = Print('start setting trainer configurations', output)
    if not data_parallel: model = model.to(device)
    else: model = nn.DataParallel(model.to(device))
    criterion = nn.CrossEntropyLoss(reduction="none")
    run_cfg.set_adv(dataset_info, device)
    trainer = Trainer(model, criterion, run_cfg, std=True, adv=True, test=True)
    trainer.load(args["checkpoint"], save_prefix, device, output)
    end = Print('end setting trainer configurations', output)
    Print("".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## train a model
    start = Print('start evaluating a model', output)
    Print(trainer.get_headline(), output)
    ### test
    for B, batch in enumerate(iterator_test):
        batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch]
        trainer.std_evaluate(batch)
        trainer.adv_evaluate(batch)
        if B % 2 == 0:
            print('# test {:.1%}'.format(B / len(iterator_test)),
                  end='\r',
                  file=sys.stderr)
    print(' ' * 150, end='\r', file=sys.stderr)

    ### print log and save models
    trainer.log(output, writer)

    end = Print('end evaluating a model', output)
    Print("".join(['elapsed time:', str(end - start)]), output, newline=True)
    if not output == sys.stdout: output.close()
Beispiel #4
0
def main():
    args = vars(parser.parse_args())
    check_args(args)
    set_seeds(2021)
    data_cfg = config.DataConfig(args["data_config"])
    model_cfg = config.ModelConfig(args["model_config"])
    run_cfg = config.RunConfig(args["run_config"],
                               eval=True,
                               sanity_check=args["sanity_check"])
    output, save_prefix = set_output(args, "evaluate_model_log")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output)
    torch.zeros((1)).to(device)

    ## Loading a dataset
    start = Print(" ".join(['start loading a dataset']), output)
    dataset_test = get_dataset_from_configs(data_cfg,
                                            "test",
                                            model_cfg.embedder,
                                            sanity_check=args["sanity_check"])
    iterator_test = torch.utils.data.DataLoader(dataset_test,
                                                run_cfg.batch_size,
                                                shuffle=False,
                                                pin_memory=True,
                                                num_workers=4)
    end = Print(
        " ".join(['loaded',
                  str(len(dataset_test)), 'dataset_test samples']), output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## initialize a model
    start = Print('start initializing a model', output)
    model, params = get_model(model_cfg, run_cfg)
    get_profile(model, dataset_test, output)
    end = Print('end initializing a model', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## setup trainer configurations
    start = Print('start setting trainer configurations', output)
    trainer = Trainer(model)
    trainer.load_model(args["checkpoint"], output)
    trainer.set_device(device)
    end = Print('end setting trainer configurations', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## evaluate a model
    start = Print('start evaluating a model', output)
    trainer.headline(output)
    ### validation
    for B, batch in enumerate(iterator_test):
        trainer.evaluate(batch, device)
        if B % 5 == 0:
            print('# {:.1%}'.format(B / len(iterator_test)),
                  end='\r',
                  file=sys.stderr)
    print(' ' * 150, end='\r', file=sys.stderr)

    ### print log
    trainer.save_outputs(save_prefix)
    trainer.log(data_cfg.data_idx, output)

    end = Print('end evaluating a model', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)
    if not output == sys.stdout: output.close()
Beispiel #5
0
def main():
    args = vars(parser.parse_args())
    check_args(args)
    set_seeds(2021)
    data_cfg = config.DataConfig(args["data_config"])
    model_cfg = config.ModelConfig(args["model_config"])
    run_cfg = config.RunConfig(args["run_config"],
                               eval=False,
                               sanity_check=args["sanity_check"])
    output, save_prefix = set_output(args, "train_model_log")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output)
    torch.zeros((1)).to(device)

    ## Loading a dataset
    start = Print(" ".join(['start loading a dataset']), output)
    dataset_train = get_dataset_from_configs(data_cfg,
                                             "train",
                                             model_cfg.embedder,
                                             sanity_check=args["sanity_check"])
    iterator_train = torch.utils.data.DataLoader(dataset_train,
                                                 run_cfg.batch_size,
                                                 shuffle=True,
                                                 pin_memory=True,
                                                 num_workers=4)
    end = Print(
        " ".join(['loaded',
                  str(len(dataset_train)), 'dataset_train samples']), output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## initialize a model
    start = Print('start initializing a model', output)
    model, params = get_model(model_cfg, run_cfg)
    get_profile(model, dataset_train, output)
    end = Print('end initializing a model', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## setup trainer configurations
    start = Print('start setting trainer configurations', output)
    trainer = Trainer(model)
    trainer.load_model(args["checkpoint"], output)
    trainer.set_class_weight(dataset_train.labels, run_cfg)
    trainer.set_device(device)
    trainer.set_optim_scheduler(run_cfg, params)
    end = Print('end setting trainer configurations', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)

    ## train a model
    start = Print('start training a model', output)
    trainer.headline(output)
    for epoch in range(int(trainer.epoch), run_cfg.num_epochs):
        ### train
        for B, batch in enumerate(iterator_train):
            trainer.train(batch, device)
            if B % 5 == 0:
                print('# epoch [{}/{}] train {:.1%}'.format(
                    epoch + 1, run_cfg.num_epochs, B / len(iterator_train)),
                      end='\r',
                      file=sys.stderr)
        print(' ' * 150, end='\r', file=sys.stderr)

        ### print log and save models
        trainer.epoch += 1
        trainer.save_model(save_prefix)
        trainer.log(data_cfg.data_idx, output)

    end = Print('end training a model', output)
    Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True)
    if not output == sys.stdout: output.close()
Beispiel #6
0
import numpy as np

from src.torch_models.training import Trainer
from src.torch_models.models import *
from src.torch_models.utils import convert_to_torch_input

from src.utils import _LOGGER, parse_args, check_args
from src.data_provider import data_provider
from src.ld_preprocessing import compute_label_distributions
from src.Variables import multilabel_datasets

if __name__ == '__main__':
    args = parse_args()
    check_args(args)

    # Generate data splits and write them to disk
    if args.generate_splits:
        _LOGGER.info('Generating {n} data splits...'.format(n=args.num_splits))
        for _, _, _, _, _, _, _, _, _ in data_provider(args):
            pass
        args.generate_splits = False

    res = {m: [] for m in args.models}
    for y_train, y_val, y_test, train_mask, val_mask, test_mask, g, adj, labels in data_provider(
            args):

        _LOGGER.info('Pre-computing label distributions...')
        ld_train_x, ld_dev_x, ld_test_x, sparse_pprs = compute_label_distributions(
            dataset=args.dataset,
            base_path=args.base_path,
            labels=labels,