def main(): args = vars(parser.parse_args()) check_args(args) set_seeds(2020) data_cfg = config.DataConfig(args["data_config"]) model_cfg = config.ModelConfig(args["model_config"]) run_cfg = config.RunConfig(args["run_config"], eval=True) output, save_prefix = set_output(args, "evaluate_model_log") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output) torch.zeros((1)).to(device) ## Loading datasets start = Print(" ".join(['start loading datasets']), output) dataset_idxs, datasets, iterators = data_cfg.path.keys(), [], [] for idx in dataset_idxs: dataset = get_dataset_from_configs(data_cfg, idx) iterator = torch.utils.data.DataLoader(dataset, run_cfg.batch_size, shuffle=False, pin_memory=True, num_workers=4) datasets.append(dataset) iterators.append(iterator) end = Print(" ".join(['loaded', str(len(dataset)), idx, 'samples']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) model, params = get_model(model_cfg, data_cfg.with_esa) end = Print('end initializing a model', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) trainer = Trainer(model) trainer.load_model(args["checkpoint"], output) trainer.set_device(device) end = Print('end setting trainer configurations', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## evaluate a model start = Print('start evaluating a model', output) ### validation for idx, dataset, iterator in zip(dataset_idxs, datasets, iterators): Print(" ".join(['processing', idx]), output) ### validation for B, batch in enumerate(iterator): trainer.evaluate(batch, device) if B % 5 == 0: print('# {} {:.1%}'.format(idx, B / len(iterator)), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### save outputs trainer.aggregate(dataset.set_labels) trainer.save_outputs(idx, save_prefix) end = Print('end evaluating a model', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) if not output == sys.stdout: output.close()
def main(): args = vars(parser.parse_args()) check_args(args) set_seeds(2021) model_cfg = config.ModelConfig(args["model_config"]) args["output_path"] = "%s/%s/" % (args["data_path"], model_cfg.embedder) output, save_prefix = set_output(args, "embed_data_log") embedder = get_embedder(model_cfg.embedder) for file in sorted(os.listdir(args["data_path"] + "/FASTA")): if not file.endswith("fasta"): continue data_idx = os.path.splitext(file)[0] os.makedirs(save_prefix + "/%s/" % (data_idx), exist_ok=True) FILE = open(args["data_path"] + "/FASTA/%s.fasta" % data_idx, "r") lines = FILE.readlines() FILE.close() start = Print('start embedding %s' % data_idx, output) for i, line in enumerate(lines): if line.startswith(">"): continue elif not os.path.exists(save_prefix + "/%s/%d.npy" % (data_idx, i // 2)): seq = line.strip().upper() if model_cfg.embedder == "ESM": seq = seq.replace("J", "X") e = embedder.embed(seq) if model_cfg.embedder == "SeqVec": e = np.sum(e, axis=0) elif model_cfg.embedder == "UniRep": e = e[1:] np.save(save_prefix + "/%s/%d.npy" % (data_idx, i // 2), e) if (i // 2) % 10 == 0: print('# {} {:.1%}'.format(data_idx, (i // 2) / ((len(lines) - 1) // 2)), end='\r', file=sys.stderr) print(' ' * 15, end='\r', file=sys.stderr) end = Print('end embedding %s' % data_idx, output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) if not output == sys.stdout: output.close()
def main(): args = vars(parser.parse_args()) check_args(args) set_seeds(2020) model_cfg = config.ModelConfig(args["model_config"]) run_cfg = config.RunConfig(args["run_config"], eval=True, sanity_check=args["sanity_check"]) output, writer, save_prefix = set_output(args, "eval_wrn_log") os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, [model_cfg, run_cfg], device, output) ## Loading datasets start = Print(" ".join(['start loading datasets:', args["dataset"]]), output) dataset_test, dataset_info = get_dataset(args["dataset"], test=True, sanity_check=args["sanity_check"]) iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, shuffle=True, num_workers=2) end = Print( " ".join(['loaded', str(len(dataset_test)), 'dataset_test samples']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) model_cfg.set_num_channels_classes(dataset_info["num_channels"], dataset_info["num_classes"]) model_cfg.set_dropout_rate(run_cfg.dropout_rate) model = WideResNet(model_cfg) end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) if not data_parallel: model = model.to(device) else: model = nn.DataParallel(model.to(device)) criterion = nn.CrossEntropyLoss(reduction="none") run_cfg.set_adv(dataset_info, device) trainer = Trainer(model, criterion, run_cfg, std=True, adv=True, test=True) trainer.load(args["checkpoint"], save_prefix, device, output) end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## train a model start = Print('start evaluating a model', output) Print(trainer.get_headline(), output) ### test for B, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.std_evaluate(batch) trainer.adv_evaluate(batch) if B % 2 == 0: print('# test {:.1%}'.format(B / len(iterator_test)), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log and save models trainer.log(output, writer) end = Print('end evaluating a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) if not output == sys.stdout: output.close()
def main(): args = vars(parser.parse_args()) check_args(args) set_seeds(2021) data_cfg = config.DataConfig(args["data_config"]) model_cfg = config.ModelConfig(args["model_config"]) run_cfg = config.RunConfig(args["run_config"], eval=True, sanity_check=args["sanity_check"]) output, save_prefix = set_output(args, "evaluate_model_log") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output) torch.zeros((1)).to(device) ## Loading a dataset start = Print(" ".join(['start loading a dataset']), output) dataset_test = get_dataset_from_configs(data_cfg, "test", model_cfg.embedder, sanity_check=args["sanity_check"]) iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size, shuffle=False, pin_memory=True, num_workers=4) end = Print( " ".join(['loaded', str(len(dataset_test)), 'dataset_test samples']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) model, params = get_model(model_cfg, run_cfg) get_profile(model, dataset_test, output) end = Print('end initializing a model', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) trainer = Trainer(model) trainer.load_model(args["checkpoint"], output) trainer.set_device(device) end = Print('end setting trainer configurations', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## evaluate a model start = Print('start evaluating a model', output) trainer.headline(output) ### validation for B, batch in enumerate(iterator_test): trainer.evaluate(batch, device) if B % 5 == 0: print('# {:.1%}'.format(B / len(iterator_test)), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log trainer.save_outputs(save_prefix) trainer.log(data_cfg.data_idx, output) end = Print('end evaluating a model', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) if not output == sys.stdout: output.close()
def main(): args = vars(parser.parse_args()) check_args(args) set_seeds(2021) data_cfg = config.DataConfig(args["data_config"]) model_cfg = config.ModelConfig(args["model_config"]) run_cfg = config.RunConfig(args["run_config"], eval=False, sanity_check=args["sanity_check"]) output, save_prefix = set_output(args, "train_model_log") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output) torch.zeros((1)).to(device) ## Loading a dataset start = Print(" ".join(['start loading a dataset']), output) dataset_train = get_dataset_from_configs(data_cfg, "train", model_cfg.embedder, sanity_check=args["sanity_check"]) iterator_train = torch.utils.data.DataLoader(dataset_train, run_cfg.batch_size, shuffle=True, pin_memory=True, num_workers=4) end = Print( " ".join(['loaded', str(len(dataset_train)), 'dataset_train samples']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) model, params = get_model(model_cfg, run_cfg) get_profile(model, dataset_train, output) end = Print('end initializing a model', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) trainer = Trainer(model) trainer.load_model(args["checkpoint"], output) trainer.set_class_weight(dataset_train.labels, run_cfg) trainer.set_device(device) trainer.set_optim_scheduler(run_cfg, params) end = Print('end setting trainer configurations', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## train a model start = Print('start training a model', output) trainer.headline(output) for epoch in range(int(trainer.epoch), run_cfg.num_epochs): ### train for B, batch in enumerate(iterator_train): trainer.train(batch, device) if B % 5 == 0: print('# epoch [{}/{}] train {:.1%}'.format( epoch + 1, run_cfg.num_epochs, B / len(iterator_train)), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log and save models trainer.epoch += 1 trainer.save_model(save_prefix) trainer.log(data_cfg.data_idx, output) end = Print('end training a model', output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) if not output == sys.stdout: output.close()
import numpy as np from src.torch_models.training import Trainer from src.torch_models.models import * from src.torch_models.utils import convert_to_torch_input from src.utils import _LOGGER, parse_args, check_args from src.data_provider import data_provider from src.ld_preprocessing import compute_label_distributions from src.Variables import multilabel_datasets if __name__ == '__main__': args = parse_args() check_args(args) # Generate data splits and write them to disk if args.generate_splits: _LOGGER.info('Generating {n} data splits...'.format(n=args.num_splits)) for _, _, _, _, _, _, _, _, _ in data_provider(args): pass args.generate_splits = False res = {m: [] for m in args.models} for y_train, y_val, y_test, train_mask, val_mask, test_mask, g, adj, labels in data_provider( args): _LOGGER.info('Pre-computing label distributions...') ld_train_x, ld_dev_x, ld_test_x, sparse_pprs = compute_label_distributions( dataset=args.dataset, base_path=args.base_path, labels=labels,