def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() cfgs = [] data_cfg = config.DataConfig(args["data_config"]) cfgs.append(data_cfg) if args["lm_model_config"] is None: model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), num_classes=1) cfgs += [model_cfg] else: lm_model_cfg = config.ModelConfig(args["lm_model_config"], idx="lm_model_config", input_dim=len(alphabet)) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), lm_dim=lm_model_cfg.num_layers * lm_model_cfg.hidden_dim * 2, num_classes=1) cfgs += [model_cfg, lm_model_cfg] if model_cfg.model_type == "RNN": pr_model_cfg = config.ModelConfig(args["pr_model_config"], idx="pr_model_config", model_type="MLP", num_classes=1) if pr_model_cfg.projection: pr_model_cfg.set_input_dim(model_cfg.embedding_dim) else: pr_model_cfg.set_input_dim(model_cfg.hidden_dim * 2) cfgs.append(pr_model_cfg) run_cfg = config.RunConfig(args["run_config"], sanity_check=args["sanity_check"]) cfgs.append(run_cfg) output, save_prefix = set_output(args, "eval_stability_log", test=True) os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, cfgs, device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_lm_model = (args["lm_model_config"] is not None) ## load test datasets start = Print( " ".join(['start loading a test dataset', data_cfg.path["test"]]), output) dataset_test = ss.load_stability(data_cfg, "test", alphabet, args["sanity_check"]) dataset_test = dataset.Seq_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len) collate_fn = dataset.collate_sequences if flag_rnn else None iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, collate_fn=collate_fn) end = Print(" ".join(['loaded', str(len(dataset_test)), 'sequences']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [ ] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] ### model if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) elif not flag_lm_model: model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo(model_cfg) models_list.append([model, "", flag_lm_model, flag_rnn, False]) ### lm_model if flag_lm_model: lm_model = p_elmo.P_ELMo_lm(lm_model_cfg) models_list.append([lm_model, "lm", True, False, False]) ### pr_model if flag_rnn: pr_model = mlp.MLP(pr_model_cfg, per_seq=True) models_list.append([pr_model, "pr", False, True, False]) params, pr_params = [], [] for model, idx, frz, _, _ in models_list: if frz: continue elif idx != "pr": params += [p for p in model.parameters() if p.requires_grad] else: pr_params += [p for p in model.parameters() if p.requires_grad] load_models(args, models_list, device, data_parallel, output, tfm_cls=flag_rnn) get_loss = plus_rnn.get_loss if flag_rnn else plus_tfm.get_loss end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) tasks_list = [] # list of lists [idx, metrics_train, metrics_eval] tasks_list.append(["cls", [], ["rho", "r"]]) if not flag_lm_model: tasks_list.append(["lm", [], ["acc"]]) trainer = Trainer(models_list, get_loss, run_cfg, tasks_list) trainer_args = {} trainer_args["data_parallel"] = data_parallel trainer_args["paired"] = False if flag_rnn: trainer_args["projection"] = pr_model_cfg.projection trainer_args["regression"] = True if flag_rnn: trainer_args["evaluate_cls"] = plus_rnn.evaluate_cls_protein else: trainer_args["evaluate_cls"] = plus_tfm.evaluate_cls_protein end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## evaluate a model start = Print('start evaluating a model', output) Print(trainer.get_headline(test=True), output) ### evaluate cls dataset_test.set_augment(False) trainer.set_exec_flags(["cls", 'lm'], [True, False]) for b, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format(b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate lm if not flag_lm_model: dataset_test.set_augment(True) trainer.set_exec_flags(["cls", 'lm'], [False, True]) for b, batch in enumerate(iterator_test): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# lm {:.1%} loss={:.4f}'.format(b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) Print(trainer.get_log(test_idx="Stability", args=trainer_args), output) trainer.reset() end = Print('end evaluating a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) output.close()
def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() cfgs = [] data_cfg = config.DataConfig(args["data_config"]) cfgs.append(data_cfg) if args["lm_model_config"] is None: model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), num_classes=3) cfgs += [model_cfg] else: lm_model_cfg = config.ModelConfig(args["lm_model_config"], idx="lm_model_config", input_dim=len(alphabet)) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), lm_dim=lm_model_cfg.num_layers * lm_model_cfg.hidden_dim * 2, num_classes=3) cfgs += [model_cfg, lm_model_cfg] if model_cfg.model_type == "RNN": pr_model_cfg = config.ModelConfig(args["pr_model_config"], idx="pr_model_config", model_type="MLP", num_classes=3) if pr_model_cfg.projection: pr_model_cfg.set_input_dim(model_cfg.embedding_dim) else: pr_model_cfg.set_input_dim(model_cfg.hidden_dim * 2) cfgs.append(pr_model_cfg) run_cfg = config.RunConfig(args["run_config"], sanity_check=args["sanity_check"]) cfgs.append(run_cfg) output, save_prefix = set_output(args, "train_transmembrane_log") os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, cfgs, device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_lm_model = (args["lm_model_config"] is not None) flag_lm_loss = (run_cfg.lm_loss_lambda != -1) ## load a train dataset start = Print( " ".join(['start loading train datasets', data_cfg.path["train"]]), output) dataset_train = transmembrane.load_transmembrane(data_cfg, "train", alphabet, args["sanity_check"]) dataset_train = dataset.Seq_dataset(*dataset_train, alphabet, run_cfg, flag_rnn, model_cfg.max_len, truncate=False) collate_fn = dataset.collate_sequences if flag_rnn else None iterator_train = torch.utils.data.DataLoader(dataset_train, run_cfg.batch_size_train, collate_fn=collate_fn, shuffle=True) end = Print(" ".join(['loaded', str(len(dataset_train)), 'sequences']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## load a dev dataset start = Print( " ".join(['start loading dev datasets', data_cfg.path["dev"]]), output) dataset_dev = transmembrane.load_transmembrane(data_cfg, "dev", alphabet, args["sanity_check"]) dataset_dev = dataset.Seq_dataset(*dataset_dev, alphabet, run_cfg, flag_rnn, model_cfg.max_len, truncate=False) iterator_dev = torch.utils.data.DataLoader(dataset_dev, run_cfg.batch_size_eval, collate_fn=collate_fn) end = Print(" ".join(['loaded', str(len(dataset_dev)), 'sequences']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [ ] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] ### model if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) elif not flag_lm_model: model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo(model_cfg) models_list.append([model, "", flag_lm_model, flag_rnn, False]) ### lm_model if flag_lm_model: lm_model = p_elmo.P_ELMo_lm(lm_model_cfg) models_list.append([lm_model, "lm", True, False, False]) ### pr_model if flag_rnn: pr_model = mlp.MLP(pr_model_cfg) models_list.append([pr_model, "pr", False, False, False]) params, pr_params = [], [] for model, idx, frz, _, _ in models_list: if frz: continue elif idx != "pr": params += [p for p in model.parameters() if p.requires_grad] else: pr_params += [p for p in model.parameters() if p.requires_grad] load_models(args, models_list, device, data_parallel, output, tfm_cls=flag_rnn) get_loss = plus_rnn.get_loss if flag_rnn else plus_tfm.get_loss end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) optim = torch.optim.Adam([{ 'params': params, 'lr': run_cfg.learning_rate }, { 'params': pr_params, 'lr': run_cfg.pr_learning_rate }]) tasks_list = [] # list of lists [idx, metrics_train, metrics_eval] tasks_list.append(["cls", [], ["acc", "acc_p"]]) if flag_lm_loss: tasks_list.append(["lm", [], ["acc"]]) trainer = Trainer(models_list, get_loss, run_cfg, tasks_list, optim) trainer_args = {} trainer_args["data_parallel"] = data_parallel trainer_args["paired"] = False if flag_rnn: trainer_args["projection"] = pr_model_cfg.projection if flag_rnn: trainer_args["evaluate_cls"] = plus_rnn.evaluate_transmembrane else: trainer_args["evaluate_cls"] = plus_tfm.evaluate_cls_amino trainer_args["evaluate"] = ["cls", plus_tfm.evaluate_transmembrane] end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## train a model start = Print('start training a model', output) Print(trainer.get_headline(), output) for epoch in range(run_cfg.num_epochs): ### train dataset_train.set_augment(flag_lm_loss) for B, batch in enumerate(iterator_train): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.train(batch, trainer_args) if B % 10 == 0: print('# epoch [{}/{}] train {:.1%} loss={:.4f}'.format( epoch + 1, run_cfg.num_epochs, B / len(iterator_train), trainer.loss_train), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate cls dataset_dev.set_augment(False) trainer.set_exec_flags(["cls", 'lm'], [True, False]) for b, batch in enumerate(iterator_dev): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format( b / len(iterator_dev), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate lm if flag_lm_loss: dataset_dev.set_augment(True) trainer.set_exec_flags(["cls", 'lm'], [False, True]) for b, batch in enumerate(iterator_dev): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# lm {:.1%} loss={:.4f}'.format( b / len(iterator_dev), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log and save models trainer.save(save_prefix) Print(trainer.get_log(epoch + 1, args=trainer_args), output) trainer.set_exec_flags(["cls", "lm"], [True, True]) trainer.reset() if trainer.patience == 0: break end = Print('end training a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True)
def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() cfgs = [] data_cfg = config.DataConfig(args["data_config"]) cfgs.append(data_cfg) if args["lm_model_config"] is None: model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), num_classes=5) cfgs += [model_cfg] else: lm_model_cfg = config.ModelConfig(args["lm_model_config"], idx="lm_model_config", input_dim=len(alphabet)) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), lm_dim=lm_model_cfg.num_layers * lm_model_cfg.hidden_dim * 2, num_classes=5) cfgs += [model_cfg, lm_model_cfg] run_cfg = config.RunConfig(args["run_config"], eval=True, sanity_check=args["sanity_check"]) cfgs.append(run_cfg) output, save_prefix = set_output(args, "eval_homology_log", test=True) os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, cfgs, device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_lm_model = (args["pretrained_lm_model"] is not None) flag_cm_model = (args["pretrained_cm_model"] is not None) ## load test datasets idxs_test, datasets_test, iterators_test = [ key for key in data_cfg.path.keys() if "pairs" in key ], [], [] start = Print(" ".join(['start loading test datasets'] + idxs_test), output) collate_fn = dataset.collate_paired_sequences if flag_rnn else None for idx_test in idxs_test: dataset_test = homology.load_homology_pairs(data_cfg, idx_test, alphabet, flag_cm_model, args["sanity_check"]) dataset_test = dataset.PairedHomology_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len) iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, collate_fn=collate_fn) datasets_test.append(dataset_test) iterators_test.append(iterator_test) end = Print( " ".join(['loaded', str(len(dataset_test)), 'sequence pairs']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [ ] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] ### model if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) elif not flag_lm_model: model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo(model_cfg) models_list.append([model, "", True, False, False]) ### lm_model if flag_lm_model: lm_model = p_elmo.P_ELMo_lm(lm_model_cfg) models_list.append([lm_model, "lm", True, False, False]) ### cm_model if flag_cm_model: cm_model = cnn.ConvNet2D(model_cfg.embedding_dim) models_list.append([cm_model, "cm", True, False, False]) load_models(args, models_list, device, data_parallel, output) get_loss = plus_rnn.get_loss if flag_rnn else plus_tfm.get_loss end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) tasks_list = [] # list of lists [idx, metrics_train, metrics_eval] tasks_list.append([ "cls", [], ["acc", "r", "rho", "aupr_cl", "aupr_fo", "aupr_sf", "aupr_fa"] ]) if not flag_lm_model: tasks_list.append(["lm", [], ["acc"]]) if flag_cm_model: tasks_list.append(["cm", [], ["pr", "re", "f1"]]) trainer = Trainer(models_list, get_loss, run_cfg, tasks_list) trainer_args = {} trainer_args["data_parallel"] = data_parallel trainer_args["paired"] = True if flag_rnn: trainer_args["evaluate_cls"] = plus_rnn.evaluate_homology else: trainer_args["evaluate_cls"] = plus_tfm.evaluate_homology trainer_args["evaluate"] = ["cls", homology.evaluate_homology] end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## evaluate a model start = Print('start evaluating a model', output) Print(trainer.get_headline(test=True), output) for idx_test, dataset_test, iterator_test in zip(idxs_test, datasets_test, iterators_test): ### evaluate cls and cm dataset_test.set_augment(False) trainer.set_exec_flags(["cls", 'lm', "cm"], [True, False, True]) for b, batch in enumerate(iterator_test): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate lm if not flag_lm_model: dataset_test.set_augment(True) trainer.set_exec_flags(["cls", 'lm', "cm"], [False, True, False]) for b, batch in enumerate(iterator_test): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# lm {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) Print(trainer.get_log(test_idx=idx_test, args=trainer_args), output) trainer.reset() end = Print('end evaluating a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) output.close()
def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() data_cfg = config.DataConfig(args["data_config"]) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), num_classes=2) run_cfg = config.RunConfig(args["run_config"], sanity_check=args["sanity_check"]) output, save_prefix = set_output(args, "train_pfam_log") os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_paired = ("testpairs" in data_cfg.path) ## load a train dataset start = Print( " ".join(['start loading a train dataset:', data_cfg.path["train"]]), output) dataset_train = pfam.load_pfam(data_cfg, "train", alphabet, args["sanity_check"]) dataset_train = dataset.Pfam_dataset(*dataset_train, alphabet, run_cfg, flag_rnn, model_cfg.max_len, random_pairing=flag_paired, sanity_check=args["sanity_check"]) if flag_rnn and flag_paired: collate_fn = dataset.collate_paired_sequences elif flag_rnn: collate_fn = dataset.collate_sequences_pelmo else: collate_fn = None iterator_train = torch.utils.data.DataLoader(dataset_train, run_cfg.batch_size_train, collate_fn=collate_fn, shuffle=True) end = Print(" ".join(['loaded', str(len(dataset_train)), 'sequences']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## load a test dataset start = Print( " ".join([ 'start loading a test dataset:', data_cfg.path["testpairs" if flag_paired else "test"] ]), output) if flag_paired: dataset_test = pfam.load_pfam_pairs(data_cfg, "testpairs", alphabet, args["sanity_check"]) dataset_test = dataset.PairedPfam_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len) else: dataset_test = pfam.load_pfam(data_cfg, "test", alphabet, args["sanity_check"]) dataset_test = dataset.Pfam_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len, random_pairing=flag_paired, sanity_check=args["sanity_check"]) iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, collate_fn=collate_fn) end = Print( " ".join(['loaded', str(len(dataset_test)), 'sequence(pair)s']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [ ] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) run_cfg.set_total_steps(len(dataset_train)) elif model_cfg.rnn_type == "B": model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo_lm(model_cfg) models_list.append([model, "", False, flag_rnn, flag_rnn and flag_paired]) params = [] for model, _, frz, _, _ in models_list: if not frz: params += [p for p in model.parameters() if p.requires_grad] load_models(args, models_list, device, data_parallel, output) get_loss = plus_rnn.get_loss if flag_rnn else plus_tfm.get_loss end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) if flag_rnn: optim = torch.optim.Adam(params, lr=run_cfg.learning_rate) else: optim = get_BertAdam_optimizer(run_cfg, models_list[0][0]) tasks_list = [] # list of lists [idx, metrics_train, metrics_eval] if run_cfg.lm_loss_lambda != -1: tasks_list.append(["lm", [], ["acc"]]) if run_cfg.cls_loss_lambda != -1: tasks_list.append(["cls", [], ["acc"]]) trainer = Trainer(models_list, get_loss, run_cfg, tasks_list, optim) trainer_args = {} trainer_args["data_parallel"] = data_parallel trainer_args["paired"] = flag_paired if flag_paired and flag_rnn: trainer_args["evaluate_cls"] = plus_rnn.evaluate_sfp elif flag_paired: trainer_args["evaluate_cls"] = plus_tfm.evaluate_sfp else: trainer_args["num_alphabets"] = len(alphabet) end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## train a model start = Print('start training a model', output) Print(trainer.get_headline(), output) for epoch in range(run_cfg.num_epochs): ### train for B, batch in enumerate(iterator_train): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.train(batch, trainer_args) if B % 10 == 0: print('# epoch [{}/{}] train {:.1%} loss={:.4f}'.format( epoch + 1, run_cfg.num_epochs, B / len(iterator_train), trainer.loss_train), end='\r', file=sys.stderr) if trainer.global_step % 20000 == 0 or args["sanity_check"]: print(' ' * 150, end='\r', file=sys.stderr) ### evaluate lm if run_cfg.lm_loss_lambda != -1: if flag_paired: dataset_test.set_augment(True) trainer.set_exec_flags(["lm", "cls"], [True, False]) for b, batch in enumerate(iterator_test): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# lm {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate cls if run_cfg.cls_loss_lambda != -1: dataset_test.set_augment(False) trainer.set_exec_flags(["lm", "cls"], [False, True]) for b, batch in enumerate(iterator_test): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log and save models trainer.save(save_prefix) Print(trainer.get_log(epoch + 1, args=trainer_args), output) trainer.set_exec_flags(["lm", "cls"], [True, True]) trainer.reset() end = Print('end trainin a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) output.close()
def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() cfgs = [] data_cfg = config.DataConfig(args["data_config"]); cfgs.append(data_cfg) if args["lm_model_config"] is None: model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), num_classes=5) cfgs += [model_cfg] else: lm_model_cfg = config.ModelConfig(args["lm_model_config"], idx="lm_model_config", input_dim=len(alphabet)) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), lm_dim=lm_model_cfg.num_layers * lm_model_cfg.hidden_dim * 2, num_classes=5) cfgs += [model_cfg, lm_model_cfg] run_cfg = config.RunConfig(args["run_config"], sanity_check=args["sanity_check"]); cfgs.append(run_cfg) output, save_prefix = set_output(args, "train_homology_log") os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args["device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available() else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, cfgs, device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_lm_model = (args["lm_model_config"] is not None) flag_lm_loss = (run_cfg.lm_loss_lambda != -1) flag_cm_loss = (run_cfg.cm_loss_lambda != -1) ## load a train dataset start = Print(" ".join(['start loading a train dataset:', data_cfg.path["train"]]), output) dataset_train = homology.load_homology(data_cfg, "train", alphabet, flag_cm_loss, args["sanity_check"]) dataset_train = dataset.Homology_dataset(*dataset_train, alphabet, run_cfg, flag_rnn, model_cfg.max_len) sampler = dataset.HomolgySampler(dataset_train.labels, run_cfg) collate_fn = dataset.collate_paired_sequences if flag_rnn else None iterator_train = torch.utils.data.DataLoader(dataset_train, run_cfg.batch_size_train, collate_fn=collate_fn, sampler=sampler) end = Print(" ".join(['loaded', str(int(np.sqrt(len(dataset_train)))), 'sequences']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## load a dev dataset start = Print(" ".join(['start loading a dev dataset:', data_cfg.path["devpairs"]]), output) dataset_test = homology.load_homology_pairs(data_cfg, "devpairs", alphabet, flag_cm_loss, args["sanity_check"]) dataset_test = dataset.PairedHomology_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len) iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, collate_fn=collate_fn) end = Print(" ".join(['loaded', str(len(dataset_test)), 'sequence pairs']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] ### model if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) elif not flag_lm_model: model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo(model_cfg) models_list.append([model, "", False, flag_rnn, flag_rnn]) ### lm_model if flag_lm_model: lm_model = p_elmo.P_ELMo_lm(lm_model_cfg) models_list.append([lm_model, "lm", True, False, False]) ### cm_model if flag_cm_loss: cm_model = cnn.ConvNet2D(model_cfg.embedding_dim) models_list.append([cm_model, "cm", False, False, True]) params = [] for model, _, frz, _, _ in models_list: if not frz: params += [p for p in model.parameters() if p.requires_grad] load_models(args, models_list, device, data_parallel, output, tfm_cls=flag_rnn) get_loss = plus_rnn.get_loss if flag_rnn else plus_tfm.get_loss end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) optim = torch.optim.Adam(params, lr=run_cfg.learning_rate) tasks_list = [] # list of lists [idx, metrics_train, metrics_eval] tasks_list.append(["cls", [], ["acc", "r", "rho"]]) if flag_lm_loss: tasks_list.append(["lm", [], ["acc"]]) if flag_cm_loss: tasks_list.append(["cm", [], ["pr", "re", "f1"]]) trainer = Trainer(models_list, get_loss, run_cfg, tasks_list, optim) trainer_args = {} trainer_args["data_parallel"] = data_parallel trainer_args["paired"] = True if flag_rnn: trainer_args["evaluate_cls"] = plus_rnn.evaluate_homology else: trainer_args["evaluate_cls"] = plus_tfm.evaluate_homology trainer_args["evaluate"] = ["cls", homology.evaluate_homology] end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## train a model start = Print('start training a model', output) Print(trainer.get_headline(), output) for epoch in range(run_cfg.num_epochs): ### train for B, batch in enumerate(iterator_train): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.train(batch, trainer_args) if B % 10 == 0: print('# epoch [{}/{}] train {:.1%} loss={:.4f}'.format( epoch + 1, run_cfg.num_epochs, B / len(iterator_train), trainer.loss_train), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate cls and cm dataset_test.set_augment(False) trainer.set_exec_flags(["cls", 'lm', "cm"], [True, False, flag_cm_loss]) for b, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate lm if flag_lm_loss: dataset_test.set_augment(True) trainer.set_exec_flags(["cls", 'lm', "cm"], [False, True, False]) for b, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# lm {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### print log and save models trainer.save(save_prefix) Print(trainer.get_log(epoch + 1, args=trainer_args), output) trainer.set_exec_flags(["cls", "lm", "cm"], [True, True, True]) trainer.reset() if trainer.patience == 0: break end = Print('end training a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) output.close()
def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() data_cfg = config.DataConfig(args["data_config"]) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), num_classes=2) run_cfg = config.RunConfig(args["run_config"], eval=True, sanity_check=args["sanity_check"]) output, save_prefix = set_output(args, "eval_pfam_log", test=True) os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args[ "device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available( ) else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, [data_cfg, model_cfg, run_cfg], device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_paired = ("testpairs" in data_cfg.path) ## load a test dataset start = Print( " ".join([ 'start loading a test dataset:', data_cfg.path["testpairs" if flag_paired else "test"] ]), output) if flag_paired: dataset_test = pfam.load_pfam_pairs(data_cfg, "testpairs", alphabet, args["sanity_check"]) dataset_test = dataset.PairedPfam_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len) else: dataset_test = pfam.load_pfam(data_cfg, "test", alphabet, args["sanity_check"]) dataset_test = dataset.Pfam_dataset(*dataset_test, alphabet, run_cfg, flag_rnn, model_cfg.max_len, random_pairing=flag_paired, sanity_check=args["sanity_check"]) if flag_rnn and flag_paired: collate_fn = dataset.collate_paired_sequences elif flag_rnn: collate_fn = dataset.collate_sequences_pelmo else: collate_fn = None iterator_test = torch.utils.data.DataLoader(dataset_test, run_cfg.batch_size_eval, collate_fn=collate_fn) end = Print( " ".join(['loaded', str(len(dataset_test)), 'sequence(pair)s']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [ ] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) elif model_cfg.rnn_type == "B": model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo_lm(model_cfg) models_list.append([model, "", True, False, False]) load_models(args, models_list, device, data_parallel, output) get_loss = plus_rnn.get_loss if flag_rnn else plus_tfm.get_loss end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) tasks_list = [] # list of lists [idx, metrics_train, metrics_eval] tasks_list.append(["lm", [], ["acc"]]) if flag_paired: tasks_list.append(["cls", [], ["acc"]]) trainer = Trainer(models_list, get_loss, run_cfg, tasks_list) trainer_args = {} trainer_args["data_parallel"] = data_parallel trainer_args["paired"] = flag_paired if flag_paired and flag_rnn: trainer_args["evaluate_cls"] = plus_rnn.evaluate_sfp elif flag_paired: trainer_args["evaluate_cls"] = plus_tfm.evaluate_sfp else: trainer_args["num_alphabets"] = len(alphabet) end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## evaluate a model start = Print('start evaluating a model', output) Print(trainer.get_headline(test=True), output) ### evaluate lm if flag_paired: dataset_test.set_augment(True) trainer.set_exec_flags(["lm", "cls"], [True, False]) for b, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# lm {:.1%} loss={:.4f}'.format(b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) ### evaluate cls if flag_paired: dataset_test.set_augment(False) trainer.set_exec_flags(["lm", "cls"], [False, True]) for b, batch in enumerate(iterator_test): batch = [ t.to(device) if type(t) is torch.Tensor else t for t in batch ] trainer.evaluate(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) Print(trainer.get_log(test_idx="Pfam", args=trainer_args), output) end = Print('end evaluating a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) output.close()