def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]: sequences = [ self._alphabet.encode(sequence.encode().upper()) for sequence in batch ] test_dataset = [ torch.from_numpy(sequence).long() for sequence in sequences ] test_dataset = Embedding_dataset(test_dataset, self._alphabet, self._run_cfg, True) iterator_test = DataLoader( test_dataset, self._run_cfg.batch_size_eval, collate_fn=collate_sequences_for_embedding, ) model_list = [self._model, "", True, False, False] tasks_list = [["", [], []]] # list of lists [idx, metrics_train, metrics_eval] trainer = Trainer([model_list], get_embedding, self._run_cfg, tasks_list) for tokens, lengths in iterator_test: # https://github.com/pytorch/pytorch/issues/43227 batch = (tokens.to(self._device), lengths) trainer.embed(batch, {"data_parallel": False}) embeddings = trainer.tasks_dict["results_eval"][0]["embeddings"] # 1 is d_h with 1024 dimensions for i in range(len(embeddings[0])): yield embeddings[1][i].numpy() trainer.reset()
def embed_batch(self, batch: List[str]) -> Generator[ndarray, None, None]: sequences = [ self._alphabet.encode(sequence.encode().upper()) for sequence in batch ] test_dataset = [torch.from_numpy(sequence).long() for sequence in sequences] test_dataset = Embedding_dataset( test_dataset, self._alphabet, self._run_cfg, True ) iterator_test = DataLoader( test_dataset, self._run_cfg.batch_size_eval, collate_fn=collate_sequences_for_embedding ) model_list = [self._model, "", True, False, False] tasks_list = [["", [], []]] # list of lists [idx, metrics_train, metrics_eval] trainer = Trainer([model_list], get_embedding, self._run_cfg, tasks_list) for batch in iterator_test: batch = [t.to(self._device) for t in batch] trainer.embed(batch, {"data_parallel": False}) embeddings = trainer.tasks_dict["results_eval"][0]["embeddings"] # TODO: Should this be `embeddings[0]` or `embeddings[1]`? # 0 is projection dimension d_z (100) # 1 is d_h (1024) for i in range(len(embeddings[0])): yield embeddings[1][i].numpy() trainer.reset()
def main(): set_seeds(2020) args = vars(parser.parse_args()) alphabet = Protein() cfgs = [] data_cfg = config.DataConfig(args["data_config"]); cfgs.append(data_cfg) if args["lm_model_config"] is None: model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet)) cfgs += [model_cfg] else: lm_model_cfg = config.ModelConfig(args["lm_model_config"], idx="lm_model_config", input_dim=len(alphabet)) model_cfg = config.ModelConfig(args["model_config"], input_dim=len(alphabet), lm_dim=lm_model_cfg.num_layers * lm_model_cfg.hidden_dim * 2) cfgs += [model_cfg, lm_model_cfg] run_cfg = config.RunConfig(args["run_config"], sanity_check=args["sanity_check"]); cfgs.append(run_cfg) output, save_prefix = set_output(args, "embedding_log", embedding=True) os.environ['CUDA_VISIBLE_DEVICES'] = args["device"] if args["device"] is not None else "" device, data_parallel = torch.device("cuda" if torch.cuda.is_available() else "cpu"), torch.cuda.device_count() > 1 config.print_configs(args, cfgs, device, output) flag_rnn = (model_cfg.model_type == "RNN") flag_lm_model = (args["lm_model_config"] is not None) ## load test datasets start = Print(" ".join(['start loading a dataset:', data_cfg.path["test"]]), output) test_dataset = load_fasta(data_cfg, "test", alphabet, sanity_check=args["sanity_check"]) test_dataset = dataset.Embedding_dataset(test_dataset, alphabet, run_cfg, flag_rnn) collate_fn = dataset.collate_sequences_for_embedding if flag_rnn else None iterator_test = torch.utils.data.DataLoader(test_dataset, run_cfg.batch_size_eval, collate_fn=collate_fn) end = Print(" ".join(['loaded', str(len(test_dataset)), 'sequences']), output) Print(" ".join(['elapsed time:', str(end - start)]), output, newline=True) ## initialize a model start = Print('start initializing a model', output) models_list = [] # list of lists [model, idx, flag_frz, flag_clip_grad, flag_clip_weight] ### model if not flag_rnn: model = plus_tfm.PLUS_TFM(model_cfg) elif not flag_lm_model: model = plus_rnn.PLUS_RNN(model_cfg) else: model = p_elmo.P_ELMo(model_cfg) models_list.append([model, "", True, False, False]) ### lm_model if flag_lm_model: lm_model = p_elmo.P_ELMo_lm(lm_model_cfg) models_list.append([lm_model, "lm", True, False, False]) load_models(args, models_list, device, data_parallel, output, tfm_cls=flag_rnn) get_loss = plus_rnn.get_embedding if flag_rnn else plus_tfm.get_embedding end = Print('end initializing a model', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## setup trainer configurations start = Print('start setting trainer configurations', output) tasks_list = [["", [], []]] # list of lists [idx, metrics_train, metrics_eval] trainer = Trainer(models_list, get_loss, run_cfg, tasks_list) trainer_args = {"data_parallel": data_parallel} end = Print('end setting trainer configurations', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) ## evaluate a model start = Print('start embedding protein sequences', output) ### evaluate cls for b, batch in enumerate(iterator_test): batch = [t.to(device) if type(t) is torch.Tensor else t for t in batch] trainer.embed(batch, trainer_args) if b % 10 == 0: print('# cls {:.1%} loss={:.4f}'.format( b / len(iterator_test), trainer.loss_eval), end='\r', file=sys.stderr) print(' ' * 150, end='\r', file=sys.stderr) trainer.save_embeddings(save_prefix) trainer.reset() end = Print('end embedding protein sequences', output) Print("".join(['elapsed time:', str(end - start)]), output, newline=True) output.close()