def read_datasets(): train_dataset = setup_dataset(task='masked_language_modeling', data_dir='/export/home/tape/data/', split='train', tokenizer='iupac') valid_dataset = setup_dataset(task='masked_language_modeling', data_dir='/export/home/tape/data/', split='valid', tokenizer='iupac') holdout_dataset = setup_dataset(task='masked_language_modeling', data_dir='/export/home/tape/data/', split='holdout', tokenizer='iupac') return train_dataset, valid_dataset, holdout_dataset
def run_transformer(model_path, split, batch_size, no_cuda): local_rank = -1 device, n_gpu, is_master = utils.setup_distributed(local_rank, no_cuda) model = registry.get_task_model('transformer', 'secondary_structure', None, model_path) model = model.to(device) runner = ForwardRunner(model, device, n_gpu) runner.initialize_distributed_model() valid_dataset = utils.setup_dataset('secondary_structure', '/export/home/tape/data/', split, 'iupac') num_sequences = len(valid_dataset) valid_loader = utils.setup_loader(valid_dataset, batch_size, local_rank, n_gpu, 1, 1) acc_fn = registry.get_metric('accuracy') computation_start = time.time() save_outputs = run_eval_epoch(valid_loader, runner, is_master) time_elapsed = time.time() - computation_start time_per_sequence = time_elapsed / num_sequences acc = acc_fn(save_outputs) print(f'Mean accuracy: {acc:.4f}') print(f'Search time per sequence: 0.00') print(f'Prediction time per sequence: {time_per_sequence:.2f}') print(f'Total time per sequence: {time_per_sequence:.2f}') return acc, 0.0, time_per_sequence, time_per_sequence
def test_datasets(): for split in ['train', 'holdout', 'valid']: dataset = setup_dataset(task='profile_prediction', data_dir='/export/home/tape/data/alignment/', split=split, tokenizer='iupac') print(dataset.data[len(dataset) - 1]) print(f'Split {split} has {len(dataset)} elements')
def get_data_iter(task, data_dir, split, tokenizer, batch_size, num_workers, precomputed_key_file): dataset = setup_dataset(task=task, data_dir=data_dir, split=split, tokenizer=tokenizer) if batch_size is not None: data_loader = setup_loader(dataset=dataset, batch_size=batch_size, local_rank=-1, n_gpu=1, gradient_accumulation_steps=1, num_workers=num_workers, precomputed_key_file=precomputed_key_file) return data_loader else: return dataset
def main(args=None): if args is None: parser = create_parser() args = parser.parse_args() dataset = setup_dataset(task=args.task, data_dir=args.data_dir, split=args.split, tokenizer=args.tokenizer) family_dataset = None if args.restrict_id: family_data_file = os.path.join(args.id_map_dir, f'pfam_{args.split}.lmdb') family_dataset = LMDBDataset(data_file=family_data_file) write_dataset_as_fasta(args.task, args.split, dataset, args.output_file, args.pfam_id, family_dataset)
def main(args): model = registry.get_task_model(model_name=args.model_type, task_name='masked_language_modeling', config_file=args.model_config_file, load_dir=args.load_dir) valid_dataset = utils.setup_dataset(task='masked_language_modeling', data_dir=args.data_dir, split=args.split, tokenizer=args.tokenizer) valid_loader = utils.setup_loader(dataset=valid_dataset, batch_size=args.batch_size, local_rank=-1, n_gpu=1, gradient_accumulation_steps=1, num_workers=8) tokenizer = valid_dataset.tokenizer counts_matrix = compute_matrix_mismatches(model, valid_loader, tokenizer) np.save(args.output_file, counts_matrix)
def main(): dataset = setup_dataset( task='profile_prediction', data_dir='/export/home/tape/data/alignment/pfam/test/', split='train', tokenizer='iupac') loader = setup_loader(dataset=dataset, batch_size=4, local_rank=-1, n_gpu=1, gradient_accumulation_steps=1, num_workers=1) model = registry.get_task_model(model_name='transformer', task_name='profile_prediction') for i, batch in enumerate(loader): outputs = model(**batch) print(f'----Batch {i}----') print(batch['input_ids'].shape) print(batch['targets'].shape) print(outputs[0], outputs[1].shape) if i > 3: break
import os import sys sys.path[0] = '/export/home/tape/' import torch from tape.datasets import * from tape.utils import setup_dataset, setup_loader from tape.utils._sampler import * from bisect import bisect_left from tqdm import tqdm dataset = setup_dataset(task='masked_language_modeling', split='valid', tokenizer='iupac', data_dir='/export/home/tape/data/') loader = setup_loader(dataset=dataset, batch_size=400, local_rank=-1, n_gpu=1, gradient_accumulation_steps=1, num_workers=2, max_sequence_length=300) max = 0 for batch in tqdm(loader): if batch['input_ids'].shape[1] > max: max = batch['input_ids'].shape[1] print(max)