Exemple #1
0
def read_datasets():
    train_dataset = setup_dataset(task='masked_language_modeling',
                                  data_dir='/export/home/tape/data/',
                                  split='train',
                                  tokenizer='iupac')
    valid_dataset = setup_dataset(task='masked_language_modeling',
                                  data_dir='/export/home/tape/data/',
                                  split='valid',
                                  tokenizer='iupac')
    holdout_dataset = setup_dataset(task='masked_language_modeling',
                                    data_dir='/export/home/tape/data/',
                                    split='holdout',
                                    tokenizer='iupac')
    return train_dataset, valid_dataset, holdout_dataset
Exemple #2
0
def run_transformer(model_path, split, batch_size, no_cuda):
    local_rank = -1

    device, n_gpu, is_master = utils.setup_distributed(local_rank, no_cuda)
    model = registry.get_task_model('transformer', 'secondary_structure', None,
                                    model_path)
    model = model.to(device)

    runner = ForwardRunner(model, device, n_gpu)
    runner.initialize_distributed_model()

    valid_dataset = utils.setup_dataset('secondary_structure',
                                        '/export/home/tape/data/', split,
                                        'iupac')
    num_sequences = len(valid_dataset)
    valid_loader = utils.setup_loader(valid_dataset, batch_size, local_rank,
                                      n_gpu, 1, 1)

    acc_fn = registry.get_metric('accuracy')
    computation_start = time.time()
    save_outputs = run_eval_epoch(valid_loader, runner, is_master)
    time_elapsed = time.time() - computation_start
    time_per_sequence = time_elapsed / num_sequences

    acc = acc_fn(save_outputs)

    print(f'Mean accuracy: {acc:.4f}')
    print(f'Search time per sequence: 0.00')
    print(f'Prediction time per sequence: {time_per_sequence:.2f}')
    print(f'Total time per sequence: {time_per_sequence:.2f}')

    return acc, 0.0, time_per_sequence, time_per_sequence
Exemple #3
0
def test_datasets():
    for split in ['train', 'holdout', 'valid']:
        dataset = setup_dataset(task='profile_prediction',
                                data_dir='/export/home/tape/data/alignment/',
                                split=split,
                                tokenizer='iupac')
        print(dataset.data[len(dataset) - 1])
        print(f'Split {split} has {len(dataset)} elements')
Exemple #4
0
def get_data_iter(task, data_dir, split, tokenizer, batch_size, num_workers,
                  precomputed_key_file):
    dataset = setup_dataset(task=task,
                            data_dir=data_dir,
                            split=split,
                            tokenizer=tokenizer)
    if batch_size is not None:
        data_loader = setup_loader(dataset=dataset,
                                   batch_size=batch_size,
                                   local_rank=-1,
                                   n_gpu=1,
                                   gradient_accumulation_steps=1,
                                   num_workers=num_workers,
                                   precomputed_key_file=precomputed_key_file)
        return data_loader
    else:
        return dataset
Exemple #5
0
def main(args=None):
    if args is None:
        parser = create_parser()
        args = parser.parse_args()

    dataset = setup_dataset(task=args.task,
                            data_dir=args.data_dir,
                            split=args.split,
                            tokenizer=args.tokenizer)

    family_dataset = None
    if args.restrict_id:
        family_data_file = os.path.join(args.id_map_dir,
                                        f'pfam_{args.split}.lmdb')
        family_dataset = LMDBDataset(data_file=family_data_file)

    write_dataset_as_fasta(args.task, args.split, dataset, args.output_file,
                           args.pfam_id, family_dataset)
Exemple #6
0
def main(args):
    model = registry.get_task_model(model_name=args.model_type,
                                    task_name='masked_language_modeling',
                                    config_file=args.model_config_file,
                                    load_dir=args.load_dir)
    valid_dataset = utils.setup_dataset(task='masked_language_modeling',
                                        data_dir=args.data_dir,
                                        split=args.split,
                                        tokenizer=args.tokenizer)
    valid_loader = utils.setup_loader(dataset=valid_dataset,
                                      batch_size=args.batch_size,
                                      local_rank=-1,
                                      n_gpu=1,
                                      gradient_accumulation_steps=1,
                                      num_workers=8)
    tokenizer = valid_dataset.tokenizer

    counts_matrix = compute_matrix_mismatches(model, valid_loader, tokenizer)
    np.save(args.output_file, counts_matrix)
Exemple #7
0
def main():
    dataset = setup_dataset(
        task='profile_prediction',
        data_dir='/export/home/tape/data/alignment/pfam/test/',
        split='train',
        tokenizer='iupac')
    loader = setup_loader(dataset=dataset,
                          batch_size=4,
                          local_rank=-1,
                          n_gpu=1,
                          gradient_accumulation_steps=1,
                          num_workers=1)
    model = registry.get_task_model(model_name='transformer',
                                    task_name='profile_prediction')

    for i, batch in enumerate(loader):
        outputs = model(**batch)
        print(f'----Batch {i}----')
        print(batch['input_ids'].shape)
        print(batch['targets'].shape)
        print(outputs[0], outputs[1].shape)
        if i > 3:
            break
import os
import sys
sys.path[0] = '/export/home/tape/'
import torch
from tape.datasets import *
from tape.utils import setup_dataset, setup_loader
from tape.utils._sampler import *
from bisect import bisect_left
from tqdm import tqdm

dataset = setup_dataset(task='masked_language_modeling',
                        split='valid',
                        tokenizer='iupac',
                        data_dir='/export/home/tape/data/')
loader = setup_loader(dataset=dataset,
                      batch_size=400,
                      local_rank=-1,
                      n_gpu=1,
                      gradient_accumulation_steps=1,
                      num_workers=2,
                      max_sequence_length=300)

max = 0
for batch in tqdm(loader):
    if batch['input_ids'].shape[1] > max:
        max = batch['input_ids'].shape[1]
        print(max)