Beispiel #1
0
def run_transformer(model_path, split, batch_size, no_cuda):
    local_rank = -1

    device, n_gpu, is_master = utils.setup_distributed(local_rank, no_cuda)
    model = registry.get_task_model('transformer', 'secondary_structure', None,
                                    model_path)
    model = model.to(device)

    runner = ForwardRunner(model, device, n_gpu)
    runner.initialize_distributed_model()

    valid_dataset = utils.setup_dataset('secondary_structure',
                                        '/export/home/tape/data/', split,
                                        'iupac')
    num_sequences = len(valid_dataset)
    valid_loader = utils.setup_loader(valid_dataset, batch_size, local_rank,
                                      n_gpu, 1, 1)

    acc_fn = registry.get_metric('accuracy')
    computation_start = time.time()
    save_outputs = run_eval_epoch(valid_loader, runner, is_master)
    time_elapsed = time.time() - computation_start
    time_per_sequence = time_elapsed / num_sequences

    acc = acc_fn(save_outputs)

    print(f'Mean accuracy: {acc:.4f}')
    print(f'Search time per sequence: 0.00')
    print(f'Prediction time per sequence: {time_per_sequence:.2f}')
    print(f'Total time per sequence: {time_per_sequence:.2f}')

    return acc, 0.0, time_per_sequence, time_per_sequence
    def __init__(self,
                 data_subdir='tape',
                 max_seq_len=1024,
                 train_batch_size=2,
                 test_batch_size=8,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)

        self.max_seq_len = max_seq_len

        data_dir = f'data/{data_subdir}'
        train_dataset = TAPERemoteHomologyDataset(data_dir, 'train')
        val_dataset = TAPERemoteHomologyDataset(data_dir, 'valid')

        self.d_train = utils.setup_loader(train_dataset, train_batch_size, -1,
                                          1, 1, 0)
        self.d_test = utils.setup_loader(val_dataset, test_batch_size, -1, 1,
                                         1, 0)

        self.train_enum = enumerate(self.d_train)
        self.test_enum = enumerate(self.d_test)
Beispiel #3
0
def get_data_iter(task, data_dir, split, tokenizer, batch_size, num_workers,
                  precomputed_key_file):
    dataset = setup_dataset(task=task,
                            data_dir=data_dir,
                            split=split,
                            tokenizer=tokenizer)
    if batch_size is not None:
        data_loader = setup_loader(dataset=dataset,
                                   batch_size=batch_size,
                                   local_rank=-1,
                                   n_gpu=1,
                                   gradient_accumulation_steps=1,
                                   num_workers=num_workers,
                                   precomputed_key_file=precomputed_key_file)
        return data_loader
    else:
        return dataset
Beispiel #4
0
def main(args):
    model = registry.get_task_model(model_name=args.model_type,
                                    task_name='masked_language_modeling',
                                    config_file=args.model_config_file,
                                    load_dir=args.load_dir)
    valid_dataset = utils.setup_dataset(task='masked_language_modeling',
                                        data_dir=args.data_dir,
                                        split=args.split,
                                        tokenizer=args.tokenizer)
    valid_loader = utils.setup_loader(dataset=valid_dataset,
                                      batch_size=args.batch_size,
                                      local_rank=-1,
                                      n_gpu=1,
                                      gradient_accumulation_steps=1,
                                      num_workers=8)
    tokenizer = valid_dataset.tokenizer

    counts_matrix = compute_matrix_mismatches(model, valid_loader, tokenizer)
    np.save(args.output_file, counts_matrix)
Beispiel #5
0
def main():
    dataset = setup_dataset(
        task='profile_prediction',
        data_dir='/export/home/tape/data/alignment/pfam/test/',
        split='train',
        tokenizer='iupac')
    loader = setup_loader(dataset=dataset,
                          batch_size=4,
                          local_rank=-1,
                          n_gpu=1,
                          gradient_accumulation_steps=1,
                          num_workers=1)
    model = registry.get_task_model(model_name='transformer',
                                    task_name='profile_prediction')

    for i, batch in enumerate(loader):
        outputs = model(**batch)
        print(f'----Batch {i}----')
        print(batch['input_ids'].shape)
        print(batch['targets'].shape)
        print(outputs[0], outputs[1].shape)
        if i > 3:
            break
Beispiel #6
0
def run_train(task: str,
              num_hidden_layers: int,
              one_vs_all_label: str = None,
              attention_probe: bool = False,
              label_scheme: str = None,
              learning_rate: float = 1e-4,
              batch_size: int = 1024,
              num_train_epochs: int = 10,
              num_log_iter: int = 20,
              fp16: bool = False,
              warmup_steps: int = 10000,
              gradient_accumulation_steps: int = 1,
              loss_scale: int = 0,
              max_grad_norm: float = 1.0,
              exp_name: typing.Optional[str] = None,
              log_dir: str = './logs',
              eval_freq: int = 1,
              save_freq: typing.Union[int, str] = 1,
              no_cuda: bool = False,
              seed: int = 42,
              local_rank: int = -1,
              num_workers: int = 0,
              debug: bool = False,
              log_level: typing.Union[str, int] = logging.INFO,
              patience: int = -1,
              max_seq_len: typing.Optional[int] = None) -> None:
    # SETUP AND LOGGING CODE #
    input_args = locals()
    device, n_gpu, is_master = utils.setup_distributed(local_rank, no_cuda)

    data_dir = get_data_path()
    output_dir = data_dir / 'probing'
    exp_dir = f'{(exp_name + "_") if exp_name else ""}{task}_{(one_vs_all_label + "_") if one_vs_all_label else ""}' \
              f'{"attn_" if attention_probe else ""}{num_hidden_layers}'
    save_path = Path(output_dir) / exp_dir

    if is_master:
        # save all the hidden parameters.
        save_path.mkdir(parents=True, exist_ok=True)
        with (save_path / 'args.json').open('w') as f:
            json.dump(input_args, f)

    utils.barrier_if_distributed()
    utils.setup_logging(local_rank, save_path, log_level)
    utils.set_random_seeds(seed, n_gpu)

    if task == 'secondary':
        num_labels = 2
        if attention_probe:
            model = ProteinBertForLinearSequenceToSequenceProbingFromAttention.from_pretrained(
                'bert-base',
                num_hidden_layers=num_hidden_layers,
                num_labels=num_labels)
        else:
            model = ProteinBertForLinearSequenceToSequenceProbing.from_pretrained(
                'bert-base',
                num_hidden_layers=num_hidden_layers,
                num_labels=num_labels)
        if label_scheme == 'ss4':
            label = int(one_vs_all_label)
        else:
            label = one_vs_all_label
        train_dataset = SecondaryStructureOneVsAllDataset(
            data_dir, 'train', label_scheme, label)
        valid_dataset = SecondaryStructureOneVsAllDataset(
            data_dir, 'valid', label_scheme, label)
    elif task == 'binding_sites':
        num_labels = 2
        if attention_probe:
            model = ProteinBertForLinearSequenceToSequenceProbingFromAttention.from_pretrained(
                'bert-base',
                num_hidden_layers=num_hidden_layers,
                num_labels=num_labels)
        else:
            model = ProteinBertForLinearSequenceToSequenceProbing.from_pretrained(
                'bert-base',
                num_hidden_layers=num_hidden_layers,
                num_labels=num_labels)
        train_dataset = BindingSiteDataset(data_dir, 'train')
        valid_dataset = BindingSiteDataset(data_dir, 'valid')
    elif task == 'contact_map':
        num_labels = 2
        if attention_probe:
            model = ProteinBertForContactPredictionFromAttention.from_pretrained(
                'bert-base', num_hidden_layers=num_hidden_layers)
        else:
            model = ProteinBertForContactProbing.from_pretrained(
                'bert-base', num_hidden_layers=num_hidden_layers)
        train_dataset = ProteinnetDataset(data_dir,
                                          'train',
                                          max_seq_len=max_seq_len)
        valid_dataset = ProteinnetDataset(data_dir,
                                          'valid',
                                          max_seq_len=max_seq_len)
    else:
        raise NotImplementedError

    model = model.to(device)
    optimizer = utils.setup_optimizer(model, learning_rate)
    # viz = visualization.get(log_dir, exp_dir, local_rank, debug=debug)
    # viz.log_config(input_args)
    # viz.log_config(model.config.to_dict())
    # viz.watch(model)
    viz = None

    train_loader = utils.setup_loader(train_dataset, batch_size, local_rank,
                                      n_gpu, gradient_accumulation_steps,
                                      num_workers)
    valid_loader = utils.setup_loader(valid_dataset, batch_size, local_rank,
                                      n_gpu, gradient_accumulation_steps,
                                      num_workers)

    num_train_optimization_steps = utils.get_num_train_optimization_steps(
        train_dataset, batch_size, num_train_epochs)

    logger.info(f"device: {device} "
                f"n_gpu: {n_gpu}, "
                f"distributed_training: {local_rank != -1}, "
                f"16-bits training: {fp16}")

    runner = BackwardRunner(model, optimizer, gradient_accumulation_steps,
                            device, n_gpu, fp16, local_rank, max_grad_norm,
                            warmup_steps, num_train_optimization_steps)

    runner.initialize_fp16()

    start_epoch = 0
    runner.initialize_distributed_model()

    num_train_optimization_steps = utils.get_num_train_optimization_steps(
        train_dataset, batch_size, num_train_epochs)
    is_master = local_rank in (-1, 0)

    if isinstance(save_freq, str) and save_freq != 'improvement':
        raise ValueError(
            f"Only recongized string value for save_freq is 'improvement'"
            f", received: {save_freq}")

    if save_freq == 'improvement' and eval_freq <= 0:
        raise ValueError(
            "Cannot set save_freq to 'improvement' and eval_freq < 0")

    num_trainable_parameters = sum(p.numel() for p in model.parameters()
                                   if p.requires_grad)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Batch size = %d", batch_size)
    logger.info("  Num epochs = %d", num_train_epochs)
    logger.info("  Num train steps = %d", num_train_optimization_steps)
    logger.info("  Num parameters = %d", num_trainable_parameters)

    best_val_loss = float('inf')
    num_evals_no_improvement = 0

    def do_save(epoch_id: int, num_evals_no_improvement: int) -> bool:
        if not is_master:
            return False
        if isinstance(save_freq, int):
            return ((epoch_id + 1) % save_freq == 0) or ((epoch_id + 1)
                                                         == num_train_epochs)
        else:
            return num_evals_no_improvement == 0

    utils.barrier_if_distributed()

    metrics = ['accuracy', 'precision', 'recall', 'f1']
    metric_functions = [accuracy, precision, recall, f1]

    # ACTUAL TRAIN/EVAL LOOP #
    with utils.wrap_cuda_oom_error(local_rank, batch_size, n_gpu,
                                   gradient_accumulation_steps):
        for epoch_id in range(start_epoch, num_train_epochs):
            run_train_epoch(epoch_id, train_loader, runner, viz, num_log_iter,
                            gradient_accumulation_steps)
            if eval_freq > 0 and (epoch_id + 1) % eval_freq == 0:
                val_loss, metric = run_valid_epoch(epoch_id, valid_loader,
                                                   runner, viz, is_master)
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    num_evals_no_improvement = 0
                    if task == 'contact_map':
                        outputs, seq_lens = run_eval_epoch(
                            valid_loader, runner, get_sequence_lengths=True)
                    else:
                        outputs = run_eval_epoch(valid_loader,
                                                 runner,
                                                 get_sequence_lengths=False)
                    target = [el['target'] for el in outputs]
                    prediction = [el['prediction'] for el in outputs]

                    if task == 'contact_map':
                        # Reshape 2d to 1d
                        # Shape batch_size, seq_len, seq_len
                        prediction = [
                            torch.tensor(prediction_matrix).view(-1,
                                                                 2).tolist()
                            for prediction_matrix in prediction
                        ]
                        target = [
                            torch.tensor(target_matrix).view(-1).tolist()
                            for target_matrix in target
                        ]

                    metrics_to_save = {
                        name: metric(target, prediction)
                        for name, metric in zip(metrics, metric_functions)
                    }
                    if task == 'contact_map':
                        ks = [int(round(seq_len / 5)) for seq_len in seq_lens]
                        metrics_to_save['precision_at_k'] = precision_at_ks(
                            ks, target, prediction)
                    elif task == 'binding_sites':
                        seq_lens = []
                        for target_array in target:
                            mask = target_array != -1
                            seq_lens.append(mask.sum())
                        ks = [int(round(seq_len / 20)) for seq_len in seq_lens]
                        metrics_to_save['precision_at_k'] = precision_at_ks(
                            ks, target, prediction)
                    print(metrics_to_save)
                    metrics_to_save['loss'] = val_loss
                else:
                    num_evals_no_improvement += 1

            # Save trained model
            if do_save(epoch_id, num_evals_no_improvement):
                logger.info("** ** * Saving trained model ** ** * ")
                # Only save the model itself
                runner.save_state(save_path, epoch_id)
                logger.info(f"Saving model checkpoint to {save_path}")

            utils.barrier_if_distributed()
            if patience > 0 and num_evals_no_improvement >= patience:
                logger.info(
                    f"Finished training at epoch {epoch_id} because no "
                    f"improvement for {num_evals_no_improvement} epochs.")
                logger.log(35, f"Best Val Loss: {best_val_loss}")
                if local_rank != -1:
                    # If you're distributed, raise this error. It sends a signal to
                    # the master process which lets it kill other processes and terminate
                    # without actually reporting an error. See utils/distributed_utils.py
                    # for the signal handling code.
                    raise errors.EarlyStopping
                else:
                    break
    logger.info(f"Finished training after {num_train_epochs} epochs.")
    if best_val_loss != float('inf'):
        logger.log(35, f"Best Val Loss: {best_val_loss}")

    with open(save_path / 'results.json', 'w') as outfile:
        json.dump(metrics_to_save, outfile)

    del model
import os
import sys
sys.path[0] = '/export/home/tape/'
import torch
from tape.datasets import *
from tape.utils import setup_dataset, setup_loader
from tape.utils._sampler import *
from bisect import bisect_left
from tqdm import tqdm

dataset = setup_dataset(task='masked_language_modeling',
                        split='valid',
                        tokenizer='iupac',
                        data_dir='/export/home/tape/data/')
loader = setup_loader(dataset=dataset,
                      batch_size=400,
                      local_rank=-1,
                      n_gpu=1,
                      gradient_accumulation_steps=1,
                      num_workers=2,
                      max_sequence_length=300)

max = 0
for batch in tqdm(loader):
    if batch['input_ids'].shape[1] > max:
        max = batch['input_ids'].shape[1]
        print(max)