Esempio n. 1
0
def test_interpret_wo_bias():
    from bpnet.metrics import RegressionMetrics, ClassificationMetrics, PeakPredictionProfileMetric
    from concise.preprocessing import encodeDNA
    # test the model
    seqs = encodeDNA(['ACAGA'] * 100)

    inputs = {"seq": seqs, "bias/a/profile": np.random.randn(100, 5, 2)}

    # Let's use regression
    targets = {
        "a/class": np.random.randint(low=0, high=2,
                                     size=(100, 1)).astype(float),
        "a/counts": 1 + np.ceil(np.abs(np.random.randn(100))),
        "a/profile": 1 + np.ceil(np.abs(np.random.randn(100, 5, 2))),
    }

    import keras.backend as K
    # K.clear_session()
    # use bias
    m = SeqModel(
        body=BaseNet('relu'),
        heads=[
            BinaryClassificationHead('{task}/class',
                                     net=TopDense(pool_size=2),
                                     use_bias=False),
            ScalarHead('{task}/counts',
                       loss='mse',
                       metric=RegressionMetrics(),
                       net=TopDense(pool_size=2),
                       use_bias=False),
            ProfileHead(
                '{task}/profile',
                loss='mse',
                metric=PeakPredictionProfileMetric(neg_max_threshold=0.05,
                                                   required_min_pos_counts=0),
                net=TopConv(n_output=2),
                use_bias=True,
                bias_shape=(5, 2)
            ),  # NOTE: the shape currently has to be hard-coded to the sequence length
        ],
        tasks=['a'])
    m.model.fit(inputs, targets)

    o = m.contrib_score_all(seqs)
    assert 'a/profile/wn' in o
    assert o['a/profile/wn'].shape == seqs.shape
    assert 'a/profile/wn' in o
    assert o['a/profile/wn'].shape == seqs.shape

    # evaluate the dataset -> setup an array dataset (NumpyDataset) -> convert to
    from bpnet.data import NumpyDataset
    ds = NumpyDataset({"inputs": inputs, "targets": targets})
    o = m.evaluate(ds)
    assert 'avg/counts/mad' in o
Esempio n. 2
0
def binary_seq_model(tasks,
                     net_body,
                     net_head,
                     lr=0.004,
                     seqlen=None):
    """NOTE: This doesn't work with gin-train since
    the classes injected by gin-config can't be pickled.

    Instead, I created `basset_seq_model`

    ```
    Can't pickle <class 'bpnet.layers.BassetConv'>: it's not the same
    object as bpnet.layers.BassetConv
    ```

    """
    from bpnet.seqmodel import SeqModel
    from bpnet.heads import ScalarHead, ProfileHead
    from bpnet.metrics import ClassificationMetrics
    # Heads -------------------------------------------------
    heads = [ScalarHead(target_name='{task}/class',
                        net=net_head,
                        activation='sigmoid',
                        loss='binary_crossentropy',
                        metric=ClassificationMetrics(),
                        )]
    # -------------------------------------------------
    m = SeqModel(
        body=net_body,
        heads=heads,
        tasks=tasks,
        optimizer=Adam(lr=lr),
        seqlen=seqlen,
    )
    return m
Esempio n. 3
0
def list_contrib(model_dir):
    """List the available contribution scores for a particular model. This can be useful
    in combination with `--contrib-wildcard` flag of the `bpnet contrib` command.
    """
    # don't use any gpu
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    seqmodel = SeqModel.from_mdir(model_dir)
    print("Available Interpretation targets>")
    for name, _ in seqmodel.get_intp_tensors(preact_only=False):
        print(name)
Esempio n. 4
0
 def from_mdir(cls, model_dir):
     from bpnet.seqmodel import SeqModel
     # figure out also the fasta_file if present (from dataspec)
     from bpnet.dataspecs import DataSpec
     ds_path = os.path.join(model_dir, "dataspec.yml")
     if os.path.exists(ds_path):
         ds = DataSpec.load(ds_path)
         fasta_file = ds.fasta_file
     else:
         fasta_file = None
     return cls(SeqModel.from_mdir(model_dir), fasta_file=fasta_file)
Esempio n. 5
0
def test_output_files_model_w_bias(trained_model_w_bias):
    K.clear_session()
    output_files = os.listdir(str(trained_model_w_bias))
    expected_files = [
        'config.gin',
        'config.gin.json',
        'bpnet-train.kwargs.json',
        'dataspec.yml',
        'evaluate.ipynb',
        'evaluate.html',
        'evaluation.valid.json',
        'history.csv',
        'model.h5',
        'seq_model.pkl',
        'note_params.json',
    ]
    for f in expected_files:
        assert f in output_files

    m = SeqModel.load(trained_model_w_bias / 'seq_model.pkl')
    m.predict(encodeDNA(["A" * 200]))
Esempio n. 6
0
def bpnet_contrib(
        model_dir,
        output_file,
        method="grad",
        dataspec=None,
        regions=None,
        fasta_file=None,  # alternative to dataspec
        shuffle_seq=False,
        shuffle_regions=False,
        max_regions=None,
        # reference='zeroes', # Currently the only option
        # peak_width=1000,  # automatically inferred from 'config.gin.json'
        # seq_width=None,
        contrib_wildcard='*/profile/wn,*/counts/pre-act',  # specifies which contrib. scores to compute
        batch_size=512,
        gpu=0,
        memfrac_gpu=0.45,
        num_workers=10,
        storage_chunk_size=512,
        exclude_chr='',
        include_chr='',
        overwrite=False,
        skip_bias=False):
    """Run contribution scores for a BPNet model
    """
    from bpnet.extractors import _chrom_sizes
    add_file_logging(os.path.dirname(output_file), logger, 'bpnet-contrib')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(
                f"File exists {output_file}. Use overwrite=True to overwrite it"
            )

    config = read_json(os.path.join(model_dir, 'config.gin.json'))
    seq_width = config['seq_width']
    peak_width = config['seq_width']

    # NOTE - seq_width has to be the same for the input and the target
    #
    # infer from the command line
    # if seq_width is None:
    #     logger.info("Using seq_width = peak_width")
    #     seq_width = peak_width

    # # make sure these are int's
    # seq_width = int(seq_width)
    # peak_width = int(peak_width)

    # Split
    contrib_wildcards = contrib_wildcard.split(",")

    # Allow chr inclusion / exclusion
    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = None
    if include_chr:
        include_chr = include_chr.split(",")
    else:
        include_chr = None

    logger.info("Loading the config files")
    model_dir = Path(model_dir)

    logger.info("Creating the dataset")
    from bpnet.datasets import StrandedProfile, SeqClassification
    if fasta_file is not None:
        if regions is None:
            raise ValueError(
                "fasta_file specified. Expecting regions to be specified as well"
            )
        dl_valid = SeqClassification(
            fasta_file=fasta_file,
            intervals_file=regions,
            incl_chromosomes=include_chr,
            excl_chromosomes=exclude_chr,
            auto_resize_len=seq_width,
        )
        chrom_sizes = _chrom_sizes(fasta_file)
    else:
        if dataspec is None:
            logger.info("Using dataspec used to train the model")
            # Specify dataspec
            dataspec = model_dir / "dataspec.yml"

        ds = DataSpec.load(dataspec)
        dl_valid = StrandedProfile(ds,
                                   incl_chromosomes=include_chr,
                                   excl_chromosomes=exclude_chr,
                                   intervals_file=regions,
                                   peak_width=peak_width,
                                   shuffle=False,
                                   seq_width=seq_width)
        chrom_sizes = _chrom_sizes(ds.fasta_file)

    # Setup contribution score trimming (not required currently)
    if seq_width > peak_width:
        # Trim
        # make sure we can nicely trim the peak
        logger.info("Trimming the output")
        assert (seq_width - peak_width) % 2 == 0
        trim_start = (seq_width - peak_width) // 2
        trim_end = seq_width - trim_start
        assert trim_end - trim_start == peak_width
    elif seq_width == peak_width:
        trim_start = 0
        trim_end = peak_width
    else:
        raise ValueError("seq_width < peak_width")

    seqmodel = SeqModel.from_mdir(model_dir)

    # get all possible interpretation names
    # make sure they match the specified glob
    intp_names = [
        name for name, _ in seqmodel.get_intp_tensors(preact_only=False)
        if fnmatch_any(name, contrib_wildcards)
    ]
    logger.info(f"Using the following interpretation targets:")
    for n in intp_names:
        print(n)

    if max_regions is not None:
        if len(dl_valid) > max_regions:
            logging.info(
                f"Using {max_regions} regions instead of the original {len(dl_valid)}"
            )
        else:
            logging.info(
                f"--max-regions={max_regions} is larger than the dataset size: {len(dl_valid)}. "
                "Using the dataset size for max-regions")
            max_regions = len(dl_valid)
    else:
        max_regions = len(dl_valid)

    max_batches = np.ceil(max_regions / batch_size)

    writer = HDF5BatchWriter(output_file, chunk_size=storage_chunk_size)
    for i, batch in enumerate(
            tqdm(dl_valid.batch_iter(batch_size=batch_size,
                                     shuffle=shuffle_regions,
                                     num_workers=num_workers),
                 total=max_batches)):
        # store the original batch containing 'inputs' and 'targets'
        if skip_bias:
            batch['inputs'] = {
                'seq': batch['inputs']['seq']
            }  # ignore all other inputs

        if max_batches > 0:
            if i > max_batches:
                break

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            batch['inputs']['seq'] = onehot_dinucl_shuffle(
                batch['inputs']['seq'])

        for name in intp_names:
            hyp_contrib = seqmodel.contrib_score(
                batch['inputs']['seq'],
                name=name,
                method=method,
                batch_size=None)  # don't second-batch

            # put contribution scores to the dictionary
            # also trim the contribution scores appropriately so that
            # the output will always be w.r.t. the peak center
            batch[f"/hyp_contrib/{name}"] = hyp_contrib[:, trim_start:trim_end]

        # trim the sequence as well
        # Trim the sequence
        batch['inputs']['seq'] = batch['inputs']['seq'][:, trim_start:trim_end]

        # ? maybe it would it be better to have an explicit ContribFileWriter.
        # that way the written schema would be fixed
        writer.batch_write(batch)

    # add chromosome sizes
    writer.f.attrs['chrom_sizes'] = json.dumps(chrom_sizes)
    writer.close()
    logger.info(f"Done. Contribution score file was saved to: {output_file}")
Esempio n. 7
0
def bpnet_model(tasks,
                filters,
                n_dil_layers,
                conv1_kernel_size,
                tconv_kernel_size,
                b_loss_weight=1,
                c_loss_weight=1,
                p_loss_weight=1,
                c_splines=0,
                b_splines=20,
                merge_profile_reg=False,
                lr=0.004,
                tracks_per_task=2,
                padding='same',
                batchnorm=False,
                use_bias=False,
                n_bias_tracks=2,
                profile_metric=None,
                count_metric=None,
                profile_bias_window_sizes=[1, 50],
                seqlen=None,
                skip_type='residual'):
    """Setup the BPNet model architecture

    Args:
      tasks: list of tasks
      filters: number of convolutional filters to use at each layer
      n_dil_layers: number of dilated convolutional filters to use
      conv1_kernel_size: kernel_size of the first convolutional layer
      tconv_kernel_size: kernel_size of the transpose/de-convolutional final layer
      b_loss_weight: binary classification weight
      c_loss_weight: total count regression weight
      p_loss_weight: profile regression weight
      c_splines: number of splines to use in the binary classification output head
      p_splines: number of splines to use in the profile regression output head (0=None)
      merge_profile_reg: if True, total count and profile prediction will be part of
        a single profile output head
      lr: learning rate of the Adam optimizer
      padding: padding in the convolutional layers
      batchnorm: if True, add Batchnorm after every layer. Note: this may mess up the
        DeepLIFT contribution scores downstream
      use_bias: if True, correct for the bias
      n_bias_tracks: how many bias tracks to expect (for both total count and profile regression)
      seqlen: sequence length.
      skip_type: skip connection type ('residual' or 'dense')

    Returns:
      bpnet.seqmodel.SeqModel
    """
    from bpnet.seqmodel import SeqModel
    from bpnet.layers import DilatedConv1D, DeConv1D, GlobalAvgPoolFCN, MovingAverages
    from bpnet.metrics import BPNetMetricSingleProfile, default_peak_pred_metric
    from bpnet.heads import ScalarHead, ProfileHead
    from bpnet.metrics import ClassificationMetrics, RegressionMetrics
    from bpnet.losses import multinomial_nll, CountsMultinomialNLL
    import bpnet.losses as bloss
    from bpnet.activations import clipped_exp
    from bpnet.functions import softmax

    assert p_loss_weight >= 0
    assert c_loss_weight >= 0
    assert b_loss_weight >= 0

    # import ipdb
    # ipdb.set_trace()

    # TODO is it possible to re-instantiate the class to get rid of gin train?

    if profile_metric is None:
        print("Using the default profile prediction metric")
        profile_metric = default_peak_pred_metric

    if count_metric is None:
        print("Using the default regression prediction metrics")
        count_metric = RegressionMetrics()

    # Heads -------------------------------------------------
    heads = []
    # Profile prediction
    if p_loss_weight > 0:
        if not merge_profile_reg:
            heads.append(ProfileHead(target_name='{task}/profile',
                                     net=DeConv1D(n_tasks=tracks_per_task,
                                                  filters=filters,
                                                  tconv_kernel_size=tconv_kernel_size,
                                                  padding=padding,
                                                  n_hidden=0,
                                                  batchnorm=batchnorm
                                                  ),
                                     loss=multinomial_nll,
                                     loss_weight=p_loss_weight,
                                     postproc_fn=softmax,
                                     use_bias=use_bias,
                                     bias_input='bias/{task}/profile',
                                     bias_shape=(None, n_bias_tracks),
                                     bias_net=MovingAverages(window_sizes=profile_bias_window_sizes),
                                     metric=profile_metric
                                     ))
        else:
            heads.append(ProfileHead(target_name='{task}/profile',
                                     net=DeConv1D(n_tasks=tracks_per_task,
                                                  filters=filters,
                                                  tconv_kernel_size=tconv_kernel_size,
                                                  padding=padding,
                                                  n_hidden=1,  # use 1 hidden layer in that case
                                                  batchnorm=batchnorm
                                                  ),
                                     activation=clipped_exp,
                                     loss=CountsMultinomialNLL(c_task_weight=c_loss_weight),
                                     loss_weight=p_loss_weight,
                                     bias_input='bias/{task}/profile',
                                     use_bias=use_bias,
                                     bias_shape=(None, n_bias_tracks),
                                     bias_net=MovingAverages(window_sizes=profile_bias_window_sizes),
                                     metric=BPNetMetricSingleProfile(count_metric=count_metric,
                                                                     profile_metric=profile_metric)
                                     ))
            c_loss_weight = 0  # don't need to use the other count loss

    # Count regression
    if c_loss_weight > 0:
        heads.append(ScalarHead(target_name='{task}/counts',
                                net=GlobalAvgPoolFCN(n_tasks=tracks_per_task,
                                                     n_splines=c_splines,
                                                     batchnorm=batchnorm),
                                activation=None,
                                loss='mse',
                                loss_weight=c_loss_weight,
                                bias_input='bias/{task}/counts',
                                use_bias=use_bias,
                                bias_shape=(n_bias_tracks, ),
                                metric=count_metric,
                                ))

    # Binary classification
    if b_loss_weight > 0:
        heads.append(ScalarHead(target_name='{task}/class',
                                net=GlobalAvgPoolFCN(n_tasks=1,
                                                     n_splines=b_splines,
                                                     batchnorm=batchnorm),
                                activation='sigmoid',
                                loss='binary_crossentropy',
                                loss_weight=b_loss_weight,
                                metric=ClassificationMetrics(),
                                ))
    # -------------------------------------------------
    m = SeqModel(
        body=DilatedConv1D(filters=filters,
                           conv1_kernel_size=conv1_kernel_size,
                           n_dil_layers=n_dil_layers,
                           padding=padding,
                           batchnorm=batchnorm,
                           skip_type=skip_type),
        heads=heads,
        tasks=tasks,
        optimizer=Adam(lr=lr),
        seqlen=seqlen,
    )
    return m