def test_interpret_wo_bias(): from bpnet.metrics import RegressionMetrics, ClassificationMetrics, PeakPredictionProfileMetric from concise.preprocessing import encodeDNA # test the model seqs = encodeDNA(['ACAGA'] * 100) inputs = {"seq": seqs, "bias/a/profile": np.random.randn(100, 5, 2)} # Let's use regression targets = { "a/class": np.random.randint(low=0, high=2, size=(100, 1)).astype(float), "a/counts": 1 + np.ceil(np.abs(np.random.randn(100))), "a/profile": 1 + np.ceil(np.abs(np.random.randn(100, 5, 2))), } import keras.backend as K # K.clear_session() # use bias m = SeqModel( body=BaseNet('relu'), heads=[ BinaryClassificationHead('{task}/class', net=TopDense(pool_size=2), use_bias=False), ScalarHead('{task}/counts', loss='mse', metric=RegressionMetrics(), net=TopDense(pool_size=2), use_bias=False), ProfileHead( '{task}/profile', loss='mse', metric=PeakPredictionProfileMetric(neg_max_threshold=0.05, required_min_pos_counts=0), net=TopConv(n_output=2), use_bias=True, bias_shape=(5, 2) ), # NOTE: the shape currently has to be hard-coded to the sequence length ], tasks=['a']) m.model.fit(inputs, targets) o = m.contrib_score_all(seqs) assert 'a/profile/wn' in o assert o['a/profile/wn'].shape == seqs.shape assert 'a/profile/wn' in o assert o['a/profile/wn'].shape == seqs.shape # evaluate the dataset -> setup an array dataset (NumpyDataset) -> convert to from bpnet.data import NumpyDataset ds = NumpyDataset({"inputs": inputs, "targets": targets}) o = m.evaluate(ds) assert 'avg/counts/mad' in o
def binary_seq_model(tasks, net_body, net_head, lr=0.004, seqlen=None): """NOTE: This doesn't work with gin-train since the classes injected by gin-config can't be pickled. Instead, I created `basset_seq_model` ``` Can't pickle <class 'bpnet.layers.BassetConv'>: it's not the same object as bpnet.layers.BassetConv ``` """ from bpnet.seqmodel import SeqModel from bpnet.heads import ScalarHead, ProfileHead from bpnet.metrics import ClassificationMetrics # Heads ------------------------------------------------- heads = [ScalarHead(target_name='{task}/class', net=net_head, activation='sigmoid', loss='binary_crossentropy', metric=ClassificationMetrics(), )] # ------------------------------------------------- m = SeqModel( body=net_body, heads=heads, tasks=tasks, optimizer=Adam(lr=lr), seqlen=seqlen, ) return m
def list_contrib(model_dir): """List the available contribution scores for a particular model. This can be useful in combination with `--contrib-wildcard` flag of the `bpnet contrib` command. """ # don't use any gpu os.environ['CUDA_VISIBLE_DEVICES'] = '' seqmodel = SeqModel.from_mdir(model_dir) print("Available Interpretation targets>") for name, _ in seqmodel.get_intp_tensors(preact_only=False): print(name)
def from_mdir(cls, model_dir): from bpnet.seqmodel import SeqModel # figure out also the fasta_file if present (from dataspec) from bpnet.dataspecs import DataSpec ds_path = os.path.join(model_dir, "dataspec.yml") if os.path.exists(ds_path): ds = DataSpec.load(ds_path) fasta_file = ds.fasta_file else: fasta_file = None return cls(SeqModel.from_mdir(model_dir), fasta_file=fasta_file)
def test_output_files_model_w_bias(trained_model_w_bias): K.clear_session() output_files = os.listdir(str(trained_model_w_bias)) expected_files = [ 'config.gin', 'config.gin.json', 'bpnet-train.kwargs.json', 'dataspec.yml', 'evaluate.ipynb', 'evaluate.html', 'evaluation.valid.json', 'history.csv', 'model.h5', 'seq_model.pkl', 'note_params.json', ] for f in expected_files: assert f in output_files m = SeqModel.load(trained_model_w_bias / 'seq_model.pkl') m.predict(encodeDNA(["A" * 200]))
def bpnet_contrib( model_dir, output_file, method="grad", dataspec=None, regions=None, fasta_file=None, # alternative to dataspec shuffle_seq=False, shuffle_regions=False, max_regions=None, # reference='zeroes', # Currently the only option # peak_width=1000, # automatically inferred from 'config.gin.json' # seq_width=None, contrib_wildcard='*/profile/wn,*/counts/pre-act', # specifies which contrib. scores to compute batch_size=512, gpu=0, memfrac_gpu=0.45, num_workers=10, storage_chunk_size=512, exclude_chr='', include_chr='', overwrite=False, skip_bias=False): """Run contribution scores for a BPNet model """ from bpnet.extractors import _chrom_sizes add_file_logging(os.path.dirname(output_file), logger, 'bpnet-contrib') if gpu is not None: create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' if os.path.exists(output_file): if overwrite: os.remove(output_file) else: raise ValueError( f"File exists {output_file}. Use overwrite=True to overwrite it" ) config = read_json(os.path.join(model_dir, 'config.gin.json')) seq_width = config['seq_width'] peak_width = config['seq_width'] # NOTE - seq_width has to be the same for the input and the target # # infer from the command line # if seq_width is None: # logger.info("Using seq_width = peak_width") # seq_width = peak_width # # make sure these are int's # seq_width = int(seq_width) # peak_width = int(peak_width) # Split contrib_wildcards = contrib_wildcard.split(",") # Allow chr inclusion / exclusion if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = None if include_chr: include_chr = include_chr.split(",") else: include_chr = None logger.info("Loading the config files") model_dir = Path(model_dir) logger.info("Creating the dataset") from bpnet.datasets import StrandedProfile, SeqClassification if fasta_file is not None: if regions is None: raise ValueError( "fasta_file specified. Expecting regions to be specified as well" ) dl_valid = SeqClassification( fasta_file=fasta_file, intervals_file=regions, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, auto_resize_len=seq_width, ) chrom_sizes = _chrom_sizes(fasta_file) else: if dataspec is None: logger.info("Using dataspec used to train the model") # Specify dataspec dataspec = model_dir / "dataspec.yml" ds = DataSpec.load(dataspec) dl_valid = StrandedProfile(ds, incl_chromosomes=include_chr, excl_chromosomes=exclude_chr, intervals_file=regions, peak_width=peak_width, shuffle=False, seq_width=seq_width) chrom_sizes = _chrom_sizes(ds.fasta_file) # Setup contribution score trimming (not required currently) if seq_width > peak_width: # Trim # make sure we can nicely trim the peak logger.info("Trimming the output") assert (seq_width - peak_width) % 2 == 0 trim_start = (seq_width - peak_width) // 2 trim_end = seq_width - trim_start assert trim_end - trim_start == peak_width elif seq_width == peak_width: trim_start = 0 trim_end = peak_width else: raise ValueError("seq_width < peak_width") seqmodel = SeqModel.from_mdir(model_dir) # get all possible interpretation names # make sure they match the specified glob intp_names = [ name for name, _ in seqmodel.get_intp_tensors(preact_only=False) if fnmatch_any(name, contrib_wildcards) ] logger.info(f"Using the following interpretation targets:") for n in intp_names: print(n) if max_regions is not None: if len(dl_valid) > max_regions: logging.info( f"Using {max_regions} regions instead of the original {len(dl_valid)}" ) else: logging.info( f"--max-regions={max_regions} is larger than the dataset size: {len(dl_valid)}. " "Using the dataset size for max-regions") max_regions = len(dl_valid) else: max_regions = len(dl_valid) max_batches = np.ceil(max_regions / batch_size) writer = HDF5BatchWriter(output_file, chunk_size=storage_chunk_size) for i, batch in enumerate( tqdm(dl_valid.batch_iter(batch_size=batch_size, shuffle=shuffle_regions, num_workers=num_workers), total=max_batches)): # store the original batch containing 'inputs' and 'targets' if skip_bias: batch['inputs'] = { 'seq': batch['inputs']['seq'] } # ignore all other inputs if max_batches > 0: if i > max_batches: break if shuffle_seq: # Di-nucleotide shuffle the sequences batch['inputs']['seq'] = onehot_dinucl_shuffle( batch['inputs']['seq']) for name in intp_names: hyp_contrib = seqmodel.contrib_score( batch['inputs']['seq'], name=name, method=method, batch_size=None) # don't second-batch # put contribution scores to the dictionary # also trim the contribution scores appropriately so that # the output will always be w.r.t. the peak center batch[f"/hyp_contrib/{name}"] = hyp_contrib[:, trim_start:trim_end] # trim the sequence as well # Trim the sequence batch['inputs']['seq'] = batch['inputs']['seq'][:, trim_start:trim_end] # ? maybe it would it be better to have an explicit ContribFileWriter. # that way the written schema would be fixed writer.batch_write(batch) # add chromosome sizes writer.f.attrs['chrom_sizes'] = json.dumps(chrom_sizes) writer.close() logger.info(f"Done. Contribution score file was saved to: {output_file}")
def bpnet_model(tasks, filters, n_dil_layers, conv1_kernel_size, tconv_kernel_size, b_loss_weight=1, c_loss_weight=1, p_loss_weight=1, c_splines=0, b_splines=20, merge_profile_reg=False, lr=0.004, tracks_per_task=2, padding='same', batchnorm=False, use_bias=False, n_bias_tracks=2, profile_metric=None, count_metric=None, profile_bias_window_sizes=[1, 50], seqlen=None, skip_type='residual'): """Setup the BPNet model architecture Args: tasks: list of tasks filters: number of convolutional filters to use at each layer n_dil_layers: number of dilated convolutional filters to use conv1_kernel_size: kernel_size of the first convolutional layer tconv_kernel_size: kernel_size of the transpose/de-convolutional final layer b_loss_weight: binary classification weight c_loss_weight: total count regression weight p_loss_weight: profile regression weight c_splines: number of splines to use in the binary classification output head p_splines: number of splines to use in the profile regression output head (0=None) merge_profile_reg: if True, total count and profile prediction will be part of a single profile output head lr: learning rate of the Adam optimizer padding: padding in the convolutional layers batchnorm: if True, add Batchnorm after every layer. Note: this may mess up the DeepLIFT contribution scores downstream use_bias: if True, correct for the bias n_bias_tracks: how many bias tracks to expect (for both total count and profile regression) seqlen: sequence length. skip_type: skip connection type ('residual' or 'dense') Returns: bpnet.seqmodel.SeqModel """ from bpnet.seqmodel import SeqModel from bpnet.layers import DilatedConv1D, DeConv1D, GlobalAvgPoolFCN, MovingAverages from bpnet.metrics import BPNetMetricSingleProfile, default_peak_pred_metric from bpnet.heads import ScalarHead, ProfileHead from bpnet.metrics import ClassificationMetrics, RegressionMetrics from bpnet.losses import multinomial_nll, CountsMultinomialNLL import bpnet.losses as bloss from bpnet.activations import clipped_exp from bpnet.functions import softmax assert p_loss_weight >= 0 assert c_loss_weight >= 0 assert b_loss_weight >= 0 # import ipdb # ipdb.set_trace() # TODO is it possible to re-instantiate the class to get rid of gin train? if profile_metric is None: print("Using the default profile prediction metric") profile_metric = default_peak_pred_metric if count_metric is None: print("Using the default regression prediction metrics") count_metric = RegressionMetrics() # Heads ------------------------------------------------- heads = [] # Profile prediction if p_loss_weight > 0: if not merge_profile_reg: heads.append(ProfileHead(target_name='{task}/profile', net=DeConv1D(n_tasks=tracks_per_task, filters=filters, tconv_kernel_size=tconv_kernel_size, padding=padding, n_hidden=0, batchnorm=batchnorm ), loss=multinomial_nll, loss_weight=p_loss_weight, postproc_fn=softmax, use_bias=use_bias, bias_input='bias/{task}/profile', bias_shape=(None, n_bias_tracks), bias_net=MovingAverages(window_sizes=profile_bias_window_sizes), metric=profile_metric )) else: heads.append(ProfileHead(target_name='{task}/profile', net=DeConv1D(n_tasks=tracks_per_task, filters=filters, tconv_kernel_size=tconv_kernel_size, padding=padding, n_hidden=1, # use 1 hidden layer in that case batchnorm=batchnorm ), activation=clipped_exp, loss=CountsMultinomialNLL(c_task_weight=c_loss_weight), loss_weight=p_loss_weight, bias_input='bias/{task}/profile', use_bias=use_bias, bias_shape=(None, n_bias_tracks), bias_net=MovingAverages(window_sizes=profile_bias_window_sizes), metric=BPNetMetricSingleProfile(count_metric=count_metric, profile_metric=profile_metric) )) c_loss_weight = 0 # don't need to use the other count loss # Count regression if c_loss_weight > 0: heads.append(ScalarHead(target_name='{task}/counts', net=GlobalAvgPoolFCN(n_tasks=tracks_per_task, n_splines=c_splines, batchnorm=batchnorm), activation=None, loss='mse', loss_weight=c_loss_weight, bias_input='bias/{task}/counts', use_bias=use_bias, bias_shape=(n_bias_tracks, ), metric=count_metric, )) # Binary classification if b_loss_weight > 0: heads.append(ScalarHead(target_name='{task}/class', net=GlobalAvgPoolFCN(n_tasks=1, n_splines=b_splines, batchnorm=batchnorm), activation='sigmoid', loss='binary_crossentropy', loss_weight=b_loss_weight, metric=ClassificationMetrics(), )) # ------------------------------------------------- m = SeqModel( body=DilatedConv1D(filters=filters, conv1_kernel_size=conv1_kernel_size, n_dil_layers=n_dil_layers, padding=padding, batchnorm=batchnorm, skip_type=skip_type), heads=heads, tasks=tasks, optimizer=Adam(lr=lr), seqlen=seqlen, ) return m