def test_exon_model_masking(): model = MMSplice() preds = [ model.exonM.predict(encodeDNA(['AAA']))[0][0], model.exonM.predict(encodeDNA(['AAA', 'CATACA']))[0][0], model.exonM.predict(encodeDNA(['AAA', 'CATACAGGAA']))[0][0] ] for i in preds: assert abs(preds[0] - i) < 1e-6
def create_patterns(motif_seqs): patterns = [ Pattern(seq=encodeDNA([s])[0], contrib=dict(a=encodeDNA([s])[0]), hyp_contrib=dict(a=encodeDNA([s])[0]), name=str(i)) for i, s in enumerate(motif_seqs) ] aligned_patterns = [ p.align(patterns[0], pad_value=np.array([0.25] * 4)) for p in patterns ] return patterns, aligned_patterns
def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, importance=[]): """ Args: importance: list of importance scores """ from basepair.exp.chipnexus.simulate import generate_seq, average_profiles, flatten batch_size = repeat seqlen = self.seqmodel.seqlen tasks = self.seqmodel.tasks # simulate sequence seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif, side_distances=side_distances, seqlen=seqlen) for i in range(repeat)]) # get predictions scaled_preds = self.predict(seqs, batch_size=batch_size) if importance: # get the importance scores (compute only the profile and counts importance) imp_scores_all = self.seqmodel.imp_score_all(seqs, intp_pattern=['*/profile/wn', '*/counts/pre-act']) imp_scores = {t: {self._get_old_imp_score_name(imp_score_name): seqs * imp_scores_all[f'{t}/{imp_score_name}'] for imp_score_name in importance} for t in tasks} # merge and aggregate the profiles out = {"imp": imp_scores, "profile": scaled_preds} else: out = {"profile": scaled_preds} return average_profiles(flatten(out, "/"))
def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, importance=[]): """ Args: importance: list of importance scores """ # TODO - update? from basepair.exp.chipnexus.simulate import generate_seq, postproc, average_profiles, flatten batch_size = repeat seqlen = self.input_seqlen() tasks = self.tasks # simulate sequence seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif, side_distances=side_distances, seqlen=seqlen) for i in range(repeat)]) # get predictions preds = self.model.predict(seqs, batch_size=batch_size) # TODO - remove this and use model.predict instead scaled_preds = postproc(preds, tasks) if importance: # get the importance scores imp_scores = self.seq_importance(seqs, importance) # merge and aggregate the profiles out = {"imp": imp_scores, "profile": scaled_preds} else: out = scaled_preds return average_profiles(flatten(out, "/"))
def sim_pred(self, central_motif, side_motif=None, side_distances=[], repeat=128, contribution=[]): """Embed two motifs in random sequences and obtain their average predictions. Args: contribution: list of contribution scores """ from bpnet.simulate import generate_seq, average_profiles, flatten batch_size = repeat seqlen = self.seqmodel.seqlen tasks = self.seqmodel.tasks # simulate sequence seqs = encodeDNA([generate_seq(central_motif, side_motif=side_motif, side_distances=side_distances, seqlen=seqlen) for i in range(repeat)]) # get predictions scaled_preds = self.predict(seqs, batch_size=batch_size) if contribution: # get the contribution scores (compute only the profile and counts contribution) contrib_scores_all = self.seqmodel.contrib_score_all(seqs, intp_pattern=['*/profile/wn', '*/counts/pre-act']) contrib_scores = {t: {self._get_old_contrib_score_name(contrib_score_name): seqs * contrib_scores_all[f'{t}/{contrib_score_name}'] for contrib_score_name in contribution} for t in tasks} # merge and aggregate the profiles out = {"contrib": contrib_scores, "profile": scaled_preds} else: out = {"profile": scaled_preds} return average_profiles(flatten(out, "/"))
def random_seq_onehot(l): """Generate random sequence one-hot-encoded Args: l: sequence length """ from concise.preprocessing import encodeDNA return encodeDNA([''.join(random.choices("ACGT", k=int(l)))])[0]
def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang # need to pad N if left seq not enough long lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N" * (lackl + 1) + x intronl_len += lackl + 1 lackr = self.donor_intron_len - intronr_len if lackr >= 0: x = x + "N" * (lackr + 1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len - self.acceptor_intron_cut] acceptor = x[(intronl_len - self.acceptor_intron_len):(intronl_len + self.acceptor_exon_len)] exon = x[(intronl_len + self.exon_cut_l):(-intronr_len - self.exon_cut_r)] donor = x[(-intronr_len - self.donor_exon_len):(-intronr_len + self.donor_intron_len)] donor_intron = x[-intronr_len + self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len + 2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len - 2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) if self.encode: return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor]), "exon": encodeDNA([exon]), "donor": encodeDNA([donor], maxlen=18), "donor_intron": encodeDNA([donor_intron]) } else: return { "acceptor_intron": acceptor_intron, "acceptor": acceptor, "exon": exon[:self.maxExonLength], "donor": donor, "donor_intron": donor_intron }
def prepare_data(dt, features, response, sequence, id_column=None, seq_align="end", trim_seq_len=None): """ Prepare data for Concise.train or ConciseCV.train. Args: dt: A pandas DataFrame containing all the required data. features (List of strings): Column names of `dt` used to produce the features design matrix. These columns should be numeric. response (str or list of strings): Name(s) of column(s) used as a reponse variable. sequence (str): Name of the column storing the DNA/RNA sequences. id_column (str): Name of the column used as the row identifier. seq_align (str): one of ``{"start", "end"}``. To which end should we align sequences? trim_seq_len (int): Consider only first `trim_seq_len` bases of each sequence when generating the sequence design matrix. If :python:`None`, set :py:attr:`trim_seq_len` to the longest sequence length, hence whole sequences are considered. standardize_features (bool): If True, column in the returned matrix matrix :py:attr:`X_seq` are normalied to have zero mean and unit variance. Returns: tuple: Tuple with elements: :code:`(X_feat: X_seq, y, id_vec)`, where: - :py:attr:`X_feat`: features design matrix of shape :code:`(N, D)`, where N is :code:`len(dt)` and :code:`D = len(features)` - :py:attr:`X_seq`: sequence matrix of shape :code:`(N, 1, trim_seq_len, 4)`. It represents 1-hot encoding of the DNA/RNA sequence. - :py:attr:`y`: Response variable 1-column matrix of shape :code:`(N, 1)` - :py:attr:`id_vec`: 1D Character array of shape :code:`(N)`. It represents the ID's of individual rows. Note: One-hot encoding of the DNA/RNA sequence is the following: .. code:: python { "A": np.array([1, 0, 0, 0]), "C": np.array([0, 1, 0, 0]), "G": np.array([0, 0, 1, 0]), "T": np.array([0, 0, 0, 1]), "U": np.array([0, 0, 0, 1]), "N": np.array([0, 0, 0, 0]), } """ if type(response) is str: response = [response] X_feat = np.array(dt[features], dtype="float32") y = np.array(dt[response], dtype="float32") X_seq = encodeDNA(seq_vec=dt[sequence], maxlen=trim_seq_len, seq_align=seq_align) X_seq = np.array(X_seq, dtype="float32") id_vec = np.array(dt[id_column]) return X_feat, X_seq, y, id_vec
def test_interpret_wo_bias(): from bpnet.metrics import RegressionMetrics, ClassificationMetrics, PeakPredictionProfileMetric from concise.preprocessing import encodeDNA # test the model seqs = encodeDNA(['ACAGA'] * 100) inputs = {"seq": seqs, "bias/a/profile": np.random.randn(100, 5, 2)} # Let's use regression targets = { "a/class": np.random.randint(low=0, high=2, size=(100, 1)).astype(float), "a/counts": 1 + np.ceil(np.abs(np.random.randn(100))), "a/profile": 1 + np.ceil(np.abs(np.random.randn(100, 5, 2))), } import keras.backend as K # K.clear_session() # use bias m = SeqModel( body=BaseNet('relu'), heads=[ BinaryClassificationHead('{task}/class', net=TopDense(pool_size=2), use_bias=False), ScalarHead('{task}/counts', loss='mse', metric=RegressionMetrics(), net=TopDense(pool_size=2), use_bias=False), ProfileHead( '{task}/profile', loss='mse', metric=PeakPredictionProfileMetric(neg_max_threshold=0.05, required_min_pos_counts=0), net=TopConv(n_output=2), use_bias=True, bias_shape=(5, 2) ), # NOTE: the shape currently has to be hard-coded to the sequence length ], tasks=['a']) m.model.fit(inputs, targets) o = m.contrib_score_all(seqs) assert 'a/profile/wn' in o assert o['a/profile/wn'].shape == seqs.shape assert 'a/profile/wn' in o assert o['a/profile/wn'].shape == seqs.shape # evaluate the dataset -> setup an array dataset (NumpyDataset) -> convert to from bpnet.data import NumpyDataset ds = NumpyDataset({"inputs": inputs, "targets": targets}) o = m.evaluate(ds) assert 'avg/counts/mad' in o
def split(self, x, intronl_len=100, intronr_len=80): ''' x: a sequence to split ''' lackl = self.acceptor_intron_len - \ intronl_len # need to pad N if left seq not enough long if lackl >= 0: x = "N" * (lackl + 1) + x intronl_len += lackl + 1 lackr = self.donor_intron_len - intronr_len if lackr >= 0: x = x + "N" * (lackr + 1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len - self.acceptor_intron_cut] acceptor = x[(intronl_len - self.acceptor_intron_len):(intronl_len + self.acceptor_exon_len)] exon = x[(intronl_len + self.exon_cut_l):(-intronr_len - self.exon_cut_r)] donor = x[(-intronr_len - self.donor_exon_len):(-intronr_len + self.donor_intron_len)] donor_intron = x[-intronr_len + self.donor_intron_cut:] if self.pattern_warning: if donor[self.donor_exon_len:self.donor_exon_len + 2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len - 2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) if len(exon) == 0: exon = 'N' return { "acceptor_intron": encodeDNA([acceptor_intron]), "acceptor": encodeDNA([acceptor]), "exon": encodeDNA([exon]), "donor": encodeDNA([donor]), "donor_intron": encodeDNA([donor_intron]) }
def load(split="train"): dt = pd.read_csv(DATA_DIR + "/PUM2_{0}.csv".format(split)) # DNA/RNA sequence xseq = encodeDNA(dt.seq, maxlen=seq_length, seq_align='center') # response variable y = dt.binding_site.as_matrix().reshape((-1, 1)).astype("float") if split == "train": from concise.data import attract # add also the pwm_list pwm_list = attract.get_pwm_list(["129"]) return {"seq": xseq}, y, pwm_list else: return {"seq": xseq}, y
def predict_on_batch(self, x, **kwargs): ''' Use when load batch with sequence already splited. This way various length sequences are padded to the same length. x is a batch with sequence not encoded. Need to be encoded here for collate function to work ''' fts = x['seq'] acceptor_intron = encodeDNA(fts['acceptor_intron'].tolist(), seq_align="end") acceptor = encodeDNA(fts['acceptor'].tolist(), seq_align="end") exon = encodeDNA(fts['exon'].tolist(), seq_align="end") donor = encodeDNA(fts['donor'].tolist(), seq_align="end") donor_intron = encodeDNA(fts['donor_intron'].tolist(), seq_align="end") score = np.concatenate([ self.acceptor_intronM.predict(acceptor_intron), logit(self.acceptorM.predict(acceptor)), self.exonM.predict(exon), logit(self.donorM.predict(donor)), self.donor_intronM.predict(donor_intron) ], axis=1) return score
def predict(self, seq, overhang=(100, 100)): """ Performe prediction of overhanged exon sequence string. Args: seq (str): sequence of overhanged exon. overhang (Tuple[int, int]): overhang of seqeunce. Returns: np.array of modular predictions as [[acceptor_intronM, acceptor, exon, donor, donor_intron]]. """ batch = self.spliter.split(seq, overhang) batch = {k: encodeDNA([v]) for k, v in batch.items()} return self.predict_on_batch(batch)[0]
def split(self, x, overhang): ''' x: a sequence to split ''' intronl_len, intronr_len = overhang # need to pad N if left seq not enough long lackl = self.acceptor_intron_len - intronl_len if lackl >= 0: x = "N" * (lackl + 1) + x intronl_len += lackl + 1 lackr = self.donor_intron_len - intronr_len if lackr >= 0: x = x + "N" * (lackr + 1) intronr_len += lackr + 1 acceptor_intron = x[:intronl_len - self.acceptor_intron_cut] acceptor_start = intronl_len - self.acceptor_intron_len acceptor_end = intronl_len + self.acceptor_exon_len acceptor = x[acceptor_start:acceptor_end] exon_start = intronl_len + self.exon_cut_l exon_end = -intronr_len - self.exon_cut_r exon = x[exon_start:exon_end] donor_start = -intronr_len - self.donor_exon_len donor_end = -intronr_len + self.donor_intron_len donor = x[donor_start:donor_end] donor_intron = x[-intronr_len + self.donor_intron_cut:] if donor[self.donor_exon_len:self.donor_exon_len + 2] != "GT": warnings.warn("None GT donor", UserWarning) if acceptor[self.acceptor_intron_len - 2:self.acceptor_intron_len] != "AG": warnings.warn("None AG donor", UserWarning) splits = { "acceptor_intron": acceptor_intron, "acceptor": acceptor, "exon": exon, "donor": donor, "donor_intron": donor_intron } if self.encode: return {k: encodeDNA([v]) for k, v in splits.items()} return splits
def extract_seq(interval, variant, fasta_file, one_hot=False): """ Note: in case the variant is an indel, the anchorpoint at the beginning is used Args: interval: pybedtools.Interval where to extract the sequence from variant: Variant class with attributes: chr, pos, ref, alt fasta_file: file path or pysam.FastaFile instance one_hot: if True, one-hot-encode the output sequence Returns: sequence """ if isinstance(fasta_file, str): from pysam import FastaFile fasta_file = FastaFile(fasta_file) if variant is not None and variant.pos - 1 >= interval.start and variant.pos <= interval.stop: inside = True lendiff = len(variant.alt) - len(variant.ref) else: inside = False lendiff = 0 seq = fasta_file.fetch(str(interval.chrom), interval.start, interval.stop - lendiff) if not inside: out = seq else: # now, mutate the sequence pos = variant.pos - interval.start - 1 expect_ref = seq[pos:(pos + len(variant.ref))] if expect_ref != variant.ref: raise ValueError( f"Expected reference: {expect_ref}, observed reference: {variant.ref}" ) # Anchor at the beginning out = seq[:pos] + variant.alt + seq[(pos + len(variant.ref)):] assert len( out ) == interval.stop - interval.start # sequece length has to be correct at the end if one_hot: out = encodeDNA([out.upper()])[0] return out
def test_output_files_model_w_bias(trained_model_w_bias): K.clear_session() output_files = os.listdir(str(trained_model_w_bias)) expected_files = [ 'config.gin', 'config.gin.json', 'bpnet-train.kwargs.json', 'dataspec.yml', 'evaluate.ipynb', 'evaluate.html', 'evaluation.valid.json', 'history.csv', 'model.h5', 'seq_model.pkl', 'note_params.json', ] for f in expected_files: assert f in output_files m = SeqModel.load(trained_model_w_bias / 'seq_model.pkl') m.predict(encodeDNA(["A" * 200]))
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = Fasta(self.fasta_file) interval = self.bt[idx] interval_fasta_id = self._interval_to_fasta_id(interval) if self.targets is not None: y = self.targets.iloc[idx].values else: y = {} # Run the fasta extractor start, end = self._compute_relative_coords(interval) record = self.fasta_extractor[interval_fasta_id] seq = record[start:end].seq return { "inputs": encodeDNA([seq]).squeeze(), "targets": y, "metadata": { "ranges": GenomicRanges.from_interval(interval) } }
def test_tf_model(): tf.reset_default_graph() input_nodes = "inputs" target_nodes = "preds" meta_graph = "model_files/model.tf.meta" # meta_graph = 'model_files/model.tf-modified.meta' checkpoint = "model_files/model.tf" index = "model_files/model.tf.index" pkl_file = "model_files/const_feed_dict.pkl" from kipoi.model import TensorFlowModel m = TensorFlowModel(input_nodes="inputs", target_nodes="preds", meta_graph=meta_graph, checkpoint=checkpoint, const_feed_dict_pkl=pkl_file) ops = tf.get_default_graph().get_operations() # TODO - modify the out = tf.train.export_meta_graph( filename='model_files/model.tf-modified.meta', as_text=True) ops[0].outputs[0].shape[0] = None pops = [ op.outputs[0] for op in ops if op.type == "Placeholder" and op.name.startswith("Placeholder") ] m.input_ops # view shapes of the data m.target_ops from concise.preprocessing import encodeDNA x = encodeDNA(["T" * m.input_ops.shape[1].value] * 2).astype("float32") out = m.predict_on_batch(x)
def _encode_seq(self, seq): return {k: encodeDNA([v]) for k, v in seq.items()}
def data_extended(rbp_name, n_bases=10, pos_class_weight=1.0, scale="sign_log", # or "nat" pos_as_track=False, valid_chr=[1, 3], test_chr=[2, 4, 6, 8, 10]): """ pos_class_weight: positive class weight """ dt_train, dt_valid, dt_test = data_split(rbp_name + "_extended", valid_chr, test_chr) seq_train = encodeDNA(dt_train.seq.tolist()) seq_valid = encodeDNA(dt_valid.seq.tolist()) seq_test = encodeDNA(dt_test.seq.tolist()) seq_length = seq_train.shape[1] # impute missing values (not part of the pipeline as the Imputer lacks inverse_transform method) imp = Imputer(strategy="median") imp.fit(pd.concat([dt_train[POS_FEATURES], dt_valid[POS_FEATURES]])) dt_train[POS_FEATURES] = imp.transform(dt_train[POS_FEATURES]) dt_valid[POS_FEATURES] = imp.transform(dt_valid[POS_FEATURES]) dt_test[POS_FEATURES] = imp.transform(dt_test[POS_FEATURES]) if scale == "sign_log": preproc_pipeline = make_pipeline( FunctionTransformer(func=sign_log_func, inverse_func=sign_log_func_inverse), MinMaxScaler() ) elif scale == "nat": preproc_pipeline = make_pipeline( MinMaxScaler() ) else: ValueError("scale argument invalid") dtx_train = np.array(dt_train[POS_FEATURES]) dtx_valid = np.array(dt_valid[POS_FEATURES]) dtx_test = np.array(dt_test[POS_FEATURES]) def melt_array(arr, seq_length): """ 3-dim -> 2-dim transform arr = np.arange(12).reshape((2,2,3)) assert np.all(unmelt_array(melt_array(arr, 3), 3) == arr) """ arr = np.transpose(arr, (0, 2, 1)) assert arr.shape[2] == len(POS_FEATURES) assert arr.shape[1] == seq_length return arr.reshape((-1, len(POS_FEATURES))) def unmelt_array(arr, seq_length): arr = arr.reshape(((-1, seq_length, len(POS_FEATURES)))) return np.transpose(arr, (0, 2, 1)) if pos_as_track: dtx_train = melt_array(expand_positions(dtx_train, seq_length), seq_length) dtx_valid = melt_array(expand_positions(dtx_valid, seq_length), seq_length) dtx_test = melt_array(expand_positions(dtx_test, seq_length), seq_length) # transform pos features preproc_pipeline.fit(np.concatenate([dtx_train, dtx_valid])) train_pos = preproc_pipeline.transform(dtx_train) valid_pos = preproc_pipeline.transform(dtx_valid) test_pos = preproc_pipeline.transform(dtx_test) def create_feature_dict(arr, seq_length, pos_as_track): if pos_as_track: arr = unmelt_array(arr, seq_length) else: arr = arr[..., np.newaxis] raw_dist = {"raw_dist_" + k: arr[:, i][..., np.newaxis] for i, k in enumerate(POS_FEATURES)} # (batch, seq_length / 1, 1) dist = {"dist_" + k: encodeSplines(arr[:, i], start=0, end=1) for i, k in enumerate(POS_FEATURES)} # (batch, seq_length / 1, default number of splines) # add also the merged version - last dimension = features raw_dist_all = np.concatenate([raw_dist["raw_dist_" + k] for k in POS_FEATURES], axis=-1) # (batch, seq_length / 1, n_features) return {**raw_dist, **dist, **{"raw_dist_all": raw_dist_all}} train_dist = create_feature_dict(train_pos, seq_length, pos_as_track) valid_dist = create_feature_dict(valid_pos, seq_length, pos_as_track) test_dist = create_feature_dict(test_pos, seq_length, pos_as_track) x_train = {"seq": seq_train, **train_dist} x_valid = {"seq": seq_valid, **valid_dist} x_test = {"seq": seq_test, **test_dist} # y y_train = dt_train.binding_site.as_matrix().reshape((-1, 1)).astype("float") y_valid = dt_valid.binding_site.as_matrix().reshape((-1, 1)).astype("float") y_test = dt_test.binding_site.as_matrix().reshape((-1, 1)).astype("float") sample_weight = np.squeeze(np.where(y_train == 1, pos_class_weight, 1), -1) return (x_train, y_train, sample_weight, POS_FEATURES, preproc_pipeline), \ (x_valid, y_valid),\ (x_test, y_test)
import h5py import pandas as pd from concise.preprocessing import encodeDNA df = pd.read_pickle("human_utrs_result.pkl") top_n = 2000 inputs = encodeDNA(df.utr)[:top_n] preds = df.retrained_pred.values.reshape((-1, 1))[:top_n] fw = h5py.File("expect.human_utrs.h5", 'w') fw.create_dataset('/inputs', data=inputs) fw.create_dataset('/preds', data=preds) fw.flush() fw.close()
def test_encodeDNA(): seq = "ACGTTTATNT" assert len(seq) == 10 with pytest.raises(ValueError): encodeDNA(seq) assert encodeDNA([seq]).shape == (1, 10, 4) assert encodeDNA([seq], maxlen=20).shape == (1, 20, 4) assert encodeDNA([seq], maxlen=5).shape == (1, 5, 4) assert np.all(encodeDNA([seq])[0, 0] == np.array([1, 0, 0, 0])) assert np.all(encodeDNA([seq])[0, 1] == np.array([0, 1, 0, 0])) assert np.all(encodeDNA([seq])[0, 2] == np.array([0, 0, 1, 0])) assert np.all(encodeDNA([seq])[0, 3] == np.array([0, 0, 0, 1])) assert np.all(encodeDNA([seq])[0, 4] == np.array([0, 0, 0, 1])) assert np.all(encodeDNA([seq])[0, -1] == np.array([0, 0, 0, 1])) assert np.all(encodeDNA([seq])[0, -2] == np.array([0, 0, 0, 0]))
return pd.DataFrame({ "pattern": p.name, "strand": strands, "center": positions, "seq_idx": np.arange(len(seqs_one_hot)) }) # 'TTTACAATTT' # seq1 # 'TTTACAATT' # seq2 # ' AACAAA ' # m1 # ' AAACAA ' # m1 # ' ACAAT ' # m2 seqs = ['TTTACAATTT', 'TTTACAATT'] seqs_one_hot = encodeDNA(seqs) motif_seqs_1 = ['TTTGTT', 'AAACAA', 'TTGTTT', 'ACAATT', 'TATTGT'] motif_seqs_2 = ['AACAAA', 'AAACAA', 'TTGTTT', 'ACAATT', 'TATTGT'] def create_patterns(motif_seqs): patterns = [ Pattern(seq=encodeDNA([s])[0], contrib=dict(a=encodeDNA([s])[0]), hyp_contrib=dict(a=encodeDNA([s])[0]), name=str(i)) for i, s in enumerate(motif_seqs) ] aligned_patterns = [
"""Create one-hot encoded sequences from the input""" import pandas as pd from basepair.exp.chipnexus.data import(pool_bottleneck, gen_padded_sequence, syn_padded_sequence, ) import numpy as np input_gen = '../tidied_GEN_RPMsExpression_plusSeqs' input_syn = '../tidied_SYN_RPMsExpression_plusSeqs' from concise.preprocessing import encodeDNA dfs_gen = pd.read_csv(input_gen) dfs_syn = pd.read_csv(input_syn) bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_gen.Sequence.str.upper()]) bpnet_seq_syn = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_syn.Sequence.str.upper()]) np.save("tidied_GEN_RPMsExpression_plusSeqs_one_hot",bpnet_seq_gen) np.save("tidied_SYN_RPMsExpression_plusSeqs_one_hot",bpnet_seq_syn)
def _encode_batch_seq(self, batch): return {k: encodeDNA(v.tolist()) for k, v in batch.items()}
def data(rbp_name, n_bases=10, pos_class_weight=1.0, tss_trunc=2000, polya_trunc=2000, pos_as_track=False, kernel_size=10, scale_raw=False, valid_chr=[1, 3], test_chr=[2, 4, 6, 8, 10]): """ pos_class_weight: positive class weight """ dt_train, dt_valid, dt_test = data_split(rbp_name, valid_chr, test_chr) # TODO - not working just with dt_train.seq ?!?!? seq_train = encodeDNA(dt_train.seq.tolist()) seq_valid = encodeDNA(dt_valid.seq.tolist()) seq_test = encodeDNA(dt_test.seq.tolist()) tss_dist = {"train": dt_train.TSS_distance.values, "valid": dt_valid.TSS_distance.values, "test": dt_test.TSS_distance.values} polya_dist = {"train": dt_train.polya_distance.values, "valid": dt_valid.polya_distance.values, "test": dt_test.polya_distance.values} seq_length = seq_train.shape[1] pos_length = seq_length - kernel_size + 1 def expand_positions(x, pos_length): """If pos_as_track, use it""" x = x.reshape((-1, 1)) # 1. create a matrix with incrementing positions incr_array = np.arange(pos_length) - pos_length // 2 # expand to have the same shape as x positions_offset = np.repeat(incr_array.reshape((1, -1)), x.shape[0], axis=0) return positions_offset + x if pos_as_track: tss_dist = {k: expand_positions(v, pos_length) for k, v in tss_dist.items()} polya_dist = {k: expand_positions(v, pos_length) for k, v in polya_dist.items()} shift = pos_length // 2 + 2 else: tss_dist = {k: v[:, np.newaxis] for k, v in tss_dist.items()} polya_dist = {k: v[:, np.newaxis] for k, v in polya_dist.items()} shift = 1 # transform polya_distance - change order tss_dist = {k: (v + shift) for k, v in tss_dist.items()} polya_dist = {k: -1 * (v - shift) for k, v in polya_dist.items()} tss_pos_ranges = get_pos_ranges(tss_dist) polya_pos_ranges = get_pos_ranges(polya_dist) def get_tss_nat_dist(x): return encodeSplines(x, n_bases=n_bases, start=tss_pos_ranges["min"], end=tss_trunc) def get_tss_log_dist(x): return encodeSplines(np.log10(x), n_bases=n_bases, start=np.log10(tss_pos_ranges["min"]), end=np.log10(tss_pos_ranges["max"]), ) def get_polya_nat_dist(x): return encodeSplines(x, n_bases=n_bases, start=polya_pos_ranges["min"], end=polya_trunc) def get_polya_log_dist(x): return encodeSplines(np.log10(x), n_bases=n_bases, start=np.log10(polya_pos_ranges["min"]), end=np.log10(polya_pos_ranges["max"]), ) # min-max scaler mms_tss = MinMaxScaler() mms_tss.fit(np.log10(tss_dist["train"]).reshape((-1, 1))) mms_polya = MinMaxScaler() mms_polya.fit(np.log10(polya_dist["train"]).reshape((-1, 1))) def get_raw_tss_log_dist(x): sh = x.shape if scale_raw: return mms_tss.transform(np.log10(x).reshape((-1, 1))).\ reshape(sh)[:, :, np.newaxis] else: return np.log10(x)[:, :, np.newaxis] def get_raw_polya_log_dist(x): sh = x.shape if scale_raw: return mms_polya.transform(np.log10(x).reshape((-1, 1))).\ reshape(sh)[:, :, np.newaxis] else: return np.log10(x)[:, :, np.newaxis] y_train = dt_train.binding_site.as_matrix().reshape((-1, 1)).astype("float") y_valid = dt_valid.binding_site.as_matrix().reshape((-1, 1)).astype("float") y_test = dt_test.binding_site.as_matrix().reshape((-1, 1)).astype("float") sample_weight = np.squeeze(np.where(y_train == 1, pos_class_weight, 1), -1) return ({"seq": seq_train, "dist_tss_nat": get_tss_nat_dist(tss_dist["train"]), "dist_tss_log": get_tss_log_dist(tss_dist["train"]), "dist_polya_nat": get_polya_nat_dist(polya_dist["train"]), "dist_polya_log": get_polya_log_dist(polya_dist["train"]), # "raw_dist_tss_nat": tss_dist["train"], # Not supported, not thresholding it "raw_dist_tss_log": get_raw_tss_log_dist(tss_dist["train"]), # "raw_dist_polya_nat": polya_dist["train"], "raw_dist_polya_log": get_raw_polya_log_dist(polya_dist["train"])}, y_train, sample_weight, tss_pos_ranges, polya_pos_ranges, mms_tss, mms_polya),\ ({"seq": seq_valid, "dist_tss_nat": get_tss_nat_dist(tss_dist["valid"]), "dist_tss_log": get_tss_log_dist(tss_dist["valid"]), "dist_polya_nat": get_polya_nat_dist(polya_dist["valid"]), "dist_polya_log": get_polya_log_dist(polya_dist["valid"]), # "raw_dist_tss_nat": tss_dist["valid"], "raw_dist_tss_log": get_raw_tss_log_dist(tss_dist["valid"]), # "raw_dist_polya_nat": polya_dist["valid"], "raw_dist_polya_log": get_raw_polya_log_dist(polya_dist["valid"])}, y_valid),\ ({"seq": seq_test, "dist_tss_nat": get_tss_nat_dist(tss_dist["test"]), "dist_tss_log": get_tss_log_dist(tss_dist["test"]), "dist_polya_nat": get_polya_nat_dist(polya_dist["test"]), "dist_polya_log": get_polya_log_dist(polya_dist["test"]), # "raw_dist_tss_nat": tss_dist["test"], "raw_dist_tss_log": get_raw_tss_log_dist(tss_dist["test"]), # "raw_dist_polya_nat": polya_dist["test"], "raw_dist_polya_log": get_raw_polya_log_dist(polya_dist["test"])}, y_test)