def predict_sequences(self, sequences, const_intercept=False, transform_scores=True, key_colname="", sequence_colname="sequence", flank_colname="flank", predict_flanks=False, flank_len=0, only_pred=False): """ Do not make this as generator, because we need to use it somewhere else. TODO: handle flank_len Args: only_pred: return only prediction dictionary, if False, return BasePrediction object which contains the sequence. """ seqdict = bio.get_seqdict(sequences, sequence_col=sequence_colname, keycol=key_colname) if type(sequences) == pd.DataFrame: flank_left = bio.get_seqdict(sequences, "%s_left" % flank_colname, keycol=key_colname, ignore_missing_col=True) flank_right = bio.get_seqdict(sequences, "%s_right" % flank_colname, keycol=key_colname, ignore_missing_col=True) predictions = {} for key in seqdict: if type(sequences) == pd.DataFrame: sequence = flank_left[key][-flank_len:] + seqdict[ key] + flank_right[key][:flank_len] else: sequence = seqdict[key] prediction = self.predict_sequence(sequence, const_intercept, transform_scores) # since we use flank, we need to update the result for result in prediction: result['site_start'] = result['site_start'] - flank_len result['core_start'] = result['core_start'] - flank_len # if a prediction is in the flanks if result['core_start'] < 0 or \ result['core_start'] + result['core_width'] > len(seqdict[key]) - 1: # remove the prediction prediction.remove(result) result['core_mid'] = result['core_mid'] - flank_len if only_pred: predictions[key] = prediction else: predictions[key] = basepred.BasePrediction( sequence, prediction) return predictions
def predict_sequences(self, sequence_df, key_colname="", sequence_colname="sequence", flank_colname="flank", predict_flanks=False, flank_len=10): '''This is a temporary function that makes predictions dict using the dataframe''' seqdict = self.pred_input_todict(sequence_df, sequence_colname=sequence_colname, key_colname=key_colname) if predict_flanks: flank_left = bio.get_seqdict(sequence_df,"%s_left" % flank_colname, ignore_missing_colname=True, keycolname=key_colname) flank_right = bio.get_seqdict(sequence_df,"%s_right" % flank_colname, ignore_missing_colname=True, keycolname=key_colname) if self.protein == 'ets1': core = (11,15) centerPos = 12 if self.protein == 'runx1': core = (12, 17) centerPos = 14 kmerFile = self.kmer_align_path predictions = {} # for each sequence we want to predict for key in seqdict: sequence = seqdict[key] if predict_flanks: sequence = flank_left[key][-10:] + seqdict[key] + flank_right[key][:10] prediction = self.predict_sequence(sequence, kmerFile, core, centerPos, self.threshold, self.protein) if predict_flanks: for result in prediction: result['site_start'] = result['site_start'] - flank_len result['core_start'] = result['core_start'] - flank_len # if a prediction is in the flanks if result['core_start'] < 0 or \ result['core_start'] + result['core_width'] > len(seqdict[key]) - 1: # remove the prediction prediction.remove(result) predictions[key] = basepred.BasePrediction(sequence, prediction) return predictions
def predict_sequences(self, sequence_df, key_colname="", sequence_colname="sequence", flank_colname="flank", predict_flanks=False, flank_len=10): '''This is a temporary function that makes predictions dict using the dataframe''' seqdict = bio.get_seqdict(sequence_df, sequence_col=sequence_colname, keycol=key_colname) if predict_flanks: flank_left = bio.get_seqdict(sequence_df, "%s_left" % flank_colname, ignore_missing_colname=True, keycolname=key_colname) flank_right = bio.get_seqdict(sequence_df, "%s_right" % flank_colname, ignore_missing_colname=True, keycolname=key_colname) predictions = {} # for each sequence we want to predict for key in seqdict: sequence = seqdict[key] if predict_flanks: sequence = flank_left[key][-10:] + seqdict[key] + flank_right[ key][:10] prediction = self.predict_sequence(sequence) if predict_flanks: for result in prediction: result['site_start'] = result['site_start'] - flank_len result['core_start'] = result['core_start'] - flank_len # if a prediction is in the flanks if result['core_start'] < 0 or \ result['core_start'] + result['core_width'] > len(seqdict[key]) - 1: # remove the prediction prediction.remove(result) predictions[key] = basepred.BasePrediction(sequence, prediction) return predictions
def predict_sequences(self, sequences, sequence_colname="sequence", key_colname="", only_pred=False): """ """ seqdict = bio.get_seqdict(sequences, sequence_col=sequence_colname, keycol=key_colname) predictions = {} for key in seqdict: prediction = self.predict_sequence(seqdict[key]) if only_pred: predictions[key] = prediction else: predictions[key] = basepred.BasePrediction( seqdict[key], prediction) return predictions
def pred_input_todict(self, sequence_input, sequence_colname="sequence", key_colname="", predict_flanks=True): """ Get the dictionary form of the input for sequences predictions. sequence_input types allowed: Datafram, dictionary """ # check if input is a dataframe if isinstance(sequence_input, pd.DataFrame): return bio.get_seqdict(sequence_input, sequence_colname=sequence_colname, keycolname=key_colname) # check if input is a dictionary elif isinstance(sequence_input, dict): return sequence_input # raise exception if input type is not allowed else: raise Exception( "input must be data frame or dictionary of sequences")
def predict_sequences(self, sequences, sequence_colname="sequence", key_colname="", predict_flanks=False, flank_colname="flank", flank_len=10, only_pred=False): """ Get a dictionary of escore predictions for each sequence. Args: sequences: list / data frame / dictionary of sequences (see bio.get_seqdict) sequence_colname: when input is a data frame, this is the column name of the sequence (default: sequence) key_colname: when input is data frame, this is the column with the key that denotes distict row (default: "") predict_flanks: default False, when True check flank column--input needs to be a data frame flank_colname: the column name of the flank sequence flank_len: length of the flanking sequences only_pred: by default we return result as `BasePred` object for plotting Return: list of dictionary of the predicted sequences as a BasePred object if `only_pred` is False, else just return the list """ seqdict = bio.get_seqdict(sequences, sequence_col=sequence_colname, keycol=key_colname) # get the flanks if we are including flank predictions if predict_flanks: flank_left = bio.get_seqdict(sequence_df, "%s_left" % flank_colname, keycol=key_colname, ignore_missing_colname=True) flank_right = bio.get_seqdict(sequence_df, "%s_right" % flank_colname, keycol=key_colname, ignore_missing_colname=True) # get prediction of each sequence predictions = {} for key, sequence in seqdict.items(): # if we are including flanks in the prediction if predict_flanks: # make sure there are enough flanks to take if len(flank_left[key]) < flank_len or len( flank_right[key]) < flank_len: raise Exception( "flank_len is greater than the length of flanks available" ) # update the sequence to be predicted sequence = flank_left[key][ -flank_len:] + sequence + flank_right[key][:flank_len] # get the prediction for this sequence prediction = self.predict_sequence(sequence) if only_pred: predictions[key] = prediction else: predictions[key] = basepred.BasePrediction( sequence, prediction) # return the dictionary of predictions for each sequence return predictions