def _input_checks(self): # data_df validation if self.data_df is None: raise ControlledError( " The Predictive Info class requires pandas dataframe as input dataframe. Entered data_df was 'None'." ) elif self.data_df is not None: check( isinstance(self.data_df, pd.DataFrame), 'type(data_df) = %s; must be a pandas dataframe ' % type(self.data_df)) # validate data_df check( pd.DataFrame.equals(self.data_df, qc.validate_dataset(self.data_df)), " Input dataframe fails quality control, please ensure input dataframe has the correct format of an mpathic dataframe " ) # model validation if self.model_df is None: raise ControlledError( " The Predictive info class requires pandas dataframe as input model dataframe. Entered model_df was 'None'." ) elif self.model_df is not None: check( isinstance(self.model_df, pd.DataFrame), 'type(model_df) = %s; must be a pandas dataframe ' % type(self.model_df)) # validate model df check( pd.DataFrame.equals(self.model_df, qc.validate_model(self.model_df)), " Model dataframe failed quality control, \ please ensure input model dataframe has the correct format of an mpathic dataframe " ) # check that start is an integer check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) check(self.start >= 0, "start = %d must be a positive integer " % self.start) if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end)) # check that verbose is a boolean check(isinstance(self.err, bool), 'type(err) = %s; must be of type bool ' % type(self.err)) check( isinstance(self.coarse_graining_level, int), 'type(coarse_graining_level) = %s; must be of type int ' % type(self.coarse_graining_level))
def __init__(self,model_df): """ Constructor takes model parameters in the form of a model dataframe """ model_df = qc.validate_model(model_df.copy(),fix=True) seqtype, modeltype = qc.get_model_type(model_df) if not modeltype=='MAT': raise SortSeqError('Invalid modeltype: %s'%modeltype) seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype) self.seqtype = seqtype self.seq_dict = seq_dict self.inv_dict = inv_dict self.df = model_df self.length = model_df.shape[0] # Extract matrix part of model dataframe headers = qc.get_cols_from_df(model_df,'vals') self.matrix = np.transpose(np.array(model_df[headers]))
def _input_check(self): """ private method that validates all parameters """ # check that input df is of type pandas dataframe if self.df is None: raise ControlledError( " Simulate Sort Requires pandas dataframe as input dataframe. Entered df was 'None'." ) elif self.df is not None: check(isinstance(self.df, pd.DataFrame), 'type(df) = %s; must be a pandas dataframe ' % type(self.df)) # validate dataset check( pd.DataFrame.equals(self.df, qc.validate_dataset(self.df)), " Input dataframe failed quality control, \ please ensure input dataset has the correct format of an mpathic dataframe " ) # check model dataframe if self.mp is None: raise ControlledError( " Simulate Sort Requires pandas dataframe as model input. Entered model df was 'None'." ) elif self.mp is not None: check(isinstance(self.mp, pd.DataFrame), 'type(mp) = %s; must be a pandas dataframe ' % type(self.mp)) # validate dataset check( pd.DataFrame.equals(self.mp, qc.validate_model(self.mp)), " Model dataframe failed quality control, \ please ensure model has the correct format of an mpathic model dataframe " ) # check noisetype is string check(isinstance(self.noisetype, str), 'type(noisetype) = %s; must be a string ' % type(self.noisetype)) # check noisetype is valid valid_noisetype_values = ['LogNormal', 'Normal', 'None', 'Plasmid'] check( self.noisetype in valid_noisetype_values, 'noisetype = %s; must be in %s' % (self.noisetype, valid_noisetype_values)) # ensure that npar is type list check(isinstance(self.npar, list), 'type(npar) = %s; must be a list ' % type(self.npar)) # for valid choice of noisetype, pick appropriate noise parameters if self.noisetype == 'Normal': if len(self.npar) != 1: raise SortSeqError( 'For a normal noise model, there must be one input parameter (width of normal distribution)' ) if self.noisetype == 'LogNormal': if len(self.npar) != 2: raise SortSeqError('''For a LogNormal noise model there must be 2 input parameters''') # ensure nbins is valid check(isinstance(self.nbins, int), 'type(nbins) = %s; must be of type int ' % type(self.nbins)) check( self.nbins > 1, 'number of bins must be greater than 1, entered bins = %d' % self.nbins) # sequence library should be boolean check( isinstance(self.sequence_library, bool), 'type(sequence_library) = %s; must be of type bool ' % type(self.sequence_library)) # make sure start is of type int check(isinstance(self.start, int), 'type(start) = %s; must be of type int ' % type(self.start)) # make sure end is of type int if self.end is not None: check(isinstance(self.end, int), 'type(end) = %s; must be of type int ' % type(self.end)) # make sure end is of type int if self.chunksize is not None: check( isinstance(self.chunksize, int), 'type(chunksize) = %s; must be of type int ' % type(self.chunksize))
def __init__(self, model_df, contig_list, numsites=10, verbose=False): self.sitelist_df = None # Determine type of string from model qc.validate_model(model_df) seqtype, modeltype = qc.get_model_type(model_df) seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype) # Check that all characters are from the correct alphabet alphabet = qc.seqtype_to_alphabet_dict[seqtype] search_string = r"[^%s]" % alphabet for contig_str, contig_name, pos_offset in contig_list: if re.search(search_string, contig_str): raise SortSeqError( \ 'Invalid character for seqtype %s found in %s.' % \ (seqtype, contig_name)) # Create model object to evaluate on seqs if modeltype == 'MAT': model_obj = Models.LinearModel(model_df) elif modeltype == 'NBR': model_obj = Models.NeighborModel(model_df) # Create list of dataframes, one for each contig seq_col = qc.seqtype_to_seqcolname_dict[seqtype] L = model_obj.length sitelist_df = pd.DataFrame( \ columns=['val', seq_col, 'left', 'right', 'ori', 'contig']) for contig_str, contig_name, pos_offset in contig_list: if len(contig_str) < L: continue this_df = pd.DataFrame( \ columns=['val', seq_col, 'left', 'right', 'ori', 'contig']) num_sites = len(contig_str) - L + 1 poss = np.arange(num_sites).astype(int) this_df['left'] = poss + pos_offset this_df['right'] = poss + pos_offset + L - 1 # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss] this_df[seq_col] = fast.seq2sitelist(contig_str, L) # Cython this_df['ori'] = '+' this_df['contig'] = contig_name this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True) # If scanning DNA, scan reverse-complement as well if seqtype == 'dna': # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]] this_df[seq_col] = fast.seq2sitelist(contig_str, L, rc=True) # Cython this_df['ori'] = '-' this_df['val'] = model_obj.evaluate(this_df[seq_col]) sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True) # Sort by value and reindex sitelist_df.sort_values(by='val', ascending=False, inplace=True) sitelist_df.reset_index(drop=True, inplace=True) # Crop list at numsites if sitelist_df.shape[0] > numsites: sitelist_df.drop(sitelist_df.index[numsites:], inplace=True) if verbose: print('.', sys.stdout.flush()) if verbose: print('') sys.stdout.flush() # If no sites were found, raise error if sitelist_df.shape[0] == 0: raise SortSeqError( \ 'No full-length sites found within provided contigs.') sitelist_df = qc.validate_sitelist(sitelist_df, fix=True) #return sitelist_df self.sitelist_df = sitelist_df
def __init__(self, df, lm='ER', modeltype='MAT', LS_means_std=None, db=None, iteration=30000, burnin=1000, thin=10, runnum=0, initialize='LS', start=0, end=None, foreground=1, background=0, alpha=0.0, pseudocounts=1, drop_library=False, verbose=False, tm=None): # set attributes self.df = df self.lm = lm self.modeltype = modeltype self.LS_means_std = LS_means_std self.db = db self.iteration = iteration self.burnin = burnin self.thin = thin self.runnum = runnum self.initialize = initialize self.start = start self.end = end self.foreground = foreground self.background = background self.alpha = alpha self.pseudocounts = pseudocounts self.drop_library = drop_library self.verbose = verbose self.tm = tm # output df self.output_df = None # validate parameters self._input_checks() # Determine dictionary seq_cols = qc.get_cols_from_df(df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype) '''Check to make sure the chosen dictionary type correctly describes the sequences. An issue with this test is that if you have DNA sequence but choose a protein dictionary, you will still pass this test bc A,C, G,T are also valid amino acids''' # set name of sequences column based on type of sequence type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'} seq_col_name = type_name_dict[dicttype] lin_seq_dict, lin_inv_dict = utils.choose_dict(dicttype, modeltype='MAT') # wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end) # wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq] par_seq_dict = { v: k for v, k in seq_dict.items() if k != (len(seq_dict) - 1) } # drop any rows with ct = 0 df = df[df.loc[:, 'ct'] != 0] df.reset_index(drop=True, inplace=True) # If there are sequences of different lengths, then print error but continue if len(set(df[seq_col_name].apply(len))) > 1: sys.stderr.write('Lengths of all sequences are not the same!') # select target sequence region df.loc[:, seq_col_name] = df.loc[:, seq_col_name].str.slice(start, end) df = utils.collapse_further(df) col_headers = utils.get_column_headers(df) # make sure all counts are ints df[col_headers] = df[col_headers].astype(int) # create vector of column names val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))] df.reset_index(inplace=True, drop=True) # Drop any sequences with incorrect length if not end: '''is no value for end of sequence was supplied, assume first seq is correct length''' seqL = len(df[seq_col_name][0]) - start else: seqL = end - start df = df[df[seq_col_name].apply(len) == (seqL)] df.reset_index(inplace=True, drop=True) # Do something different for each type of learning method (lm) if lm == 'ER': if modeltype == 'NBR': emat = self.Markov(df, dicttype, foreground=foreground, background=background, pseudocounts=pseudocounts) else: emat = self.Berg_von_Hippel(df, dicttype, foreground=foreground, background=background, pseudocounts=pseudocounts) if lm == 'PR': emat = self.convex_opt(df, seq_dict, inv_dict, col_headers, tm=tm, \ dicttype=dicttype, modeltype=modeltype) if lm == 'LS': '''First check that is we don't have a penalty for ridge regression, that we at least have all possible base values so that the analysis will not fail''' if LS_means_std: # If user supplied preset means and std for each bin means_std_df = io.load_meanstd(LS_means_std) # change bin number to 'ct_number' and then use as index labels = list(means_std_df['bin'].apply(self.add_label)) std = means_std_df['std'] std.index = labels # Change Weighting of each sequence by dividing counts by bin std df[labels] = df[labels].div(std) means = means_std_df['mean'] means.index = labels else: means = None # drop all rows without counts df['ct'] = df[col_headers].sum(axis=1) df = df[df.ct != 0] df.reset_index(inplace=True, drop=True) ''' For sort-seq experiments, bin_0 is library only and isn't the lowest expression even though it is will be calculated as such if we proceed. Therefore is drop_library is passed, drop this column from analysis.''' if drop_library: try: df.drop('ct_0', inplace=True) col_headers = utils.get_column_headers(df) if len(col_headers) < 2: raise SortSeqError( '''After dropping library there are no longer enough columns to run the analysis''') except: raise SortSeqError( '''drop_library option was passed, but no ct_0 column exists''') # parameterize sequences into 3xL vectors print('init learn model: \n') print(par_seq_dict) print('dict: ', dicttype) raveledmat, batch, sw = utils.genweightandmat(df, par_seq_dict, dicttype, means=means, modeltype=modeltype) # Use ridge regression to find matrix. emat = self.Compute_Least_Squares(raveledmat, batch, sw, alpha=alpha) if lm == 'IM': seq_mat, wtrow = numerics.dataset2mutarray(df.copy(), modeltype) # this is also an MCMC routine, do the same as above. if initialize == 'rand': if modeltype == 'MAT': emat_0 = utils.RandEmat(len(df[seq_col_name][0]), len(seq_dict)) elif modeltype == 'NBR': emat_0 = utils.RandEmat( len(df[seq_col_name][0]) - 1, len(seq_dict)) elif initialize == 'LS': emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_0_df = LearnModel(df.copy(), lm='LS', modeltype=modeltype, alpha=alpha, start=0, end=None, verbose=verbose).output_df emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) # pymc doesn't take sparse mat elif initialize == 'PR': emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_0_df = LearnModel(df.copy(), lm='PR', modeltype=modeltype, start=0, end=None).output_df emat_0 = np.transpose(np.array(emat_0_df[emat_cols])) emat = self.MaximizeMI_memsaver(seq_mat, df.copy(), emat_0, wtrow, db=db, iteration=iteration, burnin=burnin, thin=thin, runnum=runnum, verbose=verbose) # We have infered out matrix. # now format the energy matrices to get them ready to output if (lm == 'IM' or lm == 'memsaver'): if modeltype == 'NBR': try: emat_typical = gauge.fix_neighbor(np.transpose(emat)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat) elif modeltype == 'MAT': try: emat_typical = gauge.fix_matrix(np.transpose(emat)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat) elif lm == 'ER': '''the emat for this format is currently transposed compared to other formats it is also already a data frame with columns [pos,val_...]''' if modeltype == 'NBR': emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_typical = emat[emat_cols] else: emat_cols = [ 'val_' + inv_dict[i] for i in range(len(seq_dict)) ] emat_typical = emat[emat_cols] try: emat_typical = (gauge.fix_matrix((np.array(emat_typical)))) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = emat_typical elif (lm == 'MK'): '''The model is a first order markov model and its gauge does not need to be changed.''' elif lm == 'PR': emat_typical = np.transpose(emat) else: # must be Least squares emat_typical = utils.emat_typical_parameterization( emat, len(seq_dict)) if modeltype == 'NBR': try: emat_typical = gauge.fix_neighbor( np.transpose(emat_typical)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat_typical) elif modeltype == 'MAT': try: emat_typical = gauge.fix_matrix(np.transpose(emat_typical)) except: sys.stderr.write('Gauge Fixing Failed') emat_typical = np.transpose(emat_typical) em = pd.DataFrame(emat_typical) em.columns = val_cols # add position column if modeltype == 'NBR': pos = pd.Series(range(start, start - 1 + len(df[seq_col_name][0])), name='pos') else: pos = pd.Series(range(start, start + len(df[seq_col_name][0])), name='pos') output_df = pd.concat([pos, em], axis=1) # Validate model and return output_df = qc.validate_model(output_df, fix=True) self.output_df = output_df