def test_profile_mut_bin_noerr(self): """ Test the ability of mpathic.profile_mut to compute mutation rates """ print '\nIn test_profile_mut_bin_noerr_...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_mut.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_mut.main should produce a valid df elif '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_bin_noerr_%s.txt'%description io.write(df,out_file) # Test writing io.load_profile_mut(out_file) # Test loading print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_mut_bin_noerr(self): """ Test the ability of mpathic.profile_mut to compute mutation rates """ print '\nIn test_profile_mut_bin_noerr_...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False) print '(bin=%d)' % good_bin_num, # If bad or library, then profile_mut.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError, executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_mut.main should produce a valid df elif '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_bin_noerr_%s.txt'%description io.write(df, out_file) # Test writing io.load_profile_mut(out_file) # Test loading print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)' % bad_bin_num, try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_mut_total_err(self): """ Test the ability of mpathic.profile_mut to compute mutation rates based on total count values """ print '\nIn test_profile_mut_total_err...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_mut.main(io.load_dataset(file_name), err=True) # If good, then profile_mut.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_total_err_%s.txt'%description io.write(df, out_file) io.load_profile_mut(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_mut_total_err(self): """ Test the ability of mpathic.profile_mut to compute mutation rates based on total count values """ print '\nIn test_profile_mut_total_err...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_mut.main(io.load_dataset(file_name),err=True) # If good, then profile_mut.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_total_err_%s.txt'%description io.write(df,out_file) io.load_profile_mut(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_mut_seqslicing(self): """ Test the ability of mpathic.profile_mut to slice sequences properly, and to raise the correct errors """ print '\nIn test_profile_mut_seqslicing...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable_good1 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=2,end=10) executable_good2 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=10) executable_good3 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=2) executable_nopro =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=50,end=60) executable_bad1 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=-1) executable_bad2 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=100) executable_bad3 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=20,end=10) # If good, then sequences will be valid if '_good' in file_name: try: df = executable_good1() io.write(df,self.output_dir+\ 'profile_mut_splice2-10_%s.txt'%description) executable_good2() executable_good3() self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) if '_pro' in file_name: self.assertRaises(SortSeqError,executable_nopro) else: df = executable_nopro() print 'ok.' except: print 'ok (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable_good1) self.assertRaises(SortSeqError,executable_good2) self.assertRaises(SortSeqError,executable_good3) self.assertRaises(SortSeqError,executable_nopro) self.assertRaises(SortSeqError,executable_bad1) self.assertRaises(SortSeqError,executable_bad2) self.assertRaises(SortSeqError,executable_bad3) print 'ok.' except: print 'not ok (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def test_profile_mut_seqslicing(self): """ Test the ability of mpathic.profile_mut to slice sequences properly, and to raise the correct errors """ print '\nIn test_profile_mut_seqslicing...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable_good1 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=2,end=10) executable_good2 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=10) executable_good3 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=2) executable_nopro =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=50,end=60) executable_bad1 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=-1) executable_bad2 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ end=100) executable_bad3 =\ lambda: profile_mut.main(io.load_dataset(file_name),\ start=20,end=10) # If good, then sequences will be valid if '_good' in file_name: try: df = executable_good1() io.write(df,self.output_dir+\ 'profile_mut_splice2-10_%s.txt'%description) executable_good2() executable_good3() self.assertRaises(SortSeqError, executable_bad1) self.assertRaises(SortSeqError, executable_bad2) self.assertRaises(SortSeqError, executable_bad3) if '_pro' in file_name: self.assertRaises(SortSeqError, executable_nopro) else: df = executable_nopro() print 'ok.' except: print 'ok (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable_good1) self.assertRaises(SortSeqError, executable_good2) self.assertRaises(SortSeqError, executable_good3) self.assertRaises(SortSeqError, executable_nopro) self.assertRaises(SortSeqError, executable_bad1) self.assertRaises(SortSeqError, executable_bad2) self.assertRaises(SortSeqError, executable_bad3) print 'ok.' except: print 'not ok (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def _check_mut_rate(partialdf, buffer): mut = profile_mut.main(partialdf) #Check mutation rate of non-primer and non-wildtype bases relevantmuts = mut.loc[buffer + 20:179-buffer*2,'mut'] return ((relevantmuts > .14).any() or (relevantmuts < .07).any())
def main(dataset_df, bin=None, start=0, end=None, bins_df=None, pseudocounts=1, return_profile=False): """ Computes character frequencies (0.0 to 1.0) at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ seq_cols = qc.get_cols_from_df(dataset_df, 'seqs') if not len(seq_cols) == 1: raise SortSeqError('Dataframe has multiple seq cols: %s' % str(seq_cols)) dicttype = qc.colname_to_seqtype_dict[seq_cols[0]] seq_dict, inv_dict = utils.choose_dict(dicttype) # Validate dataset_df qc.validate_dataset(dataset_df) #for each bin we need to find character frequency profile, then sum over all #bins to get activity. #first make sure we have activities of each bin: if not bins_df: bins = utils.get_column_headers(dataset_df) #in this case no activity was specified so just assume the activity #equals bin number activity = [float(b.split('_')[-1]) for b in bins] else: bins = list(bins_df['bins']) activity = list(bins_df['activity']) #initialize dataframe for total counts in all bins output_ct_df = pd.DataFrame() #initialize dataframe for running activity calculation output_activity_df = pd.DataFrame() for i, b in enumerate(bins): bin_num = int(b.split('_')[-1]) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin_num, start=start, end=end) # Create columns for profile_freqs table ct_cols = utils.get_column_headers(counts_df) #add_pseudocounts counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts #add to all previous bin counts #print output_activity_df if i == 0: output_ct_df = counts_df[ct_cols] output_activity_df = counts_df[ct_cols] * activity[i] else: output_ct_df = output_ct_df + counts_df[ct_cols] output_activity_df = output_activity_df + counts_df[ ct_cols] * activity[i] #now normalize by each character at each position, this is the activity #profile output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols]) mut_rate = profile_mut.main(dataset_df, bin=bin) freq = profile_freq.main(dataset_df, bin=bin) freq_cols = [x for x in freq.columns if 'freq_' in x] #now normalize by the wt activity wtseq = ''.join(mut_rate['wt']) wtarr = utils.seq2mat(wtseq, seq_dict) wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols])) #sum this to get total wt_activity2 = wt_activity.sum(axis=1) delta_activity = output_activity_df.subtract(pd.Series(wt_activity2), axis=0) if return_profile: #first find mutation rate according to formula in SI text profile_delta_activity = mut_rate['mut']*np.sum( (1-np.transpose(wtarr))*np.array(\ freq[freq_cols])*np.array(delta_activity),axis=1) #format into dataframe output_df = pd.DataFrame() output_df['pos'] = range(start, start + len(profile_delta_activity.index)) output_df['mut_activity'] = profile_delta_activity return output_df else: #just add pos column and rename counts columns to activity columns output_df = pd.DataFrame(delta_activity) output_df.insert(0, 'pos', range(start, start + len(delta_activity.index))) #reorder columns activity_col_dict = {x:'activity_' + x.split('_')[-1] \ for x in delta_activity.columns if 'ct_' in x} output_df = output_df.rename(columns=activity_col_dict) return output_df