def test_profile_mut_bin_noerr(self): """ Test the ability of mpathic.profile_mut to compute mutation rates """ print '\nIn test_profile_mut_bin_noerr_...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False) print '(bin=%d)'%good_bin_num, # If bad or library, then profile_mut.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError,executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_mut.main should produce a valid df elif '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_bin_noerr_%s.txt'%description io.write(df,out_file) # Test writing io.load_profile_mut(out_file) # Test loading print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)'%bad_bin_num, try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_mut_bin_noerr(self): """ Test the ability of mpathic.profile_mut to compute mutation rates """ print '\nIn test_profile_mut_bin_noerr_...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') good_bin_num = 2 bad_bin_num = 5 for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False) print '(bin=%d)' % good_bin_num, # If bad or library, then profile_mut.main should raise SortSeqError if ('_bad' in file_name) or ('library' in file_name): try: self.assertRaises(SortSeqError, executable) print 'badtype,', except: print 'good (ERROR).' raise # If good, then profile_mut.main should produce a valid df elif '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_bin_noerr_%s.txt'%description io.write(df, out_file) # Test writing io.load_profile_mut(out_file) # Test loading print 'good,', except: print 'bad (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.') # Should always raise an error if bin num is too large executable = lambda:\ profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num) print '(bin=%d)' % bad_bin_num, try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise
def test_profile_mut_total_err(self): """ Test the ability of mpathic.profile_mut to compute mutation rates based on total count values """ print '\nIn test_profile_mut_total_err...' library_files = glob.glob(self.input_dir + 'library_*.txt') library_files += glob.glob(self.input_dir + 'dataset_*.txt') for file_name in library_files: print '\t%s =' % file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_mut.main(io.load_dataset(file_name), err=True) # If good, then profile_mut.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_total_err_%s.txt'%description io.write(df, out_file) io.load_profile_mut(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError, executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')
def main(dataset_df, bin=None, start=0, end=None, err=False): """ Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing results. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end) # Create columns for profile_freqs table ct_cols = [c for c in counts_df.columns if qc.is_col_type(c, "ct_")] # Record positions in new dataframe mut_df = counts_df[["pos"]].copy() # Compute mutation rate across counts max_ct = counts_df[ct_cols].max(axis=1) sum_ct = counts_df[ct_cols].sum(axis=1) mut = 1.0 - (max_ct / sum_ct) mut_df["mut"] = mut # Computation of error rate is optional if err: mut_err = np.sqrt(mut * (1.0 - mut) / sum_ct) mut_df["mut_err"] = mut_err # Figure out which alphabet the cts dataframe specifies alphabet = "".join([c.split("_")[1] for c in ct_cols]) seqtype = qc.alphabet_to_seqtype_dict[alphabet] wt_col = qc.seqtype_to_wtcolname_dict[seqtype] # Compute WT base at each position mut_df[wt_col] = "X" for col in ct_cols: indices = (counts_df[col] == max_ct).values mut_df.loc[indices, wt_col] = col.split("_")[1] # Validate as counts dataframe mut_df = qc.validate_profile_mut(mut_df, fix=True) return mut_df
def test_profile_mut_total_err(self): """ Test the ability of mpathic.profile_mut to compute mutation rates based on total count values """ print '\nIn test_profile_mut_total_err...' library_files = glob.glob(self.input_dir+'library_*.txt') library_files += glob.glob(self.input_dir+'dataset_*.txt') for file_name in library_files: print '\t%s ='%file_name, description = file_name.split('_')[-1].split('.')[0] executable = lambda: profile_mut.main(io.load_dataset(file_name),err=True) # If good, then profile_mut.main should produce a valid df if '_good' in file_name: try: df = executable() qc.validate_profile_mut(df) out_file = self.output_dir+\ 'profile_mut_total_err_%s.txt'%description io.write(df,out_file) io.load_profile_mut(out_file) print 'good.' except: print 'bad (ERROR).' raise # If bad, then profile_mut.main should raise SortSeqError elif '_bad' in file_name: try: self.assertRaises(SortSeqError,executable) print 'badtype.' except: print 'good (ERROR).' raise # There are no other options else: raise SortSeqError('Unrecognized class of file_name.')