Example #1
0
    def test_profile_mut_bin_noerr(self):
        """ Test the ability of mpathic.profile_mut to compute mutation rates
        """

        print '\nIn test_profile_mut_bin_noerr_...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        good_bin_num = 2
        bad_bin_num = 5
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda:\
                profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False)
            print '(bin=%d)'%good_bin_num,

            # If bad or library, then profile_mut.main should raise SortSeqError
            if ('_bad' in file_name) or ('library' in file_name):
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype,',
                except:
                    print 'good (ERROR).'
                    raise

            # If good, then profile_mut.main should produce a valid df
            elif '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_mut(df)
                    out_file = self.output_dir+\
                        'profile_mut_bin_noerr_%s.txt'%description
                    io.write(df,out_file)           # Test writing
                    io.load_profile_mut(out_file)   # Test loading
                    print 'good,',

                except:
                    print 'bad (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')

            # Should always raise an error if bin num is too large
            executable = lambda:\
                profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num)
            print '(bin=%d)'%bad_bin_num,
            try:
                self.assertRaises(SortSeqError,executable)
                print 'badtype.'
            except:
                print 'good (ERROR).'
                raise
Example #2
0
    def test_profile_mut_bin_noerr(self):
        """ Test the ability of mpathic.profile_mut to compute mutation rates
        """

        print '\nIn test_profile_mut_bin_noerr_...'
        library_files = glob.glob(self.input_dir + 'library_*.txt')
        library_files += glob.glob(self.input_dir + 'dataset_*.txt')
        good_bin_num = 2
        bad_bin_num = 5
        for file_name in library_files:
            print '\t%s =' % file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda:\
                profile_mut.main(io.load_dataset(file_name),bin=good_bin_num,err=False)
            print '(bin=%d)' % good_bin_num,

            # If bad or library, then profile_mut.main should raise SortSeqError
            if ('_bad' in file_name) or ('library' in file_name):
                try:
                    self.assertRaises(SortSeqError, executable)
                    print 'badtype,',
                except:
                    print 'good (ERROR).'
                    raise

            # If good, then profile_mut.main should produce a valid df
            elif '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_mut(df)
                    out_file = self.output_dir+\
                        'profile_mut_bin_noerr_%s.txt'%description
                    io.write(df, out_file)  # Test writing
                    io.load_profile_mut(out_file)  # Test loading
                    print 'good,',

                except:
                    print 'bad (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')

            # Should always raise an error if bin num is too large
            executable = lambda:\
                profile_mut.main(io.load_dataset(file_name),bin=bad_bin_num)
            print '(bin=%d)' % bad_bin_num,
            try:
                self.assertRaises(SortSeqError, executable)
                print 'badtype.'
            except:
                print 'good (ERROR).'
                raise
Example #3
0
    def test_profile_mut_total_err(self):
        """ Test the ability of mpathic.profile_mut to compute mutation rates based on total count values
        """

        print '\nIn test_profile_mut_total_err...'
        library_files = glob.glob(self.input_dir + 'library_*.txt')
        library_files += glob.glob(self.input_dir + 'dataset_*.txt')
        for file_name in library_files:
            print '\t%s =' % file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda: profile_mut.main(io.load_dataset(file_name),
                                                  err=True)

            # If good, then profile_mut.main should produce a valid df
            if '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_mut(df)
                    out_file = self.output_dir+\
                        'profile_mut_total_err_%s.txt'%description
                    io.write(df, out_file)
                    io.load_profile_mut(out_file)
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If bad, then profile_mut.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError, executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Example #4
0
    def test_profile_mut_total_err(self):
        """ Test the ability of mpathic.profile_mut to compute mutation rates based on total count values
        """

        print '\nIn test_profile_mut_total_err...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda: profile_mut.main(io.load_dataset(file_name),err=True)

            # If good, then profile_mut.main should produce a valid df
            if '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_mut(df)
                    out_file = self.output_dir+\
                        'profile_mut_total_err_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_mut(out_file)
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If bad, then profile_mut.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Example #5
0
    def test_profile_mut_seqslicing(self):
        """ Test the ability of mpathic.profile_mut to slice sequences properly, and to raise the correct errors
        """

        print '\nIn test_profile_mut_seqslicing...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable_good1 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=2,end=10)
            executable_good2 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    end=10)
            executable_good3 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    end=2)
            executable_nopro =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=50,end=60)
            executable_bad1 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=-1)
            executable_bad2 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    end=100)
            executable_bad3 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=20,end=10)

            # If good, then sequences will be valid
            if '_good' in file_name:
                try:
                    df = executable_good1()
                    io.write(df,self.output_dir+\
                        'profile_mut_splice2-10_%s.txt'%description)
                    executable_good2()
                    executable_good3()
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    if '_pro' in file_name:
                        self.assertRaises(SortSeqError,executable_nopro)
                    else:
                        df = executable_nopro()
                    print 'ok.'
                except:
                    print 'ok (ERROR).'
                    raise

            # If bad, then profile_mut.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable_good1)
                    self.assertRaises(SortSeqError,executable_good2)
                    self.assertRaises(SortSeqError,executable_good3)
                    self.assertRaises(SortSeqError,executable_nopro)
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    print 'ok.'
                except:
                    print 'not ok (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Example #6
0
    def test_profile_mut_seqslicing(self):
        """ Test the ability of mpathic.profile_mut to slice sequences properly, and to raise the correct errors
        """

        print '\nIn test_profile_mut_seqslicing...'
        library_files = glob.glob(self.input_dir + 'library_*.txt')
        library_files += glob.glob(self.input_dir + 'dataset_*.txt')
        for file_name in library_files:
            print '\t%s =' % file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable_good1 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=2,end=10)
            executable_good2 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    end=10)
            executable_good3 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    end=2)
            executable_nopro =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=50,end=60)
            executable_bad1 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=-1)
            executable_bad2 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    end=100)
            executable_bad3 =\
                lambda: profile_mut.main(io.load_dataset(file_name),\
                    start=20,end=10)

            # If good, then sequences will be valid
            if '_good' in file_name:
                try:
                    df = executable_good1()
                    io.write(df,self.output_dir+\
                        'profile_mut_splice2-10_%s.txt'%description)
                    executable_good2()
                    executable_good3()
                    self.assertRaises(SortSeqError, executable_bad1)
                    self.assertRaises(SortSeqError, executable_bad2)
                    self.assertRaises(SortSeqError, executable_bad3)
                    if '_pro' in file_name:
                        self.assertRaises(SortSeqError, executable_nopro)
                    else:
                        df = executable_nopro()
                    print 'ok.'
                except:
                    print 'ok (ERROR).'
                    raise

            # If bad, then profile_mut.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError, executable_good1)
                    self.assertRaises(SortSeqError, executable_good2)
                    self.assertRaises(SortSeqError, executable_good3)
                    self.assertRaises(SortSeqError, executable_nopro)
                    self.assertRaises(SortSeqError, executable_bad1)
                    self.assertRaises(SortSeqError, executable_bad2)
                    self.assertRaises(SortSeqError, executable_bad3)
                    print 'ok.'
                except:
                    print 'not ok (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
def _check_mut_rate(partialdf, buffer):
    mut = profile_mut.main(partialdf)

    #Check mutation rate of non-primer and non-wildtype bases
    relevantmuts = mut.loc[buffer + 20:179-buffer*2,'mut']
    return ((relevantmuts > .14).any() or (relevantmuts < .07).any())
Example #8
0
def main(dataset_df,
         bin=None,
         start=0,
         end=None,
         bins_df=None,
         pseudocounts=1,
         return_profile=False):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each 
        nucleotide/amino acid character at each position. 
    """
    seq_cols = qc.get_cols_from_df(dataset_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = utils.choose_dict(dicttype)
    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    #for each bin we need to find character frequency profile, then sum over all
    #bins to get activity.

    #first make sure we have activities of each bin:
    if not bins_df:
        bins = utils.get_column_headers(dataset_df)
        #in this case no activity was specified so just assume the activity
        #equals bin number
        activity = [float(b.split('_')[-1]) for b in bins]
    else:
        bins = list(bins_df['bins'])
        activity = list(bins_df['activity'])

    #initialize dataframe for total counts in all bins
    output_ct_df = pd.DataFrame()
    #initialize dataframe for running activity calculation
    output_activity_df = pd.DataFrame()

    for i, b in enumerate(bins):
        bin_num = int(b.split('_')[-1])
        # Compute counts
        counts_df = profile_ct.main(dataset_df,
                                    bin=bin_num,
                                    start=start,
                                    end=end)

        # Create columns for profile_freqs table
        ct_cols = utils.get_column_headers(counts_df)
        #add_pseudocounts
        counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts

        #add to all previous bin counts
        #print output_activity_df
        if i == 0:
            output_ct_df = counts_df[ct_cols]
            output_activity_df = counts_df[ct_cols] * activity[i]
        else:
            output_ct_df = output_ct_df + counts_df[ct_cols]
            output_activity_df = output_activity_df + counts_df[
                ct_cols] * activity[i]

    #now normalize by each character at each position, this is the activity
    #profile

    output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols])

    mut_rate = profile_mut.main(dataset_df, bin=bin)
    freq = profile_freq.main(dataset_df, bin=bin)
    freq_cols = [x for x in freq.columns if 'freq_' in x]
    #now normalize by the wt activity
    wtseq = ''.join(mut_rate['wt'])
    wtarr = utils.seq2mat(wtseq, seq_dict)

    wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols]))

    #sum this to get total
    wt_activity2 = wt_activity.sum(axis=1)
    delta_activity = output_activity_df.subtract(pd.Series(wt_activity2),
                                                 axis=0)
    if return_profile:
        #first find mutation rate according to formula in SI text
        profile_delta_activity = mut_rate['mut']*np.sum(
            (1-np.transpose(wtarr))*np.array(\
            freq[freq_cols])*np.array(delta_activity),axis=1)
        #format into dataframe
        output_df = pd.DataFrame()
        output_df['pos'] = range(start,
                                 start + len(profile_delta_activity.index))
        output_df['mut_activity'] = profile_delta_activity
        return output_df
    else:
        #just add pos column and rename counts columns to activity columns
        output_df = pd.DataFrame(delta_activity)
        output_df.insert(0, 'pos',
                         range(start, start + len(delta_activity.index)))
        #reorder columns

        activity_col_dict = {x:'activity_' + x.split('_')[-1] \
            for x in delta_activity.columns if 'ct_' in x}
        output_df = output_df.rename(columns=activity_col_dict)
        return output_df