Example #1
0
    def test_profile_ct_bincounts(self):
        """ Test the ability of mpathic.profile_ct to count frequencies
        """

        print '\nIn test_profile_ct_bincounts...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        good_bin_num = 2
        bad_bin_num = 5
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda:\
                profile_ct.main(io.load_dataset(file_name),bin=good_bin_num)
            print '(bin=%d)'%good_bin_num,

            # If bad or library, then profile_ct.main should raise SortSeqError
            if ('_bad' in file_name) or ('library' in file_name):
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype,',
                except:
                    print 'good (ERROR).'
                    raise

            # If good, then profile_ct.main should produce a valid df
            elif ('_good' in file_name) or ('dataset' in file_name):
                try:
                    df = executable()
                    qc.validate_profile_ct(df)
                    out_file = self.output_dir+\
                        'profile_ct_bin_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_ct(out_file)
                    print 'good,',

                except:
                    print 'bad (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')

            # Should always raise an error if bin num is too large
            executable = lambda:\
                profile_ct.main(io.load_dataset(file_name),bin=bad_bin_num)
            print '(bin=%d)'%bad_bin_num,
            try:
                self.assertRaises(SortSeqError,executable)
                print 'badtype.'
            except:
                print 'good (ERROR).'
                raise
Example #2
0
def main(dataset_df, bin=None, start=0, end=None):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Compute counts
    counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

    # Create columns for profile_freqs table
    ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')]
    freq_cols = ['freq_'+c.split('_')[1] for c in ct_cols]

    # Compute frequencies from counts
    freq_df = counts_df[ct_cols].div(counts_df['ct'], axis=0)
    freq_df.columns = freq_cols
    freq_df['pos'] = counts_df['pos']

    # Validate as counts dataframe
    freq_df = qc.validate_profile_freq(freq_df,fix=True)
    return freq_df
Example #3
0
def main(dataset_df, bin=None, start=0, end=None, err=False):
    """
    Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing results. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Compute counts
    counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

    # Create columns for profile_freqs table
    ct_cols = [c for c in counts_df.columns if qc.is_col_type(c, "ct_")]

    # Record positions in new dataframe
    mut_df = counts_df[["pos"]].copy()

    # Compute mutation rate across counts
    max_ct = counts_df[ct_cols].max(axis=1)
    sum_ct = counts_df[ct_cols].sum(axis=1)
    mut = 1.0 - (max_ct / sum_ct)
    mut_df["mut"] = mut

    # Computation of error rate is optional
    if err:
        mut_err = np.sqrt(mut * (1.0 - mut) / sum_ct)
        mut_df["mut_err"] = mut_err

    # Figure out which alphabet the cts dataframe specifies
    alphabet = "".join([c.split("_")[1] for c in ct_cols])
    seqtype = qc.alphabet_to_seqtype_dict[alphabet]
    wt_col = qc.seqtype_to_wtcolname_dict[seqtype]

    # Compute WT base at each position
    mut_df[wt_col] = "X"
    for col in ct_cols:
        indices = (counts_df[col] == max_ct).values
        mut_df.loc[indices, wt_col] = col.split("_")[1]

    # Validate as counts dataframe
    mut_df = qc.validate_profile_mut(mut_df, fix=True)
    return mut_df
Example #4
0
    def test_profile_ct_totalcounts(self):
        """ Test the ability of mpathic.profile_ct to count frequencies based on total count values
        """

        print '\nIn test_profile_ct_totalcounts...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda: profile_ct.main(io.load_dataset(file_name))

            # If good, then profile_ct.main should produce a valid df
            if '_good' in file_name:
                try:
                    df = executable()
                    qc.validate_profile_ct(df)
                    out_file = self.output_dir+\
                        'profile_ct_total_%s.txt'%description
                    io.write(df,out_file)
                    io.load_profile_ct(out_file)
                    print 'good.'
                except:
                    print 'bad (ERROR).'
                    raise

            # If bad, then profile_ct.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable)
                    print 'badtype.'
                except:
                    print 'good (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Example #5
0
def main(dataset_df, err=False, method="naive", pseudocount=1.0, start=0, end=None):
    """
    Computes the mutual information (in bits), at each position, between the character and the bin number. 

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position
        method (str): Which method to use to estimate mutual information

    Returns:
        info_df (pd.DataFrame): A dataframe containing results.
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Get number of bins
    bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "ct_")]
    if not len(bin_cols) >= 2:
        raise SortSeqError("Information profile requires at least 2 bins.")
    bins = [int(c.split("_")[1]) for c in bin_cols]
    num_bins = len(bins)

    # Get number of characters
    seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "seqs")]
    if not len(seq_cols) == 1:
        raise SortSeqError("Must be only one seq column.")
    seq_col = seq_cols[0]
    seqtype = qc.colname_to_seqtype_dict[seq_col]
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    ct_cols = ["ct_" + a for a in alphabet]
    num_chars = len(alphabet)

    # Get sequence length and check start, end numbers
    num_pos = len(dataset_df[seq_col][0])
    if not (0 <= start < num_pos):
        raise SortSeqError("Invalid start==%d, num_pos==%d" % (start, num_pos))
    if end is None:
        end = num_pos
    elif end > num_pos:
        raise SortSeqError("Invalid end==%d, num_pos==%d" % (end, num_pos))
    elif end <= start:
        raise SortSeqError("Invalid: start==%d >= end==%d" % (start, end))

    # Record positions in new dataframe
    counts_df = profile_ct.main(dataset_df)
    info_df = counts_df.loc[start : (end - 1), ["pos"]].copy()  # rows from start:end
    info_df["info"] = 0.0
    if err:
        info_df["info_err"] = 0.0

    # Fill in 3D array of counts
    ct_3d_array = np.zeros([end - start, num_chars, num_bins])
    for i, bin_num in enumerate(bins):

        # Compute counts
        counts_df = profile_ct.main(dataset_df, bin=bin_num)

        # Fill in counts table
        ct_3d_array[:, :, i] = counts_df.loc[start : (end - 1), ct_cols].astype(float)

    # Compute mutual information for each position
    for i in range(end - start):  # i only from start:end

        # Get 2D counts
        nxy = ct_3d_array[i, :, :]
        assert len(nxy.shape) == 2

        # Compute mutual informaiton
        if err:
            mi, mi_err = info.estimate_mutualinfo(nxy, err=True, method=method, pseudocount=pseudocount)
            info_df.loc[i + start, "info"] = mi
            info_df.loc[i + start, "info_err"] = mi_err
        else:
            mi = info.estimate_mutualinfo(nxy, err=False, method=method, pseudocount=pseudocount)
            info_df.loc[i + start, "info"] = mi

    # Validate info dataframe
    info_df = qc.validate_profile_info(info_df, fix=True)
    return info_df
Example #6
0
    def test_profile_ct_seqslicing(self):
        """ Test the ability of mpathic.profile_ct to slice sequences properly, and to raise the correct errors
        """

        print '\nIn test_profile_ct_seqslicing...'
        library_files = glob.glob(self.input_dir+'library_*.txt')
        library_files += glob.glob(self.input_dir+'dataset_*.txt')
        for file_name in library_files:
            print '\t%s ='%file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable_good1 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=2,end=10)
            executable_good2 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=2)
            executable_good3 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    end=2)
            executable_nopro =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=50,end=60)
            executable_bad1 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=-1)
            executable_bad2 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    end=100)
            executable_bad3 =\
                lambda: profile_ct.main(io.load_dataset(file_name),\
                    start=20,end=10)

            # If good, then sequences will be valid
            if 'good' in file_name:
                try:
                    df = executable_good1()
                    io.write(df,self.output_dir+\
                        'profile_ct_splice2-10_%s.txt'%description)
                    executable_good2()
                    executable_good3()
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    if '_pro' in file_name:
                        self.assertRaises(SortSeqError,executable_nopro)
                    else:
                        df = executable_nopro()
                    print 'ok.'
                except:
                    print 'ok (ERROR).'
                    raise

            # If bad, then profile_ct.main should raise SortSeqError
            elif '_bad' in file_name:
                try:
                    self.assertRaises(SortSeqError,executable_good1)
                    self.assertRaises(SortSeqError,executable_good2)
                    self.assertRaises(SortSeqError,executable_good3)
                    self.assertRaises(SortSeqError,executable_nopro)
                    self.assertRaises(SortSeqError,executable_bad1)
                    self.assertRaises(SortSeqError,executable_bad2)
                    self.assertRaises(SortSeqError,executable_bad3)
                    print 'ok.'
                except:
                    print 'not ok (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')
Example #7
0
def main(dataset_df,
         bin=None,
         start=0,
         end=None,
         bins_df=None,
         pseudocounts=1,
         return_profile=False):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each 
        nucleotide/amino acid character at each position. 
    """
    seq_cols = qc.get_cols_from_df(dataset_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict, inv_dict = utils.choose_dict(dicttype)
    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    #for each bin we need to find character frequency profile, then sum over all
    #bins to get activity.

    #first make sure we have activities of each bin:
    if not bins_df:
        bins = utils.get_column_headers(dataset_df)
        #in this case no activity was specified so just assume the activity
        #equals bin number
        activity = [float(b.split('_')[-1]) for b in bins]
    else:
        bins = list(bins_df['bins'])
        activity = list(bins_df['activity'])

    #initialize dataframe for total counts in all bins
    output_ct_df = pd.DataFrame()
    #initialize dataframe for running activity calculation
    output_activity_df = pd.DataFrame()

    for i, b in enumerate(bins):
        bin_num = int(b.split('_')[-1])
        # Compute counts
        counts_df = profile_ct.main(dataset_df,
                                    bin=bin_num,
                                    start=start,
                                    end=end)

        # Create columns for profile_freqs table
        ct_cols = utils.get_column_headers(counts_df)
        #add_pseudocounts
        counts_df[ct_cols] = counts_df[ct_cols] + pseudocounts

        #add to all previous bin counts
        #print output_activity_df
        if i == 0:
            output_ct_df = counts_df[ct_cols]
            output_activity_df = counts_df[ct_cols] * activity[i]
        else:
            output_ct_df = output_ct_df + counts_df[ct_cols]
            output_activity_df = output_activity_df + counts_df[
                ct_cols] * activity[i]

    #now normalize by each character at each position, this is the activity
    #profile

    output_activity_df = output_activity_df[ct_cols].div(output_ct_df[ct_cols])

    mut_rate = profile_mut.main(dataset_df, bin=bin)
    freq = profile_freq.main(dataset_df, bin=bin)
    freq_cols = [x for x in freq.columns if 'freq_' in x]
    #now normalize by the wt activity
    wtseq = ''.join(mut_rate['wt'])
    wtarr = utils.seq2mat(wtseq, seq_dict)

    wt_activity = np.transpose(wtarr) * (np.array(output_activity_df[ct_cols]))

    #sum this to get total
    wt_activity2 = wt_activity.sum(axis=1)
    delta_activity = output_activity_df.subtract(pd.Series(wt_activity2),
                                                 axis=0)
    if return_profile:
        #first find mutation rate according to formula in SI text
        profile_delta_activity = mut_rate['mut']*np.sum(
            (1-np.transpose(wtarr))*np.array(\
            freq[freq_cols])*np.array(delta_activity),axis=1)
        #format into dataframe
        output_df = pd.DataFrame()
        output_df['pos'] = range(start,
                                 start + len(profile_delta_activity.index))
        output_df['mut_activity'] = profile_delta_activity
        return output_df
    else:
        #just add pos column and rename counts columns to activity columns
        output_df = pd.DataFrame(delta_activity)
        output_df.insert(0, 'pos',
                         range(start, start + len(delta_activity.index)))
        #reorder columns

        activity_col_dict = {x:'activity_' + x.split('_')[-1] \
            for x in delta_activity.columns if 'ct_' in x}
        output_df = output_df.rename(columns=activity_col_dict)
        return output_df