Example #1
0
    def test_profile_info(self):
        """ Test the ability of mpathic.profile_info to compute mutation rates based on total count values
        """

        print '\nIn test_profile_info...'
        file_names = glob.glob(self.input_dir + 'dataset_*.txt')
        for err in [True, False]:
            for file_name in file_names:
                print '\t%s, err=%s =' % (file_name, str(err)),
                description = file_name.split('_')[-1].split('.')[0]
                executable = lambda: \
                    profile_info.main(io.load_dataset(file_name),err=err)

                # If good, then profile_info.main should produce a valid df
                if '_good' in file_name:
                    try:
                        df = executable()
                        qc.validate_profile_info(df)
                        out_file = self.output_dir+\
                            'profile_info_%s_err_%s.txt'%(description,str(err))
                        io.write(df, out_file)
                        io.load_profile_info(out_file)
                        print 'good.'
                    except:
                        print 'bad (ERROR).'
                        raise

                # If bad, then profile_info.main should raise SortSeqError
                elif '_bad' in file_name:
                    try:
                        self.assertRaises(SortSeqError, executable)
                        print 'badtype.'
                    except:
                        print 'good (ERROR).'
                        raise

                # There are no other options
                else:
                    raise SortSeqError('Unrecognized class of file_name.')
Example #2
0
    def test_profile_info(self):
        """ Test the ability of mpathic.profile_info to compute mutation rates based on total count values
        """

        print '\nIn test_profile_info...'
        file_names = glob.glob(self.input_dir+'dataset_*.txt')
        for err in [True,False]:
            for file_name in file_names:
                print '\t%s, err=%s ='%(file_name,str(err)),
                description = file_name.split('_')[-1].split('.')[0]
                executable = lambda: \
                    profile_info.main(io.load_dataset(file_name),err=err)

                # If good, then profile_info.main should produce a valid df
                if '_good' in file_name:
                    try:
                        df = executable()
                        qc.validate_profile_info(df)
                        out_file = self.output_dir+\
                            'profile_info_%s_err_%s.txt'%(description,str(err))
                        io.write(df,out_file)
                        io.load_profile_info(out_file)
                        print 'good.'
                    except:
                        print 'bad (ERROR).'
                        raise

                # If bad, then profile_info.main should raise SortSeqError
                elif '_bad' in file_name:
                    try:
                        self.assertRaises(SortSeqError,executable)
                        print 'badtype.'
                    except:
                        print 'good (ERROR).'
                        raise

                # There are no other options
                else:
                    raise SortSeqError('Unrecognized class of file_name.')
Example #3
0
def main(dataset_df, err=False, method="naive", pseudocount=1.0, start=0, end=None):
    """
    Computes the mutual information (in bits), at each position, between the character and the bin number. 

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position
        method (str): Which method to use to estimate mutual information

    Returns:
        info_df (pd.DataFrame): A dataframe containing results.
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Get number of bins
    bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "ct_")]
    if not len(bin_cols) >= 2:
        raise SortSeqError("Information profile requires at least 2 bins.")
    bins = [int(c.split("_")[1]) for c in bin_cols]
    num_bins = len(bins)

    # Get number of characters
    seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c, "seqs")]
    if not len(seq_cols) == 1:
        raise SortSeqError("Must be only one seq column.")
    seq_col = seq_cols[0]
    seqtype = qc.colname_to_seqtype_dict[seq_col]
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    ct_cols = ["ct_" + a for a in alphabet]
    num_chars = len(alphabet)

    # Get sequence length and check start, end numbers
    num_pos = len(dataset_df[seq_col][0])
    if not (0 <= start < num_pos):
        raise SortSeqError("Invalid start==%d, num_pos==%d" % (start, num_pos))
    if end is None:
        end = num_pos
    elif end > num_pos:
        raise SortSeqError("Invalid end==%d, num_pos==%d" % (end, num_pos))
    elif end <= start:
        raise SortSeqError("Invalid: start==%d >= end==%d" % (start, end))

    # Record positions in new dataframe
    counts_df = profile_ct.main(dataset_df)
    info_df = counts_df.loc[start : (end - 1), ["pos"]].copy()  # rows from start:end
    info_df["info"] = 0.0
    if err:
        info_df["info_err"] = 0.0

    # Fill in 3D array of counts
    ct_3d_array = np.zeros([end - start, num_chars, num_bins])
    for i, bin_num in enumerate(bins):

        # Compute counts
        counts_df = profile_ct.main(dataset_df, bin=bin_num)

        # Fill in counts table
        ct_3d_array[:, :, i] = counts_df.loc[start : (end - 1), ct_cols].astype(float)

    # Compute mutual information for each position
    for i in range(end - start):  # i only from start:end

        # Get 2D counts
        nxy = ct_3d_array[i, :, :]
        assert len(nxy.shape) == 2

        # Compute mutual informaiton
        if err:
            mi, mi_err = info.estimate_mutualinfo(nxy, err=True, method=method, pseudocount=pseudocount)
            info_df.loc[i + start, "info"] = mi
            info_df.loc[i + start, "info_err"] = mi_err
        else:
            mi = info.estimate_mutualinfo(nxy, err=False, method=method, pseudocount=pseudocount)
            info_df.loc[i + start, "info"] = mi

    # Validate info dataframe
    info_df = qc.validate_profile_info(info_df, fix=True)
    return info_df