Example #1
0
def main(dataset_df, bin=None, start=0, end=None):
    """
    Computes character frequencies (0.0 to 1.0) at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Compute counts
    counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

    # Create columns for profile_freqs table
    ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')]
    freq_cols = ['freq_'+c.split('_')[1] for c in ct_cols]

    # Compute frequencies from counts
    freq_df = counts_df[ct_cols].div(counts_df['ct'], axis=0)
    freq_df.columns = freq_cols
    freq_df['pos'] = counts_df['pos']

    # Validate as counts dataframe
    freq_df = qc.validate_profile_freq(freq_df,fix=True)
    return freq_df
Example #2
0
def main(dataset_df, bin=None, start=0, end=None, err=False):
    """
    Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        freq_df (pd.DataFrame): A dataframe containing results. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Compute counts
    counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end)

    # Create columns for profile_freqs table
    ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')]

    # Record positions in new dataframe
    mut_df = counts_df[['pos']].copy()

    # Compute mutation rate across counts
    max_ct = counts_df[ct_cols].max(axis=1)
    sum_ct = counts_df[ct_cols].sum(axis=1)
    mut = 1.0 - (max_ct/sum_ct)
    mut_df['mut'] = mut

    # Computation of error rate is optional
    if err:
        mut_err = np.sqrt(mut*(1.0-mut)/sum_ct)
        mut_df['mut_err'] = mut_err

    # Figure out which alphabet the cts dataframe specifies
    alphabet = ''.join([c.split('_')[1] for c in ct_cols])
    seqtype = qc.alphabet_to_seqtype_dict[alphabet]
    wt_col = qc.seqtype_to_wtcolname_dict[seqtype]

    # Compute WT base at each position
    mut_df[wt_col] = 'X'
    for col in ct_cols:
        indices = (counts_df[col]==max_ct).values
        mut_df.loc[indices,wt_col] = col.split('_')[1]

    # Validate as counts dataframe
    mut_df = qc.validate_profile_mut(mut_df,fix=True)
    return mut_df
Example #3
0
def main(dataset_df, bin=None, start=0, end=None):
    """
    Computes character counts at each position

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        bin (int): A bin number specifying which counts to use
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position

    Returns:
        counts_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. 
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Retrieve type of sequence
    seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')]
    if not len(seq_cols)==1:
        raise SortSeqError('Dataset dataframe must have only one seq colum.')
    colname = seq_cols[0]
    seqtype = qc.colname_to_seqtype_dict[colname]
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    num_chars = len(alphabet)

    # Retrieve sequence length
    if not dataset_df.shape[0] > 1:
        raise SortSeqError('Dataset dataframe must have at least one row.')
    total_seq_length = len(dataset_df[colname].iloc[0])

    # Validate start and end
    if start<0:
        raise SortSeqError('start=%d is negative.'%start)
    elif start>=total_seq_length:
        raise SortSeqError('start=%d >= total_seq_length=%d'%\
            (start,total_seq_length))

    if end is None:
        end=total_seq_length
    elif end<=start:
        raise SortSeqError('end=%d <= start=%d.'%(end,start))
    elif end>total_seq_length:
        raise SortSeqError('end=%d > total_seq_length=%d'%\
            (start,total_seq_length))

    # Set positions
    poss = pd.Series(range(start,end),name='pos')
    num_poss = len(poss)

    # Retrieve counts
    if bin is None:
        ct_col = 'ct'
    else:
        ct_col = 'ct_%d'%bin
    if not ct_col in dataset_df.columns:
        raise SortSeqError('Column "%s" is not in columns=%s'%\
            (ct_col,str(dataset_df.columns)))
    counts = dataset_df[ct_col]

    # Compute counts profile
    counts_array = np.zeros([num_poss,num_chars])
    counts_cols = ['ct_'+a for a in alphabet]
    for i,pos in enumerate(range(start,end)):
        char_list = dataset_df[colname].str.slice(pos,pos+1)
        counts_array[i,:] = [np.sum(counts[char_list==a]) for a in alphabet]
    temp_df = pd.DataFrame(counts_array,columns=counts_cols)
    counts_df = pd.concat([poss,temp_df],axis=1)

    # Validate as counts dataframe
    counts_df = qc.validate_profile_ct(counts_df,fix=True)
    return counts_df
Example #4
0
def merge_datasets(dataset_df_dict):
    """
    Merges multiple datasets into one. Data from disparate files is merged via values in 'tag', seq', 'seq_rna', or 'seq_pro' columns (in order of preference, chosen according to availability). Each value in the 'ct' column of each dataset is recorded in the 'ct_[bin]' column of the final dataset. A total 'ct' column is then computed, and rows in the final dataset are sorted in descending order according to this. 

    Arguments:
        dataset_df_dict (dict): Keys are bin numbers, values are dataset dataframes

    Returns:
        out_df (pd.DataFrame): A validated dataset dataframe
    """
    # Make sure datasets were loaded
    if not len(dataset_df_dict)>=1:
        raise SortSeqError('No datasets were loaded')

    # Determine index column. Must be same for all files
    df = dataset_df_dict.values()[0]
    if 'tag' in df.columns:
        index_col = 'tag'
    elif 'seq' in df.columns:
        index_col = 'seq'
    elif 'seq_rna' in df.columns:
        index_col = 'seq_rna'
    elif 'seq_pro' in df.columns:
        index_col = 'seq_pro'

    # Concatenate dataset dataframes
    out_df = pd.DataFrame()
    for b in dataset_df_dict.keys():
        df = dataset_df_dict[b]

        # Verify that dataframe has correct column
        if not index_col in df.columns:
            raise SortSeqError('\
                Dataframe does not contain index_col="%s"'%index_col)
        if not 'ct' in df.columns:
            raise SortSeqError('\
                Dataframe does not contain a "ct" column')

        # Delete "ct_X" columns
        for col in df.columns:
            if qc.is_col_type(col,'ct_'):
                del df[col]

        # Add bin number to name of counts column. 
        df = df.rename(columns={'ct':'ct_%d'%b})

        # Index dataset by index_col 
        df = df.groupby(index_col).sum()

        # Concatenate 
        out_df = pd.concat([out_df,df],axis=1)

    # Rename index as tag
    out_df.reset_index(inplace=True)
    out_df.rename(columns = {'index':index_col},inplace=True) 

    # Fill undefined counts with zero
    out_df.fillna(value=0,inplace=True)

    # Add 'ct' column, with proper counts
    out_df['ct'] = 0
    for col in out_df.columns:
        if qc.is_col_type(col,'ct_'):
            out_df['ct'] += out_df[col]

    # Sort by 'ct' column
    out_df.sort('ct',ascending=False,inplace=True) 
    out_df.reset_index(drop=True,inplace=True)

    # Validate out_df as dataset and return it
    out_df = qc.validate_dataset(out_df,fix=True)
    return out_df
Example #5
0
def main(dataset_df, err=False, method='naive',\
    pseudocount=1.0, start=0, end=None):
    """
    Computes the mutual information (in bits), at each position, between the character and the bin number. 

    Arguments:
        dataset_df (pd.DataFrame): A dataframe containing a valid dataset.
        start (int): An integer specifying the sequence start position
        end (int): An integer specifying the sequence end position
        method (str): Which method to use to estimate mutual information

    Returns:
        info_df (pd.DataFrame): A dataframe containing results.
    """

    # Validate dataset_df
    qc.validate_dataset(dataset_df)

    # Get number of bins
    bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'ct_')]
    if not len(bin_cols) >= 2:
        raise SortSeqError('Information profile requires at least 2 bins.')
    bins = [int(c.split('_')[1]) for c in bin_cols]
    num_bins = len(bins)

    # Get number of characters
    seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')]
    if not len(seq_cols)==1:
        raise SortSeqError('Must be only one seq column.') 
    seq_col = seq_cols[0]
    seqtype = qc.colname_to_seqtype_dict[seq_col]
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    ct_cols = ['ct_'+a for a in alphabet]
    num_chars = len(alphabet)

    # Get sequence length and check start, end numbers
    num_pos = len(dataset_df[seq_col][0])
    if not (0 <= start < num_pos):
        raise SortSeqError('Invalid start==%d, num_pos==%d'%(start,num_pos))
    if end is None:
        end = num_pos
    elif (end > num_pos):
        raise SortSeqError('Invalid end==%d, num_pos==%d'%(end,num_pos))
    elif end <= start:
        raise SortSeqError('Invalid: start==%d >= end==%d'%(start,end))

    # Record positions in new dataframe
    counts_df = profile_ct.main(dataset_df)
    info_df = counts_df.loc[start:(end-1),['pos']].copy() # rows from start:end
    info_df['info'] = 0.0
    if err:
        info_df['info_err'] = 0.0

    # Fill in 3D array of counts
    ct_3d_array = np.zeros([end-start, num_chars, num_bins])
    for i, bin_num in enumerate(bins):

        # Compute counts
        counts_df = profile_ct.main(dataset_df, bin=bin_num)

        # Fill in counts table
        ct_3d_array[:,:,i] = counts_df.loc[start:(end-1),ct_cols].astype(float)

    # Compute mutual information for each position
    for i in range(end-start): # i only from start:end

        # Get 2D counts
        nxy = ct_3d_array[i,:,:]
        assert len(nxy.shape) == 2

        # Compute mutual informaiton
        if err:
            mi, mi_err = info.estimate_mutualinfo(nxy,err=True,\
                method=method,pseudocount=pseudocount)
            info_df.loc[i+start,'info'] = mi
            info_df.loc[i+start,'info_err'] = mi_err
        else:
            mi = info.estimate_mutualinfo(nxy,err=False,\
                method=method,pseudocount=pseudocount)
            info_df.loc[i+start,'info'] = mi

    # Validate info dataframe
    info_df = qc.validate_profile_info(info_df,fix=True)
    return info_df