def main(dataset_df, bin=None, start=0, end=None): """ Computes character frequencies (0.0 to 1.0) at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end) # Create columns for profile_freqs table ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')] freq_cols = ['freq_'+c.split('_')[1] for c in ct_cols] # Compute frequencies from counts freq_df = counts_df[ct_cols].div(counts_df['ct'], axis=0) freq_df.columns = freq_cols freq_df['pos'] = counts_df['pos'] # Validate as counts dataframe freq_df = qc.validate_profile_freq(freq_df,fix=True) return freq_df
def main(dataset_df, bin=None, start=0, end=None, err=False): """ Computes the mutation rate (0.0 to 1.0) at each position. Mutation rate is defined as 1.0 minus the maximum character frequency at a position. Errors are estimated using bionomial uncertainty Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: freq_df (pd.DataFrame): A dataframe containing results. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin, start=start, end=end) # Create columns for profile_freqs table ct_cols = [c for c in counts_df.columns if qc.is_col_type(c,'ct_')] # Record positions in new dataframe mut_df = counts_df[['pos']].copy() # Compute mutation rate across counts max_ct = counts_df[ct_cols].max(axis=1) sum_ct = counts_df[ct_cols].sum(axis=1) mut = 1.0 - (max_ct/sum_ct) mut_df['mut'] = mut # Computation of error rate is optional if err: mut_err = np.sqrt(mut*(1.0-mut)/sum_ct) mut_df['mut_err'] = mut_err # Figure out which alphabet the cts dataframe specifies alphabet = ''.join([c.split('_')[1] for c in ct_cols]) seqtype = qc.alphabet_to_seqtype_dict[alphabet] wt_col = qc.seqtype_to_wtcolname_dict[seqtype] # Compute WT base at each position mut_df[wt_col] = 'X' for col in ct_cols: indices = (counts_df[col]==max_ct).values mut_df.loc[indices,wt_col] = col.split('_')[1] # Validate as counts dataframe mut_df = qc.validate_profile_mut(mut_df,fix=True) return mut_df
def main(dataset_df, bin=None, start=0, end=None): """ Computes character counts at each position Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. bin (int): A bin number specifying which counts to use start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position Returns: counts_df (pd.DataFrame): A dataframe containing counts for each nucleotide/amino acid character at each position. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Retrieve type of sequence seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')] if not len(seq_cols)==1: raise SortSeqError('Dataset dataframe must have only one seq colum.') colname = seq_cols[0] seqtype = qc.colname_to_seqtype_dict[colname] alphabet = qc.seqtype_to_alphabet_dict[seqtype] num_chars = len(alphabet) # Retrieve sequence length if not dataset_df.shape[0] > 1: raise SortSeqError('Dataset dataframe must have at least one row.') total_seq_length = len(dataset_df[colname].iloc[0]) # Validate start and end if start<0: raise SortSeqError('start=%d is negative.'%start) elif start>=total_seq_length: raise SortSeqError('start=%d >= total_seq_length=%d'%\ (start,total_seq_length)) if end is None: end=total_seq_length elif end<=start: raise SortSeqError('end=%d <= start=%d.'%(end,start)) elif end>total_seq_length: raise SortSeqError('end=%d > total_seq_length=%d'%\ (start,total_seq_length)) # Set positions poss = pd.Series(range(start,end),name='pos') num_poss = len(poss) # Retrieve counts if bin is None: ct_col = 'ct' else: ct_col = 'ct_%d'%bin if not ct_col in dataset_df.columns: raise SortSeqError('Column "%s" is not in columns=%s'%\ (ct_col,str(dataset_df.columns))) counts = dataset_df[ct_col] # Compute counts profile counts_array = np.zeros([num_poss,num_chars]) counts_cols = ['ct_'+a for a in alphabet] for i,pos in enumerate(range(start,end)): char_list = dataset_df[colname].str.slice(pos,pos+1) counts_array[i,:] = [np.sum(counts[char_list==a]) for a in alphabet] temp_df = pd.DataFrame(counts_array,columns=counts_cols) counts_df = pd.concat([poss,temp_df],axis=1) # Validate as counts dataframe counts_df = qc.validate_profile_ct(counts_df,fix=True) return counts_df
def merge_datasets(dataset_df_dict): """ Merges multiple datasets into one. Data from disparate files is merged via values in 'tag', seq', 'seq_rna', or 'seq_pro' columns (in order of preference, chosen according to availability). Each value in the 'ct' column of each dataset is recorded in the 'ct_[bin]' column of the final dataset. A total 'ct' column is then computed, and rows in the final dataset are sorted in descending order according to this. Arguments: dataset_df_dict (dict): Keys are bin numbers, values are dataset dataframes Returns: out_df (pd.DataFrame): A validated dataset dataframe """ # Make sure datasets were loaded if not len(dataset_df_dict)>=1: raise SortSeqError('No datasets were loaded') # Determine index column. Must be same for all files df = dataset_df_dict.values()[0] if 'tag' in df.columns: index_col = 'tag' elif 'seq' in df.columns: index_col = 'seq' elif 'seq_rna' in df.columns: index_col = 'seq_rna' elif 'seq_pro' in df.columns: index_col = 'seq_pro' # Concatenate dataset dataframes out_df = pd.DataFrame() for b in dataset_df_dict.keys(): df = dataset_df_dict[b] # Verify that dataframe has correct column if not index_col in df.columns: raise SortSeqError('\ Dataframe does not contain index_col="%s"'%index_col) if not 'ct' in df.columns: raise SortSeqError('\ Dataframe does not contain a "ct" column') # Delete "ct_X" columns for col in df.columns: if qc.is_col_type(col,'ct_'): del df[col] # Add bin number to name of counts column. df = df.rename(columns={'ct':'ct_%d'%b}) # Index dataset by index_col df = df.groupby(index_col).sum() # Concatenate out_df = pd.concat([out_df,df],axis=1) # Rename index as tag out_df.reset_index(inplace=True) out_df.rename(columns = {'index':index_col},inplace=True) # Fill undefined counts with zero out_df.fillna(value=0,inplace=True) # Add 'ct' column, with proper counts out_df['ct'] = 0 for col in out_df.columns: if qc.is_col_type(col,'ct_'): out_df['ct'] += out_df[col] # Sort by 'ct' column out_df.sort('ct',ascending=False,inplace=True) out_df.reset_index(drop=True,inplace=True) # Validate out_df as dataset and return it out_df = qc.validate_dataset(out_df,fix=True) return out_df
def main(dataset_df, err=False, method='naive',\ pseudocount=1.0, start=0, end=None): """ Computes the mutual information (in bits), at each position, between the character and the bin number. Arguments: dataset_df (pd.DataFrame): A dataframe containing a valid dataset. start (int): An integer specifying the sequence start position end (int): An integer specifying the sequence end position method (str): Which method to use to estimate mutual information Returns: info_df (pd.DataFrame): A dataframe containing results. """ # Validate dataset_df qc.validate_dataset(dataset_df) # Get number of bins bin_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'ct_')] if not len(bin_cols) >= 2: raise SortSeqError('Information profile requires at least 2 bins.') bins = [int(c.split('_')[1]) for c in bin_cols] num_bins = len(bins) # Get number of characters seq_cols = [c for c in dataset_df.columns if qc.is_col_type(c,'seqs')] if not len(seq_cols)==1: raise SortSeqError('Must be only one seq column.') seq_col = seq_cols[0] seqtype = qc.colname_to_seqtype_dict[seq_col] alphabet = qc.seqtype_to_alphabet_dict[seqtype] ct_cols = ['ct_'+a for a in alphabet] num_chars = len(alphabet) # Get sequence length and check start, end numbers num_pos = len(dataset_df[seq_col][0]) if not (0 <= start < num_pos): raise SortSeqError('Invalid start==%d, num_pos==%d'%(start,num_pos)) if end is None: end = num_pos elif (end > num_pos): raise SortSeqError('Invalid end==%d, num_pos==%d'%(end,num_pos)) elif end <= start: raise SortSeqError('Invalid: start==%d >= end==%d'%(start,end)) # Record positions in new dataframe counts_df = profile_ct.main(dataset_df) info_df = counts_df.loc[start:(end-1),['pos']].copy() # rows from start:end info_df['info'] = 0.0 if err: info_df['info_err'] = 0.0 # Fill in 3D array of counts ct_3d_array = np.zeros([end-start, num_chars, num_bins]) for i, bin_num in enumerate(bins): # Compute counts counts_df = profile_ct.main(dataset_df, bin=bin_num) # Fill in counts table ct_3d_array[:,:,i] = counts_df.loc[start:(end-1),ct_cols].astype(float) # Compute mutual information for each position for i in range(end-start): # i only from start:end # Get 2D counts nxy = ct_3d_array[i,:,:] assert len(nxy.shape) == 2 # Compute mutual informaiton if err: mi, mi_err = info.estimate_mutualinfo(nxy,err=True,\ method=method,pseudocount=pseudocount) info_df.loc[i+start,'info'] = mi info_df.loc[i+start,'info_err'] = mi_err else: mi = info.estimate_mutualinfo(nxy,err=False,\ method=method,pseudocount=pseudocount) info_df.loc[i+start,'info'] = mi # Validate info dataframe info_df = qc.validate_profile_info(info_df,fix=True) return info_df