Exemple #1
0
def main(
        data_df,model_df,
        start=0,end=None,err=False,coarse_graining_level=0):
    dicttype, modeltype = qc.get_model_type(model_df)
    seq_cols = qc.get_cols_from_df(data_df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end)
        if modeltype=='MAT':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']):
                raise SortSeqError('model length does not match dataset length')
        elif modeltype=='NBR':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1:
                raise SortSeqError('model length does not match dataset length')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
                data_df['ct'] = data_df[col_headers].sum(axis=1)
    data_df = data_df[data_df.ct != 0]        
    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end-start
    data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] 
    #make a numpy array out of the model data frame
    model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))]
    value = np.transpose(np.array(model_df[model_df_headers]))  
    #now we evaluate the expression of each sequence according to the model.
    seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype)
    temp_df = data_df.copy()
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) 
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True,drop=True)
    #we must divide by the total number of counts in each bin for the MI calculator
    #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0)     
    MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=coarse_graining_level)
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2))
            sub_df.reset_index(inplace=True,drop=True)
            sub_MI[i],sub_std = main(
                sub_df,model_df,err=False)
        Std = np.std(sub_MI)/np.sqrt(2)
    return MI,Std
Exemple #2
0
 def emat(p=pymcdf,value=emat_0):         
     p['val'] = numerics.eval_modelmatrix_on_mutarray(np.transpose(value),seq_mat,wtrow)                     
     MI = EstimateMutualInfoforMImax.alt4(p.copy())  # New and improved
     return n_seqs*MI
Exemple #3
0
 def evaluate_on_mutarray(self, mutarray, wtrow):
     return numerics.eval_modelmatrix_on_mutarray(\
         modelmatrix=self.matrix.T, mutarray=mutarray, wtrow=wtrow)
def main(data_df,
         model_df,
         start=0,
         end=None,
         err=False,
         coarse_graining_level=0,
         rsquared=False,
         return_freg=False):

    #determine whether you are working with RNA, DNA, or protein.
    #this also should determine modeltype (MAT, NBR, PAIR).
    dicttype, modeltype = qc.get_model_type(model_df)

    #get column header for the sequence column.
    seq_cols = qc.get_cols_from_df(data_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))

    #create dictionary that goes from, for example, nucleotide to number and
    #visa versa.
    seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)

    #set name of sequences column based on type of sequence
    type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
    seq_col_name = type_name_dict[dicttype]

    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end - start
    #throw out wrong length sequences.
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = \
            data_df.loc[:,seq_col_name].str.slice(start,end)
        right_length = data_df.loc[:, seq_col_name].apply(len) == (seqL)
        if not right_length.all():
            sys.stderr.write('''Not all sequences are the same length! 
                       Throwing out incorrect sequences!''')
            data_df = data_df.loc[right_length, :]
        data_df = data_df.reset_index(drop=True)

        if modeltype == 'MAT':
            if seqL != len(model_df.loc[:, 'pos']):
                raise SortSeqError(
                    'model length does not match dataset length')
        elif modeltype == 'NBR':
            if seqL != len(model_df.loc[:, 'pos']) + 1:
                raise SortSeqError(
                    'model length does not match dataset length')
        elif modeltype == 'PAIR':
            if int(scipy.misc.comb(seqL, 2)) != len(model_df.loc[:, 'pos']):
                raise SortSeqError(
                    'model length does not match dataset length')

    #get column names of the counts columns (excluding total counts 'ct')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
        data_df['ct'] = data_df[col_headers].sum(axis=1)

    #remove empty rows.
    data_df = data_df[data_df.ct != 0]

    #determine sequence length.

    #make a numpy array out of the model data frame
    model_df_headers = [
        'val_' + str(inv_dict[i]) for i in range(len(seq_dict))
    ]
    value = np.array(model_df[model_df_headers])

    #now we evaluate the expression of each sequence according to the model.
    #first convert to matrix representation of sequences
    seq_mat, wtrow = numerics.dataset2mutarray(data_df.copy(), modeltype)
    temp_df = data_df.copy()

    #evaluate energy of each sequence
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(
        value, seq_mat, wtrow)

    #sort based on value
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True, drop=True)

    #freg is a regularized plot which show how sequences are distributed
    #in energy space.
    if return_freg:
        fig, ax = plt.subplots()
        MI, freg = EstimateMutualInfoforMImax.alt4(
            temp_sorted,
            coarse_graining_level=coarse_graining_level,
            return_freg=return_freg)
        plt.imshow(freg, interpolation='nearest', aspect='auto')

        plt.savefig(return_freg)
    else:
        MI = EstimateMutualInfoforMImax.alt4(
            temp_sorted,
            coarse_graining_level=coarse_graining_level,
            return_freg=return_freg)

    #if we want to calculate error then use bootstrapping.
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(
                len(data_df_for_sub.index) / 2))
            sub_df.reset_index(inplace=True, drop=True)
            sub_MI[i], sub_std = main(sub_df, model_df, err=False)
        Std = np.std(sub_MI) / np.sqrt(2)

    #we can return linfoot corrolation (rsquared) or return MI.
    if rsquared:
        return (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std))
    else:
        return MI, Std