Ejemplo n.º 1
0
def main(dataset_df,model_df,left=None,right=None):

    # Validate dataframes
    qc.validate_dataset(dataset_df)
    qc.validate_model(model_df)

    # Detect model type based on columns
    seqtype, modeltype = qc.get_model_type(model_df)
    seqcol = qc.seqtype_to_seqcolname_dict[seqtype]

    # Set start and end  based on left or right
    if not ((left is None) or (right is None)):
        raise SortSeqError('Cannot set both left and right at same time.')
    if not (left is None):
        start = left
        end = start + model_df.shape[0] + (1 if modeltype=='NBR' else 0)
    elif not (right is None):
        end = right 
        start = end - model_df.shape[0] - (1 if modeltype=='NBR' else 0)
    else:
        start = model_df['pos'].values[0]
        end = model_df['pos'].values[-1] + (2 if modeltype=='NBR' else 1)
    assert start < end 

    # Validate start and end positions
    seq_length = len(dataset_df[seqcol][0])
    if start < 0:
        raise SortSeqError('Invalid start=%d'%start)
    if end > seq_length:
        raise SortSeqError('Invalid end=%d for seq_length=%d'%(end,seq_length))

    #select target sequence region
    out_df = dataset_df.copy()
    out_df.loc[:,'seq'] = out_df.loc[:,'seq'].str.slice(start,end)

    #Create model object of correct type
    if modeltype == 'MAT':
        mymodel = Models.LinearModel(model_df)
    elif modeltype == 'NBR':
        mymodel = Models.NeighborModel(model_df)
    else:
        raise SortSeqError('Unrecognized model type %s'%modeltype)
 
    # Compute values
    out_df['val'] = mymodel.evaluate(out_df)

    # Validate dataframe and return
    return qc.validate_dataset(out_df,fix=True)
Ejemplo n.º 2
0
    def __init__(self,model_df):
        """
        Constructor takes model parameters in the form of a model dataframe
        """
        model_df = qc.validate_model(model_df.copy(),fix=True)
        seqtype, modeltype = qc.get_model_type(model_df)
        if not modeltype=='NBR':
            raise SortSeqError('Invalid modeltype: %s'%modeltype)

        seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)
        self.seqtype = seqtype
        self.seq_dict = seq_dict
        self.inv_dict = inv_dict
        self.df = model_df
        self.length = model_df.shape[0]+1

        # Extract matrix part of model dataframe
        headers = qc.get_cols_from_df(model_df,'vals')
        self.matrix = np.transpose(np.array(model_df[headers]))
Ejemplo n.º 3
0
def main(df,lm='IM',modeltype='MAT',LS_means_std=None,\
    db=None,iteration=30000,burnin=1000,thin=10,\
    runnum=0,initialize='LS',start=0,end=None,foreground=1,\
    background=0,alpha=0,pseudocounts=1,test=False,drop_library=False,\
    verbose=False):
    
    # Determine dictionary
    seq_cols = qc.get_cols_from_df(df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    
    '''Check to make sure the chosen dictionary type correctly describes
         the sequences. An issue with this test is that if you have DNA sequence
         but choose a protein dictionary, you will still pass this test bc A,C,
         G,T are also valid amino acids'''
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT')
    #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
    #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True,inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
         sys.stderr.write('Lengths of all sequences are not the same!')
    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = utils.collapse_further(df)
    col_headers = utils.get_column_headers(df)
    #make sure all counts are ints
    df[col_headers] = df[col_headers].astype(int)
    #create vector of column names
    val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    df.reset_index(inplace=True,drop=True)
    #Drop any sequences with incorrect length
    if not end:
        '''is no value for end of sequence was supplied, assume first seq is
            correct length'''
        seqL = len(df[seq_col_name][0]) - start
    else:
        seqL = end-start
    df = df[df[seq_col_name].apply(len) == (seqL)]
    df.reset_index(inplace=True,drop=True)
    #Do something different for each type of learning method (lm)
    if lm == 'ER':
        emat = Berg_von_Hippel(
            df,dicttype,foreground=foreground,background=background,
            pseudocounts=pseudocounts)
    if lm == 'LS':
        '''First check that is we don't have a penalty for ridge regression,
            that we at least have all possible base values so that the analysis
            will not fail'''
        if LS_means_std: #If user supplied preset means and std for each bin
            means_std_df = io.load_meanstd(LS_means_std)

            #change bin number to 'ct_number' and then use as index
            labels = list(means_std_df['bin'].apply(add_label))
            std = means_std_df['std']
            std.index = labels
            #Change Weighting of each sequence by dividing counts by bin std
            df[labels] = df[labels].div(std)
            means = means_std_df['mean']
            means.index = labels
        else:
            means = None
        #drop all rows without counts
        df['ct'] = df[col_headers].sum(axis=1)
        df = df[df.ct != 0]        
        df.reset_index(inplace=True,drop=True)
        ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
            expression even though it is will be calculated as such if we proceed.
            Therefore is drop_library is passed, drop this column from analysis.'''
        if drop_library:
            try:     
                df.drop('ct_0',inplace=True)
                col_headers = utils.get_column_headers(df)
                if len(col_headers) < 2:
                    raise SortSeqError(
                        '''After dropping library there are no longer enough 
                        columns to run the analysis''')
            except:
                raise SortSeqError('''drop_library option was passed, but no ct_0
                    column exists''')
        #parameterize sequences into 3xL vectors
                               
        raveledmat,batch,sw = utils.genweightandmat(
                                  df,par_seq_dict,dicttype,means=means,modeltype=modeltype)
        #Use ridge regression to find matrix.       
        emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha)

    if lm == 'IM':
        seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype)
        #this is also an MCMC routine, do the same as above.
        if initialize == 'rand':
            if modeltype == 'MAT':
                emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))
            elif modeltype == 'NBR':
                emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict))
        elif initialize == 'LS':
            emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
            emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose)
            emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))   
            #pymc doesn't take sparse mat        
        emat = MaximizeMI_memsaver(
                seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin,
                thin=thin,runnum=runnum,verbose=verbose)
    #now format the energy matrices to get them ready to output
    if (lm == 'IM' or lm == 'memsaver'):       
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat))
    
    elif lm == 'ER': 
        '''the emat for this format is currently transposed compared to other formats
        it is also already a data frame with columns [pos,val_...]'''
        emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        emat_typical = emat[emat_cols]
        emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
        
    else: #must be Least squares
        emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict))        
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat_typical))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
    
    em = pd.DataFrame(emat_typical)
    em.columns = val_cols
    #add position column
    if modeltype == 'NBR':
        pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') 
    else:
        pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos')    
    output_df = pd.concat([pos,em],axis=1)

    # Validate model and return
    output_df = qc.validate_model(output_df,fix=True)
    return output_df
Ejemplo n.º 4
0
def main(model_df, contig_list, numsites=10, verbose=False):

    # Determine type of string from model
    qc.validate_model(model_df)
    seqtype, modeltype = qc.get_model_type(model_df)
    seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)

    # Check that all characters are from the correct alphabet
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    search_string = r"[^%s]"%alphabet
    for contig_str, contig_name, pos_offset in contig_list:
        if re.search(search_string,contig_str):
            raise SortSeqError(\
                'Invalid character for seqtype %s found in %s.'%\
                (seqtype,contig_name))

    # Create model object to evaluate on seqs
    if modeltype == 'MAT':
        model_obj = Models.LinearModel(model_df)
    elif modeltype == 'NBR':
        model_obj = Models.NeighborModel(model_df)
    
    # Create list of dataframes, one for each contig
    seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
    L = model_obj.length
    sitelist_df = pd.DataFrame(\
            columns=['val',seq_col,'left','right','ori','contig'])
    for contig_str, contig_name, pos_offset in contig_list:
        if len(contig_str) < L:
            continue
        this_df = pd.DataFrame(\
            columns=['val',seq_col,'left','right','ori','contig'])
        num_sites = len(contig_str) - L + 1
        poss = np.arange(num_sites).astype(int) 
        this_df['left'] = poss + pos_offset
        this_df['right']  = poss + pos_offset + L - 1 
        #this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
        this_df[seq_col] = fast.seq2sitelist(contig_str,L)  #Cython
        this_df['ori'] = '+'
        this_df['contig'] = contig_name
        this_df['val'] = model_obj.evaluate(this_df[seq_col])
        sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True)

        # If scanning DNA, scan reverse-complement as well
        if seqtype=='dna':
            #this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
            this_df[seq_col] = fast.seq2sitelist(contig_str,L,rc=True)  #Cython
            this_df['ori'] = '-'
            this_df['val'] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df,this_df], ignore_index=True)

        # Sort by value and reindex
        sitelist_df.sort_values(by='val', ascending=False, inplace=True)
        sitelist_df.reset_index(drop=True,inplace=True)

        # Crop list at numsites
        if sitelist_df.shape[0]>numsites:
            sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

        if verbose:
            print '.',
            sys.stdout.flush()

    if verbose:
        print ''
        sys.stdout.flush()

    # If no sites were found, raise error
    if sitelist_df.shape[0]==0:
        raise SortSeqError(\
            'No full-length sites found within provided contigs.')

    sitelist_df = qc.validate_sitelist(sitelist_df,fix=True)
    return sitelist_df