Esempio n. 1
0
def wrapper(args):
    
    try:
        npar = args.noiseparam.strip('[').strip(']').split(',')
    except:
        npar = []
    nbins = args.nbins
    # Run funciton
    if args.i:
        df = pd.io.parsers.read_csv(
            args.i,delim_whitespace=True,
            dtype={'seqs':str,'batch':int})
    else:
        df = pd.io.parsers.read_csv(
            sys.stdin,delim_whitespace=True,
            dtype={'seqs':str,'batch':int})
    if len(utils.get_column_headers(df)) > 0:
         raise SortSeqError('Library already sorted!')
    model_df = io.load_model(args.model)
    output_df = main(
        df,model_df,args.noisemodel,npar,
        nbins,start=args.start,end=args.end)
    
    if args.out:
        outloc = open(args.out,'w')
    else:
        outloc = sys.stdout
    pd.set_option('max_colwidth',int(1e8))

    # Validate dataframe for writting
    output_df = qc.validate_dataset(output_df,fix=True)
    io.write(output_df,outloc)
Esempio n. 2
0
def main(
        data_df,model_df,
        start=0,end=None,err=False):
    dicttype, modeltype = qc.get_model_type(model_df)
    seq_cols = qc.get_cols_from_df(data_df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end)
        if modeltype=='MAT':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']):
                raise SortSeqError('model length does not match dataset length')
        elif modeltype=='NBR':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1:
                raise SortSeqError('model length does not match dataset length')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
                data_df['ct'] = data_df[col_headers].sum(axis=1)
    data_df = data_df[data_df.ct != 0]        
    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end-start
    data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] 
    #make a numpy array out of the model data frame
    model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))]
    value = np.transpose(np.array(model_df[model_df_headers]))  
    #now we evaluate the expression of each sequence according to the model.
    seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype)
    temp_df = data_df.copy()
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) 
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True,drop=True)
    #we must divide by the total number of counts in each bin for the MI calculator
    #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0)     
    MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=0)
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2))
            sub_df.reset_index(inplace=True,drop=True)
            sub_MI[i],sub_std = main(
                sub_df,model_df,modeltype=modeltype,err=False)
        Std = np.std(sub_MI)/np.sqrt(2)
    return MI,Std
Esempio n. 3
0
def Berg_von_Hippel(df,dicttype,foreground=1,background=0,pseudocounts=1):
    '''Learn models using berg von hippel model. The foreground sequences are
         usually bin_1 and background in bin_0, this can be changed via flags.''' 
    seq_dict,inv_dict = utils.choose_dict(dicttype)
    #check that the foreground and background chosen columns actually exist.
    columns_to_check = {'ct_' + str(foreground),'ct_' + str(background)}
    if not columns_to_check.issubset(set(df.columns)):
        raise SortSeqError('Foreground or Background column does not exist!')

    #get counts of each base at each position
    foreground_counts = utils.profile_counts(df,dicttype,bin_k=foreground)   
    background_counts = utils.profile_counts(df,dicttype,bin_k=background)
    binheaders = utils.get_column_headers(foreground_counts)
    #add pseudocounts to each position
    foreground_counts[binheaders] = foreground_counts[binheaders] + pseudocounts
    background_counts[binheaders] = background_counts[binheaders] + pseudocounts
    #make sure there are no zeros in counts after addition of pseudocounts
    ct_headers = utils.get_column_headers(foreground_counts)
    if foreground_counts[ct_headers].isin([0]).values.any():
        raise SortSeqError('''There are some bases without any representation in\
            the foreground data, you should use pseudocounts to avoid failure \
            of the learning method''')
    if background_counts[ct_headers].isin([0]).values.any():
        raise SortSeqError('''There are some bases without any representation in\
            the background data, you should use pseudocounts to avoid failure \
            of the learning method''')
    #normalize to compute frequencies
    foreground_freqs = foreground_counts.copy()
    background_freqs = background_counts.copy()
    foreground_freqs[binheaders] = foreground_freqs[binheaders].div(
        foreground_freqs[binheaders].sum(axis=1),axis=0)
    background_freqs[binheaders] = background_freqs[binheaders].div(
        background_freqs[binheaders].sum(axis=1),axis=0)
    
    output_df = -np.log(foreground_freqs/background_freqs)
    #change column names accordingly (instead of ct_ we want val_)
    rename_dict = {'ct_' + str(inv_dict[i]):'val_' + str(inv_dict[i]) for i in range(len(seq_dict))}
    output_df = output_df.rename(columns=rename_dict)
    return output_df
Esempio n. 4
0
def main(
    df,mp,noisetype,npar,nbins,sequence_library=True,
    start=0,end=None):
    #validate noise parameters
    if not isinstance(npar,list):
        raise SortSeqError('Noise parameters must be given as a list')
    if noisetype == 'Normal':
        if len(npar) != 1:
            raise SortSeqError('''For a normal noise model, there must be one 
                 input parameter (width of normal distribution)''')
    if noisetype == 'LogNormal':
        if len(npar) != 2:
             raise SortSeqError('''For a LogNormal noise model there must 
                 be 2 input parameters''')
    if nbins <= 1:
        raise SortSeqError('number of bins must be greater than 1')
    #generate predicted energy of each sequence.
    df = evaluate_model.main(df,mp,left=start,right=None)
    #Determine model type to use for noise
    if noisetype == 'LogNormal':
        NoiseModelSort = Models.LogNormalNoise(npar)
    elif noisetype == 'Normal':
        NoiseModelSort = Models.NormalNoise(npar)
    elif noisetype == 'None':
        NoiseModelSort = Models.NormalNoise([1e-16])
    else:
        NoiseModelSort = Models.CustomModel(noisetype,npar)
    #Apply noise to our calculated energies
    noisyexp,listnoisyexp = NoiseModelSort.genlist(df)
    #Determine Expression Cutoffs for bins
    noisyexp.sort()
    cutoffs = list(
        noisyexp[np.linspace(0,len(noisyexp),nbins,endpoint=False,dtype=int)])
    cutoffs.append(np.inf)
    seqs_arr = np.zeros([len(listnoisyexp),nbins],dtype=int)
    #split sequence into bins based on calculated cutoffs
    for i,entry in enumerate(listnoisyexp):
        seqs_arr[i,:] = np.histogram(entry,bins=cutoffs)[0]
    col_labels = ['ct_' + str(i+1) for i in range(nbins)]
    if sequence_library:
        df['ct_0'] = utils.sample(df['ct'],int(df['ct'].sum()/nbins))
    output_df = pd.concat([df,pd.DataFrame(seqs_arr,columns=col_labels)],axis=1)
    col_labels = utils.get_column_headers(output_df)
    output_df['ct'] = output_df[col_labels].sum(axis=1)      
    output_df = output_df.drop('val',axis=1)
    return output_df
Esempio n. 5
0
def main(df,lm='IM',modeltype='MAT',LS_means_std=None,\
    db=None,iteration=30000,burnin=1000,thin=10,\
    runnum=0,initialize='LS',start=0,end=None,foreground=1,\
    background=0,alpha=0,pseudocounts=1,test=False,drop_library=False,\
    verbose=False):
    
    # Determine dictionary
    seq_cols = qc.get_cols_from_df(df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    
    '''Check to make sure the chosen dictionary type correctly describes
         the sequences. An issue with this test is that if you have DNA sequence
         but choose a protein dictionary, you will still pass this test bc A,C,
         G,T are also valid amino acids'''
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    lin_seq_dict,lin_inv_dict = utils.choose_dict(dicttype,modeltype='MAT')
    #wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
    #wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
    par_seq_dict = {v:k for v,k in seq_dict.items() if k != (len(seq_dict)-1)}
    #drop any rows with ct = 0
    df = df[df.loc[:,'ct'] != 0]
    df.reset_index(drop=True,inplace=True)
    
    #If there are sequences of different lengths, then print error but continue
    if len(set(df[seq_col_name].apply(len))) > 1:
         sys.stderr.write('Lengths of all sequences are not the same!')
    #select target sequence region
    df.loc[:,seq_col_name] = df.loc[:,seq_col_name].str.slice(start,end)
    df = utils.collapse_further(df)
    col_headers = utils.get_column_headers(df)
    #make sure all counts are ints
    df[col_headers] = df[col_headers].astype(int)
    #create vector of column names
    val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    df.reset_index(inplace=True,drop=True)
    #Drop any sequences with incorrect length
    if not end:
        '''is no value for end of sequence was supplied, assume first seq is
            correct length'''
        seqL = len(df[seq_col_name][0]) - start
    else:
        seqL = end-start
    df = df[df[seq_col_name].apply(len) == (seqL)]
    df.reset_index(inplace=True,drop=True)
    #Do something different for each type of learning method (lm)
    if lm == 'ER':
        emat = Berg_von_Hippel(
            df,dicttype,foreground=foreground,background=background,
            pseudocounts=pseudocounts)
    if lm == 'LS':
        '''First check that is we don't have a penalty for ridge regression,
            that we at least have all possible base values so that the analysis
            will not fail'''
        if LS_means_std: #If user supplied preset means and std for each bin
            means_std_df = io.load_meanstd(LS_means_std)

            #change bin number to 'ct_number' and then use as index
            labels = list(means_std_df['bin'].apply(add_label))
            std = means_std_df['std']
            std.index = labels
            #Change Weighting of each sequence by dividing counts by bin std
            df[labels] = df[labels].div(std)
            means = means_std_df['mean']
            means.index = labels
        else:
            means = None
        #drop all rows without counts
        df['ct'] = df[col_headers].sum(axis=1)
        df = df[df.ct != 0]        
        df.reset_index(inplace=True,drop=True)
        ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
            expression even though it is will be calculated as such if we proceed.
            Therefore is drop_library is passed, drop this column from analysis.'''
        if drop_library:
            try:     
                df.drop('ct_0',inplace=True)
                col_headers = utils.get_column_headers(df)
                if len(col_headers) < 2:
                    raise SortSeqError(
                        '''After dropping library there are no longer enough 
                        columns to run the analysis''')
            except:
                raise SortSeqError('''drop_library option was passed, but no ct_0
                    column exists''')
        #parameterize sequences into 3xL vectors
                               
        raveledmat,batch,sw = utils.genweightandmat(
                                  df,par_seq_dict,dicttype,means=means,modeltype=modeltype)
        #Use ridge regression to find matrix.       
        emat = Compute_Least_Squares(raveledmat,batch,sw,alpha=alpha)

    if lm == 'IM':
        seq_mat,wtrow = numerics.dataset2mutarray(df.copy(),modeltype)
        #this is also an MCMC routine, do the same as above.
        if initialize == 'rand':
            if modeltype == 'MAT':
                emat_0 = utils.RandEmat(len(df[seq_col_name][0]),len(seq_dict))
            elif modeltype == 'NBR':
                emat_0 = utils.RandEmat(len(df['seq'][0])-1,len(seq_dict))
        elif initialize == 'LS':
            emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
            emat_0_df = main(df.copy(),lm='LS',modeltype=modeltype,alpha=alpha,start=0,end=None,verbose=verbose)
            emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))   
            #pymc doesn't take sparse mat        
        emat = MaximizeMI_memsaver(
                seq_mat,df.copy(),emat_0,wtrow,db=db,iteration=iteration,burnin=burnin,
                thin=thin,runnum=runnum,verbose=verbose)
    #now format the energy matrices to get them ready to output
    if (lm == 'IM' or lm == 'memsaver'):       
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat))
    
    elif lm == 'ER': 
        '''the emat for this format is currently transposed compared to other formats
        it is also already a data frame with columns [pos,val_...]'''
        emat_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        emat_typical = emat[emat_cols]
        emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
        
    else: #must be Least squares
        emat_typical = utils.emat_typical_parameterization(emat,len(seq_dict))        
        if modeltype == 'NBR':
             emat_typical = gauge.fix_neighbor(np.transpose(emat_typical))
        elif modeltype == 'MAT':
             emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
    
    em = pd.DataFrame(emat_typical)
    em.columns = val_cols
    #add position column
    if modeltype == 'NBR':
        pos = pd.Series(range(start,start - 1 + len(df[seq_col_name][0])),name='pos') 
    else:
        pos = pd.Series(range(start,start + len(df[seq_col_name][0])),name='pos')    
    output_df = pd.concat([pos,em],axis=1)

    # Validate model and return
    output_df = qc.validate_model(output_df,fix=True)
    return output_df