Ejemplo n.º 1
0
def main(df,dicttype,logo=False,title=None,x0=None):
    seq_dict,inv_dict = utils.choose_dict(dicttype)
    matrix_headers = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
    columns = df.columns
    '''some functions can be output through plt.savefig, others must 
        be output via a write method'''
    output_via_write = False 
    #Autodetect the type of draw function to use.
    if {'ct','seq'}.issubset(columns):
        myimage = draw_library(df,seq_dict)
    elif (set(matrix_headers).issubset(columns) and not logo):
        myimage = draw_matrix(df,seq_dict,inv_dict)
    elif (set(matrix_headers).issubset(columns) and logo):
        myimage = draw_logo_from_matrix(df,seq_dict,inv_dict,dicttype,x0=x0)
        output_via_write = True
    elif set(['freq_' + inv_dict[i] for i in range(len(seq_dict))]).issubset(columns):
        myimage = draw_logo(df,seq_dict,inv_dict,dicttype)
        output_via_write = True
    elif set(['ct_' + inv_dict[i] for i in range(len(seq_dict))]).issubset(columns):
        myimage = draw_counts(df,seq_dict,inv_dict)
    elif {'pos','info'}.issubset(columns):
        myimage = draw_info_profile(df)
    elif {'pos','mut'}.issubset(columns):
        myimage = draw_mutrate(df)
    return myimage,output_via_write
Ejemplo n.º 2
0
    def Berg_von_Hippel(self,
                        df,
                        dicttype,
                        foreground=1,
                        background=0,
                        pseudocounts=1):
        '''Learn models using berg von hippel model. The foreground sequences are
             usually bin_1 and background in bin_0, this can be changed via flags.'''
        seq_dict, inv_dict = utils.choose_dict(dicttype)
        # check that the foreground and background chosen columns actually exist.
        columns_to_check = {'ct_' + str(foreground), 'ct_' + str(background)}
        if not columns_to_check.issubset(set(df.columns)):
            raise SortSeqError(
                'Foreground or Background column does not exist!')

        # get counts of each base at each position
        foreground_counts = utils.profile_counts(df,
                                                 dicttype,
                                                 bin_k=foreground)
        background_counts = utils.profile_counts(df,
                                                 dicttype,
                                                 bin_k=background)
        binheaders = utils.get_column_headers(foreground_counts)
        # add pseudocounts to each position
        foreground_counts[
            binheaders] = foreground_counts[binheaders] + pseudocounts
        background_counts[
            binheaders] = background_counts[binheaders] + pseudocounts
        # make sure there are no zeros in counts after addition of pseudocounts
        ct_headers = utils.get_column_headers(foreground_counts)
        if foreground_counts[ct_headers].isin([0]).values.any():
            raise SortSeqError(
                '''There are some bases without any representation in\
                the foreground data, you should use pseudocounts to avoid failure \
                of the learning method''')
        if background_counts[ct_headers].isin([0]).values.any():
            raise SortSeqError(
                '''There are some bases without any representation in\
                the background data, you should use pseudocounts to avoid failure \
                of the learning method''')
        # normalize to compute frequencies
        foreground_freqs = foreground_counts.copy()
        background_freqs = background_counts.copy()
        foreground_freqs[binheaders] = foreground_freqs[binheaders].div(
            foreground_freqs[binheaders].sum(axis=1), axis=0)
        background_freqs[binheaders] = background_freqs[binheaders].div(
            background_freqs[binheaders].sum(axis=1), axis=0)

        output_df = -np.log(foreground_freqs / background_freqs)
        # change column names accordingly (instead of ct_ we want val_)
        rename_dict = {
            'ct_' + str(inv_dict[i]): 'val_' + str(inv_dict[i])
            for i in range(len(seq_dict))
        }
        output_df = output_df.rename(columns=rename_dict)
        return output_df
Ejemplo n.º 3
0
    def __init__(self,model_df):
        """
        Constructor takes model parameters in the form of a model dataframe
        """
        model_df = qc.validate_model(model_df.copy(),fix=True)
        seqtype, modeltype = qc.get_model_type(model_df)
        if not modeltype=='MAT':
            raise SortSeqError('Invalid modeltype: %s'%modeltype)

        seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)
        self.seqtype = seqtype
        self.seq_dict = seq_dict
        self.inv_dict = inv_dict
        self.df = model_df
        self.length = model_df.shape[0]

        # Extract matrix part of model dataframe
        headers = qc.get_cols_from_df(model_df,'vals')
        self.matrix = np.transpose(np.array(model_df[headers]))
Ejemplo n.º 4
0
    def __init__(self,
                 data_df,
                 model_df,
                 start=0,
                 end=None,
                 err=False,
                 coarse_graining_level=0,
                 rsquared=False,
                 return_freg=False):

        self.data_df = data_df
        self.model_df = model_df
        self.start = start
        self.end = end
        self.err = err
        self.coarse_graining_level = coarse_graining_level

        self.out_MI = None
        self.out_std = None

        self._input_checks()

        dicttype, modeltype = qc.get_model_type(self.model_df)
        seq_cols = qc.get_cols_from_df(self.data_df, 'seqs')
        if not len(seq_cols) == 1:
            raise SortSeqError('Dataframe has multiple seq cols: %s' %
                               str(seq_cols))
        seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)
        # set name of sequences column based on type of sequence
        type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
        seq_col_name = type_name_dict[dicttype]
        # Cut the sequences based on start and end, and then check if it makes sense
        if (self.start != 0 or self.end):
            self.data_df.loc[:,
                             seq_col_name] = self.data_df.loc[:,
                                                              seq_col_name].str.slice(
                                                                  self.start,
                                                                  self.end)
            if modeltype == 'MAT':
                if len(self.data_df.loc[0, seq_col_name]) != len(
                        self.model_df.loc[:, 'pos']):
                    print('predictive info class: BP lengths: ',
                          len(self.data_df.loc[0, seq_col_name]), " ",
                          len(self.model_df.loc[:, 'pos']))
                    raise SortSeqError(
                        'model length does not match dataset length')
            elif modeltype == 'NBR':
                if len(self.data_df.loc[0, seq_col_name]) != len(
                        self.model_df.loc[:, 'pos']) + 1:
                    raise SortSeqError(
                        'model length does not match dataset length')
        col_headers = utils.get_column_headers(self.data_df)
        if 'ct' not in self.data_df.columns:
            self.data_df['ct'] = data_df[col_headers].sum(axis=1)
            self.data_df = self.data_df[self.data_df.ct != 0]
        if not self.end:
            seqL = len(self.data_df[seq_col_name][0]) - self.start
        else:
            seqL = self.end - self.start
            self.data_df = self.data_df[self.data_df[seq_col_name].apply(len)
                                        == (seqL)]
        # make a numpy array out of the model data frame
        model_df_headers = [
            'val_' + str(inv_dict[i]) for i in range(len(seq_dict))
        ]
        value = np.transpose(np.array(self.model_df[model_df_headers]))
        # now we evaluate the expression of each sequence according to the model.
        seq_mat, wtrow = numerics.dataset2mutarray(self.data_df.copy(),
                                                   modeltype)
        temp_df = self.data_df.copy()
        # AT: what is this line trying to do?
        temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(
            np.array(self.model_df[model_df_headers]), seq_mat, wtrow)
        temp_sorted = temp_df.sort_values(by='val')
        temp_sorted.reset_index(inplace=True, drop=True)
        # we must divide by the total number of counts in each bin for the MI calculator
        # temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0)
        if return_freg:
            #fig, ax = plt.subplots()
            MI, freg = EstimateMutualInfoforMImax.alt4(
                temp_sorted,
                coarse_graining_level=coarse_graining_level,
                return_freg=return_freg)
            #plt.imshow(freg, interpolation='nearest', aspect='auto')

            #plt.savefig(return_freg)
        else:
            MI = EstimateMutualInfoforMImax.alt4(
                temp_sorted,
                coarse_graining_level=coarse_graining_level,
                return_freg=return_freg)
        if not self.err:
            Std = np.NaN
        else:
            data_df_for_sub = self.data_df.copy()
            sub_MI = np.zeros(15)
            for i in range(15):
                sub_df = data_df_for_sub.sample(
                    int(len(data_df_for_sub.index) / 2))
                sub_df.reset_index(inplace=True, drop=True)
                sub_MI[i], sub_std = PredictiveInfo(sub_df,
                                                    model_df,
                                                    err=False)
            Std = np.std(sub_MI) / np.sqrt(2)
        if rsquared:
            #return (1 - 2 ** (-2 * MI)), (1 - 2 ** (-2 * Std))
            self.out_MI, self.out_std = (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std))
        else:
            #return MI, Std
            self.out_MI, self.out_std = MI, Std
Ejemplo n.º 5
0
    def __init__(self, model_df, contig_list, numsites=10, verbose=False):

        self.sitelist_df = None
        # Determine type of string from model
        qc.validate_model(model_df)
        seqtype, modeltype = qc.get_model_type(model_df)
        seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype)

        # Check that all characters are from the correct alphabet
        alphabet = qc.seqtype_to_alphabet_dict[seqtype]
        search_string = r"[^%s]" % alphabet
        for contig_str, contig_name, pos_offset in contig_list:
            if re.search(search_string, contig_str):
                raise SortSeqError( \
                    'Invalid character for seqtype %s found in %s.' % \
                    (seqtype, contig_name))

        # Create model object to evaluate on seqs
        if modeltype == 'MAT':
            model_obj = Models.LinearModel(model_df)
        elif modeltype == 'NBR':
            model_obj = Models.NeighborModel(model_df)

        # Create list of dataframes, one for each contig
        seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
        L = model_obj.length
        sitelist_df = pd.DataFrame( \
            columns=['val', seq_col, 'left', 'right', 'ori', 'contig'])
        for contig_str, contig_name, pos_offset in contig_list:
            if len(contig_str) < L:
                continue
            this_df = pd.DataFrame( \
                columns=['val', seq_col, 'left', 'right', 'ori', 'contig'])
            num_sites = len(contig_str) - L + 1
            poss = np.arange(num_sites).astype(int)
            this_df['left'] = poss + pos_offset
            this_df['right'] = poss + pos_offset + L - 1
            # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
            this_df[seq_col] = fast.seq2sitelist(contig_str, L)  # Cython
            this_df['ori'] = '+'
            this_df['contig'] = contig_name
            this_df['val'] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

            # If scanning DNA, scan reverse-complement as well
            if seqtype == 'dna':
                # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
                this_df[seq_col] = fast.seq2sitelist(contig_str, L,
                                                     rc=True)  # Cython
                this_df['ori'] = '-'
                this_df['val'] = model_obj.evaluate(this_df[seq_col])
                sitelist_df = pd.concat([sitelist_df, this_df],
                                        ignore_index=True)

            # Sort by value and reindex
            sitelist_df.sort_values(by='val', ascending=False, inplace=True)
            sitelist_df.reset_index(drop=True, inplace=True)

            # Crop list at numsites
            if sitelist_df.shape[0] > numsites:
                sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

            if verbose:
                print('.', sys.stdout.flush())

        if verbose:
            print('')
            sys.stdout.flush()

        # If no sites were found, raise error
        if sitelist_df.shape[0] == 0:
            raise SortSeqError( \
                'No full-length sites found within provided contigs.')

        sitelist_df = qc.validate_sitelist(sitelist_df, fix=True)
        #return sitelist_df
        self.sitelist_df = sitelist_df
Ejemplo n.º 6
0
    def Markov(self, df, dicttype, foreground=1, background=0, pseudocounts=1):
        '''Learn models using berg von hippel model. The foreground sequences are
             usually bin_1 and background in bin_0, this can be changed via flags.'''
        seq_dict, inv_dict = utils.choose_dict(dicttype)
        seq_dict_length = len(seq_dict)
        # check that the foreground and background chosen columns actually exist.
        columns_to_check = {'ct_' + str(foreground), 'ct_' + str(background)}
        if not columns_to_check.issubset(set(df.columns)):
            raise SortSeqError(
                'Foreground or Background column does not exist!')

        # get counts of each base at each position
        foreground_counts = utils.profile_counts(df,
                                                 dicttype,
                                                 bin_k=foreground)
        background_counts = utils.profile_counts(df,
                                                 dicttype,
                                                 bin_k=background)
        binheaders = utils.get_column_headers(foreground_counts)
        # get counts of each neighbor pair at each position
        foreground_counts_neighbor = utils.profile_counts_neighbor(
            df, dicttype, bin_k=foreground)
        background_counts_neighbor = utils.profile_counts_neighbor(
            df, dicttype, bin_k=background)
        binheaders_neighbor = utils.get_column_headers(
            foreground_counts_neighbor)
        # add pseudocounts to each position
        foreground_counts_neighbor[binheaders_neighbor] = \
            foreground_counts_neighbor[binheaders_neighbor] + pseudocounts
        background_counts_neighbor[binheaders_neighbor] = \
            background_counts_neighbor[binheaders_neighbor] + pseudocounts

        # do the same for the single base counts

        foreground_counts[binheaders] = foreground_counts[
            binheaders] + pseudocounts * seq_dict_length
        background_counts[binheaders] = background_counts[
            binheaders] + pseudocounts * seq_dict_length

        # make sure there are no zeros in counts after addition of pseudocounts
        ct_headers = utils.get_column_headers(foreground_counts_neighbor)
        if foreground_counts_neighbor[ct_headers].isin([0]).values.any():
            raise SortSeqError(
                '''There are some bases without any representation in\
                the foreground data, you should use pseudocounts to avoid failure \
                of the learning method''')
        if background_counts_neighbor[ct_headers].isin([0]).values.any():
            raise SortSeqError(
                '''There are some bases without any representation in\
                the background data, you should use pseudocounts to avoid failure \
                of the learning method''')
        # We will now normalize to compute our model values, we will do this by dividing each row by the
        # sum of all the rows (aka, dividing by counts + 16*psuedocounts)
        foreground_freqs_neighbor = foreground_counts_neighbor.copy()
        background_freqs_neighbor = background_counts_neighbor.copy()
        foreground_freqs_neighbor[binheaders_neighbor] = \
            foreground_freqs_neighbor[binheaders_neighbor].div( \
                foreground_freqs_neighbor[binheaders_neighbor].sum(axis=1), axis=0)
        background_freqs_neighbor[binheaders_neighbor] = \
            background_freqs_neighbor[binheaders_neighbor].div( \
                background_freqs_neighbor[binheaders_neighbor].sum(axis=1), axis=0)
        print(foreground_freqs_neighbor)
        # normalize to compute frequencies
        foreground_freqs = foreground_counts.copy()
        background_freqs = background_counts.copy()
        foreground_freqs[binheaders] = foreground_freqs[binheaders].div( \
            foreground_freqs[binheaders].sum(axis=1), axis=0)
        background_freqs[binheaders] = background_freqs[binheaders].div( \
            background_freqs[binheaders].sum(axis=1), axis=0)

        eta_fg = self.compute_etas_for_markov(self, foreground_freqs_neighbor,
                                              foreground_freqs, seq_dict,
                                              inv_dict)

        # now lets find the eta value for the background bin

        eta_bg = self.compute_etas_for_markov(self, background_freqs_neighbor,
                                              background_freqs, seq_dict,
                                              inv_dict)
        # subtract etas to create model
        model = eta_fg - eta_bg

        # turn model into data frame.
        model_df = pd.DataFrame(model)
        # label columns
        model_df.columns = [
            'val_' + inv_dict[q] + inv_dict[m] for q in range(seq_dict_length)
            for m in range(seq_dict_length)
        ]
        model_df['pos'] = foreground_counts_neighbor['pos']

        return model_df
Ejemplo n.º 7
0
    def convex_opt(self,
                   df,
                   seq_dict,
                   inv_dict,
                   columns,
                   tm=None,
                   modeltype='MAT',
                   dicttype='dna'):
        rowsforwtcalc = 1000
        seq_mat, wtrow = numerics.dataset2mutarray(df.copy(),
                                                   modeltype,
                                                   rowsforwtcalc=rowsforwtcalc)
        # need to make sure there is at least one representative
        # of each possible entry, otherwise don't fit it.
        no_reps = np.sum(np.matrix(df['ct_0']) * seq_mat, axis=0)
        cols_for_keep = [x for x in range( \
            seq_mat.shape[1]) if x in np.nonzero(no_reps)[1]]
        # if the model is a neighbor model we also need to
        # make sure we only give each mutation one parameter.
        if modeltype == 'NBR':
            mut_df = ProfileMut(df.loc[:rowsforwtcalc, :]).mut_df
            wtseq = ''.join(list(mut_df['wt']))

            single_seq_dict, single_inv_dict = utils.choose_dict(
                dicttype, modeltype='MAT')
            seqs = []
            # now make each possible single mutation...
            for i, let in enumerate(wtseq[1:-1]):
                for m in range(1, 4):
                    let_for_mutation = single_seq_dict[let]
                    let_for_mutation = single_inv_dict[np.mod(
                        let_for_mutation + m, 4)]
                    mut_seq = list(wtseq)
                    mut_seq[i + 1] = let_for_mutation
                    seqs.append(''.join(mut_seq))
            # now that we have each mutation, we should find
            # what their matrix representation is...
            seqs_df = pd.DataFrame()
            seqs_df['seq'] = seqs
            seq_mat_mutants, wtrow2 = \
                numerics.dataset2mutarray_withwtseq(seqs_df, modeltype, wtseq)
            # these mutants will have 2 entries which indicate
            # that a single mutation away from wt hits 2 parameters
            # which doesn't make sense, so we should fix the second one to zero...
            bad_cols = np.apply_along_axis(self.find_second_NBR_matrix_entry, \
                                           1, seq_mat_mutants.todense())
            cols_for_keep = [cols_for_keep[x] for x in range(len(cols_for_keep)) \
                             if cols_for_keep[x] not in bad_cols]
        seq_mat = seq_mat.tocsc()
        seq_mat2 = seq_mat[:, cols_for_keep]
        columns = [x for x in columns if 'ct_0' != x]
        N0 = np.matrix(df['ct_0']).T
        Nsm = np.matrix(df[columns])
        if tm:
            tm = np.array(tm)
        else:
            tm = np.matrix([x for x in range(1, len(columns) + 1)])
        print(tm)
        output = self.convex_opt_agorithm(seq_mat2, N0, Nsm, tm)
        output_parameterized = self.reverse_parameterization(
            output,
            cols_for_keep,
            wtrow,
            seq_dict,
            bins=tm.shape[1],
            modeltype=modeltype)
        print(output_parameterized)
        print(output_parameterized.shape)
        return (output_parameterized)
Ejemplo n.º 8
0
    def __init__(self,
                 df,
                 lm='ER',
                 modeltype='MAT',
                 LS_means_std=None,
                 db=None,
                 iteration=30000,
                 burnin=1000,
                 thin=10,
                 runnum=0,
                 initialize='LS',
                 start=0,
                 end=None,
                 foreground=1,
                 background=0,
                 alpha=0.0,
                 pseudocounts=1,
                 drop_library=False,
                 verbose=False,
                 tm=None):

        # set attributes
        self.df = df
        self.lm = lm
        self.modeltype = modeltype
        self.LS_means_std = LS_means_std
        self.db = db
        self.iteration = iteration
        self.burnin = burnin
        self.thin = thin
        self.runnum = runnum
        self.initialize = initialize
        self.start = start
        self.end = end
        self.foreground = foreground
        self.background = background
        self.alpha = alpha
        self.pseudocounts = pseudocounts
        self.drop_library = drop_library
        self.verbose = verbose
        self.tm = tm

        # output df
        self.output_df = None

        # validate parameters
        self._input_checks()

        # Determine dictionary
        seq_cols = qc.get_cols_from_df(df, 'seqs')
        if not len(seq_cols) == 1:
            raise SortSeqError('Dataframe has multiple seq cols: %s' %
                               str(seq_cols))
        dicttype = qc.colname_to_seqtype_dict[seq_cols[0]]

        seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)
        '''Check to make sure the chosen dictionary type correctly describes
             the sequences. An issue with this test is that if you have DNA sequence
             but choose a protein dictionary, you will still pass this test bc A,C,
             G,T are also valid amino acids'''
        # set name of sequences column based on type of sequence
        type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
        seq_col_name = type_name_dict[dicttype]
        lin_seq_dict, lin_inv_dict = utils.choose_dict(dicttype,
                                                       modeltype='MAT')
        # wtseq = utils.profile_counts(df.copy(),dicttype,return_wtseq=True,start=start,end=end)
        # wt_seq_dict_list = [{inv_dict[np.mod(i+1+seq_dict[w],len(seq_dict))]:i for i in range(len(seq_dict)-1)} for w in wtseq]
        par_seq_dict = {
            v: k
            for v, k in seq_dict.items() if k != (len(seq_dict) - 1)
        }
        # drop any rows with ct = 0
        df = df[df.loc[:, 'ct'] != 0]
        df.reset_index(drop=True, inplace=True)

        # If there are sequences of different lengths, then print error but continue
        if len(set(df[seq_col_name].apply(len))) > 1:
            sys.stderr.write('Lengths of all sequences are not the same!')
        # select target sequence region
        df.loc[:, seq_col_name] = df.loc[:, seq_col_name].str.slice(start, end)
        df = utils.collapse_further(df)
        col_headers = utils.get_column_headers(df)
        # make sure all counts are ints
        df[col_headers] = df[col_headers].astype(int)
        # create vector of column names
        val_cols = ['val_' + inv_dict[i] for i in range(len(seq_dict))]
        df.reset_index(inplace=True, drop=True)
        # Drop any sequences with incorrect length
        if not end:
            '''is no value for end of sequence was supplied, assume first seq is
                correct length'''
            seqL = len(df[seq_col_name][0]) - start
        else:
            seqL = end - start
        df = df[df[seq_col_name].apply(len) == (seqL)]
        df.reset_index(inplace=True, drop=True)
        # Do something different for each type of learning method (lm)
        if lm == 'ER':
            if modeltype == 'NBR':
                emat = self.Markov(df,
                                   dicttype,
                                   foreground=foreground,
                                   background=background,
                                   pseudocounts=pseudocounts)
            else:
                emat = self.Berg_von_Hippel(df,
                                            dicttype,
                                            foreground=foreground,
                                            background=background,
                                            pseudocounts=pseudocounts)

        if lm == 'PR':
            emat = self.convex_opt(df, seq_dict, inv_dict, col_headers, tm=tm, \
                                   dicttype=dicttype, modeltype=modeltype)
        if lm == 'LS':
            '''First check that is we don't have a penalty for ridge regression,
                that we at least have all possible base values so that the analysis
                will not fail'''
            if LS_means_std:  # If user supplied preset means and std for each bin
                means_std_df = io.load_meanstd(LS_means_std)

                # change bin number to 'ct_number' and then use as index
                labels = list(means_std_df['bin'].apply(self.add_label))
                std = means_std_df['std']
                std.index = labels
                # Change Weighting of each sequence by dividing counts by bin std
                df[labels] = df[labels].div(std)
                means = means_std_df['mean']
                means.index = labels
            else:
                means = None
            # drop all rows without counts
            df['ct'] = df[col_headers].sum(axis=1)
            df = df[df.ct != 0]
            df.reset_index(inplace=True, drop=True)
            ''' For sort-seq experiments, bin_0 is library only and isn't the lowest
                expression even though it is will be calculated as such if we proceed.
                Therefore is drop_library is passed, drop this column from analysis.'''
            if drop_library:
                try:
                    df.drop('ct_0', inplace=True)
                    col_headers = utils.get_column_headers(df)
                    if len(col_headers) < 2:
                        raise SortSeqError(
                            '''After dropping library there are no longer enough 
                            columns to run the analysis''')
                except:
                    raise SortSeqError(
                        '''drop_library option was passed, but no ct_0
                        column exists''')
            # parameterize sequences into 3xL vectors
            print('init learn model: \n')
            print(par_seq_dict)
            print('dict: ', dicttype)
            raveledmat, batch, sw = utils.genweightandmat(df,
                                                          par_seq_dict,
                                                          dicttype,
                                                          means=means,
                                                          modeltype=modeltype)
            # Use ridge regression to find matrix.
            emat = self.Compute_Least_Squares(raveledmat,
                                              batch,
                                              sw,
                                              alpha=alpha)

        if lm == 'IM':
            seq_mat, wtrow = numerics.dataset2mutarray(df.copy(), modeltype)
            # this is also an MCMC routine, do the same as above.
            if initialize == 'rand':
                if modeltype == 'MAT':
                    emat_0 = utils.RandEmat(len(df[seq_col_name][0]),
                                            len(seq_dict))
                elif modeltype == 'NBR':
                    emat_0 = utils.RandEmat(
                        len(df[seq_col_name][0]) - 1, len(seq_dict))
            elif initialize == 'LS':

                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_0_df = LearnModel(df.copy(),
                                       lm='LS',
                                       modeltype=modeltype,
                                       alpha=alpha,
                                       start=0,
                                       end=None,
                                       verbose=verbose).output_df
                emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))
                # pymc doesn't take sparse mat
            elif initialize == 'PR':
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_0_df = LearnModel(df.copy(),
                                       lm='PR',
                                       modeltype=modeltype,
                                       start=0,
                                       end=None).output_df
                emat_0 = np.transpose(np.array(emat_0_df[emat_cols]))
            emat = self.MaximizeMI_memsaver(seq_mat,
                                            df.copy(),
                                            emat_0,
                                            wtrow,
                                            db=db,
                                            iteration=iteration,
                                            burnin=burnin,
                                            thin=thin,
                                            runnum=runnum,
                                            verbose=verbose)

        # We have infered out matrix.
        # now format the energy matrices to get them ready to output
        if (lm == 'IM' or lm == 'memsaver'):
            if modeltype == 'NBR':
                try:
                    emat_typical = gauge.fix_neighbor(np.transpose(emat))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat)
            elif modeltype == 'MAT':
                try:
                    emat_typical = gauge.fix_matrix(np.transpose(emat))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat)

        elif lm == 'ER':
            '''the emat for this format is currently transposed compared to other formats
            it is also already a data frame with columns [pos,val_...]'''
            if modeltype == 'NBR':
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_typical = emat[emat_cols]
            else:
                emat_cols = [
                    'val_' + inv_dict[i] for i in range(len(seq_dict))
                ]
                emat_typical = emat[emat_cols]
                try:
                    emat_typical = (gauge.fix_matrix((np.array(emat_typical))))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = emat_typical

        elif (lm == 'MK'):
            '''The model is a first order markov model and its gauge does not need
                to be changed.'''

        elif lm == 'PR':
            emat_typical = np.transpose(emat)
        else:  # must be Least squares
            emat_typical = utils.emat_typical_parameterization(
                emat, len(seq_dict))
            if modeltype == 'NBR':
                try:
                    emat_typical = gauge.fix_neighbor(
                        np.transpose(emat_typical))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat_typical)
            elif modeltype == 'MAT':
                try:
                    emat_typical = gauge.fix_matrix(np.transpose(emat_typical))
                except:
                    sys.stderr.write('Gauge Fixing Failed')
                    emat_typical = np.transpose(emat_typical)
        em = pd.DataFrame(emat_typical)
        em.columns = val_cols
        # add position column
        if modeltype == 'NBR':
            pos = pd.Series(range(start, start - 1 + len(df[seq_col_name][0])),
                            name='pos')
        else:
            pos = pd.Series(range(start, start + len(df[seq_col_name][0])),
                            name='pos')
        output_df = pd.concat([pos, em], axis=1)
        # Validate model and return
        output_df = qc.validate_model(output_df, fix=True)
        self.output_df = output_df
Ejemplo n.º 9
0
    def __init__(self,
                 wtseq="ACGACGA",
                 mutrate=0.10,
                 numseq=10000,
                 dicttype='dna',
                 probarr=None,
                 tags=False,
                 tag_length=10):

        # setting attributes to parameters. This could be modified.
        self.wtseq = wtseq
        self.mutrate = mutrate
        self.numseq = numseq
        self.dicttype = dicttype
        self.probarr = probarr
        self.tags = tags
        self.tag_length = tag_length
        # attribute that gets populated after running the constructor
        self.output_df = None

        # Validate inputs:
        self._input_check()

        # generate sequence dictionary
        seq_dict, inv_dict = utils.choose_dict(dicttype)

        if isinstance(probarr, np.ndarray):
            L = probarr.shape[1]
            #Generate bases according to provided probability matrix
            letarr = np.zeros([numseq, L])
            for z in range(L):
                letarr[:, z] = np.random.choice(range(len(seq_dict)),
                                                numseq,
                                                p=probarr[:, z])
        else:
            parr = []
            wtseq = wtseq.upper()
            L = len(wtseq)
            letarr = np.zeros([numseq, L])

            #find wtseq array
            wtarr = self.seq2arr(wtseq, seq_dict)
            mrate = mutrate / (len(seq_dict) - 1)  # prob of non wildtype
            # Generate sequences by mutating away from wildtype
            '''probabilities away from wildtype (0 = stays the same, a 3 for 
                example means a C becomes an A, a 1 means C-> G)'''
            parr = np.array([1 - (len(seq_dict) - 1) * mrate] +
                            [mrate for i in range(len(seq_dict) - 1)])
            # Generate random movements from wtseq
            letarr = np.random.choice(range(len(seq_dict)),
                                      [numseq, len(wtseq)],
                                      p=parr)
            #Find sequences
            letarr = np.mod(letarr + wtarr, len(seq_dict))
        seqs = []
        # Convert Back to letters
        for i in range(numseq):
            seqs.append(self.arr2seq(letarr[i, :], inv_dict))

        seq_col = qc.seqtype_to_seqcolname_dict[dicttype]
        seqs_df = pd.DataFrame(seqs, columns=[seq_col])

        # If simulating tags, each generated seq gets a unique tag
        if tags:
            tag_seq_dict, tag_inv_dict = utils.choose_dict('dna')
            tag_alphabet_list = tag_seq_dict.keys()

            check(
                len(tag_alphabet_list)**tag_length > 2 * numseq,
                'tag_length=%d is too short for num_tags_needed=%d' %
                (tag_length, numseq))

            # Generate a unique tag for each unique sequence
            tag_set = set([])
            while len(tag_set) < numseq:
                num_tags_left = numseq - len(tag_set)
                new_tags = [''.join(choice(tag_alphabet_list,size=tag_length)) \
                    for i in range(num_tags_left)]
                tag_set = tag_set.union(new_tags)

            df = seqs_df.copy()
            df.loc[:, 'ct'] = 1
            df.loc[:, 'tag'] = list(tag_set)

        # If not simulating tags, list only unique seqs w/ corresponding counts
        else:
            seqs_counts = seqs_df[seq_col].value_counts()
            df = seqs_counts.reset_index()
            df.columns = [seq_col, 'ct']

        # Convert into valid dataset dataframe and return
        self.output_df = qc.validate_dataset(df, fix=True)
Ejemplo n.º 10
0
    def _input_check(self):
        """
        Check all parameter values for correctness

        """

        ########################
        #  wtseq input checks  #
        ########################

        # check if wtseq is of type string
        check(isinstance(self.wtseq, str),
              'type(wtseq) = %s; must be a string ' % type(self.wtseq))

        # check if empty wtseq is passed
        check(len(self.wtseq) > 0, "wtseq length cannot be 0")

        # Check to ensure the wtseq uses the correct bases according to dicttype

        # unique characters in the wtseq parameter as a list
        unique_base_list = list(set(self.wtseq))

        # if more than 4 unique bases detected and dicttype is not protein
        if (len(unique_base_list) > 4 and self.dicttype != 'protein'):
            print(
                ' Warning, more than 4 unique bases detected for dicttype %s did you mean to enter protein for dicttype? '
                % self.dicttype)

        # if 'U' base detected and dicttype is not 'rna'
        if ('U' in unique_base_list and self.dicttype != 'rna'):
            print(
                ' Warning, U bases detected for dicttype %s did you mean to enter rna for dicttype? '
                % self.dicttype)

        lin_seq_dict, lin_inv_dict = utils.choose_dict(self.dicttype,
                                                       modeltype='MAT')
        check(
            set(self.wtseq).issubset(lin_seq_dict),
            'wtseq can only contain bases in ' + str(lin_seq_dict.keys()))

        ##########################
        #  mutrate input checks  #
        ##########################

        # check if mutrate is of type float
        check(isinstance(self.mutrate, float),
              'type(mutrate) = %s; must be a float ' % type(self.mutrate))

        # ensure mutrate is in the correct range
        check(
            self.mutrate > 0 and self.mutrate <= 1,
            'mutrate = %d; must be %d <= mutrate <= %d.' %
            (self.mutrate, 0, 1))

        #########################
        #  numseq input checks  #
        #########################

        # check if numseq is valid
        check(isinstance(self.numseq, int),
              'type(numseq) = %s; must be a int ' % type(self.numseq))

        # check if numseq is positive
        check(self.numseq > 0,
              'numseq = %d must be a positive int ' % self.numseq)

        ###########################
        #  dicttype input checks  #
        ###########################

        # check if dicttype is of type string
        check(isinstance(self.dicttype, str),
              'type(dicttype) = %s; must be a string ' % type(self.dicttype))

        # check if len(dicttype) > 0
        check(
            len(self.dicttype) > 0,
            " length of dicttype must be greater than 0, length(dicttype): %d"
            % len(self.dicttype))

        ###########################
        #  probarr input checks   #
        ###########################

        # check if probarr is an ndarray
        if self.probarr is not None:
            check(
                isinstance(self.probarr, np.ndarray),
                'type(probarr) = %s; must be an np.ndarray ' %
                type(self.probarr))

        #######################
        #  tags input checks  #
        #######################

        # *** NOTE ***: an additional check is made on tags in the constructor if tags = True

        # check if tags is of type bool.
        check(isinstance(self.tags, bool),
              'type(tags) = %s; must be an boolean ' % type(self.tags))

        #############################
        #  tag_length input checks  #
        #############################

        # check if tag_length is of type int
        check(isinstance(self.tag_length, int),
              'type(tag_length) = %s; must be an int ' % type(self.tag_length))

        # check if tag_length is of positive
        check(self.tag_length > 0,
              'tag_length = %d must be a positive int ' % self.tag_length)


# /usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/