Example #1
0
def _validate_pos_cols(df, fix=False):
    """
    Validates the pos column in a given dataframe (if it exists)
    """
    col = 'pos'
    if col in df.columns:
        try:
            int_vals = df[col].values.astype(int)
            float_vals = df[col].values.astype(float)
        except:
            raise SortSeqError(\
                'Cannot convert values in column %s to numbers.'%col)

        if not df[col].values.dtype == int:
            if all(int_vals == float_vals):
                if fix:
                    df[col] = df[col].astype(int)
                else:
                    raise SortSeqError(\
                        'Positions are not integers; set fix=True to fix.')
            else:
                raise SortSeqError(\
                        'Positions cannot be interpreted as integers.')

        first = df[col].iloc[0]
        last = df[col].iloc[-1]
        if not np.array_equal(df[col].values, np.arange(first, last + 1)):
            raise SortSeqError('Positions are not consecutive integers.')

        if first < 0:
            raise SortSeqError('Positions are not all nonnegative.')

    return df
Example #2
0
def _validate_mut_cols(df, fix=False):
    """
    Validates contents of mut and mut_err columns in a given dataframe
    """
    mut_cols = get_cols_from_df(df, 'mut')
    for col in mut_cols:

        # Verify that freqs are floats
        if not df[col].values.dtype == float:

            # Check whether freqs can be interpreted as floats
            try:
                float_vals = df[col].astype(float)
            except:
                raise SortSeqError('Non-numbers found in freqs.')

            # Check whether we have permission to change these to floats
            if fix:
                df[col] = float_vals
            else:
                raise SortSeqError(\
                    'Freqs are not floats; set fix=True to fix.')

        # Make sure that all mut values are between 0 and 1
        if (not all(df[col] <= 1.0)) or (not all(df[col] >= 0.0)):
            raise SortSeqError(\
                'Freq values outside [0.0, 1.0] encountered.')

    return df
Example #3
0
def validate_file_for_reading(file_arg):
    """ Checks that a specified file exists and is readable. Returns a valid file handle given a file name or handle
    """
    # If user passed file name
    if type(file_arg)==str:

        # Verify that file exists
        if not os.path.isfile(file_arg):
            raise SortSeqError('Cannot find file: %s'%file_arg)

        # Verify that file can be read
        if not os.access(file_arg,os.R_OK):
            raise SortSeqError('Can find but cannot read from file: %s'%file_arg)

        # Get handle to file
        file_handle = open(file_arg,'r')

    # If user passed file object
    elif type(file_arg)==file:

        # Verify that file isn't closed
        if file_arg.closed:
            raise SortSeqError('File object is already closed.')
        file_handle = file_arg

    # Otherwise, throw error
    else:
        raise SortSeqError('file_arg is neigher a name or handle.')

    # Return validated file handle
    return file_handle
Example #4
0
def load(file_arg, file_type, **kwargs):
    """ Loads file of any specified type
    """
    validate_func_dict = {
    #'dataset'       : qc.validate_dataset,  # This won't work right now
    'model'         : qc.validate_model,
    'filelist'      : qc.validate_filelist,
    'tagkey'        : qc.validate_tagkey,
    'profile_ct'    : qc.validate_profile_ct,
    'profile_freq'  : qc.validate_profile_freq,
    'profile_mut'   : qc.validate_profile_mut,
    'profile_info'  : qc.validate_profile_info,
    'meanstd'       : qc.validate_meanstd,
    'sitelist'      : qc.validate_sitelist
    }

    df = load_text(file_arg)

    if 'dataset' in file_type:
        raise SortSeqError('file_type %s is not supported in load()'%file_type)

    if file_type not in validate_func_dict.keys():
        raise SortSeqError('Unrecognized file_type %s'%file_type)

    func = validate_func_dict[file_type]
    return func(df,fix=True,**kwargs)
Example #5
0
def _validate_freq_cols(df, fix=False, tol=1E-2):
    """
    Validates contents of freq_* columns in a given dataframe
    """
    freq_cols = get_cols_from_df(df, 'freq_')
    for col in freq_cols:

        # Check if columns are floats
        if not df[col].values.dtype == float:

            # Check whether values can be interpreted as floats
            try:
                df.loc[col] = df[col].astype(float)
            except:
                raise SortSeqError(\
                    'Cannot interpret values in %s as floats.'%col)

            # Check whether we have permission to change these to floats
            if fix:
                df[col] = df[col].astype(float)
            else:
                SortSeqError(\
                    'Values in %s not floats; set fix=True to fix.'%col)

        # Make sure that all freqs are between 0 and 1
        if (not all(df[col] <= 1.0)) or (not all(df[col] >= 0.0)):
            raise SortSeqError('Freq values outside [0.0, 1.0] encountered.')

    # If there are freq cols, sum along each row has to be 1.0 +- tol
    if freq_cols:
        row_sums = df[freq_cols].sum(axis=1).values
        if not all((row_sums <= 1.0 + tol) & (row_sums >= 1.0 - tol)):
            raise SortSeqError('Not all rows sum to 1.0 +- %f' % tol)

    return df
Example #6
0
def _validate_std_cols(df, fix=False):
    """
    Validates contents of xxx_err columns in a given dataframe
    """
    col = 'std'
    if col in df.columns:

        # Verify that freqs are floats
        if not df[col].values.dtype == float:

            # Check whether freqs can be interpreted as floats
            try:
                float_vals = df[col].astype(float)
            except:
                raise SortSeqError('Non-numbers found in %s.' % col)

            # Check whether we have permission to change these to floats
            if fix:
                df[col] = float_vals
            else:
                raise SortSeqError(\
                    'std values are not floats; set fix=True to fix.')

        # Make sure that all err values are finite
        if not all(np.isfinite(df[col])):
            raise SortSeqError('Nonfinite std values encountered.')

        # Make sure that all err values are nonnegative
        if any(df[col] < 0.0):
            raise SortSeqError('Negative std values encountered.')

    return df
Example #7
0
def validate_dataset(df, fix=False):
    """ 
    Validates the form of a dataset dataframe. A dataset dataframe must look something like this:

    ct      ct_0    ct_1    ct_2    val     tag     seq     
    3       1       2       0       0.012   CTG     ACCAT
    2       2       0       0      -4.52    CTA     ACCAT
    1       0       0       1       0.000   CCA     TCAGG
    
    A 'ct' column reports the total counts of all sequence/tag pairs. Optional 'ct_0', 'ct_1', ... columns contain counts of sequence/tag. pairs for  individual bins. Optional 'tag' column lists DNA sequnce tags used to identify sequences. A 'seq' column lists the sequences of interests. 

    Specifications:
    0. The dataframe must have at least one row and one column.
    1. A 'ct' column is mandatory and should appear first. Counts must be nonnegative integers. If not present, this can be added
    2. 'ct_X' columns are optional. If they appear, X must be a nonnegative integer. Columns must appear in the order of this number. Counts must be nonnegative integers and must sum to the value in the 'ct' column.
    4. A 'val' column is optional; this reports the value of a model run on the sequences in the dataframe 
    3. A 'tag', 'seq', 'seq_rna', or 'seq_pro' column is mandatory. More than one of these columns are allowed simultaneously. They must appear to the left of all other columns. In each column, sequences must conform to unambiguous DNA, RNA, or protein alphabets and must be all be of the same length.


    Arguments:
        df (pd.DataFrame): Dataset in dataframe format
        fix (bool): A flag saying whether to fix the dataframe into shape if possible.

    Returns:
        if fix=True:
            df_valid: a valid dataframe that has been fixed by the function
        if fix=False:
            Nothing

    Function:
        Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True).
    """

    # Verify dataframe has at least one row and one column
    if not df.shape[0] >= 1:
        raise SortSeqError('Dataframe must contain at least one row')

    # Validate column names
    for col in df.columns:
        if not is_col_type(col, ['seqs', 'cts', 'tag', 'val']):
            raise SortSeqError('Invalid column in dataframe: %s' % col)

    # Validate contents of columns
    df = _validate_cols(df, fix=fix)

    # Validate column order
    ct_cols = get_cols_from_df(df, 'cts')
    tag_cols = get_cols_from_df(df, 'tag')
    seq_cols = get_cols_from_df(df, 'seqs')
    val_cols = get_cols_from_df(df, 'val')
    new_cols = ct_cols + val_cols + tag_cols + seq_cols
    if not all(df.columns == new_cols):
        if fix:
            df = df[new_cols]
        else:
            raise SortSeqError(
                'Dataframe columns are in the wrong order; set fix=True to fix.'
            )

    return df
Example #8
0
def validate_model(df, fix=False):
    """ 
    Validates the form of a model dataframe. A model dataframe must look something like this:

    pos     val_A   val_C   val_G   val_T   
    3       1.1     4.3     -6.19   5.2
    4       0.01    3.40    -10.5   5.3
    5       0       1.4     10.9    231.0
    
    A 'pos' column reports the position within a sequence to which this modle applies. 'val_X' then describe the values of the model parameters.

    Specifications:
    0. The dataframe must have at least one row and one column.
    1. A 'pos' column is mandatory and must occur first. Values must be nonnegative integers in sequential order.
    2. 'val_X' columns must conform to one of the accepted model types. These columns must be arranged in alphabetical order. Parameter values must be finite float values.   

    Arguments:
        df (pd.DataFrame): Dataset in dataframe format
        fix (bool): A flag saying whether to fix the dataframe into shape if possible.

    Returns:
        if fix=True:
            df_valid: a valid dataframe that has been fixed by the function
        if fix=False:
            Nothing

    Function:
        Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True).
    """

    # Verify dataframe has at least one row and one column
    if not df.shape[0] >= 1:
        raise SortSeqError(\
            'Dataframe must contain at least one row')

    # Validate column names
    for col in df.columns:
        if not is_col_type(col, ['pos', 'vals']):
            raise SortSeqError('Invalid column in dataframe: %s.' % col)
    for col in ['pos']:
        if not col in df.columns:
            raise SortSeqError('%s column missing' % col)

    # Validate parameter column names
    val_cols = sorted([c for c in df.columns if is_col_type(c, 'vals')])
    ok = False
    for cols in model_parameters_dict.values():
        # Check if cols and df.columns are identical
        if len(cols) == len(val_cols):
            if all([a == b for a, b in zip(cols, val_cols)]):
                ok = True
    if not ok:
        raise SortSeqError(
            'Dataframe represents model with invalid columns: %s' %
            str(val_cols))

    # Validate contents of all columns
    df = _validate_cols(df, fix=fix)

    return df
Example #9
0
def _validate_bin_cols(df, fix=False):
    """
    Validates the bin column in a given dataframe (if it exists)
    """
    col = 'bin'
    if col in df.columns:
        try:
            int_vals = df[col].values.astype(int)
            float_vals = df[col].values.astype(float)
        except:
            raise SortSeqError(\
                'Cannot convert values in column %s to numbers.'%col)

        if not df[col].values.dtype == int:
            if all(int_vals == float_vals):
                if fix:
                    df[col] = df[col].astype(int)
                else:
                    raise SortSeqError(\
                        'Positions are not integers; set fix=True to fix.')
            else:
                raise SortSeqError(\
                        'Positions cannot be interpreted as integers.')

        if not len(int_vals) == len(set(int_vals)):
            raise SortSeqError('Bin numbers are not unique.')

        if not all(int_vals >= 0):
            raise SortSeqError('Bin numbers must be nonnegative numbers.')

    return df
Example #10
0
def _validate_mean_cols(df, fix=False):
    """
    Validates contents of mean column in a given dataframe
    """
    col = 'mean'
    if col in df.columns:

        # Check if columns are floats
        if not df[col].values.dtype == float:

            # Check whether values can be interpreted as floats
            try:
                float_vals = df[col].astype(float)
            except:
                raise SortSeqError(\
                    'Cannot interpret values in %s as floats.'%col)

            # Check whether we have permission to change these to floats
            if fix:
                df[col] = float_vals
            else:
                SortSeqError(\
                    'Values in %s not floats; set fix=True to fix.'%col)

        # Make sure that all parameters are finite
        if not all(np.isfinite(df[col])):
            pdb.set_trace()
            raise SortSeqError('Nonfinite parameters encountered.')
    return df
Example #11
0
def validate_tagkey(df, fix=False):
    """ 
    Validates the form of a tagkeys dataframe. A tagkeys dataframe must look something like this:

    tag     seq
    AACT    ATTAGTCTAGATC
    AGCT    ATTAGTCTAGATC
    TCGA    ATTAGTCTGGGTC
    
    A 'tag' column reports the short tag associated with the sequences in the 'seq' column. This file is used in the preprocess method

    Specifications:
    0. The dataframe must have at least one row.
    1. A 'tag' column is mandatory and must occur first. Values must be valid DNA sequences, all the same length.
    2. A single 'seq', 'seq_rna', or 'seq_pro' column is mandatory and must come second. Values must be valid DNA, RNA, or protein strings, all of the same length. 

    Arguments:
        df (pd.DataFrame): Dataset in dataframe format
        fix (bool): A flag saying whether to fix the dataframe into shape if possible.

    Returns:
        if fix=True:
            df_valid: a valid dataframe that has been fixed by the function
        if fix=False:
            Nothing

    Function:
        Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True).
    """

    # Verify dataframe has at least one row and one column
    if not df.shape[0] >= 1:
        raise SortSeqError(\
            'Dataframe must contain at least one row')

    # Check for exactly one tag column
    tag_cols = get_cols_from_df(df, 'tag')
    if len(tag_cols) != 1:
        raise SortSeqError('Must be exactly one tag column.')

    # Check for exactly one seqs column
    seq_cols = get_cols_from_df(df, 'seqs')
    if len(seq_cols) != 1:
        raise SortSeqError('Must be exactly one sequence column.')

    # Validate contents of columns
    df = _validate_cols(df, fix=fix)

    # Rearrange columns
    new_cols = tag_cols + seq_cols
    if not all(df.columns == new_cols):
        if fix:
            df = df[new_cols]
        else:
            raise SortSeqError(
                'Dataframe columns are in the wrong order; set fix=True to fix.'
            )

    return df
Example #12
0
    def evaluate(self,seqs):
        # Check seqs container
        if isinstance(seqs,pd.DataFrame):
            seq_col = qc.get_cols_from_df(seqs,'seqs')[0]
            seqs_to_use = list(seqs[seq_col])
        elif not (isinstance(seqs,list) or isinstance(seqs,pd.Series)):
            raise SortSeqError('Sequences must be input as a list, pd.Series, or pd.DataFrame')
        else:
            seqs_to_use = list(seqs)

        # Check length
        if len(seqs_to_use[0]) != self.length:
            raise SortSeqError(\
                'Energy Matrix Length does not equal Sequence Length')

        # Compute seqmats
        t0 = time.time()

        # fast.seqs2array_for_matmodel expects seqtype to be bytes
        #self.seqtype = str.encode(self.seqtype) -> type bytes
        #print(self.seqtype)
        #self.seqtype = str.encode(self.seqtype)
        #self.seqtype = str(self.seqtype).encode()
        #print('In Models...')
        #print(type(self.seqtype))
        # if not bytes, change to bytes
        if not (isinstance(self.seqtype,bytes)):
            self.seqtype = str(self.seqtype).encode('utf-8')
        #print(type(self.seqtype))

        #print(type(self.seqtype))
        #print(qc.seqtypes)
        #seqs_to_use = list(map(bytes, str(seqs_to_use).encode('UTF-8')))
        #seqs_to_use = list(map(bytes, seqs_to_use))
        #print(seqs_to_use)

        #if (isinstance(seqs_to_use[0], bytes)):
            #print(seqs_to_use[0].decode())
        #    for i in range(len(seqs_to_use)):
        #        seqs_to_use[i] = seqs_to_use[i].decode()

        # change elements to bytes if they're not bytes
        if not (isinstance(seqs_to_use[0],bytes)):
            #print('changing seq to bytes...')
            for i in range(len(seqs_to_use)):
                seqs_to_use[i] = str(seqs_to_use[i]).encode('utf-8')

        #print('calling cython:')
        #seqarray = fast.seqs2array_for_matmodel(list(seqs_to_use),self.seqtype)
        seqarray = fast.seqs2array_for_matmodel(seqs_to_use, self.seqtype)
        t1 = time.time()

        # Compute and return values
        vals = self.evaluate_on_seqarray(seqarray)
        t2 = time.time()

        #print 't1-t0 = %.4f, t1-t2 = %.4f'%(t1-t0,t2-t1)
        return vals 
Example #13
0
 def __init__(self,npar):
     try:
         self.scale = float(npar[0])
     except ValueError:
         raise SortSeqError('your input parameter must be a float')
     #Check that scale is in the correct range
     if self.scale <= 0:
         raise SortSeqError('''your input scale for normal noise must be greater\
             than zero''')       
Example #14
0
def validate_meanstd(df, fix=False):
    """ 
    Validates the form of a meanstd dataframe. An meanstd dataframe must look something like this:

    bin    mean    std
      0     5.1     .9
      1    -1.0    1.5
      2    -4.2      1
      3       8      3
      4       3      1
    
    Used only for least squares model fitting. A 'bin' column reports the label of a bin in a. A 'mean' column reports the mean SFR value for sequences in that bin. A 'std' column reports the std of SFR values for sequences in that bin.   

    Specifications:
    0. The dataframe must have at least one row.
    1. A 'bin' column is mandatory and must occur first. Values must be nonnegative integers in sequential order.
    2. A 'mean' column is mandatry and must come second. Values must be finite floatingpoint values. 
    3. An 'std' column is optional and must come last. Values must be nonnegative floating point values. 

    Arguments:
        df (pd.DataFrame): Dataset in dataframe format
        fix (bool): A flag saying whether to fix the dataframe into shape if possible.

    Returns:
        df (pd.DataFrame): the fixed dataframe (if fix==True) or the original dataframe passed by the user

    Function:
        Raises a TyepError if the data frame violates the specifications (if fix=False) or if these violations cannot be fixed (fix=True).
    """

    # Verify dataframe has at least one row
    if not df.shape[0] >= 1:
        raise SortSeqError(\
            'Dataframe must contain at least one row')

    # Validate column names
    for col in df.columns:
        if not is_col_type(col, ['bin', 'mean', 'std']):
            raise SortSeqError('Invalid column in dataframe: %s.' % col)
    for col in ['bin', 'mean', 'std']:
        if not col in df.columns:
            raise SortSeqError('%s column missing' % col)

    # Validate contents of columns
    df = _validate_cols(df, fix=fix)

    # Validate column order
    new_cols = ['bin', 'mean', 'std']
    if not all(df.columns == new_cols):
        if fix:
            df = df[new_cols]
        else:
            raise SortSeqError(\
             'Dataframe columns are in the wrong order; set fix=True to fix.')

    return df
Example #15
0
    def Berg_von_Hippel(self,
                        df,
                        dicttype,
                        foreground=1,
                        background=0,
                        pseudocounts=1):
        '''Learn models using berg von hippel model. The foreground sequences are
             usually bin_1 and background in bin_0, this can be changed via flags.'''
        seq_dict, inv_dict = utils.choose_dict(dicttype)
        # check that the foreground and background chosen columns actually exist.
        columns_to_check = {'ct_' + str(foreground), 'ct_' + str(background)}
        if not columns_to_check.issubset(set(df.columns)):
            raise SortSeqError(
                'Foreground or Background column does not exist!')

        # get counts of each base at each position
        foreground_counts = utils.profile_counts(df,
                                                 dicttype,
                                                 bin_k=foreground)
        background_counts = utils.profile_counts(df,
                                                 dicttype,
                                                 bin_k=background)
        binheaders = utils.get_column_headers(foreground_counts)
        # add pseudocounts to each position
        foreground_counts[
            binheaders] = foreground_counts[binheaders] + pseudocounts
        background_counts[
            binheaders] = background_counts[binheaders] + pseudocounts
        # make sure there are no zeros in counts after addition of pseudocounts
        ct_headers = utils.get_column_headers(foreground_counts)
        if foreground_counts[ct_headers].isin([0]).values.any():
            raise SortSeqError(
                '''There are some bases without any representation in\
                the foreground data, you should use pseudocounts to avoid failure \
                of the learning method''')
        if background_counts[ct_headers].isin([0]).values.any():
            raise SortSeqError(
                '''There are some bases without any representation in\
                the background data, you should use pseudocounts to avoid failure \
                of the learning method''')
        # normalize to compute frequencies
        foreground_freqs = foreground_counts.copy()
        background_freqs = background_counts.copy()
        foreground_freqs[binheaders] = foreground_freqs[binheaders].div(
            foreground_freqs[binheaders].sum(axis=1), axis=0)
        background_freqs[binheaders] = background_freqs[binheaders].div(
            background_freqs[binheaders].sum(axis=1), axis=0)

        output_df = -np.log(foreground_freqs / background_freqs)
        # change column names accordingly (instead of ct_ we want val_)
        rename_dict = {
            'ct_' + str(inv_dict[i]): 'val_' + str(inv_dict[i])
            for i in range(len(seq_dict))
        }
        output_df = output_df.rename(columns=rename_dict)
        return output_df
Example #16
0
def _validate_ori_cols(df, fix=False):
    """
    Validates 'ori' columns in a given dataframe. Column must contain only '+' and '-' characters.
    """
    col = 'ori'
    if col in df.columns:
        if not all([type(s) == str for s in df[col]]):
            raise SortSeqError('ori column must contain strings')

        if not all((df[col] == '+') | (df[col] == '-')):
            raise SortSeqError(\
                'ori column contains more than just "+" and "-" characters.')
    return df
Example #17
0
def estimate_mutualinfo(raw_counts, pseudocount=1, err=False, method='naive'):
    """
    Naive mutual information esimator. raw_counts must be a 2d array
    """

    # These are the only options supported thus far
    assert method in ['naive', 'tpm', 'nsb']

    # Make sure pseudocount is sane
    if not pseudocount >= 0:
        raise SortSeqError('pseudocount is not nonnegative.')

    # Fix up counts table
    counts = fix_counts_2d(raw_counts)

    # Naive estimate; includes pseudocount
    if method == 'naive':
        if err:
            mi, mi_err = _estimate_mutualinfo_naive(counts,\
                                pseudocount=pseudocount, err=True)
        else:
            mi = _estimate_mutualinfo_naive(counts,\
                                pseudocount=pseudocount, err=False)

    # Treves, Panzeri, Miller
    elif method == 'tpm':
        if err:
            mi, mi_err = _estimate_mutualinfo_naive(counts,\
                                pseudocount=pseudocount, err=True)
        else:
            mi = _estimate_mutualinfo_naive(counts,\
                                pseudocount=pseudocount, err=False)

        # Compute tpm correction
        n_rows = counts.shape[0]
        n_cols = counts.shape[1]
        N = counts.flatten().sum()
        mi -= (n_cols - 1.0) * (n_rows - 1.0) * np.log2(
            np.exp(1.0)) / (2.0 * N)

    # Nemenman, Shafee, Bialek
    elif method == 'nsb':
        if err:
            mi, mi_err = _estimate_mutualinfo_nsb(counts, err=True)
        else:
            mi = _estimate_mutualinfo_nsb(counts, err=False)

    else:
        raise SortSeqError('Unknown method: %s.' % method)

    return (mi, mi_err) if err else mi
Example #18
0
def _validate_contig_cols(df, fix=False):
    """
    Validates 'contig' columns in a given dataframe. Column must contain strings having no whitespace.
    """
    col = 'contig'
    if col in df.columns:
        if not all([type(s) == str for s in df[col]]):
            raise SortSeqError('contig col contains non-string.')
        if any([re.search('\s', s) for s in df[col]]):
            if fix:
                df.loc[:, col] = [re.sub('\s', '_', s) for s in df[col]]
            else:
                raise SortSeqError(
                    'Whitespace found in contig names; set fix=True to fix.')
    return df
Example #19
0
def is_col_type(col_name, col_types='all'):
    """ 
    Checks whether col_name is a valid column name, as specified by col_types. col_types can be either a string (for a single column type) or a list of strings (for multimple column types). Default col_types='all' causes function to check all available column types
    """
    col_match = False

    # Make col_types_list
    if type(col_types) == list:
        col_types_list = col_types
    elif type(col_types) == str:
        if col_types == 'all':
            col_types_list = col_patterns.values()
        else:
            col_types_list = [col_types]
    else:
        raise SortSeqError('col_types is not a string or a list.')

    # Check for matches wihtin col_type list
    for col_type in col_types_list:
        pattern = col_patterns[col_type]
        if re.search(pattern, col_name):
            col_match = True

    # Return true if any match found
    return col_match
Example #20
0
def eval_modelmatrix_on_mutarray(modelmatrix, mutarray, wtrow):

    print("numerics: sizes: ", modelmatrix.size, " ", wtrow.size)
    # Do error checking
    if not isinstance(modelmatrix, np.ndarray):
        raise SortSeqError('modelmatrix is not a np.ndarray')
    if not isinstance(wtrow, np.ndarray):
        raise SortSeqError('wtrow is not an np.ndarray')
    if not isinstance(mutarray, csr.csr_matrix):
        raise SortSeqError('mutarray is not a sparse csr_matrix')
        raise SortSeqError('Unrecognized model type %s' % modeltype)
    if len(wtrow.shape) != 1:
        raise SortSeqError('wtrow is not 1-dimensional')
    if len(modelmatrix.shape) != 2:
        raise SortSeqError('modelmatrix is not 2-dimensional')
    if wtrow.size != modelmatrix.size:
        raise SortSeqError('wtrow does not match modelmatrix')

    # Compute constant contribution to model prediciton
    modelmatrix_vec = modelmatrix.ravel()
    const_val = np.dot(wtrow, modelmatrix_vec)

    # Prepare matrix for scanning mutarray
    tmp_matrix = modelmatrix.copy()
    indices = wtrow.reshape(modelmatrix.shape).astype(bool)
    wt_matrix_vals = tmp_matrix[indices]
    tmp_matrix -= wt_matrix_vals[:, np.newaxis]
    modelmatrix_for_mutarray = csr_matrix(np.matrix(tmp_matrix.ravel()).T)

    # Compute values
    mutarray_vals = mutarray * modelmatrix_for_mutarray
    vals = const_val + mutarray_vals.toarray().ravel()
    return vals
Example #21
0
def rc(dna_str):
    if re.search(r"[^ACGT]", dna_str):
        raise SortSeqError('Invalid character found in DNA sequence.')
    c_str = ''.join([{
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A'
    }[B] for B in dna_str])
    return c_str[::-1]
Example #22
0
def dataset2mutarray_withwtseq(dataset_df, modeltype, wtseq, chunksize=1000):

    # Determine the type of model and set seq2array function appropriately
    if modeltype == 'MAT':
        seqs2array = fast.seqs2array_for_matmodel
    elif modeltype == 'NBR':
        seqs2array = fast.seqs2array_for_nbrmodel
    else:
        raise SortSeqError('Unknown model type: %s' % modeltype)

    # Determine seqtype, etc.
    seqcol = qc.get_cols_from_df(dataset_df, 'seqs')[0]
    seqtype = qc.colname_to_seqtype_dict[seqcol]
    wtcol = qc.seqtype_to_wtcolname_dict[seqtype]

    # Compute the wt sequence
    wtrow = seqs2array([wtseq], seq_type=seqtype).ravel().astype(bool)
    numfeatures = len(wtrow)
    # Process dataframe in chunks
    startrow = 0
    endrow = startrow + chunksize - 1
    numrows = dataset_df.shape[0]

    # Fill in mutarray (a lil matrix) chunk by chunk
    mutarray_lil = lil_matrix((numrows, numfeatures), dtype=int)
    matrix_filled = False
    while not matrix_filled:

        if startrow >= numrows:
            matrix_filled = True
            continue
        elif endrow >= numrows:
            endrow = numrows - 1
            matrix_filled = True

        # Compute seqarray
        seqlist = list(dataset_df[seqcol][startrow:(endrow + 1)])
        seqarray = seqs2array(seqlist, seq_type=seqtype)

        # Remove wt entries
        tmp = seqarray.copy()
        tmp[:, wtrow] = 0

        # Store results from this chunk
        mutarray_lil[startrow:(endrow + 1), :] = tmp

        # Increment rows
        startrow = endrow + 1
        endrow = startrow + chunksize - 1

    # Convert to csr matrix
    mutarray_csr = mutarray_lil.tocsr()

    # Return vararray as well as binary representation of wt seq
    return mutarray_csr, wtrow
Example #23
0
def fix_counts(raw_counts):
    """
    Flattens and converts to floats.
    Also checks that elements are present, are nonnegative, and not all zero. 
    """
    try:
        counts = np.array(raw_counts).astype(float).flatten()
    except:
        raise SortSeqError('could not covernt counts to array of flots')

    if len(counts.shape) == 0.0:
        raise SortSeqError('counts is empty or not array.')
    if not all(np.isfinite(counts)):
        raise SortSeqError('counts are not all finite.')
    if not all(counts >= 0.0):
        raise SortSeqError('counts are not nonnegative.')
    if all(counts == 0.0):
        raise SortSeqError('counts are all equal to zero.')

    return counts
Example #24
0
    def test_profile_freq_bincounts(self):
        """ Test the ability of mpathic.profile_freq to count frequencies
        """

        print '\nIn test_profile_freq_bincounts...'
        library_files = glob.glob(self.input_dir + 'library_*.txt')
        library_files += glob.glob(self.input_dir + 'dataset_*.txt')
        good_bin_num = 2
        bad_bin_num = 5
        for file_name in library_files:
            print '\t%s =' % file_name,
            description = file_name.split('_')[-1].split('.')[0]
            executable = lambda:\
                profile_freq.main(io.load_dataset(file_name),bin=good_bin_num)
            print '(bin=%d)' % good_bin_num,

            # If bad or library, then profile_freq.main should raise SortSeqError
            if ('_bad' in file_name) or ('library' in file_name):
                try:
                    self.assertRaises(SortSeqError, executable)
                    print 'badtype,',
                except:
                    print 'good (ERROR).'
                    raise

            # If good, then profile_freq.main should produce a valid df
            elif ('_good' in file_name) or ('dataset' in file_name):
                try:
                    df = executable()
                    qc.validate_profile_freq(df)
                    out_file = self.output_dir+\
                        'profile_freq_bin_%s.txt'%description
                    io.write(df, out_file)
                    io.load_profile_freq(out_file)
                    print 'good,',

                except:
                    print 'bad (ERROR).'
                    raise

            # There are no other options
            else:
                raise SortSeqError('Unrecognized class of file_name.')

            # Should always raise an error if bin num is too large
            executable = lambda:\
                profile_freq.main(io.load_dataset(file_name),bin=bad_bin_num)
            print '(bin=%d)' % bad_bin_num,
            try:
                self.assertRaises(SortSeqError, executable)
                print 'badtype.'
            except:
                print 'good (ERROR).'
                raise
Example #25
0
def fix_counts_2d(raw_counts):
    """
    Converts to numpy array of floats.
    Checks that array is 2d, elements are nonnegative and not all zero. 
    """
    try:
        counts = np.array(raw_counts).astype(float)
    except:
        raise SortSeqError('could not covernt counts to array of flots')

    if len(counts.shape) != 2:
        raise SortSeqError('counts array is not 2d.')
    if not all(np.isfinite(counts.flatten())):
        raise SortSeqError('counts are all finite.')
    if not all(counts.flatten() >= 0.0):
        raise SortSeqError('counts are not nonnegative.')
    if all(counts.flatten() == 0.0):
        raise SortSeqError('counts are all equal to zero.')

    return counts
Example #26
0
def _validate_lr_cols(df, fix=False):
    """
    Validates left/right columns in a given dataframe. Will check columns with names 'left' or 'right'.
    """
    lr_cols = get_cols_from_df(df, 'lr')
    for col in lr_cols:

        # Verify that counts are integers
        if not df[col].values.dtype == int:

            # Try to convert column to numbers
            try:
                int_vals = df[col].astype(int)
                float_vals = df[col].astype(float)
            except:
                raise SortSeqError(
                    'Cannot interptret left/right positions as numbers; column name = %s'
                    % col)

            # Convert to integers if this doesn't change count values
            if all(int_vals == float_vals):
                if fix:
                    df[col] = int_vals
                else:
                    SortSeqError(
                        'left/right positions are not integers; set fix=True to fix.'
                    )
            else:
                raise SortSeqError(
                    'Noninteger numbers found in left/right positions.')

            # Make sure that all parameters are finite
            if not all(np.isfinite(df[col])):
                SortSeqError('Nonfinite left/right positions encountered.')

        # Verify that counts are nonnegative
        if not all(df[col] >= 0):
            raise SortSeqError(
                'left/right positions must be nonnegative numbers.')

    return df
Example #27
0
def load_text(file_arg):
    """
    General function used to load data from a text file
    """
    file_handle = validate_file_for_reading(file_arg)
    try:
        df = pd.io.parsers.read_csv(file_handle,delim_whitespace=True,\
            comment='#', skip_blank_lines=True, engine='c')
    except:
        raise SortSeqError(\
            'Could not interpret text file %s as dataframe.'%repr(file_handle))
    return df.dropna(axis=0, how='all')   # Drop rows with all NaNs
Example #28
0
def dataset2seqarray(dataset_df, modeltype):
    # Determine the type of model and set seq2array function appropriately
    if modeltype=='MAT':
        seqs2array = mpathic.fast.seqs2array_for_matmodel
    elif modeltype=='NBR':
        seqs2array = mpathic.fast.seqs2array_for_nbrmodel
    else:
        raise SortSeqError('Unknown model type: %s'%modeltype)
    seqcol = qc.get_cols_from_df(dataset_df,'seqs')[0]  
    seqtype = qc.colname_to_seqtype_dict[seqcol]
    seqlist = list(dataset_df[seqcol])
    seqarray = seqs2array(seqlist, seq_type=seqtype)
    return seqarray
Example #29
0
def _validate_seqs_cols(df, fix=False):
    """
    Validates sequence columns in a given dataframe. Will check columns with names seq, seq_rna, seq_pro, tag, wt, wt_rna, wt_pro
    """
    seq_cols = get_cols_from_df(df, ['seqs', 'tag', 'wts'])
    for col in seq_cols:

        # Set alphabet
        try:
            seqtype = colname_to_seqtype_dict[col]
            alphabet = seqtype_to_alphabet_dict[seqtype]
        except:
            raise SortSeqError('Sequence column is of unkown type: %s.' % col)

        # Check that all sequences have the same length
        try:
            L = len(df[col][0])
        except:
            raise SortSeqError('Could not determine length of sequence.')

        if not all([len(seq) == L for seq in df[col]]):
            raise SortSeqError('Not all sequences are the same length.')

        # Make sure sequences are uppercase
        if not all([seq == seq.upper() for seq in df[col]]):
            if fix:
                df[col] = [seq.upper() for seq in df[col]]
            else:
                SortSeqError(
                    'Seqs are not all uppercase; set fix=True to fix.')

        # Check that all characters are from the correct alphabet
        search_string = r"[^%s]" % alphabet
        if not all([re.search(search_string, seq) == None for seq in df[col]]):
            print sum(
                [re.search(search_string, seq) == None for seq in df[col]])
            raise SortSeqError('Invalid character found in sequences.')

    return df
Example #30
0
def get_model_type(model_df):
    """ Returns seqtype correpsonding to given model dataframe
    """
    headers = get_cols_from_df(model_df, 'vals')
    seqtype = None
    modeltype = None
    for key in model_parameters_dict.keys():
        val_cols = model_parameters_dict[key]
        if set(val_cols) == set(headers):
            seqtype = key[1]
            modeltype = key[0]
    if (seqtype is None) or (modeltype is None):
        raise SortSeqError('Could not identify seqtype or modeltype')
    return (seqtype, modeltype)