Ejemplo n.º 1
0
def main(
        data_df,model_df,
        start=0,end=None,err=False,coarse_graining_level=0):
    dicttype, modeltype = qc.get_model_type(model_df)
    seq_cols = qc.get_cols_from_df(data_df,'seqs')
    if not len(seq_cols)==1:
        raise SortSeqError('Dataframe has multiple seq cols: %s'%str(seq_cols))
    seq_dict,inv_dict = utils.choose_dict(dicttype,modeltype=modeltype)
    #set name of sequences column based on type of sequence
    type_name_dict = {'dna':'seq','rna':'seq_rna','protein':'seq_pro'}
    seq_col_name = type_name_dict[dicttype]
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = data_df.loc[:,seq_col_name].str.slice(start,end)
        if modeltype=='MAT':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos']):
                raise SortSeqError('model length does not match dataset length')
        elif modeltype=='NBR':
            if len(data_df.loc[0,seq_col_name]) != len(model_df.loc[:,'pos'])+1:
                raise SortSeqError('model length does not match dataset length')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
                data_df['ct'] = data_df[col_headers].sum(axis=1)
    data_df = data_df[data_df.ct != 0]        
    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end-start
    data_df = data_df[data_df[seq_col_name].apply(len) == (seqL)] 
    #make a numpy array out of the model data frame
    model_df_headers = ['val_' + str(inv_dict[i]) for i in range(len(seq_dict))]
    value = np.transpose(np.array(model_df[model_df_headers]))  
    #now we evaluate the expression of each sequence according to the model.
    seq_mat,wtrow = numerics.dataset2mutarray(data_df.copy(),modeltype)
    temp_df = data_df.copy()
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(np.array(model_df[model_df_headers]),seq_mat,wtrow) 
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True,drop=True)
    #we must divide by the total number of counts in each bin for the MI calculator
    #temp_sorted[col_headers] = temp_sorted[col_headers].div(temp_sorted['ct'],axis=0)     
    MI = EstimateMutualInfoforMImax.alt4(temp_sorted,coarse_graining_level=coarse_graining_level)
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(len(data_df_for_sub.index)/2))
            sub_df.reset_index(inplace=True,drop=True)
            sub_MI[i],sub_std = main(
                sub_df,model_df,err=False)
        Std = np.std(sub_MI)/np.sqrt(2)
    return MI,Std
Ejemplo n.º 2
0
def main(dataset_df,model_df,left=None,right=None):

    # Validate dataframes
    qc.validate_dataset(dataset_df)
    qc.validate_model(model_df)

    # Detect model type based on columns
    seqtype, modeltype = qc.get_model_type(model_df)
    seqcol = qc.seqtype_to_seqcolname_dict[seqtype]

    # Set start and end  based on left or right
    if not ((left is None) or (right is None)):
        raise SortSeqError('Cannot set both left and right at same time.')
    if not (left is None):
        start = left
        end = start + model_df.shape[0] + (1 if modeltype=='NBR' else 0)
    elif not (right is None):
        end = right 
        start = end - model_df.shape[0] - (1 if modeltype=='NBR' else 0)
    else:
        start = model_df['pos'].values[0]
        end = model_df['pos'].values[-1] + (2 if modeltype=='NBR' else 1)
    assert start < end 

    # Validate start and end positions
    seq_length = len(dataset_df[seqcol][0])
    if start < 0:
        raise SortSeqError('Invalid start=%d'%start)
    if end > seq_length:
        raise SortSeqError('Invalid end=%d for seq_length=%d'%(end,seq_length))

    #select target sequence region
    out_df = dataset_df.copy()
    out_df.loc[:,'seq'] = out_df.loc[:,'seq'].str.slice(start,end)

    #Create model object of correct type
    if modeltype == 'MAT':
        mymodel = Models.LinearModel(model_df)
    elif modeltype == 'NBR':
        mymodel = Models.NeighborModel(model_df)
    else:
        raise SortSeqError('Unrecognized model type %s'%modeltype)
 
    # Compute values
    out_df['val'] = mymodel.evaluate(out_df)

    # Validate dataframe and return
    return qc.validate_dataset(out_df,fix=True)
Ejemplo n.º 3
0
    def __init__(self,model_df):
        """
        Constructor takes model parameters in the form of a model dataframe
        """
        model_df = qc.validate_model(model_df.copy(),fix=True)
        seqtype, modeltype = qc.get_model_type(model_df)
        if not modeltype=='NBR':
            raise SortSeqError('Invalid modeltype: %s'%modeltype)

        seq_dict,inv_dict = utils.choose_dict(seqtype,modeltype=modeltype)
        self.seqtype = seqtype
        self.seq_dict = seq_dict
        self.inv_dict = inv_dict
        self.df = model_df
        self.length = model_df.shape[0]+1

        # Extract matrix part of model dataframe
        headers = qc.get_cols_from_df(model_df,'vals')
        self.matrix = np.transpose(np.array(model_df[headers]))
Ejemplo n.º 4
0
def wrapper(args):
    """ Wrapper for function for scan_model.main()
    """

    # Prepare input to main
    model_df = io.load_model(args.model)
    seqtype, modeltype = qc.get_model_type(model_df)
    L = model_df.shape[0]
    if modeltype == "NBR":
        L += 1

    chunksize = args.chunksize
    if not chunksize > 0:
        raise SortSeqError("chunksize=%d must be positive" % chunksize)

    if args.numsites <= 0:
        raise SortSeqError("numsites=%d must be positive." % args.numsites)

    if args.i and args.seq:
        raise SortSeqError("Cannot use flags -i and -s simultaneously.")

    # If sequence is provided manually
    if args.seq:
        pos_offset = 0
        contig_str = args.seq

        # Add a bit on end if circular
        if args.circular:
            contig_str += contig_str[: L - 1]

        contig_list = [(contig_str, "manual", pos_offset)]

    # Otherwise, read sequence from FASTA file
    else:
        contig_list = []
        inloc = io.validate_file_for_reading(args.i) if args.i else sys.stdin
        for i, record in enumerate(SeqIO.parse(inloc, "fasta")):
            name = record.name if record.name else "contig_%d" % i

            # Split contig up into chunk)size bits
            full_contig_str = str(record.seq)

            # Add a bit on end if circular
            if args.circular:
                full_contig_str += full_contig_str[: L - 1]

            # Define chunks containing chunksize sites
            start = 0
            end = start + chunksize + L - 1
            while end < len(full_contig_str):
                contig_str = full_contig_str[start:end]
                contig_list.append((contig_str, name, start))
                start += chunksize
                end = start + chunksize + L - 1
            contig_str = full_contig_str[start:]
            contig_list.append((contig_str, name, start))

        if len(contig_list) == 0:
            raise SortSeqError("No input sequences to read.")

    # Compute results
    outloc = io.validate_file_for_writing(args.out) if args.out else sys.stdout
    output_df = main(model_df, contig_list, numsites=args.numsites, verbose=args.verbose)

    # Write df to stdout or to outfile
    io.write(output_df, outloc, fast=args.fast)
Ejemplo n.º 5
0
def main(model_df, contig_list, numsites=10, verbose=False):

    # Determine type of string from model
    qc.validate_model(model_df)
    seqtype, modeltype = qc.get_model_type(model_df)
    seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype)

    # Check that all characters are from the correct alphabet
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    search_string = r"[^%s]" % alphabet
    for contig_str, contig_name, pos_offset in contig_list:
        if re.search(search_string, contig_str):
            raise SortSeqError("Invalid character for seqtype %s found in %s." % (seqtype, contig_name))

    # Create model object to evaluate on seqs
    if modeltype == "MAT":
        model_obj = Models.LinearModel(model_df)
    elif modeltype == "NBR":
        model_obj = Models.NeighborModel(model_df)

    # Create list of dataframes, one for each contig
    seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
    L = model_obj.length
    sitelist_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"])
    for contig_str, contig_name, pos_offset in contig_list:
        if len(contig_str) < L:
            continue
        this_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"])
        num_sites = len(contig_str) - L + 1
        poss = np.arange(num_sites).astype(int)
        this_df["left"] = poss + pos_offset
        this_df["right"] = poss + pos_offset + L - 1
        # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
        this_df[seq_col] = fast.seq2sitelist(contig_str, L)  # Cython
        this_df["ori"] = "+"
        this_df["contig"] = contig_name
        this_df["val"] = model_obj.evaluate(this_df[seq_col])
        sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

        # If scanning DNA, scan reverse-complement as well
        if seqtype == "dna":
            # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
            this_df[seq_col] = fast.seq2sitelist(contig_str, L, rc=True)  # Cython
            this_df["ori"] = "-"
            this_df["val"] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

        # Sort by value and reindex
        sitelist_df.sort_values(by="val", ascending=False, inplace=True)
        sitelist_df.reset_index(drop=True, inplace=True)

        # Crop list at numsites
        if sitelist_df.shape[0] > numsites:
            sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

        if verbose:
            print ".",
            sys.stdout.flush()

    if verbose:
        print ""
        sys.stdout.flush()

    # If no sites were found, raise error
    if sitelist_df.shape[0] == 0:
        raise SortSeqError("No full-length sites found within provided contigs.")

    sitelist_df = qc.validate_sitelist(sitelist_df, fix=True)
    return sitelist_df
Ejemplo n.º 6
0
def main(data_df,
         model_df,
         start=0,
         end=None,
         err=False,
         coarse_graining_level=0,
         rsquared=False,
         return_freg=False):

    #determine whether you are working with RNA, DNA, or protein.
    #this also should determine modeltype (MAT, NBR, PAIR).
    dicttype, modeltype = qc.get_model_type(model_df)

    #get column header for the sequence column.
    seq_cols = qc.get_cols_from_df(data_df, 'seqs')
    if not len(seq_cols) == 1:
        raise SortSeqError('Dataframe has multiple seq cols: %s' %
                           str(seq_cols))

    #create dictionary that goes from, for example, nucleotide to number and
    #visa versa.
    seq_dict, inv_dict = utils.choose_dict(dicttype, modeltype=modeltype)

    #set name of sequences column based on type of sequence
    type_name_dict = {'dna': 'seq', 'rna': 'seq_rna', 'protein': 'seq_pro'}
    seq_col_name = type_name_dict[dicttype]

    if not end:
        seqL = len(data_df[seq_col_name][0]) - start
    else:
        seqL = end - start
    #throw out wrong length sequences.
    #Cut the sequences based on start and end, and then check if it makes sense
    if (start != 0 or end):
        data_df.loc[:,seq_col_name] = \
            data_df.loc[:,seq_col_name].str.slice(start,end)
        right_length = data_df.loc[:, seq_col_name].apply(len) == (seqL)
        if not right_length.all():
            sys.stderr.write('''Not all sequences are the same length! 
                       Throwing out incorrect sequences!''')
            data_df = data_df.loc[right_length, :]
        data_df = data_df.reset_index(drop=True)

        if modeltype == 'MAT':
            if seqL != len(model_df.loc[:, 'pos']):
                raise SortSeqError(
                    'model length does not match dataset length')
        elif modeltype == 'NBR':
            if seqL != len(model_df.loc[:, 'pos']) + 1:
                raise SortSeqError(
                    'model length does not match dataset length')
        elif modeltype == 'PAIR':
            if int(scipy.misc.comb(seqL, 2)) != len(model_df.loc[:, 'pos']):
                raise SortSeqError(
                    'model length does not match dataset length')

    #get column names of the counts columns (excluding total counts 'ct')
    col_headers = utils.get_column_headers(data_df)
    if 'ct' not in data_df.columns:
        data_df['ct'] = data_df[col_headers].sum(axis=1)

    #remove empty rows.
    data_df = data_df[data_df.ct != 0]

    #determine sequence length.

    #make a numpy array out of the model data frame
    model_df_headers = [
        'val_' + str(inv_dict[i]) for i in range(len(seq_dict))
    ]
    value = np.array(model_df[model_df_headers])

    #now we evaluate the expression of each sequence according to the model.
    #first convert to matrix representation of sequences
    seq_mat, wtrow = numerics.dataset2mutarray(data_df.copy(), modeltype)
    temp_df = data_df.copy()

    #evaluate energy of each sequence
    temp_df['val'] = numerics.eval_modelmatrix_on_mutarray(
        value, seq_mat, wtrow)

    #sort based on value
    temp_sorted = temp_df.sort_values(by='val')
    temp_sorted.reset_index(inplace=True, drop=True)

    #freg is a regularized plot which show how sequences are distributed
    #in energy space.
    if return_freg:
        fig, ax = plt.subplots()
        MI, freg = EstimateMutualInfoforMImax.alt4(
            temp_sorted,
            coarse_graining_level=coarse_graining_level,
            return_freg=return_freg)
        plt.imshow(freg, interpolation='nearest', aspect='auto')

        plt.savefig(return_freg)
    else:
        MI = EstimateMutualInfoforMImax.alt4(
            temp_sorted,
            coarse_graining_level=coarse_graining_level,
            return_freg=return_freg)

    #if we want to calculate error then use bootstrapping.
    if not err:
        Std = np.NaN
    else:
        data_df_for_sub = data_df.copy()
        sub_MI = np.zeros(15)
        for i in range(15):
            sub_df = data_df_for_sub.sample(int(
                len(data_df_for_sub.index) / 2))
            sub_df.reset_index(inplace=True, drop=True)
            sub_MI[i], sub_std = main(sub_df, model_df, err=False)
        Std = np.std(sub_MI) / np.sqrt(2)

    #we can return linfoot corrolation (rsquared) or return MI.
    if rsquared:
        return (1 - 2**(-2 * MI)), (1 - 2**(-2 * Std))
    else:
        return MI, Std