Beispiel #1
0
def main(model_df, contig_list, numsites=10, verbose=False):

    # Determine type of string from model
    qc.validate_model(model_df)
    seqtype, modeltype = qc.get_model_type(model_df)
    seq_dict, inv_dict = utils.choose_dict(seqtype, modeltype=modeltype)

    # Check that all characters are from the correct alphabet
    alphabet = qc.seqtype_to_alphabet_dict[seqtype]
    search_string = r"[^%s]" % alphabet
    for contig_str, contig_name, pos_offset in contig_list:
        if re.search(search_string, contig_str):
            raise SortSeqError("Invalid character for seqtype %s found in %s." % (seqtype, contig_name))

    # Create model object to evaluate on seqs
    if modeltype == "MAT":
        model_obj = Models.LinearModel(model_df)
    elif modeltype == "NBR":
        model_obj = Models.NeighborModel(model_df)

    # Create list of dataframes, one for each contig
    seq_col = qc.seqtype_to_seqcolname_dict[seqtype]
    L = model_obj.length
    sitelist_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"])
    for contig_str, contig_name, pos_offset in contig_list:
        if len(contig_str) < L:
            continue
        this_df = pd.DataFrame(columns=["val", seq_col, "left", "right", "ori", "contig"])
        num_sites = len(contig_str) - L + 1
        poss = np.arange(num_sites).astype(int)
        this_df["left"] = poss + pos_offset
        this_df["right"] = poss + pos_offset + L - 1
        # this_df[seq_col] = [contig_str[i:(i+L)] for i in poss]
        this_df[seq_col] = fast.seq2sitelist(contig_str, L)  # Cython
        this_df["ori"] = "+"
        this_df["contig"] = contig_name
        this_df["val"] = model_obj.evaluate(this_df[seq_col])
        sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

        # If scanning DNA, scan reverse-complement as well
        if seqtype == "dna":
            # this_df[seq_col] = [qc.rc(s) for s in this_df[seq_col]]
            this_df[seq_col] = fast.seq2sitelist(contig_str, L, rc=True)  # Cython
            this_df["ori"] = "-"
            this_df["val"] = model_obj.evaluate(this_df[seq_col])
            sitelist_df = pd.concat([sitelist_df, this_df], ignore_index=True)

        # Sort by value and reindex
        sitelist_df.sort_values(by="val", ascending=False, inplace=True)
        sitelist_df.reset_index(drop=True, inplace=True)

        # Crop list at numsites
        if sitelist_df.shape[0] > numsites:
            sitelist_df.drop(sitelist_df.index[numsites:], inplace=True)

        if verbose:
            print ".",
            sys.stdout.flush()

    if verbose:
        print ""
        sys.stdout.flush()

    # If no sites were found, raise error
    if sitelist_df.shape[0] == 0:
        raise SortSeqError("No full-length sites found within provided contigs.")

    sitelist_df = qc.validate_sitelist(sitelist_df, fix=True)
    return sitelist_df
Beispiel #2
0
print 'cython rc: %f sec to rc one dna seq of length %d' % (c_time, len(seq))

print '%.1f-fold speedup.' % (p_time / c_time)

print '-----------------------------'
# Test seq2sitelist

site_length = 20
t = time.time()
x = [seq[i:(i + site_length)] for i in range(len(seq) - site_length + 1)]
p_time = time.time() - t
print 'python seq2sitelist: %f sec to chop a seq into %d sites'%\
    (p_time,len(x))

t = time.time()
x = fast.seq2sitelist(seq, site_length, safe=False)
c_time = time.time() - t
print 'cython seq2sitelist: %f sec to chop a seq into %d sites'%\
    (c_time,len(x))

print '%.1f-fold speedup.' % (p_time / c_time)

print '-----------------------------'
# Test seq2sitelist rc feature

site_length = 20
t = time.time()
x = fast.seq2sitelist(seq, site_length, safe=False)
y = [fast.reverse_complement(s, safe=False) for s in x]
p_time = time.time() - t
print 'python, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\
Beispiel #3
0
print 'cython rc: %f sec to rc one dna seq of length %d'%(c_time,len(seq))

print '%.1f-fold speedup.'%(p_time/c_time)

print '-----------------------------'
# Test seq2sitelist

site_length = 20
t = time.time()
x = [seq[i:(i+site_length)] for i in range(len(seq)-site_length+1)]
p_time = time.time()-t
print 'python seq2sitelist: %f sec to chop a seq into %d sites'%\
    (p_time,len(x))

t = time.time()
x = fast.seq2sitelist(seq,site_length, safe=False)
c_time = time.time()-t
print 'cython seq2sitelist: %f sec to chop a seq into %d sites'%\
    (c_time,len(x))

print '%.1f-fold speedup.'%(p_time/c_time)

print '-----------------------------'
# Test seq2sitelist rc feature

site_length = 20
t = time.time()
x = fast.seq2sitelist(seq,site_length, safe=False)
y = [fast.reverse_complement(s, safe=False) for s in x] 
p_time = time.time()-t
print 'python, seq2sitelist w/ rc: %f sec to chop seq into %d rc sites'%\