Ejemplo n.º 1
0
def main(input_fofn, input_fasta, ref_fasta, num_cpus):
    cmd = "blasr {i} {r} -bestn 5 -nproc {c} -m 5 -sa {r}.sa -maxScore -1000 -minPctIdentity 85 -out {i}.blasr".format(\
            i=input_fasta, r=ref_fasta, c=num_cpus)
    done_filename = input_fasta + '.BLASR.DONE'
    if not os.path.exists(done_filename): 
        f = open(input_fasta + '_test.sh', 'w')
        f.write("#!/bin/bash\n")
        f.write(cmd + '\n')
        f.write("touch " + done_filename + '\n')
        f.close()
        if subprocess.check_call("bash {0}_test.sh &".format(input_fasta), shell=True)!=0:
            print "qsub failed. abort!"
            sys.exit(-1)

    if input_fofn is None:
        probqv = iCEC.ProbFromModel(.01, .07, .06)
    else:
        probqv = iCEC.ProbFromQV(input_fofn, input_fasta)
    sleep_time = 1
    while True:
        if os.path.exists(done_filename): break
        sleep_time  = max(60, sleep_time * 2)
        print "sleep for another", sleep_time
        time.sleep(sleep_time)


    iter=iCEC.blasr_against_ref(input_fasta+'.blasr', False, True, probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False)
    
    partial_uc = {}
    seen = set()
    for r in iter:
        # r[0] is qID, r[1] is cID
        if r[-1] is not None:
            if r[1] not in partial_uc: partial_uc[r[1]] = []
            partial_uc[r[1]].append(r[0])
            seen.add(r[0])

    nohit = set(r.id for r in SeqIO.parse(open(input_fasta),'fasta')).difference(seen)

    with open(input_fasta + '.partial_uc.pickle', 'w') as f:
        dump({'partial_uc':partial_uc, 'nohit':nohit}, f)

    os.remove(input_fasta+'.blasr')
Ejemplo n.º 2
0
def worker_run_blasr(query_filename, ref_filename, output_filename, maxScore, probQV, qv_prob_threshold, ece_penalty, ece_min_len):
    """
    blasr {0} {1} -nproc 8 -m 5 -maxLCPLength 15 -maxScore {3} -out {2}
    """
    d = {}
    blasr_cmd = "blasr {q} {r} -m 5 -maxLCPLength 15 -maxScore {s} -out {o}".format(q=query_filename, r=ref_filename, s=maxScore, o=output_filename)
    if subprocess.check_call(blasr_cmd, shell=True) != 0:
        print >> sys.stderr, "Error running command", blasr_cmd
        sys.exit(-1)
    print >> sys.stderr, "parsing blasr output", output_filename
    for qID, cID, qStart, qEnd, missed_q, missed_t, fakecigar, ece_arr in iCEC.blasr_against_ref(output_filename, is_FL=True, sID_starts_with_c=True, qver_get_func=probQV.get_smoothed, qv_prob_threshold=qv_prob_threshold, ece_penalty=ece_penalty, ece_min_len=ece_min_len):
        if qID not in d: d[qID] = {}
        if fakecigar is not None:
            d[qID][cID] = probQV.calc_prob_from_aln(qID, qStart, qEnd, fakecigar)

    with open(output_filename + '.pickle', 'w') as f:
        dump(d, f)

    os.remove(output_filename)
Ejemplo n.º 3
0
def init_cluster_by_clique(fasta_filename, qver_get_func, bestn=100, ece_penalty=1, ece_min_len=20, nproc=8, maxScore=-1000):
    """
    fasta_filename --- initial fasta filename, probably called aloha.fa_split00.fa
    qver_get_func --- function that returns QVs on reads
    bestn --- parameter in BLASR, higher helps in finding perfect cliques but bigger output
    nproc, maxScore --- parameter in BLASR, set maxScore appropriate to input transcript length
    ece_penalty, ece_min_len --- parameter in isoform hit calling

    Self-blasr input then iteratively find all mutually exclusive cliques (in decreasing size)
    Returns dict of cluster_index --> list of seqids
    which is the 'uc' dict that can be used by ICE
    """
    out_filename = fasta_filename + '.self.blasr'

    if os.path.exists(out_filename):
        print >> sys.stderr, "{0} already exists. No need to run BLASR.".format(out_filename)
    else:
        cmd = "blasr {i} {i} -m 5 -maxLCPLength 15 -nproc {cpu} -maxScore {score} -bestn {n} -nCandidates {n} -out {o}".format(\
        i=fasta_filename, n=bestn, o=out_filename, cpu=nproc, score=maxScore)
        print >> sys.stderr, cmd
        subprocess.check_call(cmd, shell=True)

    G = nx.Graph()
    for r in iCEC.blasr_against_ref(out_filename, is_FL=True, sID_starts_with_c=False, qver_get_func=qver_get_func, ece_penalty=ece_penalty, ece_min_len=ece_min_len):
        if r[0] == r[1]: continue # self hit, ignore
        if r[-1] is not None:
            print >> sys.stderr, "adding edge {0},{1}".format(r[0], r[1])
            G.add_edge(r[0], r[1])

    uc = {}
    used = []
    ind = 0

    deg = G.degree().items()
    deg.sort(key=lambda x:x[1], reverse=True)
    for d in deg:
        if d[0] not in G: continue
        # just get the immediate neighbors since we're looking for perfect cliques
        G_prime = G.subgraph([d[0]] + G.neighbors(d[0]))
        G_prime_nodes = G_prime.nodes()
        S,H = pClique.convert_graph_connectivity_to_sparse(G_prime, G_prime_nodes)
        seed_i = G_prime_nodes.index(d[0])
        tQ = pClique.grasp(S, H, 1., 5, seed_i)
        if len(tQ) > 0:
            c = [G_prime_nodes[i] for i in tQ]
            uc[ind] = c
            ind += 1
            used += c
            G.remove_nodes_from(c)

# ------ below, old way of using find_cliques, inefficient on large graphs
#    cliques = list(nx.find_cliques(G))
#    cliques.sort(key=lambda x: len(x), reverse=True)
#    for c in cliques:
#        if all(map(lambda x: x in G, c)):
#            uc[ind] = c
#            ind += 1
#            used += c
#            G.remove_nodes_from(c)

    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        if r.id not in used:
            uc[ind] = [r.id]
            ind += 1

    return uc