def main(input_fofn, input_fasta, ref_fasta, num_cpus): cmd = "blasr {i} {r} -bestn 5 -nproc {c} -m 5 -sa {r}.sa -maxScore -1000 -minPctIdentity 85 -out {i}.blasr".format(\ i=input_fasta, r=ref_fasta, c=num_cpus) done_filename = input_fasta + '.BLASR.DONE' if not os.path.exists(done_filename): f = open(input_fasta + '_test.sh', 'w') f.write("#!/bin/bash\n") f.write(cmd + '\n') f.write("touch " + done_filename + '\n') f.close() if subprocess.check_call("bash {0}_test.sh &".format(input_fasta), shell=True)!=0: print "qsub failed. abort!" sys.exit(-1) if input_fofn is None: probqv = iCEC.ProbFromModel(.01, .07, .06) else: probqv = iCEC.ProbFromQV(input_fofn, input_fasta) sleep_time = 1 while True: if os.path.exists(done_filename): break sleep_time = max(60, sleep_time * 2) print "sleep for another", sleep_time time.sleep(sleep_time) iter=iCEC.blasr_against_ref(input_fasta+'.blasr', False, True, probqv.get_smoothed, ece_penalty=1, ece_min_len=10, same_strand_only=False) partial_uc = {} seen = set() for r in iter: # r[0] is qID, r[1] is cID if r[-1] is not None: if r[1] not in partial_uc: partial_uc[r[1]] = [] partial_uc[r[1]].append(r[0]) seen.add(r[0]) nohit = set(r.id for r in SeqIO.parse(open(input_fasta),'fasta')).difference(seen) with open(input_fasta + '.partial_uc.pickle', 'w') as f: dump({'partial_uc':partial_uc, 'nohit':nohit}, f) os.remove(input_fasta+'.blasr')
def worker_run_blasr(query_filename, ref_filename, output_filename, maxScore, probQV, qv_prob_threshold, ece_penalty, ece_min_len): """ blasr {0} {1} -nproc 8 -m 5 -maxLCPLength 15 -maxScore {3} -out {2} """ d = {} blasr_cmd = "blasr {q} {r} -m 5 -maxLCPLength 15 -maxScore {s} -out {o}".format(q=query_filename, r=ref_filename, s=maxScore, o=output_filename) if subprocess.check_call(blasr_cmd, shell=True) != 0: print >> sys.stderr, "Error running command", blasr_cmd sys.exit(-1) print >> sys.stderr, "parsing blasr output", output_filename for qID, cID, qStart, qEnd, missed_q, missed_t, fakecigar, ece_arr in iCEC.blasr_against_ref(output_filename, is_FL=True, sID_starts_with_c=True, qver_get_func=probQV.get_smoothed, qv_prob_threshold=qv_prob_threshold, ece_penalty=ece_penalty, ece_min_len=ece_min_len): if qID not in d: d[qID] = {} if fakecigar is not None: d[qID][cID] = probQV.calc_prob_from_aln(qID, qStart, qEnd, fakecigar) with open(output_filename + '.pickle', 'w') as f: dump(d, f) os.remove(output_filename)
def init_cluster_by_clique(fasta_filename, qver_get_func, bestn=100, ece_penalty=1, ece_min_len=20, nproc=8, maxScore=-1000): """ fasta_filename --- initial fasta filename, probably called aloha.fa_split00.fa qver_get_func --- function that returns QVs on reads bestn --- parameter in BLASR, higher helps in finding perfect cliques but bigger output nproc, maxScore --- parameter in BLASR, set maxScore appropriate to input transcript length ece_penalty, ece_min_len --- parameter in isoform hit calling Self-blasr input then iteratively find all mutually exclusive cliques (in decreasing size) Returns dict of cluster_index --> list of seqids which is the 'uc' dict that can be used by ICE """ out_filename = fasta_filename + '.self.blasr' if os.path.exists(out_filename): print >> sys.stderr, "{0} already exists. No need to run BLASR.".format(out_filename) else: cmd = "blasr {i} {i} -m 5 -maxLCPLength 15 -nproc {cpu} -maxScore {score} -bestn {n} -nCandidates {n} -out {o}".format(\ i=fasta_filename, n=bestn, o=out_filename, cpu=nproc, score=maxScore) print >> sys.stderr, cmd subprocess.check_call(cmd, shell=True) G = nx.Graph() for r in iCEC.blasr_against_ref(out_filename, is_FL=True, sID_starts_with_c=False, qver_get_func=qver_get_func, ece_penalty=ece_penalty, ece_min_len=ece_min_len): if r[0] == r[1]: continue # self hit, ignore if r[-1] is not None: print >> sys.stderr, "adding edge {0},{1}".format(r[0], r[1]) G.add_edge(r[0], r[1]) uc = {} used = [] ind = 0 deg = G.degree().items() deg.sort(key=lambda x:x[1], reverse=True) for d in deg: if d[0] not in G: continue # just get the immediate neighbors since we're looking for perfect cliques G_prime = G.subgraph([d[0]] + G.neighbors(d[0])) G_prime_nodes = G_prime.nodes() S,H = pClique.convert_graph_connectivity_to_sparse(G_prime, G_prime_nodes) seed_i = G_prime_nodes.index(d[0]) tQ = pClique.grasp(S, H, 1., 5, seed_i) if len(tQ) > 0: c = [G_prime_nodes[i] for i in tQ] uc[ind] = c ind += 1 used += c G.remove_nodes_from(c) # ------ below, old way of using find_cliques, inefficient on large graphs # cliques = list(nx.find_cliques(G)) # cliques.sort(key=lambda x: len(x), reverse=True) # for c in cliques: # if all(map(lambda x: x in G, c)): # uc[ind] = c # ind += 1 # used += c # G.remove_nodes_from(c) for r in SeqIO.parse(open(fasta_filename), 'fasta'): if r.id not in used: uc[ind] = [r.id] ind += 1 return uc