def main(dir, contigs_file1, contigs_file2, unique_contigs_file): CreateLog(dir) sys.stdout.level = LogPriority.warning unique = ContigCollection().loadFromFasta( open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) contigs1 = ContigCollection().loadFromFasta(open(contigs_file1, "r"), False) cals1 = list(aligner.overlapAlign(unique.unique(), contigs1)) transfers1, term1, all1 = extract_transfers(contigs1, cals1) contigs2 = ContigCollection().loadFromFasta(open(contigs_file2, "r"), False) cals2 = list(aligner.overlapAlign(unique.unique(), contigs2)) transfers2, term2, all2 = extract_transfers(contigs2, cals2) missing1 = [] missing2 = [] different = dict() unresolved1 = [] unresolved2 = [] same = [] for ucontig in list(unique) + [contig.rc for contig in unique]: uid = ucontig.id in1 = uid in all1 in2 = uid in all2 if not in1 and not in2: continue if not in1: missing1.append(uid) elif not in2: missing2.append(uid) else: if all1[uid][0] == all2[uid][0]: same.append(uid) elif uid in transfers1 and uid in transfers2: different[uid] = (all1[uid][0], all2[uid][0]) elif uid in transfers1: unresolved2.append(uid) elif uid in transfers2: unresolved1.append(uid) out = open(os.path.join(dir, "contigs.txt"), "w") out.write("Different: " + str(different) + "\n") out.write("Unresolved1: " + str(unresolved1) + "\n") out.write("Unresolved2: " + str(unresolved2) + "\n") out.write("Same: " + str(same) + "\n") out.write("Missing1: " + str(missing1) + "\n") out.write("Missing2: " + str(missing2) + "\n") out.write("Contig1 transfers: " + str(transfers1) + "\n") out.write("Contig1 term: " + str(term1) + "\n") out.write("Contig2 transfers: " + str(transfers2) + "\n") out.write("Contig2 term: " + str(term2) + "\n") out.close() print contigs_file1, contigs_file2 print len(different), len(unresolved1), len(unresolved2), len( missing1), len(missing2), len(same)
def main(dir, contigs_files, reference_file, unique_contigs_file): CreateLog(dir) sys.stdout.level = LogPriority.warning ref = ContigCollection().loadFromFasta(open(reference_file, "r"), False) unique = ContigCollection().loadFromFasta(open(unique_contigs_file, "r"), False).filter(lambda contig: len(contig) > 5000) aligner = Aligner(DirDistributor(os.path.join(dir, "alignments"))) ref_als= list(aligner.overlapAlign(unique.unique(), ref)) ref_transfers, ref_term, all_ref_als = extract_transfers(ref, ref_als) for uid in ref_term: ref_transfers[uid] = ref_term[uid] print "#", "file", "wrong", "unresolved", "correct", "missing" for i, contigs_file in enumerate(contigs_files): contigs = ContigCollection().loadFromFasta(open(contigs_file, "r"), False) contig_als = list(aligner.overlapAlign(unique.unique(), contigs)) contig_transfers, contig_term, all_contig_als = extract_transfers(contigs, contig_als) missing = [] wrong = dict() unresolved = [] correct = [] for uid in ref_transfers: if uid not in contig_transfers and uid not in contig_term: # print uid, "missing" missing.append(uid) elif uid in contig_transfers: if ref_transfers[uid][0] == contig_transfers[uid][0]: # print uid, "correct" correct.append(uid) else: # print uid, "wrong", ref_transfers[uid][0].id, contig_transfers[uid][0].id wrong[uid] = (ref_transfers[uid][0], contig_transfers[uid][0]) else: if ref_transfers[uid][0] == contig_term[uid][0]: # print uid, "correct" correct.append(uid) else: # print uid, "unresolved" unresolved.append(uid) out = open(os.path.join(dir, "contigs_" + str(i) +".txt"), "w") out.write("Wrong: " + str(wrong) + "\n") out.write("Unresolved: " + str(unresolved) + "\n") out.write("Correct: " + str(correct) + "\n") out.write("Missing: " + str(missing) + "\n") out.write("Contig transfers: " + str(contig_transfers) + "\n") out.write("Contig term: " + str(contig_term) + "\n") out.write("Ref transfers: " + str(ref_transfers) + "\n") out.write("Ref als:\n") for c in all_ref_als: out.write(str(c) + "\n") out.write("Contig als:\n") for c in all_contig_als: out.write(str(c) + "\n") out.close() print "result", i, contigs_file, len(wrong), len(unresolved), len(correct), len(missing)
def main(ref_file, contig_size, rlen, cov, dir): basic.ensure_dir_existance(dir) all_contigs = ContigCollection().loadFromFasta(open(ref_file, "r"), False) contig_file_name = os.path.join(dir, "contigs.fasta") contig_file = open(contig_file_name, "w") reads_file_name = os.path.join(dir, "reads.fasta") reads_file = open(reads_file_name, "w") for ref in all_contigs.unique(): if len(ref) < contig_size: continue SeqIO.write(ref, contig_file, "fasta") for i in range(0, len(ref), max(1, rlen / cov)): read = ref.segment(i, min(i + rlen, len(ref))).asNamedSequence() SeqIO.write(read, reads_file, "fasta") reads_file.close() contig_file.close() print "Done" print contig_file_name print reads_file_name
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs