def main(reads_file, ref_file, dir, error_rate): sys.stderr.write("Reading reference" + "\n") ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))), key=lambda rec: len(rec))[-1] ref = Contig(ref.seq, ref.id) refs = ContigCollection() for i in range(0, len(ref) - 500, 500): if random.random() > 0.95: tmp = list(ref.segment(i, i + 500).Seq()) for j in range(error_rate * 500 / 100): pos = random.randint(0, 499) tmp[pos] = basic.rc[tmp[pos]] refs.add( Contig("".join(tmp), ref.id + "(" + str(i) + "," + str(i + 500) + ")")) refs.print_names(sys.stderr) sys.stderr.write("Reading reads" + "\n") reads = ReadCollection() reads.loadFromFasta(open(reads_file, "r")) sys.stderr.write("Aligning reads" + "\n") basic.ensure_dir_existance(dir) aligner = Aligner(DirDistributor(dir)) aligner.alignReadCollection(reads, refs) sys.stderr.write("Analysing alignments" + "\n") alignments = [] for read in reads: alignments.extend(read.alignments) alignments = filter(lambda al: len(al) > 450, alignments) alignments = sorted(alignments, key=lambda al: (al.seg_to.contig.id, al.seg_from.contig.id)) scorer = Scorer() scorer.scores.homo_score = 3 scorer.scores.ins_score = 5 scorer.scores.del_score = 5 cnt = 0 for contig, iter in itertools.groupby(alignments, key=lambda al: al.seg_to.contig): iter = list(iter) sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n") if len(iter) < 150: for al in iter: print scorer.accurateScore(al.matchingSequence(), params.alignment_correction_radius) cnt += 1 if cnt >= 5000: break if cnt >= 5000: break
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique): sys.stdout.info("Creating contig collection") if force_unique is None and not all_unique: graph = SimpleGraph().ReadDot(graph_file) graph.FillSeq(contigs_file) covs = [] for e in graph.e.values(): covs.append((e.len, e.cov)) tmp_cov = [] total = sum(l for c,l in covs) / 2 for l, c in sorted(covs)[::-1]: if total < 0: break tmp_cov.append((l, c)) total -= l avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov) sys.stdout.info("Average coverage determined:", avg_cov) nonunique = set() for edge in graph.e.values(): if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1: if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000): alter = ContigStorage() for e in graph.v[edge.start].out: if e != edge: alter.add(Contig(e.seq, e.id)) for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)): nonunique.add(edge.id) nonunique.add(basic.Reverse(edge.id)) contigs = ContigCollection() for edge in graph.e.values(): if basic.isCanonocal(edge.id): if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0): if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000): if edge.id in nonunique: sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len) else: contigs.add(Contig(edge.seq, edge.id)) else: sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len) elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov): contigs.add(Contig(edge.seq, edge.id)) sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len) elif force_unique is not None: sys.stdout.info("Using forced unique edge set") sys.stdout.trace(force_unique) contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique) else: sys.stdout.info("Considering all contigs unique") contigs = ContigCollection().loadFromFile(contigs_file) # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True) # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20) sys.stdout.info("Created", len(contigs), "initial contigs") if not all_unique or force_unique is not None: sys.stdout.info("Polishing contigs") polished_contigs = polisher.polishMany(reads, list(contigs.unique())) contigs = ContigCollection().addAll(polished_contigs) else: sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used") return contigs