def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
Example #2
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs