예제 #1
0
 def analyseSegments(self, segs):
     # type: (List[Segment]) -> None
     contigs = ContigStorage()
     contigs.addAll([seg.asContig() for seg in segs if len(seg) > 5000])
     res = [] # type: List[Segment]
     for al in self.aligner.overlapAlign(self.reads, contigs):
         if basic.isCanonocal(al.seg_to.contig.id):
             res.append(al.seg_to)
         else:
             res.append(al.seg_to.RC())
     res = sorted(res, key=lambda seg: (seg.contig.id, seg.left))
     covs = [[0] * params.maxCoverageThreshold for i in range(100)]
     for contig, it in itertools.groupby(res, key = lambda seg: seg.contig):
         segs = list(it)
         shrink = contig.asSegment().shrink(1000)
         bad_seg = False
         for cov, slen in self.covSegments(shrink, segs, 1):
             if cov < 3:
                 bad_seg = True
         if bad_seg:
             continue
         for i in range(len(covs)):
             k = 500 + i * 100
             for cov, slen in self.covSegments(shrink, segs, k):
                 covs[i][min(cov, len(covs[i]) - 1)] += slen
     self.recs = [CoverageAnalyser.CoverageRecord(500 + i * 100, covs[i]) for i in range(len(covs)) if covs[i] > 1000]
예제 #2
0
 def __init__(self, disjointigs, aligner):
     # type: (DisjointigCollection, Aligner) -> None
     ContigStorage.__init__(self, [], True)
     self.disjointigs = disjointigs
     self.aligner = aligner
     self.items = dict()  # type: Dict[str, NewLine]
     self.cnt = 1
     self.listeners = []  # type: List[LineStorageListener]
     self.name_printer = None
예제 #3
0
def alsToReads(als):
    # type: (List[AlignmentPiece]) -> ContigStorage
    readIds = set()
    res = ContigStorage()
    for al in als:
        if al.seg_from.contig.id in readIds:
            continue
        readIds.add(al.seg_from.contig.id)
        res.add(al.seg_from.contig)
    return res
예제 #4
0
def printAlignments(sam_handler, reference_handler, reads_handler):
    print "Loading reference"
    cc = ContigStorage(add_rc=False).loadFromFasta(reference_handler, False)
    print "Loading query"
    reads = ContigStorage().loadFromFasta(reads_handler, False)
    print "Loading result"
    res = []
    for rec in sam_parser.Samfile(sam_handler):
        if rec.query_name in reads.items and cc[rec.tname] is not None:
            al = AlignmentPiece.FromSamRecord(reads[rec.query_name], cc[rec.tname], rec)
            if al is None:
                print rec.query_name, rec.tname
                continue
            if al.seg_to.contig not in cc:
                al = al.rc
            res.append(al)
    print "Printing result", len(res)
    res = sorted(res, key = lambda al: al.seg_to.left)
#    res = sorted(res, key = lambda al: len(al))[::-1]
    up = 0
    down = 0
    for al in res:
        print al
        print list(al.splitRead())
        s1, s2 = al.asMatchingStrings()
        up += s1.count("-")
        down += s2.count("-")
        s = []
        if len(list(al.splitRead())) > 1:
            nums = []
            for al1 in al.splitRead():
                nums.append(al1.seg_from.left)
                nums.append(al1.seg_from.right - 1)
            cur_num = 0
            cur = al.seg_from.left

            for c in s1:
                if cur == nums[cur_num] and c != "-":
                    if cur_num % 2 == 0:
                        s.append("[")
                    else:
                        s.append("]")
                    cur_num += 1
                else:
                    if cur_num % 2 == 0:
                        s.append("-")
                    else:
                        s.append("+")
                if c != "-":
                    cur += 1
            print "".join(s)
        print s1
        print s2
    print up, down
예제 #5
0
def main(args):
    dir = args[4]
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    sys.stdout.info("Starting graph-free recruitment")
    print " ".join(args)
    sys.stdout.info("Loading repeat sequences")
    seqs = ContigStorage().loadFromFasta(open(args[1], "r"), False)
    sys.stdout.info("Loading reads")
    reads = ContigStorage().loadFromFasta(open(args[2], "r"), False)
    k = int(args[3])
    recruit(seqs, reads, k, dir)
    sys.stdout.info("Finised graph-free recruitment")
예제 #6
0
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads, polisher):
    # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage
    sys.stdout.info("Initializing lines from init file", lc_file)
    lines = NewLineStorage(disjointigs, aligner)
    f = TokenReader(open(lc_file, "r"))
    n = f.readInt()
    for i in range(n):
        id = f.readToken()
        contig = contigs[id]
        assert contig.id == id
        line = lines.addNew(contig.seq, contig.id)
        read_ids = f.readTokens()
        for al in aligner.overlapAlign([reads[rid] for rid in read_ids], ContigStorage([line])):
            if len(al.seg_to) >= min(params.k, len(line) - 100):
                tmp_line = al.seg_to.contig # type: NewLine
                tmp_line.addReadAlignment(al)
        if len(line) < params.k + 200:
            new_contig, new_als = polisher.polishEnd(list(line.read_alignments), max_extension=params.k + 100 - len(line))
            line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als)
        line.correct_segments.add(line.asSegment().shrink(100))
        line.completely_resolved.add(line.asSegment().shrink(100))
        line.initial.add(AlignmentPiece.Identical(line.asSegment().asContig().asSegment(), line.asSegment()))
    sys.stdout.trace("Final list of lines:")
    for line in lines.unique():
        sys.stdout.trace(line, line.completely_resolved)
    lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w"))
    lines.alignDisjointigs()
    sys.stdout.info("Constructing line dot plot")
    return lines
예제 #7
0
 def alignAndFilter(self, reads, ref_storage, mode):
     # type: (Iterable[Contig], ContigStorage, str) -> Generator[AlignmentPiece]
     filter = self.filters[mode]
     read_storage = ContigStorage(reads, False)
     als = []
     for rec in self.align(read_storage, list(ref_storage.unique()), mode):
         if rec.is_unmapped:
             continue
         if len(als) > 0 and rec.query_name != als[0].seg_from.contig.id:
             res = list(filter(als))
             for al in res:
                 yield al
             als = []
         if len(als) > 0:
             seq_from = als[0].seg_from.contig
         else:
             seq_from = read_storage[rec.query_name]
         seq_to = ref_storage[rec.tname]
         tmp = AlignmentPiece.FromSamRecord(seq_from, seq_to, rec)
         if tmp is not None:
             if mode == "dotplot":
                 als.extend(tmp.splitRef())
             elif (mode == "local"):
                 als.extend(tmp.splitRead())
             elif (mode == "ava-pb"):
                 als.extend(tmp.splitRead())
             else:
                 als.append(tmp)
     if len(als) > 0:
         res = list(filter(als))
         for al in res:
             yield al
예제 #8
0
 def getRelevantAlignments(self, seg, min_overlap):
     # type: (Segment, int) -> Generator[AlignmentPiece]
     sys.stdout.trace("Requesting read alignments for", seg, " using palignments")
     line = seg.contig #type: NewLine
     reads = ContigStorage()
     relevant_reads = line.read_alignments.allInter(seg, min_overlap)
     sys.stdout.trace("Using reads ", relevant_reads)
     for base_read_al in relevant_reads:
         for read in self.als.getAlignments(base_read_al.seg_from.contig.id, params.k):
             reads.add(read)
     cnt = 0
     for al in self.aligner.localAlign(reads, ContigStorage([seg.contig])):
         if al.seg_to.interSize(seg) > min_overlap and al.__len__() > params.k:
             yield al
             cnt += 1
     sys.stdout.trace("Request for read alignments for", seg, "yielded", cnt, "alignments")
예제 #9
0
def printSegs(f, segs):
    c = ContigStorage().loadFromFasta(open(f, "r"), False)
    for seg in segs:
        if seg[2] == 0:
            seg[2] = len(c[seg[0]])
        SeqIO.write(c[seg[0]].segment(seg[1], seg[2]).asContig(), sys.stdout,
                    "fasta")
예제 #10
0
 def checkAlignments(self, seg, als):
     # type: (Segment,List[AlignmentPiece]) -> None
     rids = set([al.seg_from.contig.id for al in als])
     for al in self.aligner.localAlign(self.reads,
                                       ContigStorage([seg.contig])):
         if al.seg_to.interSize(
                 seg) > params.k and al.seg_from.contig.id not in rids:
             print "Missing alignment", al
예제 #11
0
def draw(contigs_file, output_dir, k):
    aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments")))
    CreateLog(output_dir)
    print "Reading contigs"
    tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")),
                 key=lambda contig: len(contig))
    lens = map(len, tmp)[::-1]
    print lens
    contigs = ContigStorage()
    if lens[1::2] == lens[0::2]:
        tmp = tmp[0::2]
        print "Removed extra contigs"
    for i, contig in enumerate(tmp):
        print i, contig
        contigs.add(Contig(contig.seq, str(i)))
    print "Constructing components"
    componenets = ExtractRepeatComponents(contigs, aligner, k)
    print "Components:"
    for comp in componenets:
        print comp.segments
        print comp.alignments
    for cnt, comp in enumerate(componenets):
        print "Processing component", cnt
        print comp.segments
        # print comp.alignments
        print "Forming blocks"
        Block.id_cnt = 0
        blocks = CreateBlocks(comp)
        if len(blocks) == 1:
            print "Skipping trivial repeat"
            continue
        for block in blocks:
            print "Block", block.id, ":", block.segs
        for block in blocks:
            for other in block.out:
                print block.id, "->", other.id
        print "Placing blocks on X axis"
        code = placeX(blocks)
        if code == 1:
            print "WARNING: component", cnt, "contains cycle. Aborting visualization."
            continue
        print "Placing blocks on Y axis"
        placeY(blocks, comp.segments)
        print "Printing figure"
        SimplePrinter().printBlocks(blocks, sys.stdout)
        print "Finished printing figure"
예제 #12
0
def splitSegKmeans(aligner, seg, mult, all_reads_list):
    polisher = Polisher(aligner, aligner.dir_distributor)
    all_reads = ContigStorage()
    base = seg.asContig()
    tmp = []
    rtv = readsToVectors(aligner, all_reads_list, base)
    kmeans = KMeans(n_clusters=mult, precompute_distances=True)
    recs = list(rtv.values())
    result = kmeans.fit_predict(X=[rec.v for rec in recs])
    print result
    clusters = dict()
    for i, c in enumerate(result):
        if c not in clusters:
            clusters[c] = []
        clusters[c].append(recs[i].al)
    for c in clusters.values():
        print str(c), ":", len(c)
    split_contigs = []
    split_reads = []
    for c in clusters.values():
        split_contigs.append(
            Contig(
                polisher.polishSmallSegment(base.asSegment(),
                                            c).seg_from.Seq(),
                str(len(split_contigs))))
        split_reads.append([al.seg_from.contig for al in c])
    maxpi = 1
    for i in range(mult):
        for j in range(mult):
            if i == j:
                sys.stdout.write("1.0 ")
                continue
            al = aligner.overlapAlign([split_contigs[i]],
                                      ContigStorage([split_contigs[j]
                                                     ])).next()
            sys.stdout.write(str(al.percentIdentity()) + " ")
            maxpi = max(maxpi, al.percentIdentity())
        print ""
    print "Maxpi:", maxpi
    if maxpi < 0.985:
        return zip(split_contigs, split_reads)
    else:
        return None
예제 #13
0
 def polishSmallSegment(self, seg, als):
     # type: (Segment, List[AlignmentPiece]) -> AlignmentPiece
     ok = False
     for al in als:
         if al.seg_to.contains(seg):
             ok = True
     if not ok:
         sys.stdout.log(common.log_params.LogPriority.warning, "Warning",
                        seg, "has no covering reads")
         return AlignmentPiece.Identical(seg.asContig().asSegment(), seg)
     reads = []
     start = basic.randomSequence(200)
     end = basic.randomSequence(200)
     for al in als:
         new_seq = ""
         al = al.reduce(target=seg)
         if al.seg_to.left < seg.left + 20:
             new_seq += start
         new_seq += al.seg_from.Seq()
         if al.seg_to.right > seg.right - 20:
             new_seq += end
         reads.append(NamedSequence(new_seq, al.seg_from.contig.id))
     base = Contig(start + seg.Seq() + end, "base")
     polished = None
     try:
         polished = Contig(self.polish(reads, base), "polished")
     except PolishException:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "has a sequence very different from reads. Using reads to correct."
         )
         for al, read in zip(als, reads):
             if al.seg_to.contains(seg):
                 try:
                     polished = Contig(
                         self.polish(reads, Contig(read.seq, read.id)),
                         "polished")
                     break
                 except PolishException:
                     pass
     if polished is None:
         sys.stdout.log(
             common.log_params.LogPriority.warning, "Warning", seg,
             "could not be corrected even though some reads cover it.")
         polished = seg.asContig()
     als = list(self.aligner.overlapAlign([polished],
                                          ContigStorage([base])))
     for al in als:
         if al.seg_from.left < 10 and al.rc.seg_from.left < 10:
             mapping = AlignmentPiece.Identical(
                 base.segment(len(start),
                              len(base) - len(end)), seg)
             return al.compose(mapping)
     assert False, "No alignment from polished to base: " + str(als)
예제 #14
0
def readsToVectors(aligner, reads_list, base):
    als = []
    rtv = dict()
    polisher = Polisher(aligner, aligner.dir_distributor)
    for al in fixAlDir(aligner.overlapAlign(reads_list, ContigStorage([base])),
                       base):
        if len(al.seg_to) < len(base) - 100:
            continue
        else:
            als.append(al)
            rtv[al.seg_from.contig.id] = ReadRecord(al).extend(toVector(al))
    reads_list = [al.seg_from.contig for al in als]
    bases = [base]
    for base_al1, base_al2, base_al3 in zip(als[0::3], als[1::3], als[2::3]):
        base_candidate = Contig(
            polisher.polishSmallSegment(
                base.asSegment(),
                [base_al1, base_al2, base_al3]).seg_from.Seq(),
            str(len(bases)))
        rtr_als = []
        read_ids = set()
        #        base_candidate = base_al.seg_from.asContig()
        for al in fixAlDir(
                aligner.overlapAlign(reads_list,
                                     ContigStorage([base_candidate])),
                base_candidate):
            if len(al.seg_to) < len(base_candidate) - 100:
                continue
            else:
                rtr_als.append(al)
                read_ids.add(al.seg_from.contig.id)
        if len(read_ids) == len(als):
            bases.append(base_candidate)
            for al in rtr_als:
                rtv[al.seg_from.contig.id].extend(toVector(al))
            if len(bases) > 10:
                break
    for rec in rtv.values():
        print rec.read.id, len(rec.v), rec.v
    return rtv
예제 #15
0
def main(k, dir, contigs_file, reads_file):
    # type: (int, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    params.k = k
    print "Loading contigs"
    tmp = sorted(ContigStorage().loadFromFasta(open(contigs_file, "r"),
                                               False).unique(),
                 key=lambda contig: len(contig))
    cnt = 1
    contigs = ContigStorage()
    for c1, c2 in zip(tmp[::2], tmp[1::2]):
        # if c1.seq == c2.rc.seq:
        contigs.add(Contig(c1.seq, str(cnt)))
        print cnt, c1.id, c2.id
        cnt += 1
        # else:
        #     contigs.add(Contig(c1.seq, str(cnt)))
        #     print cnt, c1.id
        #     cnt += 1
        #     contigs.add(Contig(c2.seq, str(cnt)))
        #     print cnt, c2.id
        #     cnt += 1
    print "Loading reads"
    reads = ReadCollection().loadFromFasta(open(reads_file, "r"))
    print "Aligning reads"
    for al in aligner.localAlign(reads, contigs):
        if len(al) > k:
            read = al.seg_from.contig  # type:AlignedRead
            read.addAlignment(al)
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in reads:
        if not basic.isCanonocal(read.id):
            continue
        if len(read.alignments) > 1:
            SeqIO.write(read, res, "fasta")
    res.close()
예제 #16
0
def LoadLineCollection(dir, lc_file, aligner, contigs, disjointigs, reads,
                       polisher):
    # type: (str, str, Aligner, ContigStorage, DisjointigCollection, ReadCollection, Polisher) -> NewLineStorage
    sys.stdout.info("Initializing lines from init file", lc_file)
    lines = NewLineStorage(disjointigs, aligner)
    f = TokenReader(open(lc_file, "r"))
    n = f.readInt()
    for i in range(n):
        id = f.readToken()
        contig = contigs[id]
        assert contig.id == id
        line = lines.addNew(contig.seq, contig.id)
        read_ids = f.readTokens()
        als = []
        line_reads = [reads[rid] for rid in read_ids]
        if len(line_reads) == 0:
            sys.stdout.warn("No read alignments in initialization for line",
                            line.id, "Realigning all reads")
            line_reads = reads
        for al in aligner.overlapAlign(line_reads, ContigStorage([line])):
            if len(al.seg_to) >= min(1500, len(line) - 100):
                als.append(al)
        als = sorted(als,
                     key=lambda al: (al.seg_from.contig.id, -int(
                         al.percentIdentity() * 100), -len(al)))
        for key, read_als in itertools.groupby(
                als, key=lambda al: al.seg_from.contig.id):
            al = list(read_als)[0]
            tmp_line = al.seg_to.contig  # type: NewLine
            tmp_line.addReadAlignment(al)
        correct_seg = line.asSegment().shrink(100)
        if len(line) < params.k + 200:
            new_contig, new_als = polisher.polishEnd(
                list(line.read_alignments),
                max_extension=params.k + 100 - len(line))
            line.extendRight(new_contig.suffix(pos=len(line)).Seq(), new_als)
            if len(correct_seg) < params.k:
                correct_seg = correct_seg.expandRight(params.k -
                                                      len(correct_seg))
        line.correct_segments.add(correct_seg)
        line.completely_resolved.add(correct_seg)
        line.initial.add(
            AlignmentPiece.Identical(line.asSegment().asContig().asSegment(),
                                     line.asSegment()))
    sys.stdout.trace("Final list of lines:")
    for line in lines.unique():
        sys.stdout.trace(line, line.completely_resolved)
    lines.writeToFasta(open(os.path.join(dir, "initial_lines.fasta"), "w"))
    lines.alignDisjointigs()
    sys.stdout.info("Constructing line dot plot")
    return lines
예제 #17
0
 def __init__(self,
              genome="",
              letter_size=550,
              error_rate=0.05,
              mutation_rate=0.005,
              seed=0):
     random.seed(seed)
     self.reads = []  # type: List[NamedSequence]
     self.disjointigs = []  # type: List[NamedSequence]
     self.contigs = []  # type: List[NamedSequence]
     self.letter_size = letter_size
     self.error_rate = error_rate
     self.mutation_rate = mutation_rate
     self.alphabet = ContigStorage()
     self.matches = dict()
     for c1, c2 in zip(ascii_lowercase, ascii_uppercase):
         seq = self.generate(self.letter_size)
         self.alphabet.add(Contig(seq, c1))
         seq, matches = self.mutate(seq, self.mutation_rate)
         self.alphabet.add(Contig(seq, c2))
         self.matches[c1] = matches
         self.matches[c2] = [(b, a) for a, b in matches]
     self.genome = Contig(self.translate(genome), genome)
예제 #18
0
 def collectRecords(self, corrected):
     # type: (List[Segment]) -> List[LineExtender.Record]
     sys.stdout.trace("Collecting records", corrected)
     read_bounds = dict()
     records = dict()  # type: Dict[Segment, LineExtender.Record]
     good_reads = set()
     for seg in corrected:
         sys.stdout.trace("Oppa initial:", seg)
         seg = seg.expandLeft(params.k)
         sys.stdout.trace("Alignments relevant for", seg,
                          list(self.dot_plot.allInter(seg)))
         for al in self.dot_plot.allInter(seg):
             seg1 = al.matchingSequence().mapSegUp(al.seg_from.contig, seg)
             line = al.seg_from.contig  # type:NewLine
             for seg_correct in line.correct_segments.allInter(al.seg_from):
                 for seg_resolved in line.completely_resolved.allInter(
                         seg_correct):
                     if seg_resolved in records:
                         continue
                     if seg_resolved.right == len(line):
                         next_start = len(line)
                     else:
                         next = line.completely_resolved.find(
                             line.asSegment().suffix(
                                 pos=seg_resolved.right), 1)
                         if next is None:
                             next_start = len(line)
                         else:
                             next_start = next.left
                     next_start = min(next_start, len(line) - 200)
                     focus = line.segment(
                         max(seg_resolved.left,
                             min(seg_resolved.right - params.k, seg1.left)),
                         min(seg_correct.right, next_start + params.k))
                     als = list(line.getRelevantAlignmentsFor(focus))
                     reads = ContigStorage()
                     for al in als:
                         reads.add(al.seg_from.contig)
                     als = list(
                         self.aligner.localAlign(reads.unique(),
                                                 ContigStorage([line])))
                     final_als = []
                     sys.stdout.trace("Focus:", focus, seg_resolved)
                     sys.stdout.trace(als)
                     for al in als:
                         if al.seg_to.contig == line.rc:
                             al = al.rc
                         if al.seg_to.interSize(focus) >= params.k - 100:
                             final_als.append(al)
                     sys.stdout.trace(final_als)
                     sys.stdout.trace("Finished realignment of reads")
                     records[seg_resolved] = self.createRecord(
                         seg_resolved, next_start, seg_correct, final_als,
                         good_reads, read_bounds)
     records = list(records.values())  # type: List[LineExtender.Record]
     return records
예제 #19
0
def ExtendShortContigs(contigs, reads, aligner, polisher, read_dump):
    # type: (ContigStorage, ReadCollection, Aligner, Polisher, str) -> None
    sys.stdout.info("Extending short lines")
    short_contigs = ContigStorage()
    als = dict() # type: Dict[str, List[AlignmentPiece]]
    for contig in contigs.unique():
        if len(contig) < params.k + 500:
            short_contigs.add(contig)
            als[contig.id] = []
            als[contig.rc.id] = []

    if read_dump is not None:
        sys.stdout.trace("Using flye read dump file to extend short contigs")
        relevant_reads = RelevantReadsFromDump(read_dump, short_contigs, reads)
        for contig in short_contigs:
            for al in aligner.overlapAlign(relevant_reads[contig.id], ContigStorage([contig])):
                als[al.seg_to.contig.id].append(al)
                als[al.seg_to.contig.rc.id].append(al.rc)
    else:
        sys.stdout.trace("Realigning all reads to extend short contigs")
        for al in aligner.overlapAlign(reads, short_contigs):
            if al.seg_to.left <= 20 and al.rc.seg_to.left <= 20:
                added = False
                for i, al1 in enumerate(als[al.seg_to.contig.id]):
                    if al1.seg_from.contig.id == al.seg_from.contig.id:
                        added = True
                        if al.percentIdentity() > al1.percentIdentity():
                            als[al.seg_to.contig.id][i] = al
                            als[al.seg_to.contig.rc.id][i] = al.rc
                        break
                if not added:
                    als[al.seg_to.contig.id].append(al)
                    als[al.seg_to.contig.rc.id].append(al.rc)
    for contig in short_contigs.unique():
        if len(als[contig.id]) > 0:
            tmp_contig, new_als = polisher.polishEnd(als[contig.id], params.reliable_coverage, max_extension=params.l - len(contig))
            r = len(tmp_contig) - len(contig)
            tmp_contig, new_als = polisher.polishEnd([al.rc for al in new_als], params.reliable_coverage, max_extension=params.l - len(contig))
            l = len(tmp_contig) - len(contig) - r
        else:
            tmp_contig, new_als = contig, als[contig.id]
            l = 0
            r = 0
#        if l > params.k / 2 and r > params.k / 2:
#            tmp_contig.seq = tmp_contig.seq[l - params.k / 2:-r + params.k / 2]
#        else:
#            tmp_contig.seq = tmp_contig.seq[max(0, l - params.k):-max(1, r - params.k)]
        if len(tmp_contig) > params.k + 500:
            sys.stdout.info("Prolonged contig", contig.id, "for", l, "and", r, "nucleotides from left and right")
            contigs.add(Contig(tmp_contig.rc.seq, contig.id))
        else:
            sys.stdout.warn("Could not prolong contig", contig.id, "enough. Removing it.")
            contigs.remove(contig)
예제 #20
0
def evaluatePI(dir, contigs_file, initial_file, ref_file):
    basic.ensure_dir_existance(dir)
    CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
    initial = ContigStorage().loadFromFasta(open(initial_file, "r"), False)
    ref = ContigStorage().loadFromFasta(open(ref_file, "r"), False)
    segs = []
    for al in aligner.overlapAlign(initial.unique(), contigs):
        if basic.isCanonocal(al.seg_to.contig.id):
            segs.append(al.seg_to)
        else:
            segs.append(al.rc.seg_to)
    segs = sorted(segs, key=lambda seg: basic.Normalize(seg.contig.id))
    interesting = dict()
    print "Interesting segments:"
    for contig in contigs:
        interesting[contig.id] = [contig.asSegment()]
    for contig, segit in itertools.groupby(segs, lambda seg: seg.contig):
        csegs = SegmentStorage().addAll(segit)
        csegs.mergeSegments()
        csegs = csegs.reverse(contig)
        interesting[contig.id] = list(csegs)
        print list(csegs)
    print "Analysis of contigs"
    scorer = Scorer()
    for al in aligner.localAlign(contigs.unique(), ref):
        print al
        for seg in interesting[al.seg_from.contig.id]:
            if al.seg_from.expand(500).contains(
                    seg) or al.seg_from.interSize(seg) > 40000:
                tmp_al = al.reduce(query=al.seg_from.cap(seg))
                scorer.polyshMatching(tmp_al.matchingSequence(),
                                      params.score_counting_radius)
                print tmp_al.seg_from, tmp_al.seg_to, str(events)
    print ""
    print "Analysis of initial"
    for al in aligner.overlapAlign(initial, ref):
        scorer.polyshMatching(al.matchingSequence(),
                              params.score_counting_radius)
        print al.seg_from, al.seg_to, str(events)
예제 #21
0
def iter_align(aligner, contig1, contig2):
    als = sorted(aligner.localAlign([contig1], ContigStorage([contig2])),
                 key=lambda al: al.seg_from.left)
    split = [al.splitRef() for al in als]
    als = [
        al for al in itertools.chain(*split) if len(al) > 400
        and al.seg_from.contig == contig1 and al.seg_to.contig == contig2
    ]
    als = largestSubseq(als)
    res = []
    if len(als) > 0:
        for al1, al2 in zip(als[:-1], als[1:]):
            res.append(al1)
            if al1.seg_from.dist(al2.seg_from) > 400 and al1.seg_from.dist(
                    al2.seg_from) > 400:
                seg1 = contig1.segment(al1.seg_from.right, al2.seg_from.left)
                seg2 = contig2.segment(al1.seg_to.right, al2.seg_to.left)
                tmp = iter_align(aligner, seg1.asContig(), seg2.asContig())
                for al in tmp:
                    res.append(al.queryAsSegment(seg1).targetAsSegment(seg2))
        res.append(als[-1])
    return res
예제 #22
0
 def polishEnd(self, als, min_cov=4, min_cov_frac=0.7, max_extension=None):
     # type: (List[AlignmentPiece], int, int, int) -> Tuple[Contig, List[AlignmentPiece]]
     if max_extension is None:
         max_extension = 10000000000
     scorer = Scorer()
     contig = als[0].seg_to.contig
     max_len = max_extension + len(contig)
     sys.stdout.trace("Polishing end of", als[0].seg_to.contig)
     new_contig = contig.asSegment().asContig()
     relevant_als = [
         al.changeTargetContig(new_contig) for al in als
         if al.rc.seg_to.left < 100
     ]
     finished_als = []
     while True:
         tmp = []
         for al in relevant_als:
             if al.seg_to.inter(new_contig.asSegment().suffix(
                     length=100)) and al.rc.seg_from.left > 100:
                 tmp.append(al)
             else:
                 finished_als.append(al)
         relevant_als = tmp
         if len(relevant_als) < min_cov:
             break
         start = "ACGTTCGA" + basic.randomSequence(
             params.flanking_size) + new_contig.asSegment().suffix(
                 length=min(params.flanking_size, len(new_contig))).Seq()
         reduced_read_list = [
             AlignedRead.new(
                 start + al.seg_from.contig.asSegment().suffix(
                     pos=al.seg_from.right).Seq(),
                 str(i) + "_" + al.seg_from.contig.id)
             for i, al in enumerate(relevant_als)
         ]
         reduced_reads = ReadCollection(reduced_read_list)
         found = False
         for base_al in relevant_als:
             if base_al.rc.seg_from.left < params.flanking_size:
                 continue
             # Base consists 500 random nucleotides and 500 last nucls from the polished sequence a segment of read of length at most 500
             base_segment = base_al.seg_from.contig.segment(
                 base_al.seg_from.right,
                 min(
                     len(base_al.seg_from.contig), base_al.seg_from.right +
                     max(params.window_size, params.k)))
             base = Contig(start + base_segment.Seq(), "base")
             for read in reduced_read_list:
                 read.clean()
             polished_base = Contig(self.polish(reduced_reads, base),
                                    "polished_base")
             for al in self.aligner.localAlign(
                     reduced_reads,
                     ContigStorage().addAll([polished_base])):
                 reduced_reads.reads[al.seg_from.contig.id].addAlignment(al)
             candidate_alignments = []
             for read in reduced_read_list:
                 candidate_alignments.append(None)
                 for al in read.alignmentsTo(polished_base.asSegment()):
                     if al.seg_to.left == 0 and (
                         (candidate_alignments[-1] is None
                          or candidate_alignments[-1].seg_to.right <
                          al.seg_to.right)):
                         candidate_alignments[-1] = al
             trimmedAlignments = []
             for i, al in enumerate(candidate_alignments):
                 assert al is not None, reduced_read_list[i]
                 trimmedAlignments.append(al.trimByQuality(0.4, 100))
             contra_index = 0
             contra = []
             support = len(trimmedAlignments)
             cutoff_pos = len(start)
             for al in sorted(trimmedAlignments,
                              key=lambda al: al.seg_to.right):
                 while contra_index < len(contra) and contra[
                         contra_index].seg_to.right < al.seg_to.right - 50:
                     contra_index += 1
                 if support >= min_cov and len(contra) - contra_index <= (
                         1 - min_cov_frac) * support:
                     cutoff_pos = al.seg_to.right
                     support -= 1
                     if al.contradictingRTCRight():
                         contra.append(al)
                 else:
                     sys.stdout.trace("Stopped at:", support, contra_index,
                                      (1 - min_cov_frac) * support)
                     break
             sys.stdout.trace("Positions:",
                              [al.seg_to.right for al in trimmedAlignments])
             sys.stdout.trace("Contra:", contra)
             if cutoff_pos > len(start) + 100:
                 sys.stdout.trace("Chose to use read", base_al.__repr__(),
                                  "Extended for", cutoff_pos - len(start),
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
                 found = True
                 new_contig_candidate = Contig(
                     new_contig.seq + polished_base[len(start):cutoff_pos],
                     "candidate")
                 embedding = AlignmentPiece.Identical(
                     polished_base.segment(len(start), cutoff_pos),
                     new_contig_candidate.asSegment().suffix(
                         pos=len(new_contig)))
                 read_mappings = []
                 for al1, al2 in zip(candidate_alignments, relevant_als):
                     seg_from = al2.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     seg_to = al1.seg_from.contig.asSegment().suffix(
                         length=len(al1.seg_from.contig) - len(start))
                     read_mappings.append(
                         AlignmentPiece.Identical(seg_from, seg_to))
                 embedded_alignments = []
                 for al1, al2 in zip(candidate_alignments, read_mappings):
                     if al1.seg_to.right <= len(start) + 10:
                         embedded_alignments.append(None)
                     else:
                         tmp = al2.compose(al1)
                         if tmp.seg_to.left > embedding.seg_from.right - 10:
                             embedded_alignments.append(None)
                         else:
                             embedded_alignments.append(
                                 tmp.compose(embedding))
                 corrected_relevant_alignments = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in relevant_als
                 ]
                 relevant_als = []
                 for al1, al2 in zip(corrected_relevant_alignments,
                                     embedded_alignments):
                     if al2 is None:
                         al = al1
                     else:
                         al = al1.mergeDistant(al2)
                         if al is None:
                             al = al1
                         elif al1.seg_from.dist(
                                 al2.seg_from) >= 10 or al1.seg_to.dist(
                                     al2.seg_to) >= 10:
                             al = scorer.polyshAlignment(
                                 al, params.alignment_correction_radius)
                     relevant_als.append(al)
                 finished_als = [
                     al.targetAsSegment(
                         new_contig_candidate.asSegment().prefix(
                             len(new_contig))) for al in finished_als
                 ]
                 new_contig = new_contig_candidate
                 break
             else:
                 sys.stdout.trace("Could not prolong with read", base_al,
                                  "Alignments:")
                 sys.stdout.trace(map(str, reduced_read_list))
         if len(new_contig) >= max_len:
             break
         if not found:
             break
     return new_contig, relevant_als + finished_als
예제 #23
0
    def __init__(self):
        self.num_iters = params.num_iters
        self.platform = params.technology
        self.threads = params.threads


if __name__ == "__main__":
    reads_file = sys.argv[2]
    consensus_file = sys.argv[3]
    dir = sys.argv[1]
    extra_params = sys.argv[4:]
    CreateLog(dir)
    dd = DirDistributor(dir)
    aligner = Aligner(dd)
    polisher = Polisher(aligner, dd)
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"),
                                          num_names=False)
    ref = ContigStorage().loadFromFasta(open(consensus_file, "r"),
                                        num_names=False)
    if "accurate" in extra_params:
        res = []
        als = sorted(aligner.overlapAlign(reads, ref),
                     key=lambda al: al.seg_to.contig.id)
        for rid, rals in itertools.groupby(als,
                                           key=lambda al: al.seg_to.contig.id):
            if basic.isCanonocal(rid):
                contig = ref[rid]
                corrected_seq = polisher.polishSegment(
                    contig.asSegment(), list(rals)).seg_from.Seq()
                res.append(Contig(corrected_seq, rid))
    else:
        res = polisher.polishMany(reads, list(ref.unique()))
예제 #24
0
class TestDataset:
    def __init__(self,
                 genome="",
                 letter_size=550,
                 error_rate=0.05,
                 mutation_rate=0.005,
                 seed=0):
        random.seed(seed)
        self.reads = []  # type: List[NamedSequence]
        self.disjointigs = []  # type: List[NamedSequence]
        self.contigs = []  # type: List[NamedSequence]
        self.letter_size = letter_size
        self.error_rate = error_rate
        self.mutation_rate = mutation_rate
        self.alphabet = ContigStorage()
        self.matches = dict()
        for c1, c2 in zip(ascii_lowercase, ascii_uppercase):
            seq = self.generate(self.letter_size)
            self.alphabet.add(Contig(seq, c1))
            seq, matches = self.mutate(seq, self.mutation_rate)
            self.alphabet.add(Contig(seq, c2))
            self.matches[c1] = matches
            self.matches[c2] = [(b, a) for a, b in matches]
        self.genome = Contig(self.translate(genome), genome)

    def translate(self, seq):
        return "".join(map(lambda c: self.alphabet[c].seq, seq))

    def addRead(self, read_seq):
        name = "R" + str(len(self.reads)) + "_" + read_seq
        self.reads.append(
            NamedSequence(
                self.mutate(self.translate(read_seq), self.error_rate)[0],
                name))
        return name

    def addDisjointig(self, disjointig_seq):
        # type: (str) -> str
        self.disjointigs.append(
            NamedSequence(
                self.mutate(self.translate(disjointig_seq),
                            self.mutation_rate)[0],
                "D" + str(len(self.disjointigs)) + "_" + disjointig_seq))
        return self.disjointigs[-1].id

    def addContig(self, contig_seq):
        # type: (str) -> str
        name = "C" + str(len(self.contigs)) + "_" + contig_seq
        self.contigs.append(NamedSequence(self.translate(contig_seq), name))
        return name

    def generateReads(self, length=5, cov=15, circular=False):
        genome = self.genome.id
        if circular:
            genome = genome + genome[0:length - 1]
        for i in range(0, len(genome) - length + 1):
            for j in range((cov + length - 1) / length):
                self.addRead(genome[i:i + length])

    def generate(self, letter_size):
        # type: (int) -> str
        return "".join(
            [random.choice(["A", "C", "G", "T"]) for i in range(letter_size)])

    def genAll(self, aligner):
        # type: (Aligner) -> Tuple[NewLineStorage, LineDotPlot, ReadCollection]
        disjointigs = DisjointigCollection()
        for dis in self.disjointigs:
            disjointigs.addNew(dis.seq, dis.id)
        from disjointig_resolve.line_storage import NewLineStorage
        lines = NewLineStorage(disjointigs, aligner)
        lines.name_printer = lambda line: line.id + "_" + self.translateBack(
            line, aligner)
        for line in self.contigs:
            new_line = lines.addNew(line.seq, line.id)
            new_line.initial.add(
                AlignmentPiece.Identical(
                    new_line.asSegment().asContig().asSegment(),
                    new_line.asSegment()))
        dp = LineDotPlot(lines, aligner)
        dp.construct(aligner)
        lines.alignDisjointigs()
        reads = ReadCollection()
        for read in self.reads:
            reads.addNewRead(read)
        disjointigs.addAlignments(aligner.localAlign(reads, disjointigs))
        return lines, dp, reads

    def mutate(self, seq, rate):
        # type: (str, float) -> Tuple[str, List[Tuple[int, int]]]
        res = [seq[0]]
        matches = []
        matches.append((0, 0))
        cur = 1
        for i, c in enumerate(seq):
            if i == 0 or i == len(seq) - 1:
                continue
            if random.random() < rate:
                vars = ["A", "C", "G", "T"]
                vars.remove(c)
                res.append(random.choice([random.choice(vars), "", c + c]))
                cur += len(res[-1])
            else:
                res.append(c)
                matches.append((cur, i))
                cur += 1
        res.append(seq[-1])
        matches.append((len(seq) - 1, cur))
        return "".join(res), matches

    def saveStructure(self, handler):
        # type: (TokenWriter) -> None
        handler.writeToken(self.genome.id)
        handler.writeInt(len(self.reads))
        for read in self.reads:
            handler.writeToken(read.id.split("_")[-1])
        handler.writeInt(len(self.disjointigs))
        for disjointig in self.disjointigs:
            handler.writeToken(disjointig.id.split("_")[-1])
        handler.writeInt(len(self.contigs))
        for contig in self.contigs:
            handler.writeToken(contig.id.split("_")[-1])

    @staticmethod
    def loadStructure(handler):
        # type: (TokenReader) -> TestDataset
        random.seed(0)
        res = TestDataset(handler.readToken())
        for i in range(handler.readInt()):
            res.addRead(handler.readToken())
        for i in range(handler.readInt()):
            res.addDisjointig(handler.readToken())
        for i in range(handler.readInt()):
            res.addContig(handler.readToken())
        return res

    def translateBack(self, contig, aligner):
        # type: (Contig, Aligner) -> str
        res = []
        for al in sorted(aligner.overlapAlign([contig], self.alphabet),
                         key=lambda al: al.seg_from.left):
            if len(res) > 0 and al.seg_from.interSize(
                    res[-1].seg_from) > self.letter_size / 2:
                if al.percentIdentity() > res[-1].percentIdentity():
                    res[-1] = al
            else:
                res.append(al)
        return "".join([al.seg_to.contig.id for al in res])
예제 #25
0
 def __init__(self):
     ContigStorage.__init__(self, [], True)
     self.items = self.items  # type: Dict[str, Disjointig]
     self.cnt = 1
예제 #26
0
def assemble(args, bin_path):
    params.bin_path = bin_path
    start = time.time()
    cl_params = Params().parse(args)
    ref = ContigStorage()
    if cl_params.test:
        cl_params.reads_file = os.path.dirname(__file__)  + "/../../test_dataset/reads.fasta"
        cl_params.genome_size = 30000
        cl_params.dir = os.path.dirname(__file__)  + "/../../test_results"
        ref.loadFromFile(os.path.dirname(__file__)  + "/../../test_dataset/axbctbdy.fasta", False)
    if cl_params.debug:
        params.save_alignments = True
    cl_params.check()
    CreateLog(cl_params.dir)
    sys.stdout.info("Command line:", " ".join(cl_params.args))
    sys.stdout.info("Started")
    if cl_params.debug:
        sys.stdout.info("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
        sys.stdout.info("Modifications:")
        print subprocess.check_output(["git", "diff"])
    sys.stdout.info("Preparing initial state")
    if cl_params.debug:
        save_handler = SaveHandler(os.path.join(cl_params.dir, "saves"))
    else:
        save_handler = None
    if cl_params.load_from is not None:
        # tmp = cl_params.focus
        sys.stdout.info("Loading initial state from saves")
        cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot = loadAll(TokenReader(open(cl_params.load_from, "r")))
        cl_params.parse(args)
        # cl_params.focus = tmp
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot)
        dot_plot.printAll(sys.stdout)
        printState(lines)
    else:
        aligner = Aligner(DirDistributor(cl_params.alignmentDir()))
        polisher = Polisher(aligner, aligner.dir_distributor)

        reads = CreateReadCollection(cl_params.reads_file, cl_params.cut_reads, cl_params.downsample)


        if cl_params.contigs_file is None:
            sys.stdout.info("Running Flye")
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            subprocess.check_call([os.path.join(params.bin_path, "flye"), "--meta", "-o", assembly_dir, "-t", str(cl_params.threads), "--" + params.technology + "-raw", reads_file, "--genome-size", str(cl_params.genome_size), "--min-overlap", str(params.k)])
            cl_params.set_flye_dir(assembly_dir, cl_params.mode)
        elif len(cl_params.disjointigs_file_list) == 0:
            assembly_dir = os.path.join(cl_params.dir, "assembly_initial")
            reads_file = os.path.join(cl_params.dir, "actual_reads.fasta")
            reads.print_fasta(open(reads_file, "w"))
            disjointigs_file = constructDisjointigs(reads, params.expected_size, assembly_dir)
            # graph_file, contigs_file, disjointigs_file, rep_dir, graph_file_after, contigs_file_after = parseFlyeDir(assembly_dir)
            cl_params.disjointigs_file_list.append(disjointigs_file)
            params.min_contra_for_break = 8

        disjointigs = CreateDisjointigCollection(cl_params.disjointigs_file_list, cl_params.dir, aligner, reads)

        all_unique = cl_params.init_file is not None
        contigs = CreateContigCollection(cl_params.graph_file, cl_params.contigs_file, cl_params.min_cov, aligner, polisher, reads, cl_params.force_unique, all_unique)

        if cl_params.autoKL:
            adjustKL(aligner, reads, contigs)

        if cl_params.init_file is None:
            ExtendShortContigs(contigs, reads, aligner, polisher, cl_params.read_dump)
            lines = CreateLineCollection(cl_params.dir, aligner, contigs, disjointigs, reads, cl_params.split)
        else:
            lines = LoadLineCollection(cl_params.dir, cl_params.init_file, aligner, contigs, disjointigs, reads, polisher)

        sys.stdout.info("Constructing dot plot")
        dot_plot = LineDotPlot(lines, aligner)
        dot_plot.construct(aligner)
        # dot_plot.printAll(sys.stdout)

        sys.stdout.info("Updating sequences and resolved segments.")
        knotter = LineMerger(lines, Polisher(aligner, aligner.dir_distributor), dot_plot)
        extender = LineExtender(aligner, knotter, disjointigs, dot_plot)
        extender.updateAllStructures(itertools.chain.from_iterable(line.completely_resolved for line in lines))
        for line in list(lines.unique()): # type: NewLine
            line.completely_resolved.mergeSegments()
            if len(line.completely_resolved) == 0:
                lines.removeLine(line)
        if cl_params.debug:
            sys.stdout.info( "Saving initial state")
            try:
                writer = save_handler.getWriter()
                sys.stdout.info("Save details:", writer.info)
                saveAll(writer, cl_params, aligner, contigs, reads, disjointigs, lines, dot_plot)
            except Exception as e:
                _, _, tb = sys.exc_info()
                sys.stdout.warn("Could not write save")
                traceback.print_tb(tb)
                sys.stdout.INFO( "Message:", e.message)

    sys.stdout.trace( "Disjointig alignments")
    for line in lines:
        sys.stdout.trace( line.disjointig_alignments)
    sys.stdout.info("Starting expanding alignment-consensus loop")

    EACL(aligner, cl_params, contigs, disjointigs, dot_plot, extender, lines, reads, save_handler)

    dot_plot.printAll(sys.stdout)

    sys.stdout.trace( "Final result:")
    lines.printToFasta(open(os.path.join(cl_params.dir, "lines.fasta"), "w"))
    lines.printKnottedToFasta(open(os.path.join(cl_params.dir, "assembly.fasta"), "w"))
    printState(lines)
    sys.stdout.info("Finished")
    secs = int(time.time() - start)
    days = secs / 60 / 60 / 24
    hours = secs / 60 / 60 % 24
    mins = secs / 60 % 60
    sys.stdout.info("Finished in %d days, %d hours, %d minutes" % (days, hours, mins))
    if cl_params.test:
        passed = False
        for al in aligner.dotplotAlign(lines, ref):
            if len(al) > len(al.seg_to.contig) - 3000:
                passed = True
                break
        if passed:
            sys.stdout.info("Test passed")
        else:
            sys.stdout.info("Test failed")
예제 #27
0
def CreateContigCollection(graph_file, contigs_file, min_cov, aligner, polisher, reads, force_unique, all_unique):
    sys.stdout.info("Creating contig collection")
    if force_unique is None and not all_unique:
        graph = SimpleGraph().ReadDot(graph_file)
        graph.FillSeq(contigs_file)
        covs = []
        for e in graph.e.values():
            covs.append((e.len, e.cov))
        tmp_cov = []
        total = sum(l for c,l in covs) / 2
        for l, c in sorted(covs)[::-1]:
            if total < 0:
                break
            tmp_cov.append((l, c))
            total -= l
        avg_cov = float(sum([l * c for l, c in tmp_cov])) / sum(l for l, c in tmp_cov)
        sys.stdout.info("Average coverage determined:", avg_cov)
        nonunique = set()
        for edge in graph.e.values():
            if edge.unique and edge.len < 20000 and len(graph.v[edge.start].out) > 1:
                if edge.cov >= min_cov and (edge.cov < 0.8 * avg_cov or edge.len > 40000):
                    alter = ContigStorage()
                    for e in graph.v[edge.start].out:
                        if e != edge:
                            alter.add(Contig(e.seq, e.id))
                    for al in aligner.localAlign([Contig(edge.seq, edge.id)], alter):#type: AlignmentPiece
                        if al.percentIdentity() > 0.98 and (al.seg_from.left < 100 and al.seg_to.left < 100 and len(al) > min(500, edge.len)):
                            nonunique.add(edge.id)
                            nonunique.add(basic.Reverse(edge.id))
        contigs = ContigCollection()
        for edge in graph.e.values():
            if basic.isCanonocal(edge.id):
                if edge.unique and (edge.len > params.min_isolated_length or len(graph.v[edge.end].out) > 0 or len(graph.v[edge.start].inc) > 0):
                    if edge.cov >= min_cov and (edge.cov < 1.5 * avg_cov or edge.len > 40000):
                        if edge.id in nonunique:
                            sys.stdout.info("Edge removed based on alignment to alternative:", edge.id, edge.cov, edge.len)
                        else:
                            contigs.add(Contig(edge.seq, edge.id))
                    else:
                        sys.stdout.info("Edge removed based on coverage:", edge.id, edge.cov, edge.len)
                elif (edge.len > 100000 and edge.cov < 1.5 * avg_cov) or (edge.len > 40000 and 1.3 * avg_cov > edge.cov > 0.7 * avg_cov):
                    contigs.add(Contig(edge.seq, edge.id))
                    sys.stdout.info("Edge added based on length and coverage:", edge.id, edge.cov, edge.len)

    elif force_unique is not None:
        sys.stdout.info("Using forced unique edge set")
        sys.stdout.trace(force_unique)
        contigs = ContigCollection().loadFromFile(contigs_file).filter(lambda contig: contig.id in force_unique)
    else:
        sys.stdout.info("Considering all contigs unique")
        contigs = ContigCollection().loadFromFile(contigs_file)
    # contigs.loadFromFasta(open(contigs_file, "r"), num_names=True)
    # contigs = contigs.filter(lambda contig: contig.id not in nonunique and len(contig) > params.k + 20)
    sys.stdout.info("Created", len(contigs), "initial contigs")
    if not all_unique or force_unique is not None:
        sys.stdout.info("Polishing contigs")
        polished_contigs = polisher.polishMany(reads, list(contigs.unique()))
        contigs = ContigCollection().addAll(polished_contigs)
    else:
        sys.stdout.info("Skipping contig polishing step since manual unique contig initialization was used")
    return contigs
예제 #28
0
import sys

sys.path.append("py")

from common import basic, params
from common.basic import CreateLog

from alignment.align_tools import Aligner, DirDistributor
from common.line_align import Scorer
from common.sequences import ContigStorage

if __name__ == "__main__":
    basic.ensure_dir_existance(sys.argv[1])
    CreateLog(sys.argv[1])
    reads = ContigStorage().loadFromFile(sys.argv[2])
    contigs = ContigStorage().loadFromFile(sys.argv[3])
    scorer = Scorer()
    dd = DirDistributor(sys.argv[1])
    aligner = Aligner(dd)
    for read in reads.unique():
        print "Processing read", read
        als = [
            scorer.polyshAlignment(al, params.alignment_correction_radius)
            for al in aligner.localAlign([read], contigs)
        ]
        for al1 in als:
            for al2 in als:
                if al1.seg_to.contig == al2.seg_to.contig:
                    continue
                print al1, "vs", al2
                scorer.scoreInCorrectSegments(al1,
예제 #29
0
def prolong(aligner, polisher, contig, reads):
    als = list(aligner.overlapAlign(reads.unique(), ContigStorage([contig])))
    contig, als = polisher.polishEnd(fixAlDir(als, contig), min_cov=5)
    contig, als = polisher.polishEnd([al.rc for al in als], min_cov=5)
    return contig
예제 #30
0
def main(contigs_file, contig_name, reads_file, dir, k, initial_reads1, initial_reads2):
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    contigs = ContigStorage().loadFromFasta(open(contigs_file, "r"), False)
#    contig = contigs[contig_name].asSegment().prefix(length=2000).asContig()
    contig = contigs[contig_name]
    reads = ContigStorage().loadFromFasta(open(reads_file, "r"), False)
    reads1 = ContigStorage()
    reads2 = ContigStorage()
    cnt = 0
    for read in reads.unique():
        cnt += 1
#        if cnt % 2 == 0:
        if read.id in initial_reads1:
            reads1.add(read)
        elif read.id in initial_reads2:
            reads2.add(read)
    polisher = Polisher(aligner, dd)
    contig1 = contig
    contig2 = contig
    scorer = Scorer()
    for i in range(3):
        diff = 0
        print "Iteration", i
        als1 = fixAlDir(aligner.overlapAlign(reads1.unique(), ContigStorage([contig])), contig)
        als2 = fixAlDir(aligner.overlapAlign(reads2.unique(), ContigStorage([contig])), contig)
        contig1 = Contig(polisher.polishSmallSegment(contig.asSegment(), als1).seg_from.Seq(), "1")
        contig2 = Contig(polisher.polishSmallSegment(contig.asSegment(), als2).seg_from.Seq(), "2")
        al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
        als1 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig1])), contig1)
        als1 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als1)
        als2 = fixAlDir(aligner.overlapAlign(reads.unique(), ContigStorage([contig2])), contig2)
        als2 = filter(lambda al: len(al.seg_to) > len(al.seg_to.contig) - 100, als2)
        als1 = sorted(als1, key = lambda al: al.seg_from.contig.id)
        als2 = sorted(als2, key = lambda al: al.seg_from.contig.id)
        reads1 = ContigStorage()
        reads2 = ContigStorage()
        dp = scorer.accurateScore(al.matchingSequence(), 10) #1 - al.percentIdentity()
        als_map = dict()
        for al in als1:
            als_map[al.seg_from.contig.id] = [al]
        for al in als2:
            if al.seg_from.contig.id in als_map:
                als_map[al.seg_from.contig.id].append(al)
        com_res = []
        diffs = []
        for tmp_als in als_map.values():
            if len(tmp_als) != 2:
                continue
            al1 = tmp_als[0]
            al2 = tmp_als[1]
            print al1, al2
            assert al1.seg_from.contig == al2.seg_from.contig
            pi1 = scorer.accurateScore(al1.matchingSequence(), 10) # al1.percentIdentity()
            pi2 = scorer.accurateScore(al2.matchingSequence(), 10) # al2.percentIdentity()
            com_res.append((al1, al2, pi1 - pi2))
            diffs.append(pi1 - pi2)
        diffs = sorted(diffs)
        th1 = diffs[len(diffs) / 4]
        th2 = diffs[len(diffs) * 3 / 4]
        print "Thresholds:", th1, th2
        for al1, al2, diff in com_res:
            if diff < th1:
                reads1.add(al1.seg_from.contig)
            elif diff > th2:
                reads2.add(al2.seg_from.contig)
#           if pi1 > pi2 + dp / 4:
#               reads1.add(al1.seg_from.contig)
#           elif pi2 > pi1 + dp / 4:
#               reads2.add(al2.seg_from.contig)
#           diff += abs(pi1 - pi2)
        print float(diff) / len(als1), len(reads1) / 2, len(reads2) / 2
    al = aligner.overlapAlign([contig1], ContigStorage([contig2])).next()
    print al
    print "\n".join(al.asMatchingStrings2())
    for read in reads1:
        if read.id in initial_reads1:
            sys.stdout.write(read.id + " ")
    print ""
    for read in reads2:
        if read.id in initial_reads2:
            sys.stdout.write(read.id + " ")
    print ""
    contig1 = prolong(aligner, polisher, contig1, reads1)
    contig2 = prolong(aligner, polisher, contig2, reads2)
    contig1.id = "1"
    contig2.id = "2"
    out = open(os.path.join(dir, "copies.fasta"), "w")
    SeqIO.write(contig1, out, "fasta")
    SeqIO.write(contig2, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads1.fasta"), "w")
    for read in reads1.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    out = open(os.path.join(dir, "reads2.fasta"), "w")
    for read in reads2.unique():
        SeqIO.write(read, out, "fasta")
    out.close()
    print "Finished"