Esempio n. 1
0
def main(contig_file, reads_file, sam_file, dir, contig_id):
    # type: (str, str, str, str, str) -> None
    basic.ensure_dir_existance(dir)
    contigs = ContigCollection()
    contigs.loadFromFasta(open(contig_file, "r"))
    print "Contigs loaded"
    contig = contigs[contig_id]
    read_names = set()
    for rec in Samfile(open(sam_file, "r")):
        read_names.add(rec.query_name)
    reads = ReadCollection()
    cnt = 0
    for rec in SeqIO.parse_fasta(open(reads_file, "r")):
        if rec.id in read_names:
            rec.id = "Read" + str(cnt)
            reads.add(AlignedRead(rec))
            cnt += 1
    reads.print_fasta(open(os.path.join(dir, "reads.fasta"), "w"))
    print "Reads loaded", len(reads)
    reads.addAllRC()
    print "RC added", len(reads)

    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    aligner.alignReadCollection(reads, contigs)
    print "Reads aligned", len(reads)
    reads = reads.inter(contig.asSegment())
    print "Reads filtered", len(reads)
    sorted_reads = sorted(list(reads.reads.values()), key = lambda read: read.alignmentsTo(contig.asSegment()).next().seg_to.left)
    for read in sorted_reads:
        print read
        for al in read.alignmentsTo(contig.asSegment()):
            print "\n".join(al.asMatchingStrings())
Esempio n. 2
0
 def loadFromFasta(self,
                   handler,
                   save_names=False,
                   int_ids=False,
                   filter=lambda rec: True):
     # type: (BinaryIO, bool, bool, Callable[[NamedSequence], bool]) -> DisjointigCollection
     recs = list(SeqIO.parse_fasta(handler))
     if save_names:
         for rec in recs:
             assert rec.id not in self.items.keys() and basic.Reverse(
                 rec.id) not in self.items.keys()
     for rec in recs:
         if not filter(rec):
             continue
         if save_names:
             number = basic.parseNegativeNumber(rec.id)
             if number is not None:
                 self.cnt = max(self.cnt, int(abs(number)) + 1)
             if int_ids:
                 self.addNew(rec.seq, str(number))
             else:
                 self.addNew(rec.seq)
         else:
             self.addNew(rec.seq)
     return self
Esempio n. 3
0
def readsN50(dir):
    for fn in os.listdir(dir):
        tmp = []
        f = os.path.join(dir, fn, fn + ".fasta")
        for rec in SeqIO.parse_fasta(open(f, "r")):
            if len(tmp) >= 1000:
                break
            tmp.append(len(rec))
        print fn, sorted(tmp)[len(tmp) / 2]
Esempio n. 4
0
def prepare_disjointigs_file(disjointigs_file, disjointigs_file_list):
    recs = []
    for fn in disjointigs_file_list:
        for rec in SeqIO.parse_fasta(open(fn, "r")):
            recs.append(rec)
    h = open(disjointigs_file, "w")
    for rec in recs:
        SeqIO.write(rec, h, "fasta")
    h.close()
Esempio n. 5
0
 def loadFromFasta(self, handler, num_names=True):
     # type: (BinaryIO, bool) -> ContigCollection
     for rec in SeqIO.parse_fasta(handler):
         if num_names:
             self.add(
                 Contig(rec.seq,
                        str(basic.parseNegativeNumberAndMod(rec.id))))
         else:
             self.add(Contig(rec.seq, rec.id))
     return self
Esempio n. 6
0
 def FillSeq(self, f, numeric=True):
     for s in SeqIO.parse_fasta(open(f, "r")):
         if numeric:
             s.id = str(basic.parseNumber(s.id))
         if s.id in self.e:
             self.e[s.id].seq = s.seq
             self.e[s.id].len = len(s.seq)
         if "-" + s.id in self.e:
             self.e["-" + s.id].seq = basic.RC(s.seq)
             self.e["-" + s.id].len = len(s.seq)
     return self
Esempio n. 7
0
 def CheckSequences(self, reads, reads_file):
     # type: (Iterable[NamedSequence], str) -> bool
     if not os.path.exists(reads_file):
         return False
     try:
         for rec, read in itertools.izip_longest(SeqIO.parse_fasta(open(reads_file, "r")), reads):
             if str(rec.id) != str(read.id) or rec.seq != read.seq:
                 return False
         return True
     except:
         return False
Esempio n. 8
0
 def FillSeq(self, f, numeric=True):
     for s in SeqIO.parse_fasta(open(f, "r")):
         if numeric:
             s.id = str(basic.parseNumber(s.id))
         if s.id in self.e:
             self.e[s.id].seq = s.seq
             self.e[s.id].len = len(s.seq)
         if basic.Reverse(s.id) in self.e:
             self.e[basic.Reverse(s.id)].seq = basic.RC(s.seq)
             self.e[basic.Reverse(s.id)].len = len(s.seq)
     for edge in self.e.values():
         assert (edge.seq is not None)
     return self
Esempio n. 9
0
def align(dir, contigs_file):
    CreateLog(dir)
    contigs = list(SeqIO.parse_fasta(open(contigs_file, "r")))
    assert len(contigs) == 2
    contigs = [
        Contig(contigs[0].seq, contigs[0].id),
        Contig(contigs[1].seq, contigs[1].id)
    ]
    aligner = Aligner(DirDistributor(os.path.join(dir, "alignments")))
    als = iter_align(aligner, contigs[0], contigs[1])
    printVar(os.path.join(dir, "diff.txt"), als)
    for al in als:
        print al
Esempio n. 10
0
def main(reads_file, ref_file, dir, error_rate):
    sys.stderr.write("Reading reference" + "\n")
    ref = sorted(list(SeqIO.parse_fasta(open(ref_file, "r"))),
                 key=lambda rec: len(rec))[-1]
    ref = Contig(ref.seq, ref.id)
    refs = ContigCollection()
    for i in range(0, len(ref) - 500, 500):
        if random.random() > 0.95:
            tmp = list(ref.segment(i, i + 500).Seq())
            for j in range(error_rate * 500 / 100):
                pos = random.randint(0, 499)
                tmp[pos] = basic.rc[tmp[pos]]
            refs.add(
                Contig("".join(tmp),
                       ref.id + "(" + str(i) + "," + str(i + 500) + ")"))
    refs.print_names(sys.stderr)
    sys.stderr.write("Reading reads" + "\n")
    reads = ReadCollection()
    reads.loadFromFasta(open(reads_file, "r"))

    sys.stderr.write("Aligning reads" + "\n")
    basic.ensure_dir_existance(dir)
    aligner = Aligner(DirDistributor(dir))
    aligner.alignReadCollection(reads, refs)
    sys.stderr.write("Analysing alignments" + "\n")
    alignments = []
    for read in reads:
        alignments.extend(read.alignments)
    alignments = filter(lambda al: len(al) > 450, alignments)
    alignments = sorted(alignments,
                        key=lambda al:
                        (al.seg_to.contig.id, al.seg_from.contig.id))
    scorer = Scorer()
    scorer.scores.homo_score = 3
    scorer.scores.ins_score = 5
    scorer.scores.del_score = 5
    cnt = 0
    for contig, iter in itertools.groupby(alignments,
                                          key=lambda al: al.seg_to.contig):
        iter = list(iter)
        sys.stderr.write(str(contig) + " " + str(len(iter)) + "\n")
        if len(iter) < 150:
            for al in iter:
                print scorer.accurateScore(al.matchingSequence(),
                                           params.alignment_correction_radius)
                cnt += 1
                if cnt >= 5000:
                    break
        if cnt >= 5000:
            break
Esempio n. 11
0
 def polish(self, reads, consensus):
     # type: (Iterable[NamedSequence], Contig) -> str
     dir, new_files, same = self.dir_distributor.fillNextDir([([consensus], "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return list(SeqIO.parse_fasta(open(polished_file, "r")))[0].seq
Esempio n. 12
0
 def polishMany(self, reads, sequences):
     # type: (Iterable[AlignedRead], List[Contig]) -> List[Contig]
     dir, new_files, same = self.dir_distributor.fillNextDir([(list(sequences), "ref.fasta"), (reads, "reads.fasta")])
     consensus_file_name = new_files[0]
     reads_file_name = new_files[1]
     args = FakePolishingArgs()
     basic.ensure_dir_existance(os.path.join(dir, "work"))
     job = JobPolishing(args, os.path.join(dir, "work"), os.path.join(dir, "log.info"), [reads_file_name], consensus_file_name, "polish")
     polished_file = job.out_files["contigs"]
     if same and not params.clean and os.path.exists(polished_file):
         sys.stdout.trace("Polishing reused:", polished_file)
     else:
         sys.stdout.trace("Running polishing:", polished_file)
         job.run()
     return map(lambda rec: Contig(rec.seq, rec.id), SeqIO.parse_fasta(open(polished_file, "r")))
def draw(contigs_file, output_dir, k):
    aligner = Aligner(DirDistributor(os.path.join(output_dir, "alignments")))
    CreateLog(output_dir)
    print "Reading contigs"
    tmp = sorted(SeqIO.parse_fasta(open(contigs_file, "r")),
                 key=lambda contig: len(contig))
    lens = map(len, tmp)[::-1]
    print lens
    contigs = ContigStorage()
    if lens[1::2] == lens[0::2]:
        tmp = tmp[0::2]
        print "Removed extra contigs"
    for i, contig in enumerate(tmp):
        print i, contig
        contigs.add(Contig(contig.seq, str(i)))
    print "Constructing components"
    componenets = ExtractRepeatComponents(contigs, aligner, k)
    print "Components:"
    for comp in componenets:
        print comp.segments
        print comp.alignments
    for cnt, comp in enumerate(componenets):
        print "Processing component", cnt
        print comp.segments
        # print comp.alignments
        print "Forming blocks"
        Block.id_cnt = 0
        blocks = CreateBlocks(comp)
        if len(blocks) == 1:
            print "Skipping trivial repeat"
            continue
        for block in blocks:
            print "Block", block.id, ":", block.segs
        for block in blocks:
            for other in block.out:
                print block.id, "->", other.id
        print "Placing blocks on X axis"
        code = placeX(blocks)
        if code == 1:
            print "WARNING: component", cnt, "contains cycle. Aborting visualization."
            continue
        print "Placing blocks on Y axis"
        placeY(blocks, comp.segments)
        print "Printing figure"
        SimplePrinter().printBlocks(blocks, sys.stdout)
        print "Finished printing figure"
Esempio n. 14
0
def main(flye_dir, rf, dir, edge_id, k):
    params.technology = "nano"
    params.k = k
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    print "Reading graph"
    graph = SimpleGraph().ReadGFA(os.path.join(flye_dir, "assembly_graph.gfa"))
    print "Parsing edge mapping"
    id_map = parseUPaths(flye_dir)
    edge_ids = edge_id.split(",")
    print "Extracting relevant graph component"
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    unique = dict()
    for eid in edge_ids:
        for e in graph.v[graph.e[eid].start].inc:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "l"] = NamedSequence(
                        basic.RC(e.seq[:5000]), e.id[1:] + "l")
                else:
                    unique[e.id + "r"] = NamedSequence(e.seq[-5000:],
                                                       e.id + "r")
        for e in graph.v[graph.e[eid].end].out:
            if basic.Normalize(e.id) in edge_ids:
                continue
            if len(e.seq) < 10000:
                if e.id.startswith("-"):
                    unique[e.id[1:]] = NamedSequence(basic.RC(e.seq), e.id[1:])
                else:
                    unique[e.id] = NamedSequence(e.seq, e.id)
            else:
                if e.id.startswith("-"):
                    unique[e.id[1:] + "r"] = NamedSequence(
                        basic.RC(e.seq[-5000:]), e.id[1:] + "r")
                else:
                    unique[e.id + "l"] = NamedSequence(e.seq[:5000],
                                                       e.id + "l")

    for c in unique.values():
        print c.id
        SeqIO.write(c, res, "fasta")
    res.close()
    old_ids = []
    for eid in edge_ids:
        for olde in id_map[eid[len("edge_"):]]:
            old_ids.append(basic.Normalize(olde))
    print "Finding reads that align to", edge_ids
    print "Old ids:", old_ids
    relevant_read_ids = set()
    for s in open(os.path.join(flye_dir, "20-repeat", "read_alignment_dump"),
                  "r").readlines():
        s = s.split()
        if s[0] != "Aln":
            continue
        if s[6].split("_")[1] in old_ids:
            relevant_read_ids.add(s[2][1:])
            print s[2][1:], s[6].split("_")[1]
    print "Reading reads"
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_fasta(open(rf, "r")):
        if read.id in relevant_read_ids and len(read) > k * 1.2:
            SeqIO.write(read, res, "fasta")
    res.close()
Esempio n. 15
0
def extract(fname):
    res = dict()
    for read in SeqIO.parse_fasta(open(fname, "r")):
        res[read.id] = read.seq
    return res
Esempio n. 16
0
import sys
sys.path.append("py")

from common import SeqIO

rf = sys.argv[1]
outf = sys.argv[2]
total = int(sys.argv[3])
print "Reading reads"
reads = list(SeqIO.parse_fasta(open(rf, "r")))
print "Sorting reads"
reads = sorted(reads, key=lambda read: -len(read))
print "Printing reads"
out = open(outf, "w")
for read in reads:
    if total <= 0:
        break
    total -= len(read)
    SeqIO.write(read, out, "fasta")
out.close()
Esempio n. 17
0
def ReadReference(file):
    result = dict()
    for rec in SeqIO.parse_fasta(open(file, "r")):
        result[rec.id] = rec.seq
    return result
Esempio n. 18
0
import sys
import os

import common.seq_records

sys.path.append("py")
import common.SeqIO as SeqIO

read_len = int(sys.argv[2])
cov = float(sys.argv[3])
for seq in SeqIO.parse_fasta(open(sys.argv[1], "r")):
    sys.stderr.write(seq.id + " " + str(len(seq)) + " " + str(int(len(seq) * cov / read_len)) + "\n")
    # if len(seq) > 100000000 or len(seq) < 10000000:
    #      continue
    cur = 100000
    for i in range(0, len(seq), int(read_len / cov)):
        if i > cur:
            sys.stderr.write(str(cur) + "\n")
            cur = cur * 3 / 2
        SeqIO.write(common.seq_records.SeqRecord(seq.seq[i:min(len(seq), i + read_len)], seq.id + "_" + str(i)), sys.stdout, "fasta")
    break

Esempio n. 19
0
        vert = dict()
        cnt = 0
        for v in self.v:
            inc = list(self.inc(v))
            out = list(self.out(v))
            if (self.incdeg(v) != 1 or self.outdeg(v) != 1
                ) and self.incdeg(v) + self.outdeg(v) != 0:
                vert[v] = cnt
                cnt += 1
        for v in vert:
            for next in self.out(v):
                e = self.unique_forward(v[0], next)
                handler.write(
                    self.ntostr(vert[v]) + " -> " +
                    self.ntostr(vert[e[-self.k:]]) + "[label = " + "\"" +
                    str(len(e) - self.k) + ":" + str(self.cov(e)) + "\"" +
                    ", color = \"black\"] ;\n")
        handler.write("}\n")


sys.stderr.write("starting\n")
k = int(sys.argv[2])
edges = ConstructEdges(SeqIO.parse_fasta(open(sys.argv[1], "r")), k,
                       int(sys.argv[3]))
sys.stderr.write("constructing dbg\n")
g = DBG(edges, k)
sys.stderr.write("cleaning")
g.clean(20)
sys.stderr.write("printing dbg\n")
g.print_dot(sys.stdout)
dir = os.path.join(sys.argv[4])
if not os.path.exists(dir):
    os.makedirs(dir)

names = sys.argv[1].split(";")
tmp = dict()
for s in names:
    if s.endswith("RC"):
        tmp[s[:-2]] = True
    else:
        tmp[s] = False
names = tmp

contigs_file = os.path.join(sys.argv[4], "contigs.fasta")
contigs_handler = open(contigs_file, "w")
for rec in SeqIO.parse_fasta(open(sys.argv[2], "r")):
    if rec.id in names:
        if names[rec.id]:
            rec.seq = RC(rec.seq)
            rec.id += "RC"
        SeqIO.write(rec, contigs_handler, "fasta")
contigs_handler.close()

alignment_dir = os.path.join(sys.argv[4], "alignment")
if not os.path.exists(alignment_dir):
    os.makedirs(alignment_dir)

alignment = os.path.join(sys.argv[4], "alignment.sam")
make_alignment(contigs_file, [sys.argv[3]], 8, alignment_dir, "pacbio",
               alignment)
Esempio n. 21
0
cur = None
dump = open(sys.argv[2]).readlines()
d = dict()
for s in dump:
    s = s.strip()
    if s == "":
        continue
    if s.startswith("#"):
        s = s[1:].split()
        if s[0] == "Repeat":
            repeat = s[1]
        if s[0] in ["All", "Input", "Output"]:
            cur = s[1]
    else:
        if repeat not in interest:
            continue
        sign = s[0]
        s = s[1:]
        if s not in d:
            d[s] = []
        d[s].append((repeat, cur, sign))
for rec in SeqIO.parse_fasta(open(sys.argv[1])):
    id = rec.id.split()[0]
    if id in d:
        tmp = d[id]
        if ("reads", "-") in [(a[1], a[2]) for a in tmp]:
            rec.seq = RC(rec.seq)
        SeqIO.write(
            common.seq_records.SeqRecord(rec.seq, id + "_" + str(d[id])),
            sys.stdout, "fasta")
        sys.stderr.write(id + "_" + str(d[id]) + "\n")
Esempio n. 22
0
import sys
sys.path.append("py")
from common import SeqIO

if __name__ == "__main__":
    contigs = list(SeqIO.parse_fasta(open(sys.argv[1], "r")))
    print "Total:", sum(map(
        len, contigs)), "nucleotides in", len(contigs), "contigs."
    for contig in contigs:
        print contig.id, len(contig)