예제 #1
0
 def loadFromFile(self, fname, num_names=True):
     # type: (str, bool) -> ContigCollection
     for rec in SeqIO.parse_by_name(fname):
         if num_names:
             self.add(
                 Contig(rec.seq,
                        str(basic.parseNegativeNumberAndMod(rec.id))))
         else:
             self.add(Contig(rec.seq, rec.id))
     return self
예제 #2
0
def main(args):
    flye_dir = sys.argv[1]
    repeats, starts, ends = parse(sys.argv[2])
    graph_file, unique_file, disjointigs_file, rep_dir, tmp, their_file = cl_params.parseFlyeDir(
        flye_dir)
    dump = os.path.join(rep_dir, "read_alignment_dump")
    reads_file = sys.argv[3]
    dir = sys.argv[4]
    CreateLog(dir)
    print " ".join(args)
    print "Printing contigs"
    edges_file = os.path.join(rep_dir, "graph_before_rr.fasta")
    edges = ContigStorage().loadFromFasta(open(edges_file, "r"))
    unique = open(os.path.join(dir, "contigs"), "w")
    for l in starts:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[-15000:]
        SeqIO.write(NamedSequence(seq, "(" + "_".join(l) + ")"), unique,
                    "fasta")
    for l in ends:
        seq = "".join(map(lambda eid: edges[eid].seq, l))
        if len(seq) > 15000:
            seq = seq[:15000]
        SeqIO.write(NamedSequence(basic.RC(seq), "(" + "_".join(l) + ")"),
                    unique, "fasta")
    unique.close()
    print "Selecting reads"
    reads = set()
    cur_read = None
    als = []
    for s in open(dump).readlines():
        if s.startswith("Chain"):
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print als
            for al in als:
                if al in repeats:
                    if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                        print "oppa"
                    reads.add(cur_read)
                    break
            als = []

        else:
            s = s.split()
            cur_read = s[2][1:]
            eid = s[6].split("_")[1]
            if s[6][0] == "-":
                eid = "-" + eid
            if cur_read == "06dfd536-e258-446f-a019-8d8340ef831e":
                print eid
            als.append(eid)
    print "Selected", len(reads), "reads"
    print "\n".join(reads)
    print "Reading and printing reads"
    freads = open(os.path.join(dir, "reads.fasta"), "w")
    cnt = 0
    for read in SeqIO.parse_by_name(reads_file):
        cnt += 1
        if cnt % 10000 == 0:
            print cnt
        if read.id in reads:
            SeqIO.write(read, freads, "fasta")
    freads.close()
예제 #3
0
def main(flye_dir, rf, dir, edge_id, to_resolve, min_contig_length):
    params.technology = "nano"
    basic.ensure_dir_existance(dir)
    basic.CreateLog(dir)
    dd = DirDistributor(os.path.join(dir, "alignments"))
    aligner = Aligner(dd)
    print " ".join(sys.argv)
    print "Reading graph"
    graph = SimpleGraph().ReadDot(
        os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv"))
    graph.FillSeq(os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta"),
                  True)
    print "Extracting relevant graph component"
    edge_ids = edge_id.split(",")
    to_resolve = to_resolve.split(",")
    to_resolve = [(a, int(b))
                  for a, b in zip(to_resolve[0::2], to_resolve[1::2])]
    unique = uniqueNeighbours(edge_ids, graph, min_contig_length)

    if rf == "none":
        return
    print "Finding reads that align to", edge_ids
    reads_to_resolve = dict()  # type: Dict[str, List[str]]
    for eid, mult in to_resolve:
        reads_to_resolve[eid] = []
    for unique_edge, initial in unique:
        reads_to_resolve[initial] = []
    relevant_read_ids = set()
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if eid in edge_ids:
            relevant_read_ids.add(rid)
            print rid, eid
    for rid, eid in parseReadDump(
            os.path.join(flye_dir, "20-repeat", "read_alignment_dump")):
        if rid in relevant_read_ids and eid in reads_to_resolve:
            reads_to_resolve[eid].append(rid)
    for eid in reads_to_resolve:
        reads_to_resolve[eid] = list(set(reads_to_resolve[eid]))
    print "Reading reads"
    res_reads = ContigStorage()
    res = open(os.path.join(dir, "reads.fasta"), "w")
    for read in SeqIO.parse_by_name(rf):
        if read.id in relevant_read_ids:
            res_reads.add(Contig(read.seq, read.id))
            SeqIO.write(read, res, "fasta")
    res.close()
    random_down = open(os.path.join(dir, "random_down.fasta"), "w")
    cnt = 0
    for read in res_reads:
        if cnt % 5 == 0:
            SeqIO.write(read, random_down, "fasta")
        cnt += 1
    random_down.close()
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    lcf = open(os.path.join(dir, "contigs.lc"), "w")
    for eid, mult in to_resolve:
        repeat_reads = [res_reads[rid] for rid in reads_to_resolve[eid]]
        print reads_to_resolve[eid]
        print map(str, repeat_reads)
        split_contigs = splitRepeat(aligner, graph.e[eid].seq, mult,
                                    repeat_reads, min_contig_length)
        if split_contigs is None:
            print "Failed to resove edge", eid, "Aborting"
        print "Edge", eid, "was split into", mult, "copies"
        for contig, contig_reads in split_contigs:
            print contig.id
            SeqIO.write(contig, res, "fasta")
            lcf.write(contig.id + "\n")
            lcf.write(" ".join([r.id for r in contig_reads]) + "\n")
    res = open(os.path.join(dir, "contigs.fasta"), "w")
    for unique_edge, initial in unique:
        print unique_edge.id
        SeqIO.write(unique_edge, res, "fasta")
        lcf.write(unique_edge.id + "\n")
        lcf.write(" ".join(reads_to_resolve[initial]) + "\n")
    res.close()