Beispiel #1
0
import sys

import os

sys.path.append("py")
from common.SimpleGraph import SimpleGraph
from common import basic

g = SimpleGraph()
g.ReadDot(sys.argv[1])
basic.ensure_dir_existance(sys.argv[2])
args = sys.argv[3:]
if "merge" in args:
    g = g.Merge()
cnt = 0
oppa = []
for comp in g.Split(1000000000):
    if len(comp) < 3:
        if len(g.v[comp[0]].inc) + len(g.v[comp[0]].out) + len(
                g.v[comp[-1]].inc) + len(g.v[comp[-1]].out) <= 2:
            pass
        else:
            oppa.extend(comp)
        if len(oppa) > 30:
            comp = list(oppa)
            oppa = []
        else:
            continue
    print cnt, len(comp)
    f = open(os.path.join(sys.argv[2], str(cnt) + ".dot"), "w")
    g.Draw(comp, f)
def main(flye_dir, output_dir, diploid):
    basic.ensure_dir_existance(output_dir)
    CreateLog(output_dir)
    print("Version:", subprocess.check_output(["git", "rev-parse", "HEAD"]))
    print("Modifications:")
    print subprocess.check_output(["git", "diff"])
    graph_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.gv")
    edge_file = os.path.join(flye_dir, "20-repeat", "graph_before_rr.fasta")
    dump_file = os.path.join(flye_dir, "20-repeat", "read_alignment_dump")
    if diploid:
        calculator = DipolidCalculator(150000)
    else:
        calculator = HaploidCalculator(150000)
    print "Reading graph from", graph_file
    graph = SimpleGraph()
    graph.ReadDot(graph_file)
    print "Reading sequences from", edge_file
    graph.FillSeq(edge_file, True)
    print "Splitting graph", edge_file
    componentRecords, edgecomp = constructComponentRecords(graph, calculator)
    print "Reading alignment dump from", dump_file
    rcnt = 0
    for rid, eids in AlignmentDumpParser(dump_file).parse():
        compids = set()
        eids = map(basic.Normalize, eids)
        for eid in eids:
            for compid in edgecomp[eid]:
                compids.add(compid)
        for compid in compids:
            comp_eids = [
                eid for eid in eids
                if eid in componentRecords[compid].component.e
            ]
            if comp_eids.__len__() == 0:
                print "GOPA", compid, compids, rid, eids
            componentRecords[compid].addRead(rid, eids)
        rcnt += 1
        if rcnt % 100000 == 0:
            print "Processed", rcnt, "reads"
    print "Filling flye repeat resolution results"
    flye_next = FillFlyeNext(componentRecords,
                             os.path.join(flye_dir, "flye.log"))
    for compRec in componentRecords:
        half = compRec.half()
        for norm_eid in compRec.unique:
            for eid in [norm_eid, basic.Reverse(norm_eid)]:
                if eid not in compRec.component.e:
                    assert not basic.isCanonocal(eid)
                    assert basic.Reverse(eid) in compRec.component.e
                    continue
                if compRec.component.e[eid].end in half:
                    if compRec.component.isBorder(
                            compRec.component.e[eid].end):
                        compRec.out += 1
                    if compRec.component.isBorder(
                            compRec.component.e[eid].start):
                        compRec.inc += 1
                if not compRec.component.isBorder(
                        compRec.component.e[eid].end):
                    if flye_next[eid] is None:
                        compRec.unresolved_connections += 1
                    else:
                        compRec.resolved_connections.append(
                            (eid, flye_next[eid]))
                        if flye_next[eid] not in compRec.component.e:
                            compRec.outside_connections += 1

    basic.ensure_dir_existance(output_dir)
    print "Printing components to disk"
    subdataset_dir = os.path.join(output_dir, "subdatasets")
    basic.ensure_dir_existance(subdataset_dir)
    order = range(componentRecords.__len__())
    order = sorted(order, key=lambda i: componentRecords[i].score())
    ordered_components = [
        componentRecords[order[i]] for i in range(len(order))
    ]
    componentRecords = ordered_components
    basic.ensure_dir_existance(os.path.join(output_dir, "pics"))
    for i, component in enumerate(componentRecords):
        comp_dir = os.path.join(subdataset_dir, str(i))
        component.dump(comp_dir)
        fig_name = os.path.join(comp_dir, "graph.dot")
        component.draw(fig_name, calculator)
        if component.component.__len__() <= 100:
            fig_file = os.path.join(output_dir, "pics", str(i) + ".dot")
            component.draw(fig_file, calculator)

    table_file = os.path.join(output_dir, "table.txt")
    print "Printing table to file", table_file
    f = open(table_file, "w")
    f.write(
        "Id v e unique inc out repeats unresolved resolved outside zero hub badborder score\n"
    )
    for i, compRec in enumerate(componentRecords):
        comp = compRec.component
        f.write(" ".join([
            str(i),
            str(comp.v.__len__()),
            str(comp.e.__len__()),
            str(compRec.unique.__len__() * 2),
            str(compRec.inc),
            str(compRec.out),
            str(compRec.repeat_edges),
            str(compRec.unresolved_connections),
            str(compRec.resolved_connections.__len__()),
            str(compRec.outside_connections),
            str(compRec.zero),
            str(compRec.red),
            str(compRec.bad_border),
            str(compRec.overcovered_edges),
            str(compRec.score())
        ]) + "\n")
    f.close()
    table_file = os.path.join(output_dir, "list.txt")
    f = open(table_file, "w")
    for a in range(len(componentRecords)):
        f.write(str(a) + "\n")
    f.close()