Esempio n. 1
0
def generate_read_to_contig_map(rawread_dir=rawread_dir,
                                pread_dir=pread_dir,
                                asm_dir=asm_dir):

    read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
    make_dirs(read_map_dir)

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db"))
    rawread_id_file = makePypeLocalFile(
        os.path.join(rawread_dir, "raw_read_ids"))

    @PypeTask(inputs={"rawread_db": rawread_db},
              outputs={"rawread_id_file": rawread_id_file},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/dump_rawread_ids")
    def dump_rawread_ids(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" %
                  (rawread_db, rawread_id_file))

    wf.addTask(dump_rawread_ids)

    pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db"))
    pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids"))

    @PypeTask(inputs={"pread_db": pread_db},
              outputs={"pread_id_file": pread_id_file},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/dump_pread_ids")
    def dump_pread_ids(self):
        pread_db = fn(self.pread_db)
        pread_id_file = fn(self.pread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" %
                  (pread_db, pread_id_file))

    wf.addTask(dump_pread_ids)

    all_raw_las_files = {}
    for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")):
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_raw_las_files["r_las_%s" % idx] = las_file

    all_pread_las_files = {}
    for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")):
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_pread_las_files["p_las_%s" % idx] = las_file

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list"))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data"))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths"))

    inputs = {
        "rawread_id_file": rawread_id_file,
        "pread_id_file": pread_id_file,
        "sg_edges_list": sg_edges_list,
        "utg_data": utg_data,
        "ctg_paths": ctg_paths
    }

    read_to_contig_map = makePypeLocalFile(
        os.path.join(read_map_dir, "read_to_contig_map"))

    @PypeTask(inputs=inputs,
              outputs={"read_to_contig_map": read_to_contig_map},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/get_ctg_read_map")
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data),
                         fn(self.ctg_paths))

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid) / 10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault(k, set())
                    pread_to_contigs[k].add(ctg)

            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[k]):
                    print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask(generate_read_to_ctg_map)

    def dump_rawread_to_ctg(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        #pread_id_file = fn( self.pread_id_file )
        las_file = fn(self.las_file)
        rawread_to_contig_file = fn(self.rawread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        rid_to_oid = open(rawread_id_file).read().split("\n")
        #pread_did_to_rid = open(pread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        rid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                rid = int(rid)
                rid_to_contigs.setdefault(rid, (oid, set()))
                rid_to_contigs[rid][1].add(ctg)

        with open(rawread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            for row in sp.check_output(
                    shlex.split("LA4Falcon -m %s %s " %
                                (rawread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            o_id = rid_to_oid[cur_read_id]
                            print >> f, "%09d %s %s %d %d %d %d" % (
                                cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (
                                    q_id_, o_id, ctg, count, rank, score,
                                    in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id

                if q_id in rid_to_contigs and len(
                        ovlp_data) == 0:  #if the query is in some contig....
                    t_o_id, ctgs = rid_to_contigs[q_id]
                    o_id = rid_to_oid[q_id]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1

                if t_id not in rid_to_contigs:
                    continue

                t_o_id, ctgs = rid_to_contigs[t_id]
                o_id = rid_to_oid[q_id]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (
                        q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    def dump_pread_to_ctg(self):
        pread_db = fn(self.pread_db)
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        las_file = fn(self.las_file)
        pread_to_contig_file = fn(self.pread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pid_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        pid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                pid = int(pid)
                pid_to_contigs.setdefault(pid, (oid, set()))
                pid_to_contigs[pid][1].add(ctg)

        with open(pread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            skip_rest = 0
            for row in sp.check_output(
                    shlex.split("LA4Falcon -mo %s %s " %
                                (pread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            rid = pid_to_rid[cur_read_id].split("/")[1]
                            rid = int(int(rid) / 10)
                            o_id = rid_to_oid[rid]
                            print >> f, "%09d %s %s %d %d %d %d" % (
                                cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (
                                    q_id_, o_id, ctg, count, rank, score,
                                    in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id
                        skip_rest = 0

                if q_id in pid_to_contigs and len(
                        ovlp_data) == 0:  #if the query is in some contig....
                    t_o_id, ctgs = pid_to_contigs[q_id]
                    rid = pid_to_rid[q_id].split("/")[1]
                    rid = int(int(rid) / 10)
                    o_id = rid_to_oid[rid]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1
                    skip_rest = 1

                if skip_rest == 1:
                    continue

                if t_id not in pid_to_contigs:
                    continue

                t_o_id, ctgs = pid_to_contigs[t_id]
                rid = pid_to_rid[q_id].split("/")[1]
                rid = int(int(rid) / 10)
                o_id = rid_to_oid[rid]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (
                        q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    for las_key, las_file in all_raw_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        rawread_to_contig_file = makePypeLocalFile(
            os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
        make_dump_rawread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "rawread_db": rawread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file
            },
            outputs={"rawread_to_contig_file": rawread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/r_read_to_contigs.%s" % idx)
        dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(
            dump_rawread_to_ctg)
        wf.addTask(dump_rawread_to_ctg_task)

    for las_key, las_file in all_pread_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        pread_to_contig_file = makePypeLocalFile(
            os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
        make_dump_pread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "pread_db": pread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file
            },
            outputs={"pread_to_contig_file": pread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/pread_to_contigs.%s" % idx)
        dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)
        wf.addTask(dump_pread_to_ctg_task)

    wf.refreshTargets()  # block
Esempio n. 2
0
def testDistributed(runmode, cleanup):
    logger.info("test start")
    baseDir = "."
    import random
    random.seed(1984)
    #PypeThreadWorkflow.setNumThreadAllowed(20,20)
    #wf = PypeThreadWorkflow()
    PypeMPWorkflow.setNumThreadAllowed(20,20)
    wf = PypeMPWorkflow()
    allTasks = []
    for layer in range(5):
        fN = random.randint(3,7)
        fin = [None] * fN
        fout = [None] * fN
        fmut = [None] * fN
        for w in range(fN):
            fin[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer, w) )
            fout[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer+1, w) )
            fmut[w] = makePypeLocalFile(baseDir + "/testdata/m_testfile_l%d_w%d.dat" % (layer+1, w) )
            #wf.addObjects([fin[w], fout[w], fmut[w]])

        for w in range(fN):
            inputDataObjs = {}
            outputDataObjs = {}
            mutableDataObjs = {}
            for i in range(5):
                inputDataObjs["infile%d" % i] = random.choice(fin)

            i = 0
            for obj in random.sample(fmut,2):
                #mutableDataObjs["outfile%d" % i] = obj
                i += 1
            outputDataObjs["outfile%d" % i] = fout[w]

            shellCmd = "sleep 1\n" + "\n".join([ "echo %d %d ...  >> %s" % (layer, w, of.localFileName) for of in outputDataObjs.values() ]) + "\nsleep 10"
            shellCmd += "sleep 1\n" + "\n".join([ "echo %d %d ...  >> %s" % (layer, w, of.localFileName) for of in mutableDataObjs.values() ]) + "\nsleep 10"
            shellFileName = baseDir + "/testdata/task_l%d_w%d.sh" % (layer, w)
            shfile = open(shellFileName, 'w')
            print(shellCmd, file=shfile)
            shfile.close()

            if runmode == "internal":
                def t1(self):
                    runShellCmd(["sleep", "%d" % random.randint(0,20) ])

                    for of in self.outputDataObjs.values():
                        runShellCmd(["touch", of.localFileName])

                task = PypeTask(inputDataObjs = inputDataObjs,
                                outputDataObjs = outputDataObjs, 
                                mutableDataObjs = mutableDataObjs,
                                URL="task://internal/task_l%d_w%d" % (layer, w), 
                                TaskType=PypeThreadTaskBase) ( t1 )

            elif runmode == "localshell":
                task = PypeShellTask(inputDataObjs = inputDataObjs,
                                     outputDataObjs = outputDataObjs, 
                                     mutableDataObjs = mutableDataObjs,
                                     URL="task://localshell/task_l%d_w%d" % (layer, w), 
                                     TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )

            elif runmode == "sge": 
                task = PypeSGETask(inputDataObjs = inputDataObjs,
                                   outputDataObjs = outputDataObjs, 
                                   mutableDataObjs = mutableDataObjs,
                                   URL="task://sge/task_l%d_w%d" % (layer, w), 
                                   TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )

            elif runmode == "mixed":
                #distributed = random.choice( (False, True) )
                distributed = True if w % 3 == 0 else False
                task = PypeDistributibleTask(inputDataObjs = inputDataObjs,
                                   outputDataObjs = outputDataObjs,
                                   mutableDataObjs = mutableDataObjs,
                                   URL="task://sge/task_l%d_w%d" % (layer, w), 
                                   distributed=distributed,
                                   TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )

            wf.addTasks([task])
            allTasks.append(task)

    for URL in wf._pypeObjects:
        prereqJobURLs = [str(u) for u in wf._RDFGraph.transitive_objects(URIRef(URL), pypeNS["prereq"])
                                        if isinstance(wf._pypeObjects[str(u)], PypeLocalFile) and str(u) != URL ]
        if len(prereqJobURLs) == 0:
            if cleanup == "1":
                os.system("echo start > %s" % wf._pypeObjects[URL].localFileName)
            pass
    wf.refreshTargets(allTasks)
    dotFile = open("test.dot","w")
    #print >>dotFile, wf.graphvizShortNameDot
    print(wf.graphvizDot, file=dotFile)
    dotFile.close()
    dotFile = open("test_short_name.dot","w")
    print(wf.graphvizShortNameDot, file=dotFile)
    dotFile.close()
    rdfFile = open("test.rdf","w")
    print(wf.RDFXML, file=rdfFile)
    rdfFile.close()
    if runmode != "internal":
        mkFile = open("test.mk","w")
        print(wf.makeFileStr, file=mkFile)
        mkFile.close()
Esempio n. 3
0
wf.addTask( dump_rawread_ids )

pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )

@PypeTask( inputs = {"pread_db": pread_db}, 
           outputs =  {"pread_id_file": pread_id_file},
           TaskType = PypeThreadTaskBase,
           URL = "task://localhost/dump_pread_ids" )
def dump_pread_ids(self):
    pread_db = fn( self.pread_db )
    pread_id_file = fn( self.pread_id_file )
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )

wf.addTask( dump_pread_ids )
wf.refreshTargets() # block

all_raw_las_files = {}
for las_fn in glob.glob( os.path.join( rawread_dir, "m*/raw_reads.*.las") ):
    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1]) 
    las_file = makePypeLocalFile( las_fn )
    all_raw_las_files["r_las_%s" % idx] = las_file 

all_pread_las_files = {}
for las_fn in glob.glob( os.path.join( pread_dir, "m*/preads.*.las") ):
    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1]) 
    las_file = makePypeLocalFile( las_fn )
    all_pread_las_files["p_las_%s" % idx] = las_file 
Esempio n. 4
0
def generate_read_to_contig_map(rawread_dir=rawread_dir, pread_dir=pread_dir, asm_dir=asm_dir):

    read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
    make_dirs(read_map_dir)

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db"))
    rawread_id_file = makePypeLocalFile(os.path.join(rawread_dir, "raw_read_ids"))

    @PypeTask(
        inputs={"rawread_db": rawread_db},
        outputs={"rawread_id_file": rawread_id_file},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/dump_rawread_ids",
    )
    def dump_rawread_ids(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file))

    wf.addTask(dump_rawread_ids)

    pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db"))
    pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids"))

    @PypeTask(
        inputs={"pread_db": pread_db},
        outputs={"pread_id_file": pread_id_file},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/dump_pread_ids",
    )
    def dump_pread_ids(self):
        pread_db = fn(self.pread_db)
        pread_id_file = fn(self.pread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file))

    wf.addTask(dump_pread_ids)

    all_raw_las_files = {}
    for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")):
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_raw_las_files["r_las_%s" % idx] = las_file

    all_pread_las_files = {}
    for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")):
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_pread_las_files["p_las_%s" % idx] = las_file

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list"))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data"))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths"))

    inputs = {
        "rawread_id_file": rawread_id_file,
        "pread_id_file": pread_id_file,
        "sg_edges_list": sg_edges_list,
        "utg_data": utg_data,
        "ctg_paths": ctg_paths,
    }

    read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, "read_to_contig_map"))

    @PypeTask(
        inputs=inputs,
        outputs={"read_to_contig_map": read_to_contig_map},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/get_ctg_read_map",
    )
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths))

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid) / 10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault(k, set())
                    pread_to_contigs[k].add(ctg)

            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[k]):
                    print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask(generate_read_to_ctg_map)

    def dump_rawread_to_ctg(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        # pread_id_file = fn( self.pread_id_file )
        las_file = fn(self.las_file)
        rawread_to_contig_file = fn(self.rawread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        rid_to_oid = open(rawread_id_file).read().split("\n")
        # pread_did_to_rid = open(pread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        rid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                rid = int(rid)
                rid_to_contigs.setdefault(rid, (oid, set()))
                rid_to_contigs[rid][1].add(ctg)

        with open(rawread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            o_id = rid_to_oid[cur_read_id]
                            print >> f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id

                if q_id in rid_to_contigs and len(ovlp_data) == 0:  # if the query is in some contig....
                    t_o_id, ctgs = rid_to_contigs[q_id]
                    o_id = rid_to_oid[q_id]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1

                if t_id not in rid_to_contigs:
                    continue

                t_o_id, ctgs = rid_to_contigs[t_id]
                o_id = rid_to_oid[q_id]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    def dump_pread_to_ctg(self):
        pread_db = fn(self.pread_db)
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        las_file = fn(self.las_file)
        pread_to_contig_file = fn(self.pread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pid_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        pid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                pid = int(pid)
                pid_to_contigs.setdefault(pid, (oid, set()))
                pid_to_contigs[pid][1].add(ctg)

        with open(pread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            skip_rest = 0
            for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            rid = pid_to_rid[cur_read_id].split("/")[1]
                            rid = int(int(rid) / 10)
                            o_id = rid_to_oid[rid]
                            print >> f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id
                        skip_rest = 0

                if q_id in pid_to_contigs and len(ovlp_data) == 0:  # if the query is in some contig....
                    t_o_id, ctgs = pid_to_contigs[q_id]
                    rid = pid_to_rid[q_id].split("/")[1]
                    rid = int(int(rid) / 10)
                    o_id = rid_to_oid[rid]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1
                    skip_rest = 1

                if skip_rest == 1:
                    continue

                if t_id not in pid_to_contigs:
                    continue

                t_o_id, ctgs = pid_to_contigs[t_id]
                rid = pid_to_rid[q_id].split("/")[1]
                rid = int(int(rid) / 10)
                o_id = rid_to_oid[rid]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    for las_key, las_file in all_raw_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
        make_dump_rawread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "rawread_db": rawread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file,
            },
            outputs={"rawread_to_contig_file": rawread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/r_read_to_contigs.%s" % idx,
        )
        dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)
        wf.addTask(dump_rawread_to_ctg_task)

    for las_key, las_file in all_pread_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
        make_dump_pread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "pread_db": pread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file,
            },
            outputs={"pread_to_contig_file": pread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/pread_to_contigs.%s" % idx,
        )
        dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)
        wf.addTask(dump_pread_to_ctg_task)

    wf.refreshTargets()  # block

@PypeTask(
    inputs={"pread_db": pread_db},
    outputs={"pread_id_file": pread_id_file},
    TaskType=PypeThreadTaskBase,
    URL="task://localhost/dump_pread_ids",
)
def dump_pread_ids(self):
    pread_db = fn(self.pread_db)
    pread_id_file = fn(self.pread_id_file)
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file))


wf.addTask(dump_pread_ids)
wf.refreshTargets()  # block

all_raw_las_files = {}
for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")):
    idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1])
    las_file = makePypeLocalFile(las_fn)
    all_raw_las_files["r_las_%s" % idx] = las_file

all_pread_las_files = {}
for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")):
    idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1])
    las_file = makePypeLocalFile(las_fn)
    all_pread_las_files["p_las_%s" % idx] = las_file
Esempio n. 6
0
def get_read_hctg_map(rawread_dir, pread_dir, asm_dir, hasm_dir):

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, "read_maps/raw_read_ids"))
    pread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, "read_maps/pread_ids"))
    h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_edges"))
    p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, "all_p_ctg_edges"))
    h_ctg_ids = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_ids"))
    read_map_dir = os.path.join(hasm_dir, "read_maps")
    make_dirs(read_map_dir)

    read_to_contig_map = makePypeLocalFile(
        os.path.join(read_map_dir, "read_to_contig_map"))

    inputs = {
        "rawread_id_file": rawread_id_file,
        "pread_id_file": pread_id_file,
        "h_ctg_edges": h_ctg_edges,
        "p_ctg_edges": p_ctg_edges,
        "h_ctg_ids": h_ctg_ids
    }

    @PypeTask(inputs=inputs,
              outputs={"read_to_contig_map": read_to_contig_map},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/get_ctg_read_map")
    def generate_read_to_hctg_map(self):

        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        h_ctg_edges = fn(self.h_ctg_edges)
        p_ctg_edges = fn(self.p_ctg_edges)

        h_ctg_ids = set()
        with open(fn(self.h_ctg_ids)) as f:
            for row in f:
                row = row.strip()
                h_ctg_ids.add(row)

        pread_to_contigs = {}

        for fnanme in (p_ctg_edges, h_ctg_edges):
            with open(fnanme) as f:
                for row in f:
                    row = row.strip().split()
                    ctg = row[0]
                    if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids:
                        continue
                    n1 = row[1]
                    n2 = row[2]
                    pid1 = int(n1.split(":")[0])
                    pid2 = int(n2.split(":")[0])
                    rid1 = pread_did_to_rid[pid1].split("/")[1]
                    rid2 = pread_did_to_rid[pid2].split("/")[1]
                    rid1 = int(int(rid1) / 10)
                    rid2 = int(int(rid2) / 10)
                    oid1 = rid_to_oid[rid1]
                    oid2 = rid_to_oid[rid2]
                    k1 = (pid1, rid1, oid1)
                    pread_to_contigs.setdefault(k1, set())
                    pread_to_contigs[k1].add(ctg)
                    k2 = (pid2, rid2, oid2)
                    pread_to_contigs.setdefault(k2, set())
                    pread_to_contigs[k2].add(ctg)

        with open(read_to_contig_map, "w") as f:
            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[k]):
                    print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask(generate_read_to_hctg_map)
    wf.refreshTargets()  # block
Esempio n. 7
0
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):

    read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
    make_dirs(read_map_dir)

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
    rawread_id_file = makePypeLocalFile( os.path.join( read_map_dir, "raw_read_ids" ) )

    @PypeTask( inputs = {"rawread_db": rawread_db},
               outputs =  {"rawread_id_file": rawread_id_file},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/dump_rawread_ids" )
    def dump_rawread_ids(self):
        rawread_db = fn( self.rawread_db )
        rawread_id_file = fn( self.rawread_id_file )
        os.system("DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )

    wf.addTask( dump_rawread_ids )

    pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
    pread_id_file = makePypeLocalFile( os.path.join( read_map_dir, "pread_ids" ) )

    @PypeTask( inputs = {"pread_db": pread_db},
               outputs =  {"pread_id_file": pread_id_file},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/dump_pread_ids" )
    def dump_pread_ids(self):
        pread_db = fn( self.pread_db )
        pread_id_file = fn( self.pread_id_file )
        os.system("DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (pread_db, pread_id_file) )

    wf.addTask( dump_pread_ids )

    wf.refreshTargets() # block

    sg_edges_list = makePypeLocalFile( os.path.join(asm_dir, "sg_edges_list") )
    utg_data = makePypeLocalFile( os.path.join(asm_dir, "utg_data") )
    ctg_paths = makePypeLocalFile( os.path.join(asm_dir, "ctg_paths") )

    inputs = { "rawread_id_file": rawread_id_file,
               "pread_id_file": pread_id_file,
               "sg_edges_list": sg_edges_list,
               "utg_data": utg_data,
               "ctg_paths": ctg_paths }

    read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") )

    @PypeTask( inputs = inputs,
               outputs = {"read_to_contig_map": read_to_contig_map},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/get_ctg_read_map" )
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn( self.rawread_id_file )
        pread_id_file = fn( self.pread_id_file )
        read_to_contig_map = fn( self.read_to_contig_map )

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list),
                         fn(self.utg_data),
                         fn(self.ctg_paths) )

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid)/10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault( k, set() )
                    pread_to_contigs[ k ].add( ctg )


            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[ k ]):
                    print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask( generate_read_to_ctg_map )

    wf.refreshTargets() # block