Example #1
0
def blasr_align(self):

    q_fofn = self.query_fofn
    target_fa = self.target_fa
    target_sa = self.target_sa
    output_dir = self.parameters["mapping_data_dir"]
    q_sn = self.parameters["q_sn"]
    t_sn = self.parameters["t_sn"]
    out_fn = os.path.join( output_dir, "q%05d_t%05d.m4" % (q_sn, t_sn))
    script_fn = os.path.join( output_dir, "q%05d_t%05d.sh" % (q_sn, t_sn))
    config = self.parameters["config"]
    blasr_opt = config["blasr_opt"]
    sge_option_dm = config["sge_option_dm"]
    install_prefix = config["install_prefix"]

    #blasr_cmd = """blasr %s %s -sa %s -noSplitSubreads -bestn 16 -nCandidates 32 -maxScore -1000 -minMatch 12 -maxLCPLength 15 -nproc 16 -m 4 -out %s""" % (fn(q_fofn), fn(target_fa), fn(target_sa), out_fn)

    blasr_cmd = """blasr {query} {target} -sa {target_sa} {blasr_opt} -noSplitSubreads -m 4 -out {out_fn}"""
    blasr_cmd = blasr_cmd.format( query=fn(q_fofn), target=fn(target_fa), target_sa=fn(target_sa), blasr_opt = blasr_opt, out_fn=out_fn )
                                                                                                                     

    with open(script_fn,"w") as script_file:
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write(blasr_cmd+"\n")
        script_file.write("touch %s" % fn(self.job_done))

    job_name = self.URL.split("/")[-1]
    job_name += str(uuid.uuid1())[:8]
    job_data = {"job_name": job_name,
                "cwd": os.getcwd(),
                "sge_option": sge_option_dm,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])

    wait_for_file( fn(self.job_done), task=self, job_name=job_name )
Example #2
0
    def run_falcon_asm_task(self):
        wd = self.parameters["wd"]
        config = self.parameters["config"]
        install_prefix = config["install_prefix"]
        pread_dir = self.parameters["pread_dir"]
        script_dir = os.path.join( wd )
        script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
        
        script = []
        script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
        script.append( "cd %s" % pread_dir )
        script.append( "DB2Falcon preads")
        script.append( "cd %s" % wd )
        script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
        overlap_filtering_setting = config["overlap_filtering_setting"]
        length_cutoff_pr = config["length_cutoff_pr"]
        script.append( """fc_ovlp_filter.py --fofn las.fofn %s --min_len %d > preads.ovl""" %\
                (overlap_filtering_setting, length_cutoff_pr) )

        script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
        script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr)
        script.append( """fc_graph_to_contig.py""" )
        script.append( """touch %s\n""" % fn(self.falcon_asm_done))

        with open(script_fn, "w") as script_file:
            script_file.write("\n".join(script))

        job_name = self.URL.split("/")[-1]
        job_name += "-"+str(uuid.uuid4())[:8]
        job_data = {"job_name": job_name,
                    "cwd": wd,
                    "sge_option": config["sge_option_fc"],
                    "script_fn": script_fn }
        run_script(job_data, job_type = config["job_type"])
        wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name )
Example #3
0
def query_filter(self):
    #print self.parameters
    #print [fn(f) for f in self.inputs.values()]
    output_dir = self.parameters["mapping_data_dir"]
    q_sn = self.parameters["q_sn"]
    script_fn = os.path.join( output_dir, "qf%05d.sh" % q_sn)
    qf_fofn = os.path.join( output_dir, "qf%05d_input.fofn" % (q_sn, ) )
    install_prefix = config["install_prefix"]
    sge_option_qf = config["sge_option_qf"]
    length_cutoff_pr = config["length_cutoff_pr"]
    bestn = config["bestn"]

    with open(script_fn,"w") as script_file:
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write("""find %s -name "q[0-9]*_t[0-9]*.m4" > %s\n""" % (output_dir, qf_fofn))
        script_file.write("""query_m4_filtering.py %s 1 0 %d %d %s\n""" % (qf_fofn, bestn, length_cutoff_pr, fn(self.qf_out) ))
        script_file.write("""touch %s\n""" % fn(self.job_done) )

    job_name = self.URL.split("/")[-1]
    job_data = {"job_name": job_name,
                "cwd": os.getcwd(),
                "sge_option": sge_option_qf,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])

    wait_for_file( fn(self.job_done), task=self, job_name=job_name )
Example #4
0
def blasr_align(self):

    q_fofn = self.query_fofn
    target_fa = self.target_fa
    target_sa = self.target_sa
    output_dir = self.parameters["mapping_data_dir"]
    q_sn = self.parameters["q_sn"]
    t_sn = self.parameters["t_sn"]
    out_fn = os.path.join( output_dir, "q%05d_t%05d.m4" % (q_sn, t_sn))
    script_fn = os.path.join( output_dir, "q%05d_t%05d.sh" % (q_sn, t_sn))
    config = self.parameters["config"]
    blasr_opt = config["blasr_opt"]
    sge_option_dm = config["sge_option_dm"]
    install_prefix = config["install_prefix"]

    #blasr_cmd = """blasr %s %s -sa %s -noSplitSubreads -bestn 16 -nCandidates 32 -maxScore -1000 -minMatch 12 -maxLCPLength 15 -nproc 16 -m 4 -out %s""" % (fn(q_fofn), fn(target_fa), fn(target_sa), out_fn)

    blasr_cmd = """blasr {query} {target} -sa {target_sa} {blasr_opt} -noSplitSubreads -m 4 -out {out_fn}"""
    blasr_cmd = blasr_cmd.format( query=fn(q_fofn), target=fn(target_fa), target_sa=fn(target_sa), blasr_opt = blasr_opt, out_fn=out_fn )
                                                                                                                     

    with open(script_fn,"w") as script_file:
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write(blasr_cmd+"\n")
        script_file.write("touch %s" % fn(self.job_done))

    job_name = self.URL.split("/")[-1]
    job_data = {"job_name": job_name,
                "cwd": os.getcwd(),
                "sge_option": sge_option_dm,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])

    wait_for_file( fn(self.job_done), task=self, job_name=job_name )
Example #5
0
def run_daligner(self):
    daligner_cmd = self.parameters["daligner_cmd"]
    job_uid = self.parameters["job_uid"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    install_prefix = config["install_prefix"]
    db_prefix = self.parameters["db_prefix"]
    nblock = self.parameters["nblock"]

    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rj_%s.sh" % (job_uid))
    log_path = os.path.join( script_dir, "rj_%s.log" % (job_uid))

    script = []
    script.append( "cd %s" % cwd )
    script.append( "hostname >> %s" % log_path )
    script.append( "date >> %s" % log_path )
    # Jason's time path does not work on Centos (where time has no path!?!)
    # this code is also rather fugly - the time output is not logged - encapsulate in brackets
    script.append( "(time "+ daligner_cmd + ") >> %s 2>&1 " % log_path  )
    script.append( "touch %s" % fn( self.job_done ) )

    for p_id in xrange( 1, nblock+1 ):
        script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """  % (db_prefix, p_id, db_prefix, p_id) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))

    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid4())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])
    wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
Example #6
0
def task_report_pre_assembly(self):
    # TODO(CD): Bashify this, in case it is slow.
    i_raw_reads_db_fn = fn(self.raw_reads_db)
    i_preads_fofn_fn = fn(self.preads_fofn)
    i_length_cutoff_fn = fn(self.length_cutoff_fn)
    o_json_fn = fn(self.pre_assembly_report)
    cfg = self.parameters
    genome_length = int(cfg.get('genome_size', 0))  # different name in falcon
    length_cutoff = int(cfg['length_cutoff'])
    length_cutoff = support.get_length_cutoff(length_cutoff,
                                              i_length_cutoff_fn)
    kwds = {
        'i_raw_reads_db_fn': i_raw_reads_db_fn,
        'i_preads_fofn_fn': i_preads_fofn_fn,
        'genome_length': genome_length,
        'length_cutoff': length_cutoff,
    }
    fc_run_logger.info('Report inputs: {}'.format(repr(kwds)))
    report_dict = stats_preassembly.calc_dict(**kwds)
    content = json.dumps(report_dict,
                         sort_keys=True,
                         indent=4,
                         separators=(',', ': '))
    fc_run_logger.info('Report stats:\n{}'.format(content))
    open(o_json_fn, 'w').write(content)
Example #7
0
def query_filter(self):
    #print self.parameters
    #print [fn(f) for f in self.inputs.values()]
    output_dir = self.parameters["mapping_data_dir"]
    q_sn = self.parameters["q_sn"]
    script_fn = os.path.join(output_dir, "qf%05d.sh" % q_sn)
    qf_fofn = os.path.join(output_dir, "qf%05d_input.fofn" % (q_sn, ))
    install_prefix = config["install_prefix"]
    sge_option_qf = config["sge_option_qf"]
    length_cutoff_pr = config["length_cutoff_pr"]
    bestn = config["bestn"]

    with open(script_fn, "w") as script_file:
        script_file.write("source {install_prefix}/bin/activate\n".format(
            install_prefix=install_prefix))
        script_file.write("""find %s -name "q[0-9]*_t[0-9]*.m4" > %s\n""" %
                          (output_dir, qf_fofn))
        script_file.write("""query_m4_filtering.py %s 1 0 %d %d %s\n""" %
                          (qf_fofn, bestn, length_cutoff_pr, fn(self.qf_out)))
        script_file.write("""touch %s\n""" % fn(self.job_done))

    job_name = self.URL.split("/")[-1]
    job_name += str(uuid.uuid1())[:8]
    job_data = {
        "job_name": job_name,
        "cwd": os.getcwd(),
        "sge_option": sge_option_qf,
        "script_fn": script_fn
    }
    run_script(job_data, job_type=config["job_type"])

    wait_for_file(fn(self.job_done), task=self, job_name=job_name)
Example #8
0
def quiver_reseq(self):
    config = self.config
    sge_option_ck = config["sge_option_ck"]
    sge_option_qv = config["sge_option_qv"]
    big_tmpdir = config["big_tmpdir"]

    try:
        os.makedirs("quiver_reseq")
    except:
        pass

    SEYMOUR_HOME = config["SEYMOUR_HOME"]
    if SEYMOUR_HOME == None:
        print "SEYMOUR_HOME not set, bypass quiver consensus step"
        return 0

    job_name = "QuiverReq_"+str(uuid.uuid4())
    quiver_script = """#!/bin/bash
export SEYMOUR_HOME=%s
. $SEYMOUR_HOME/etc/setup.sh 
cd %s/quiver_reseq
cp ../CA/9-terminator/asm.ctg.fasta .
referenceUploader -c -p $PWD -n assembly -f asm.ctg.fasta --skipIndexUpdate
compareSequences.py --info --useGuidedAlign --algorithm=blasr --nproc=24 --noXML --h5mode=w --h5fn=out.cmp.h5 --minAccuracy=0.70 --minLength=200 -x -nCandidates 50 -x -minMatch 12 -x -bestn 1 -x -minPctIdentity 70.0 %s assembly/
loadPulses %s out.cmp.h5 -metrics DeletionQV,IPD,InsertionQV,PulseWidth,QualityValue,MergeQV,SubstitutionQV,DeletionTag -byread
cmph5tools.py sort out.cmp.h5 --tmp %s
variantCaller.py --algorithm quiver -j 16 --referenceFilename assembly/sequence/assembly.fasta  --parameters best -o output.gff  -o output.fasta -o output.fastq -q 0  -X 80 -x 5 --mapQvThreshold 0 out.cmp.h5
""" % (SYMOURE_HOME, os.getcwd(), fn(self.input_fofn), fn(self.input_fofn), big_tmpdir)
    with open("scripts/quiver_reseq.sh", "w") as f:
        print >>f, quiver_script
    os.system( """qsub -sync y  {sge_option_qv} -N {jn} -o {cwd}/sge_log -j y -S /bin/bash scripts/quiver_reseq.sh """.format(jn=job_name, cwd=os.getcwd(), sge_option_qv = sge_option_qv) )
    with open("scripts/quiver_done.sh","w") as f:
        print >>f, "echo done > %s" % fn(self.Quiver_done)
    os.system("bash scripts/quiver_done.sh")
Example #9
0
    def run_falcon_asm_task(self):
        wd = self.parameters["wd"]
        config = self.parameters["config"]
        install_prefix = config["install_prefix"]
        pread_dir = self.parameters["pread_dir"]
        script_dir = os.path.join( wd )
        script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
        
        script = []
        script.append( "set -vex" )
        script.append( "trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done) )
        script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
        script.append( "cd %s" % pread_dir )
        # Write preads4falcon.fasta, in 1-preads_ovl:
        script.append( "DB2Falcon -U preads")
        script.append( "cd %s" % wd )
        script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
        overlap_filtering_setting = config["overlap_filtering_setting"]
        length_cutoff_pr = config["length_cutoff_pr"]
        script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\
                (fn(db_file), overlap_filtering_setting, length_cutoff_pr) )
        script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
        script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile
        # Write 'p_ctg.fa' and 'a_ctg.fa':
        script.append( """fc_graph_to_contig.py""" )
        script.append( """touch %s""" % fn(self.falcon_asm_done))

        with open(script_fn, "w") as script_file:
            script_file.write("\n".join(script))

        job_data = make_job_data(self.URL, script_fn)
        job_data["sge_option"] = config["sge_option_fc"]
        run_script(job_data, job_type = config["job_type"])
        wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_data['job_name'])
Example #10
0
 def gather_qm4(self):
     all_qf = [fn(o) for o in self.inputs.values()]
     all_qf.sort()
     with open( fn( self.qm4_fofn ),"w" ) as f:
         for m4f in all_qf:
             if m4f.endswith("m4"):
                 print >> f, m4f
Example #11
0
def run_merge_task(self):
    p_script_fn = self.parameters["merge_script"]
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    install_prefix = config["install_prefix"]

    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rp_%05d.sh" % (job_id))
    log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id))

    script = []
    script.append( "cd %s" % cwd )
    script.append( "hostname >> %s" % log_path )
    script.append( "date >> %s" % log_path )

    # Jason's time path does not work on Centos (where time has no path!?!)
    # this code is also rather fugly - the time output is not logged - encapsulate in brackets
    script.append( "(time bash %s) >> %s 2>&1 " % (p_script_fn, log_path)  )
    script.append( "touch %s" % fn( self.job_done ) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))

    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid4())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])
    wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
Example #12
0
 def check_r_cns_task(self):
     with open(fn(self.pread_fofn),  "w") as f:
         fn_list =  glob.glob("%s/preads/out*.fa" % rawread_dir)
         fn_list.sort()
         for fa_fn in fn_list:
             print >>f, fa_fn
     os.system("touch %s" % fn(self.cns_done))
Example #13
0
def task_build_pdb(
    self
):  #essential the same as build_rdb() but the subtle differences are tricky to consolidate to one function
    input_fofn_fn = fn(self.pread_fofn)
    job_done = fn(self.pdb_build_done)
    db = fn(self.preads_db)
    run_jobs = fn(self.run_jobs)
    remove(job_done, db, run_jobs)
    work_dir = self.parameters["work_dir"]
    config = self.parameters["config"]

    script_fn = os.path.join(work_dir, "prepare_pdb.sh")
    args = {
        'input_fofn_fn': input_fofn_fn,
        'config': config,
        'job_done': job_done,
        'script_fn': script_fn,
        'run_jobs_fn': run_jobs,
    }
    support.build_pdb(**args)
    run_script_and_wait_and_rm_exit(self.URL,
                                    script_fn,
                                    job_done,
                                    self,
                                    job_type=config['job_type'],
                                    sge_option=config['sge_option_pda'])
Example #14
0
def task_run_falcon_asm(self):
    wd = self.parameters["wd"]
    #self.db2falcon_done
    db_file = fn(self.db_file)
    job_done = fn(self.falcon_asm_done)
    config = self.parameters["config"]
    pread_dir = self.parameters["pread_dir"]
    script_dir = os.path.join(wd)
    script_fn = os.path.join(script_dir, "run_falcon_asm.sh")
    # Generate las.fofn in run-dir.
    system('cd {}; find {}/m_*/ -name "*.las" >| las.fofn'.format(
        wd, pread_dir))
    las_fofn_fn = 'las.fofn'
    args = {
        'las_fofn_fn': las_fofn_fn,
        'preads4falcon_fasta_fn': os.path.join(pread_dir,
                                               'preads4falcon.fasta'),
        'db_file_fn': db_file,
        'config': config,
        'job_done': job_done,
        'script_fn': script_fn,
    }
    support.run_falcon_asm(**args)
    run_script_and_wait_and_rm_exit(self.URL,
                                    script_fn,
                                    job_done,
                                    self,
                                    job_type=config['job_type'],
                                    sge_option=config['sge_option_fc'])
Example #15
0
def build_rdb(self):

    input_fofn = self.input_fofn
    input_fofn_fn = fn(input_fofn)
    rdb_build_done = self.rdb_build_done
    work_dir = self.parameters["work_dir"]
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]
    install_prefix = config["install_prefix"]
    length_cutoff = config["length_cutoff"]
    pa_HPCdaligner_option = config["pa_HPCdaligner_option"]
    pa_DBsplit_option = config["pa_DBsplit_option"]


    script_fn = os.path.join( work_dir, "prepare_db.sh" )
    with open(script_fn,"w") as script_file:
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
        script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done\n".format(input_fofn_fn = input_fofn_fn))
        script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option)
        script_file.write("HPCdaligner %s -H%d raw_reads > run_jobs.sh\n" % (pa_HPCdaligner_option, length_cutoff))
        script_file.write("touch {rdb_build_done}\n".format(rdb_build_done = fn(rdb_build_done)))

    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid1())[:8]
    job_data = {"job_name": job_name,
                "cwd": os.getcwd(),
                "sge_option": sge_option_da,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])
    wait_for_file( fn(rdb_build_done), task=self, job_name=job_name )
Example #16
0
def run_merge_task(self):
    p_script_fn = self.parameters["merge_script"]
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    sge_option_la = config["sge_option_la"]
    install_prefix = config["install_prefix"]

    script_dir = os.path.join(cwd)
    script_fn = os.path.join(script_dir, "rp_%05d.sh" % (job_id))
    log_path = os.path.join(script_dir, "rp_%05d.log" % (job_id))

    script = []
    script.append("source {install_prefix}/bin/activate\n".format(
        install_prefix=install_prefix))
    script.append("cd %s" % cwd)
    script.append("hostname >> %s" % log_path)
    script.append("date >> %s" % log_path)
    script.append(("/usr/bin/time bash %s " % p_script_fn) +
                  (" >> %s 2>&1" % log_path) +
                  (" && touch %s" % fn(self.job_done)))

    with open(script_fn, "w") as script_file:
        script_file.write("\n".join(script))

    job_name = self.URL.split("/")[-1]
    job_name += "-" + str(uuid.uuid1())[:8]
    job_data = {
        "job_name": job_name,
        "cwd": cwd,
        "sge_option": sge_option_la,
        "script_fn": script_fn
    }
    run_script(job_data, job_type=config["job_type"])
    wait_for_file(fn(self.job_done), task=self, job_name=job_name)
Example #17
0
def task_report_pre_assembly(self):
    i_raw_reads_db_fn = fn(self.raw_reads_db)
    i_preads_fofn_fn = fn(self.preads_fofn)
    i_length_cutoff_fn = fn(self.length_cutoff_fn)
    o_json_fn = fn(self.pre_assembly_report)
    cfg = self.parameters
    genome_length = int(cfg.get('genome_size', 0)) # different name in falcon
    length_cutoff = int(cfg['length_cutoff'])
    # Update length_cutoff if auto-calc (when length_cutoff is negative).
    # i_length_cutoff_fn was created long ago, so no filesystem issues.
    length_cutoff = support.get_length_cutoff(length_cutoff, i_length_cutoff_fn)
    cwd = self.parameters['cwd']
    mkdir(cwd)
    script_fn = os.path.join(cwd , 'run_report_pre_assembly.sh')
    job_done = os.path.join(cwd, 'report_pa_done')
    kwds = {
        'i_raw_reads_db_fn': i_raw_reads_db_fn,
        'i_preads_fofn_fn': i_preads_fofn_fn,
        'genome_length': genome_length,
        'length_cutoff': length_cutoff,
        'o_json_fn': o_json_fn,
        'job_done': job_done,
        'script_fn': script_fn,
    }
    fc_run_logger.info('Report inputs: {}'.format(repr(kwds)))
    support.run_report_pre_assembly(**kwds)
    self.generated_script_fn = script_fn
Example #18
0
def task_daligner_gather(self):
    da_done = fn(self.da_done)
    main_dir = os.path.dirname(da_done)
    out_dict = self.inputDataObjs
    nblock = self.parameters['nblock']
    fc_run_logger.debug('nblock=%d, out_dir:\n%s' % (nblock, out_dict))

    # Create m_* dirs.
    for block in xrange(1, nblock + 1):
        mdir = os.path.join(
            main_dir,
            'm_%05d' % block)  # By convention. pbsmrtpipe works differently.
        mkdir(mdir)
        # TODO: Remove existing symlinks?
    job_rundirs = [
        os.path.dirname(fn(dal_done)) for dal_done in out_dict.values()
    ]

    # Symlink all daligner *.las.
    links = collections.defaultdict(list)
    for block, las_path in support.daligner_gather_las(job_rundirs):
        mdir = os.path.join(
            main_dir,
            'm_%05d' % block)  # By convention. pbsmrtpipe works differently.
        #las_path = os.path.relpath(las_path, mdir)
        links[mdir].append(las_path)
    only_these_symlinks(links)
    system("touch %s" % da_done)
Example #19
0
def task_build_rdb(self):
    input_fofn_fn = fn(self.input_fofn)
    job_done = fn(self.rdb_build_done)
    db = fn(self.raw_reads_db)
    run_jobs = fn(self.run_jobs)
    remove(job_done, db, run_jobs)
    work_dir = self.parameters["work_dir"]
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]

    script_fn = os.path.join(work_dir, "prepare_rdb.sh")
    args = {
        'input_fofn_fn': input_fofn_fn,
        'config': config,
        'job_done': job_done,
        'script_fn': script_fn,
        'run_jobs_fn': run_jobs,
    }
    support.build_rdb(**args)
    run_script_and_wait_and_rm_exit(self.URL,
                                    script_fn,
                                    job_done,
                                    self,
                                    job_type=config['job_type'],
                                    sge_option=sge_option_da)
Example #20
0
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False):
    job_id = 0
    tasks = []
    tasks_out = {}

    nblock = 1
    new_db = True
    if os.path.exists(fn(db_file)):
        with open(fn(db_file)) as f:
            for l in f:
                l = l.strip().split()
                if l[0] == "blocks" and l[1] == "=":
                    nblock = int(l[2])
                    new_db = False
                    break

    for pid in xrange(1, nblock + 1):
        support.make_dirs("%s/m_%05d" % (wd, pid))

    with open(os.path.join(wd, "run_jobs.sh")) as f:
        for l in f:
            l = l.strip()
            job_uid = hashlib.md5(l).hexdigest()
            job_uid = job_uid[:8]
            l = l.split()
            if l[0] == "daligner":
                support.make_dirs(os.path.join(wd, "./job_%s" % job_uid))
                call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (
                    wd,
                    job_uid,
                    db_prefix,
                    db_prefix,
                    db_prefix,
                )
                rc = os.system(call)
                if rc:
                    raise Exception("Failure in system call: %r -> %d" % (call, rc))
                job_done = makePypeLocalFile(os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)))
                if pread_aln == True:
                    l[0] = "daligner_p"
                parameters = {
                    "daligner_cmd": " ".join(l),
                    "cwd": os.path.join(wd, "job_%s" % job_uid),
                    "job_uid": job_uid,
                    "config": config,
                    "nblock": nblock,
                    "db_prefix": db_prefix,
                }
                make_daligner_task = PypeTask(
                    inputs={"rdb_build_done": rdb_build_done},
                    outputs={"job_done": job_done},
                    parameters=parameters,
                    TaskType=PypeThreadTaskBase,
                    URL="task://localhost/d_%s_%s" % (job_uid, db_prefix),
                )
                daligner_task = make_daligner_task(run_daligner)
                tasks.append(daligner_task)
                tasks_out["ajob_%s" % job_uid] = job_done
                job_id += 1
    return tasks, tasks_out
Example #21
0
def run_daligner(self):
    daligner_cmd = self.parameters["daligner_cmd"]
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    script_dir = os.path.join(cwd)
    script_fn = os.path.join(script_dir, "rj_%05d.sh" % (job_id))
    log_path = os.path.join(script_dir, "rj_%05d.log" % (job_id))
    script = []
    script.append("export PATH=~/task2014/dazzler/DALIGNER/:$PATH")
    script.append("cd %s" % cwd)
    script.append("/usr/bin/time " + daligner_cmd + (" >& %s " % log_path) +
                  (" && touch %s" % fn(self.job_done)))

    with open(script_fn, "w") as script_file:
        script_file.write("\n".join(script))

    job_name = self.URL.split("/")[-1]
    job_name += "-" + str(uuid.uuid1())[:8]
    job_data = {
        "job_name": job_name,
        "cwd": cwd,
        "sge_option": " -pe smp 6 -q huasm ",
        "script_fn": script_fn
    }
    run_script(job_data, job_type="SGE")
    wait_for_file(fn(self.job_done), task=self, job_name=job_name)
Example #22
0
def check_p_merge_check_task(self):
    wdir = os.path.dirname(fn(self.p_merge_done))
    mkdir(wdir)
    system("touch %s" % fn(self.p_merge_done))
    script_fn = os.path.join(wdir, 'noop.sh')
    open(script_fn, 'w').write('echo NOOP raw')
    self.generated_script_fn = script_fn
Example #23
0
File: run1.py Project: pbjd/FALCON
def task_run_consensus(self):
    merge_job_done = fn(self.job_done)
    out_file_fn = fn(self.out_file)
    out_done = fn(self.out_done)
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    prefix = self.parameters["prefix"]
    script_dir = os.path.join(cwd)
    script_fn = os.path.join(script_dir, "c_%05d.sh" % (job_id))
    db_fn = os.path.abspath(
        '{cwd}/../../{prefix}'.format(**locals()))  # ASSUMING 2-levels deep
    merge_job_dir = os.path.dirname(merge_job_done)
    # by convention, we assume the name of the .las file
    las_fn = os.path.abspath(
        '{merge_job_dir}/{prefix}.{job_id}.las'.format(**locals()))
    args = {
        'db_fn': db_fn,
        'las_fn': las_fn,
        'out_file_fn': out_file_fn,
        'config': config,
        'job_done': out_done,
        'script_fn': script_fn,
    }
    support.run_consensus(**args)
    self.generated_script_fn = script_fn
Example #24
0
def run_merge_task(self):
    p_script_fn = self.parameters["merge_script"]
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    job_done = self.job_done
    config = self.parameters["config"]
    sge_option_la = config["sge_option_la"]
    install_prefix = config["install_prefix"]

    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rp_%05d.sh" % (job_id))

    script = []
    script.append( "set -vex" )
    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = fn(job_done)) )
    script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
    script.append( "cd %s" % cwd )
    script.append( "hostname" )
    script.append( "date" )
    script.append( "time bash %s" % p_script_fn )
    script.append( "touch {job_done}".format(job_done = fn(job_done)) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))

    job_data = make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_option_la
    run_script(job_data, job_type = config["job_type"])
    wait_for_file(fn(job_done), task=self, job_name=job_data['job_name'])
Example #25
0
def task_run_las_merge(self):
    gathered_las_fn = fn(self.gathered_las)
    script = self.parameters["merge_script"]
    job_id = self.parameters["job_id"]  # aka "block"
    cwd = self.parameters["cwd"]
    mkdir(cwd)

    gathered_dict = read_gathered_las(gathered_las_fn)
    las_paths = gathered_dict[job_id]
    for las_path in las_paths:
        src = os.path.relpath(las_path, cwd)
        tgt = os.path.join(cwd, os.path.basename(las_path))
        fc_run_logger.debug('symlink {!r} -> {!r}'.format(src, tgt))
        os.symlink(src, tgt)

    job_done = fn(self.job_done)
    config = self.parameters["config"]

    script_dir = os.path.join(cwd)
    script_fn = os.path.join(script_dir, "rp_%05d.sh" % (job_id))
    args = {
        'script': script,
        'config': config,
        'job_done': job_done,
        'script_fn': script_fn,
    }
    support.run_las_merge(**args)
    self.generated_script_fn = script_fn
Example #26
0
    def run_falcon_asm_task(self):
        wd = self.parameters["wd"]
        config = self.parameters["config"]
        install_prefix = config["install_prefix"]
        pread_dir = self.parameters["pread_dir"]
        script_dir = os.path.join( wd )
        script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
        
        script = []
        script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
        script.append( "cd %s" % pread_dir )
        script.append( "DB2Falcon preads")
        script.append( "cd %s" % wd )
        script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
        overlap_filtering_setting = config["overlap_filtering_setting"]
        length_cutoff_pr = config["length_cutoff_pr"]
        script.append( """fc_ovlp_filter.py --fofn las.fofn %s \
                                 --n_core 24 --min_len %d > preads.ovl""" % (overlap_filtering_setting, length_cutoff_pr) )

        script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
        script.append( """fc_ovlp_to_graph.py preads.ovl > fc.log""" )
        script.append( """fc_graph_to_contig.py""" )
        script.append( """touch %s\n""" % fn(self.falcon_asm_done))

        with open(script_fn, "w") as script_file:
            script_file.write("\n".join(script))

        job_name = self.URL.split("/")[-1]
        job_name += "-"+str(uuid.uuid1())[:8]
        job_data = {"job_name": job_name,
                    "cwd": wd,
                    "sge_option": config["sge_option_fc"],
                    "script_fn": script_fn }
        run_script(job_data, job_type = "SGE")
        wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name )
Example #27
0
 def check_r_cns_task(self):
     with open(fn(self.pread_fofn), "w") as f:
         fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir)
         fn_list.sort()
         for fa_fn in fn_list:
             print >> f, fa_fn
     system("touch %s" % fn(self.cns_done))
Example #28
0
 def gather_qm4(self):
     all_qf = [fn(o) for o in self.inputs.values()]
     all_qf.sort()
     with open(fn(self.qm4_fofn), "w") as f:
         for m4f in all_qf:
             if m4f.endswith("m4"):
                 print >> f, m4f
Example #29
0
def task_daligner_gather(self):
    da_done = fn(self.da_done)
    main_dir = os.path.dirname(da_done)
    out_dict = self.inputDataObjs
    nblock = self.parameters['nblock']
    fc_run_logger.debug('nblock=%d, out_dir:\n%s' % (nblock, out_dict))

    # Create m_* dirs.
    for block in xrange(1, nblock + 1):
        mdir = os.path.join(
            main_dir,
            'm_%05d' % block)  # By convention. pbsmrtpipe works differently.
        mkdir(mdir)
        # TODO: Remove existing symlinks?

    # Symlink all daligner *.las.
    # Could be L1.* or preads.*
    re_las = re.compile(r'\.(\d*)(\.\d*)?\.las$')
    for dal_done in out_dict.values():
        job_rundir = os.path.dirname(fn(dal_done))
        for las_fn in os.listdir(job_rundir):
            mo = re_las.search(las_fn)
            if not mo:
                continue
            block = int(
                mo.group(1))  # We will merge in the m_* dir of the left block.
            mdir = os.path.join(
                main_dir, 'm_%05d' %
                block)  # By convention. pbsmrtpipe works differently.
            las_path = os.path.join('..', os.path.basename(job_rundir), las_fn)
            cmd = 'ln -sf {} {}'.format(las_path, mdir)
            system(cmd)
    system("touch %s" % da_done)
Example #30
0
def run_daligner(self):
    daligner_cmd = self.parameters["daligner_cmd"]
    job_uid = self.parameters["job_uid"]
    cwd = self.parameters["cwd"]
    job_done = self.job_done
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]
    install_prefix = config["install_prefix"]
    db_prefix = self.parameters["db_prefix"]
    nblock = self.parameters["nblock"]

    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rj_%s.sh" % (job_uid))

    script = []
    script.append( "set -vex" )
    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = fn(job_done)) )
    script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
    script.append( "cd %s" % cwd )
    script.append( "hostname" )
    script.append( "date" )
    script.append( "time "+ daligner_cmd )

    for p_id in xrange( 1, nblock+1 ):
        script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """  % (db_prefix, p_id, db_prefix, p_id) )

    script.append( "touch {job_done}".format(job_done = fn(job_done)) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))

    job_data = make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_option_da
    run_script(job_data, job_type = config["job_type"])
    wait_for_file(fn(job_done), task=self, job_name=job_data['job_name'])
Example #31
0
def task_daligner_gather(self):
    da_done = fn(self.da_done)
    main_dir = os.path.dirname(da_done)
    out_dict = self.inputDataObjs
    nblock = self.parameters['nblock']
    fc_run_logger.debug('nblock=%d, out_dir:\n%s' % (nblock, out_dict))

    # Create m_* dirs.
    for block in xrange(1, nblock + 1):
        mdir = os.path.join(
            main_dir,
            'm_%05d' % block)  # By convention. pbsmrtpipe works differently.
        mkdir(mdir)
        # TODO: Remove existing symlinks?
    job_rundirs = [
        os.path.dirname(fn(dal_done)) for dal_done in out_dict.values()
    ]

    # Symlink all daligner *.las.
    for block, las_path in support.daligner_gather_las(job_rundirs):
        #fc_run_logger.warning('block: %s, las_path: %s' %(block, las_path))
        mdir = os.path.join(
            main_dir,
            'm_%05d' % block)  # By convention. pbsmrtpipe works differently.
        las_path = os.path.relpath(las_path, mdir)
        cmd = 'ln -sf {} {}'.format(las_path, mdir)
        system(cmd)
    system("touch %s" % da_done)
Example #32
0
def build_pdb(self):

    input_fofn = self.pread_fofn
    input_fofn_fn = fn(input_fofn)
    pdb_build_done = self.pdb_build_done
    work_dir = self.parameters["work_dir"]
    config = self.parameters["config"]
    sge_option_pda = config["sge_option_pda"]
    install_prefix = config["install_prefix"]
    length_cutoff = config["length_cutoff_pr"]
    ovlp_HPCdaligner_option = config["ovlp_HPCdaligner_option"]
    ovlp_DBsplit_option = config["ovlp_DBsplit_option"]


    script_fn = os.path.join( work_dir, "prepare_pdb.sh" )

    with open(script_fn,"w") as script_file:
        script_file.write("set -vex\n")
        script_file.write("trap 'touch {pdb_build_done}.exit' EXIT\n".format(pdb_build_done = fn(pdb_build_done)))
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
        script_file.write("hostname\n")
        script_file.write("date\n")
        script_file.write("fasta2DB -v preads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn))
        script_file.write("DBsplit -x%d %s preads\n" % (length_cutoff, ovlp_DBsplit_option))
        script_file.write("HPCdaligner %s -H%d preads > run_jobs.sh\n" % (ovlp_HPCdaligner_option, length_cutoff))
        script_file.write("touch {pdb_build_done}\n".format(pdb_build_done = fn(pdb_build_done)))

    job_data = make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_option_pda
    run_script(job_data, job_type = config["job_type"])
    wait_for_file(fn(pdb_build_done), task=self, job_name=job_data['job_name'])
Example #33
0
def dist_map(self):

    config = self.config
    dist_map_num_chunk = config["dist_map_num_chunk"]
    directory_for_dist_map = config["directory_for_dist_map"]
    sge_option_ck = config["sge_option_ck"]
    sge_option_dm = config["sge_option_dm"]
    install_prefix = config["install_prefix"]
    blasr_opt = config["blasr_opt"]

    #set_up_script = "fastasplit %s %s/ -c %d" % (fn(self.seed_fasta), directory_for_dist_map, dist_map_num_chunk)
    #os.system(set_up_script)
    fasta_file = pbcore.io.FastaReader(fn(self.seed_fasta))
    out_files = []
    for i in range(dist_map_num_chunk):
        out_files.append(
            open(
                "%s/%s_chunk_%07d" %
                (directory_for_dist_map, os.path.basename(fn(
                    self.seed_fasta)), i), "w"))
    for s in fasta_file:
        g = hash(s.name) % dist_map_num_chunk
        out_file = out_files[g]
        out_file.write(">%s\n" % s.name)
        out_file.write("%s\n" % s.sequence)
    for i in range(dist_map_num_chunk):
        out_files[i].close()
    fasta_file.file.close()

    align_script_template = """\
. {install_prefix}/bin/activate 
cd %s/%s
blasr {blasr_opt} -m 4 -out m4_%s.dat %s %s 
""".format(install_prefix=install_prefix, blasr_opt=blasr_opt)

    job_name = "dist_map_" + str(uuid.uuid4())
    i = 0
    for chunk_name in glob.glob(
            "%s/%s_chunk_*" %
        (directory_for_dist_map, os.path.basename(fn(self.seed_fasta)))):
        script = align_script_template % (
            os.getcwd(), directory_for_dist_map, os.path.basename(chunk_name),
            fn(self.normalized_fasta), os.path.basename(chunk_name))
        with open("scripts/dist_map_%02d.sh" % i, "w") as f:
            print >> f, script
        os.system("qsub -N {jn} {sge_option_dm} -o {cwd}/sge_log -j y\
                -S /bin/bash scripts/dist_map_{jid:02d}.sh".format(
            jn=job_name + "_%02d" % i,
            cwd=os.getcwd(),
            sge_option_dm=sge_option_dm,
            jid=i))
        i += 1

    with open("scripts/mapping_done.sh", "w") as f:
        print >> f, "echo done > %s" % fn(self.m4_data_done)
    os.system(
        """qsub -sync y {sge_option_ck} -hold_jid "{jn}*" -o {cwd}/sge_log -j y\
               -S /bin/bash scripts/mapping_done.sh""".format(
            jn=job_name, cwd=os.getcwd(), sge_option_ck=sge_option_ck))
Example #34
0
def get_phased_reads(self):

    q_id_map_fn = fn(self.q_id_map_file)
    vmap_fn = fn(self.vmap_file)
    p_variant_fn = fn(self.phased_variant_file)
    parameters = self.parameters

    ctg_id = parameters["ctg_id"]

    phased_read_fn = fn(self.phased_read_file)

    rid_map = {}
    with open(q_id_map_fn) as f:
        for l in f:
            l = l.strip().split()
            rid_map[int(l[0])] = l[1]

    read_to_variants = {}
    variant_to_reads = {}
    with open(vmap_fn) as f:
        for l in f:
            l = l.strip().split()
            variant = "_".join(l[:3])
            read_id = int(l[3])
            read_to_variants.setdefault(read_id, set())
            read_to_variants[read_id].add(variant)
            variant_to_reads.setdefault(variant, set())
            variant_to_reads[variant].add(read_id)

    variant_to_phase = {}
    with open(p_variant_fn) as f:
        for l in f:
            """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
            l = l.strip().split()
            if l[0] != "V":
                continue
            pb_id = int(l[1])
            variant_to_phase[l[3]] = (pb_id, 0)
            variant_to_phase[l[4]] = (pb_id, 1)

    with open(phased_read_fn, "w") as out_f:
        for r in read_to_variants:
            vl = {}
            pl = set()
            for v in list(read_to_variants[r]):
                if v in variant_to_phase:
                    p = variant_to_phase[v]
                    vl[p] = vl.get(p, 0) + 1
                    pl.add(p[0])
            pl = list(pl)
            pl.sort()
            for p in pl:
                if vl.get((p, 0), 0) - vl.get((p, 1), 0) > 1:
                    print >> out_f, r, ctg_id, p, 0, vl.get((p, 0), 0), vl.get(
                        (p, 1), 0), rid_map[r]
                elif vl.get((p, 1), 0) - vl.get((p, 0), 0) > 1:
                    print >> out_f, r, ctg_id, p, 1, vl.get((p, 0), 0), vl.get(
                        (p, 1), 0), rid_map[r]
Example #35
0
 def h5fofn_to_fasta(self):
     os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\
                (fn(self.input_fofn), 
                 self.parameters["fasta_dir"], 
                 self.parameters["min_length"],
                 self.parameters["min_read_score"]))
     os.system("""find %s -name "*_t.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.target_fa_fofn)))
     os.system("""find %s -name "*_q.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.query_fa_fofn)))
     os.system("touch %s" % fn(self.fasta_dump_done))
Example #36
0
def get_phased_reads(self):

    q_id_map_fn = fn(self.q_id_map_file)
    vmap_fn = fn(self.vmap_file)
    p_variant_fn = fn(self.phased_variant_file)

    ctg_id = parameters["ctg_id"]

    phased_read_fn = fn(self.phased_read_file) 

    rid_map = {}
    with open(q_id_map_fn) as f:
        for l in f:
            l = l.strip().split()
            rid_map[int(l[0])] = l[1]


    read_to_variants = {}
    variant_to_reads = {}
    with open(vmap_fn) as f: 
        for l in f:
            l = l.strip().split()
            variant = "_".join(l[:3])
            read_id = int(l[3])
            read_to_variants.setdefault(read_id, set())
            read_to_variants[read_id].add(variant)
            variant_to_reads.setdefault(variant, set())
            variant_to_reads[variant].add(read_id)


    variant_to_phase = {}
    with open(p_variant_fn) as f:
        for l in f:
            """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781"""
            l = l.strip().split()
            if l[0] != "V":
                continue
            pb_id = int(l[1])
            variant_to_phase[ l[3] ] = (pb_id, 0)
            variant_to_phase[ l[4] ] = (pb_id, 1)
    
    with open(phased_read_fn, "w") as out_f:
        for r in read_to_variants:
            vl = {}
            pl = set()
            for v in list( read_to_variants[r] ):
                if v in variant_to_phase:
                    p = variant_to_phase[v]
                    vl[ p ] = vl.get(p, 0) + 1
                    pl.add(p[0])
            pl = list(pl)
            pl.sort()
            for p in pl:
                if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1:
                    print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
                elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1:
                    print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
Example #37
0
    def split_fofn_task(self):
        query_chunk_size = self.parameters["config"]["q_chunk_size"]
        target_chunk_size = self.parameters["config"]["t_chunk_size"]
        split_fofn( fn(self.query_fa_fofn), self.parameters["dist_map_dir"], "query", query_chunk_size, 
                    incremental = True, allow_fraction = True)
        split_fofn( fn(self.target_fa_fofn), self.parameters["dist_map_dir"], "target", target_chunk_size, 
                    incremental = True, allow_fraction = True)

        os.system("touch %s" % fn(self.split_fofn_done))
Example #38
0
 def h5fofn_to_fasta(self):
     os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\
                (fn(self.input_fofn), 
                 self.parameters["fasta_dir"], 
                 self.parameters["min_length"],
                 self.parameters["min_read_score"]))
     os.system("""find %s -name "*_t.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.target_fa_fofn)))
     os.system("""find %s -name "*_q.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.query_fa_fofn)))
     os.system("touch %s" % fn(self.fasta_dump_done))
Example #39
0
    def split_fofn_task(self):
        query_chunk_size = self.parameters["config"]["q_chunk_size"]
        target_chunk_size = self.parameters["config"]["t_chunk_size"]
        split_fofn( fn(self.query_fa_fofn), self.parameters["dist_map_dir"], "query", query_chunk_size, 
                    incremental = True, allow_fraction = True)
        split_fofn( fn(self.target_fa_fofn), self.parameters["dist_map_dir"], "target", target_chunk_size, 
                    incremental = True, allow_fraction = True)

        os.system("touch %s" % fn(self.split_fofn_done))
def build_rdb(self):

    input_fofn = self.input_fofn
    input_fofn_fn = fn(input_fofn)
    rdb_build_done = self.rdb_build_done
    work_dir = self.parameters["work_dir"]
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]
    install_prefix = config["install_prefix"]
    length_cutoff = config["length_cutoff"]
    pa_HPCdaligner_option = config["pa_HPCdaligner_option"]
    pa_DBsplit_option = config["pa_DBsplit_option"]
    openending = config["openending"]



    script_fn = os.path.join( work_dir, "prepare_db.sh" )
    
    last_block = 1
    new_db = True
    if os.path.exists( os.path.join(work_dir, "raw_reads.db") ):
        with open(  os.path.join(work_dir, "raw_reads.db") ) as f:
            for l in f:
                l = l.strip().split()
                if l[0] == "blocks" and l[1] == "=":
                    last_block = int(l[2])
                    new_db = False
                    break


    with open(script_fn,"w") as script_file:
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
        script_file.write("hostname >> db_build.log\n")
        script_file.write("date >> db_build.log\n")
        script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done >> db_build.log \n".format(input_fofn_fn = 
input_fofn_fn))
        if new_db  == True:
            script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option)
        if openending == True:
            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3-1}')\n""")
        else:
            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3}')\n""")
        script_file.write("HPCdaligner %s -H%d raw_reads %d-$LB > run_jobs.sh\n" % (pa_HPCdaligner_option, length_cutoff, last_block)
)

        script_file.write("touch {rdb_build_done}\n".format(rdb_build_done = fn(rdb_build_done)))

    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid4())[:8]
    job_data = {"job_name": job_name,
                "cwd": os.getcwd(),
                "sge_option": sge_option_da,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])
    wait_for_file( fn(rdb_build_done), task=self, job_name=job_name )
Example #41
0
def check_r_cns_task(self):
    with open(fn(self.preads_fofn), "w") as f:
        for fa_fn in sorted(fn(plf) for plf in self.inputs.values()):
            print >> f, fa_fn
    wdir = os.path.dirname(fn(self.cns_done))
    #mkdir(wdir) We SHOULD need this! TODO
    system("touch %s" % fn(self.cns_done))
    script_fn = os.path.join(wdir, 'noop.sh')
    open(script_fn, 'w').write('echo NOOP raw')
    self.generated_script_fn = script_fn
Example #42
0
    def get_rid_to_phase_all(self):
        rid_to_phase_all_fn = fn(self.rid_to_phase_all)
        inputs_fn = [fn(f) for f in self.inputs.values()]
        inputs_fn.sort()
        output = []
        for fname in inputs_fn:
            output.extend(open(fname).read())

        out = open(rid_to_phase_all_fn, "w")
        out.write("".join(output))
        out.close()
Example #43
0
def prepare_seed_reads(self):

    config = self.config
    length_cutoff = config["length_cutoff"]

    f = pbcore.io.FastaReader(fn(self.normalized_fasta))
    with open(fn(self.seed_fasta), 'w') as sfasta:
        for r in f:
            if len(r.sequence) > length_cutoff:
                sfasta.write(">%s\n" % r.name)
                sfasta.write("%s\n" % r.sequence.upper())
Example #44
0
def get_preassembled_reads(self):

    config = self.config
    directory_for_dist_map = config["directory_for_dist_map"]
    sge_option_ck = config["sge_option_ck"]
    sge_option_pa = config["sge_option_pa"]
    bestn = config["bestn"]
    tmpdir = config["tmpdir"]
    install_prefix = config["install_prefix"]
    num_chunk = config["preassembly_num_chunk"]
    min_cov = config["min_cov"]
    max_cov = config["max_cov"]
    trim_align = config["trim_align"]
    trim_plr = config["trim_plr"]
    q_nproc = config["q_nproc"]

    #set_up_script = "cp generate_preassemble_reads.py %s/" % directory_for_dist_map
    #os.system(set_up_script)
    SGE_script_template = """. %s/bin/activate
cd %s/%s
echo start: `date` > %01d"_job.log"
hostname >> %01d"_job.log"
ls -l m4*.dat >> %01d"_job.log"
%s >> %01d"_job.log"
echo end: `date` >> %01d"_job.log"
"""

    job_name = "preassembly_" + str(uuid.uuid4())
    for j_id in range(0, num_chunk):
        #TODO: use real template lib

        g_plr_str = "generate_preassemble_reads.py %01d %s %s %d %s %d %d %d %d %d %d" % (
            j_id, fn(self.normalized_fasta), fn(self.seed_fasta), bestn,
            tmpdir, num_chunk, min_cov, max_cov, trim_align, trim_plr, q_nproc)

        script = SGE_script_template % (install_prefix, os.getcwd(),
                                        directory_for_dist_map, j_id, j_id,
                                        j_id, g_plr_str, j_id, j_id)

        with open("scripts/preassembly_%02d.sh" % j_id, "w") as f:
            print >> f, script

        os.system("qsub -N {jn} {sge_option_pa} -o {cwd}/sge_log -j y\
                -S /bin/bash scripts/preassembly_{jid:02d}.sh".format(
            jn=job_name + "_%02d" % j_id,
            cwd=os.getcwd(),
            sge_option_pa=sge_option_pa,
            jid=j_id))

    with open("scripts/preassembly_done.sh", "w") as f:
        print >> f, "echo done > %s" % fn(self.preassembly_done)
    os.system(
        """qsub -sync y  {sge_option_ck} -hold_jid "{jn}*" -o {cwd}/sge_log -j y -S /bin/bash scripts/preassembly_done.sh"""
        .format(jn=job_name, cwd=os.getcwd(), sge_option_ck=sge_option_ck))
Example #45
0
def prepare_seed_reads(self):

    config = self.config
    length_cutoff = config["length_cutoff"]

    f = pbcore.io.FastaReader(fn(self.normalized_fasta))
    with open(fn(self.seed_fasta),'w') as sfasta:
        for r in f:
            if len(r.sequence) > length_cutoff:
                sfasta.write( ">%s\n" % r.name )
                sfasta.write( "%s\n" % r.sequence.upper() )
Example #46
0
def task_run_quiver(self):

    ref_fasta = fn(self.ref_fasta)
    read_sam = fn(self.read_sam)

    cns_fasta = fn(self.cns_fasta)
    cns_fastq = fn(self.cns_fastq)
    job_done = fn(self.job_done)

    job_uid = self.parameters["job_uid"]
    wd = self.parameters["wd"]
    config = self.parameters["config"]
    ctg_id = self.parameters["ctg_id"]
    
    smrt_bin = config["smrt_bin"]
    sge_quiver = config["sge_quiver"]
    job_type = config["job_type"]
    samtools = os.path.join( smrt_bin, "samtools")
    pbalign = os.path.join( smrt_bin, "pbalign")
    makePbi = os.path.join( smrt_bin, "makePbi")
    variantCaller = os.path.join( smrt_bin, "variantCaller")

    script_dir = os.path.join( wd )
    script_fn =  os.path.join( script_dir , "cns_%s.sh" % (ctg_id))

    script = []
    script.append( "set -vex" )
    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) )
    script.append( "cd %s" % wd )
    script.append( "hostname" )
    script.append( "date" )
    script.append( "cd {wd}".format(wd = wd) )
    
    script.append( "{samtools} faidx {ref_fasta}".format( samtools=samtools, ref_fasta=ref_fasta ) )
    script.append( "{samtools} view -b -S {read_sam} > {ctg_id}.bam".format( samtools=samtools, read_sam = read_sam, ctg_id = ctg_id ) )
    script.append( "{pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\
            --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\
            --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\
            {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam".format( pbalign=pbalign , ctg_id = ctg_id, ref_fasta = ref_fasta)) 
    script.append( "#{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam".format(makePbi = makePbi, ref_fasta = ref_fasta, ctg_id = ctg_id) ) 
    script.append( "({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\
            -o {cns_fasta} -o {cns_fastq}) || echo quvier failed".format( variantCaller = variantCaller, ctg_id = ctg_id, ref_fasta = ref_fasta, 
                                                   cns_fasta=cns_fasta, cns_fastq=cns_fastq ))

    script.append( "date" )
    script.append( "touch {job_done}".format(job_done = job_done) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script) + '\n')

    job_data = support.make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_quiver
    run_script(job_data, job_type = job_type)
    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
Example #47
0
def dist_map(self):

    config = self.config
    dist_map_num_chunk = config["dist_map_num_chunk"]
    directory_for_dist_map = config["directory_for_dist_map"]
    sge_option_ck = config["sge_option_ck"]
    sge_option_dm = config["sge_option_dm"]
    install_prefix = config["install_prefix"]
    blasr_opt = config["blasr_opt"]

    #set_up_script = "fastasplit %s %s/ -c %d" % (fn(self.seed_fasta), directory_for_dist_map, dist_map_num_chunk)
    #os.system(set_up_script)
    fasta_file = pbcore.io.FastaReader(fn(self.seed_fasta))
    out_files = []
    for i in range(dist_map_num_chunk):
        out_files.append( open( "%s/%s_chunk_%07d" % (directory_for_dist_map, os.path.basename(fn(self.seed_fasta)), i), "w"))
    for s in fasta_file:
        g = hash(s.name) % dist_map_num_chunk
        out_file = out_files[g]
        out_file.write(">%s\n" % s.name)
        out_file.write("%s\n" % s.sequence)
    for i in range(dist_map_num_chunk):
        out_files[i].close()
    fasta_file.file.close()
    

    align_script_template = """\
. {install_prefix}/bin/activate 
cd %s/%s
blasr {blasr_opt} -m 4 -out m4_%s.dat %s %s 
""".format(install_prefix = install_prefix, blasr_opt=blasr_opt)

    job_name = "dist_map_"+str(uuid.uuid4())
    i = 0
    for chunk_name in glob.glob("%s/%s_chunk_*" % ( directory_for_dist_map, os.path.basename(fn(self.seed_fasta))) ):
        script = align_script_template % (os.getcwd(), directory_for_dist_map, 
                                          os.path.basename(chunk_name), 
                                          fn(self.normalized_fasta), 
                                          os.path.basename(chunk_name))
        with open("scripts/dist_map_%02d.sh" % i,"w") as f:
            print >>f, script
        os.system("qsub -N {jn} {sge_option_dm} -o {cwd}/sge_log -j y\
                -S /bin/bash scripts/dist_map_{jid:02d}.sh".format(jn=job_name+"_%02d" % i, 
                                                                   cwd=os.getcwd(), 
                                                                   sge_option_dm = sge_option_dm, 
                                                                   jid=i))
        i += 1
    
    with open("scripts/mapping_done.sh","w") as f:
        print >>f, "echo done > %s" % fn(self.m4_data_done)
    os.system("""qsub -sync y {sge_option_ck} -hold_jid "{jn}*" -o {cwd}/sge_log -j y\
               -S /bin/bash scripts/mapping_done.sh""".format(jn=job_name, cwd=os.getcwd(), sge_option_ck=sge_option_ck))
Example #48
0
def build_rdb(self):  #essential the same as build_rdb() but the subtle differences are tricky to consolidate to one function

    input_fofn = self.input_fofn
    input_fofn_fn = fn(input_fofn)
    rdb_build_done = self.rdb_build_done
    work_dir = self.parameters["work_dir"]
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]
    install_prefix = config["install_prefix"]
    length_cutoff = config["length_cutoff"]
    pa_HPCdaligner_option = config["pa_HPCdaligner_option"]
    pa_DBsplit_option = config["pa_DBsplit_option"]
    openending = config["openending"]



    script_fn = os.path.join( work_dir, "prepare_rdb.sh" )
    
    last_block = 1
    new_db = True
    if os.path.exists( os.path.join(work_dir, "raw_reads.db") ):
        with open(  os.path.join(work_dir, "raw_reads.db") ) as f:
            for l in f:
                l = l.strip().split()
                if l[0] == "blocks" and l[1] == "=":
                    last_block = int(l[2])
                    new_db = False
                    break


    with open(script_fn,"w") as script_file:
        script_file.write("set -vex\n")
        script_file.write("trap 'touch {rdb_build_done}.exit' EXIT\n".format(rdb_build_done = fn(rdb_build_done)))
        script_file.write("source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix))
        script_file.write("cd {work_dir}\n".format(work_dir = work_dir))
        script_file.write("hostname\n")
        script_file.write("date\n")
        #script_file.write("for f in `cat {input_fofn_fn}`; do fasta2DB raw_reads $f; done\n".format(input_fofn_fn = input_fofn_fn))
        script_file.write("fasta2DB -v raw_reads -f{input_fofn_fn}\n".format(input_fofn_fn = input_fofn_fn))
        if new_db  == True:
            script_file.write("DBsplit %s raw_reads\n" % pa_DBsplit_option)
        if openending == True:
            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3-1}')\n""")
        else:
            script_file.write("""LB=$(cat raw_reads.db | awk '$1 == "blocks" {print $3}')\n""")
        script_file.write("HPCdaligner %s -H%d raw_reads %d-$LB > run_jobs.sh\n" % (pa_HPCdaligner_option, length_cutoff, last_block))
        script_file.write("touch {rdb_build_done}\n".format(rdb_build_done = fn(rdb_build_done)))

    job_data = make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_option_da
    run_script(job_data, job_type = config["job_type"])
    wait_for_file(fn(rdb_build_done), task=self, job_name=job_data['job_name'])
Example #49
0
def task_run_blasr(self):

    job_done = fn(self.job_done)
    ref_fasta = fn(self.ref_fasta)
    read_fasta = fn(self.read_fasta)

    job_uid = self.parameters["job_uid"]
    wd = self.parameters["wd"]
    ctg_id = self.parameters["ctg_id"]

    config = self.parameters["config"]
    smrt_bin = config["smrt_bin"]
    sge_blasr_aln = config["sge_blasr_aln"]
    job_type = config["job_type"]
    blasr = os.path.join(smrt_bin, "blasr")
    samtools = os.path.join(smrt_bin, "samtools")

    script_dir = os.path.join(wd)
    script_fn = os.path.join(script_dir,
                             "aln_{ctg_id}.sh".format(ctg_id=ctg_id))

    script = []
    script.append("set -vex")
    script.append(
        "trap 'touch {job_done}.exit' EXIT".format(job_done=job_done))
    script.append("cd %s" % wd)
    script.append("hostname")
    script.append("date")
    script.append("cd {wd}".format(wd=wd))
    script.append(
        "time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\
 -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\
 -minMatch 12  -nproc 24 -sam -out tmp_aln.sam".format(blasr=blasr,
                                                       read_fasta=read_fasta,
                                                       ref_fasta=ref_fasta))

    script.append(
        "{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted".
        format(samtools=samtools, ctg_id=ctg_id))
    script.append("{samtools} index {ctg_id}_sorted.bam".format(
        samtools=samtools, ctg_id=ctg_id))
    script.append("rm tmp_aln.sam")
    script.append("date")
    script.append("touch {job_done}".format(job_done=job_done))

    with open(script_fn, "w") as script_file:
        script_file.write("\n".join(script) + '\n')

    job_data = support.make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_blasr_aln
    run_script(job_data, job_type=config["job_type"])
    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
Example #50
0
def make_fofn_abs(self):
    """Copy i_fofn to o_fofn, but with relative filenames expanded for CWD.
    """
    i_fofn_fn = fn(self.i_fofn)
    o_fofn_fn = fn(self.o_fofn)
    #cwd = self.parameters["cwd"]

    assert os.path.abspath(o_fofn_fn) != os.path.abspath(i_fofn_fn)
    with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs:
        for line in ifs:
            ifn = line.strip()
            if not ifn: continue
            abs_ifn = os.path.abspath(ifn)
            ofs.write('%s\n' %abs_ifn)
Example #51
0
def make_fofn_abs(self):
    """Copy i_fofn to o_fofn, but with relative filenames expanded for CWD.
    """
    i_fofn_fn = fn(self.i_fofn)
    o_fofn_fn = fn(self.o_fofn)
    #cwd = self.parameters["cwd"]

    assert os.path.abspath(o_fofn_fn) != os.path.abspath(i_fofn_fn)
    with open(i_fofn_fn) as ifs, open(o_fofn_fn, 'w') as ofs:
        for line in ifs:
            ifn = line.strip()
            if not ifn: continue
            abs_ifn = os.path.abspath(ifn)
            ofs.write('%s\n' % abs_ifn)
Example #52
0
def task_phasing(self):

    ref_fasta = fn(self.ref_fasta)
    aln_bam = fn(self.aln_bam)

    job_done = fn(self.job_done)

    job_uid = self.parameters["job_uid"]
    wd = self.parameters["wd"]
    ctg_id = self.parameters["ctg_id"]

    config = self.parameters["config"]
    sge_phasing = config["sge_phasing"]
    job_type = config["job_type"]

    script_dir = os.path.join( wd )
    script_fn =  os.path.join( script_dir , "p_%s.sh" % (ctg_id))

    script = []

    script.append( "set -vex" )
    script.append( "trap 'touch {job_done}.exit' EXIT".format(job_done = job_done) )
    script.append( "cd %s" % wd )
    script.append( "hostname" )
    script.append( "date" )
    script.append( "cd {wd}".format(wd = wd) )
    script.append( "fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir ../".format( aln_bam = aln_bam,
                                                                                                                ref_fasta = ref_fasta,
                                                                                                                ctg_id = ctg_id ))
    script.append( "fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads".format(ctg_id = ctg_id) )
    #script.append( "fc_ovlp_filter_with_phase.py --fofn ../../2-asm-falcon/las.fofn\
    #        --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500\
    #        --db ../../1-preads_ovl/preads.db  --rid_phase_map ./rid_to_phase > preads.p_ovl") #TODO: make it configurable
    #script.append( "fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log" )
    #script.append( "fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_phase_path ./ --ctg_id {ctg_id}\
    #        --rid_phase_map ./rid_to_phase --fasta ../../1-preads_ovl/preads4falcon.fasta".format(ctg_id = ctg_id))

    #script.append( "fc_dedup_h_tigs.py" )
    script.append( "date" )
    script.append( "touch {job_done}".format(job_done = job_done) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script) + '\n')

    job_data = support.make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_phasing
    run_script(job_data, job_type = job_type)
    wait_for_file(job_done, task=self, job_name=job_data['job_name'])
Example #53
0
def run_p_task(self):
    p_script_fn = self.parameters["p_file"]
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rp_%05d.sh" % (job_id))
    log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id))
    script = []
    script.append( "export PATH=~/task2014/dazzler/DALIGNER/:$PATH" )
    script.append( "cd %s" % cwd )
    script.append( ("/usr/bin/time bash %s " % p_script_fn)  + ( " >& %s " % log_path ) + ( " && touch %s" % fn( self.job_done ) ) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))



    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid1())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "sge_option": " -pe smp 2 -q huasm ",
                "script_fn": script_fn }
    run_script(job_data, job_type = "SGE")
    wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
Example #54
0
def run_consensus_task(self):
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    sge_option_cns = config["sge_option_cns"]
    install_prefix = config["install_prefix"]
    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "c_%05d.sh" % (job_id))
    log_path = os.path.join( script_dir, "c_%05d.log" % (job_id))
    prefix = self.parameters["prefix"]
    falcon_sense_option = config["falcon_sense_option"]
    length_cutoff = config["length_cutoff"]

    with open( os.path.join(cwd, "cp_%05d.sh" % job_id), "w") as c_script:
        print >> c_script, "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix)
        print >> c_script, "cd .."
        print >> c_script, """LA4Falcon -H%d -o -f:%s las_files/%s.%d.las | """ % (length_cutoff, prefix, prefix, job_id),
        print >> c_script, """fc_consensus.py %s > %s""" % (falcon_sense_option, fn(self.out_file))

    script = []
    script.append( "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) )
    script.append( "cd %s" % cwd )
    script.append( ("/usr/bin/time bash cp_%05d.sh " % job_id )  + ( " >& %s " % log_path ) + ( " && touch c_%05d_done" % job_id  ) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))

    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid1())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "sge_option": sge_option_cns,
                "script_fn": script_fn }
    run_script(job_data, job_type = "SGE")
    wait_for_file( os.path.join(cwd,"c_%05d_done" % job_id) , task=self, job_name=job_name )
Example #55
0
    def build_p_rdb_task(self):
        config = self.parameters["config"]
        pread_dir = self.parameters["pread_dir"]
        fa_serial = 0
        for fa_fn in open(fn(self.pread_fofn)).readlines():
            fa_fn = fa_fn.strip()
            c = 0
            fa_serial += 1
            with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm:
                f = FastaReader(fa_fn)
                for r in f:
                    if len(r.sequence) < config["length_cutoff_pr"]:
                        continue
                    name = r.name
                    name = name.replace("_","")
                    ignore_read = False
                    for  cc in r.sequence:
                        if cc not in ["A","C","G","T"]:
                            ignore_read = True
                            break
                    if ignore_read:
                        continue
                    print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) )
                    for i in range(0, len(r.sequence)/80):
                        print >> p_norm, r.sequence[ i *80 : (i + 1) * 80]
                    print >> p_norm, r.sequence[(i+1)*80:]
                    c += 1
            os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) )

        os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"]))
        os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"]))
        os.system("cd %s; touch rdb_build_done" % pread_dir)
Example #56
0
def run_merge_task(self):
    p_script_fn = self.parameters["merge_script"]
    job_id = self.parameters["job_id"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    sge_option_la = config["sge_option_la"]
    install_prefix = config["install_prefix"]

    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rp_%05d.sh" % (job_id))
    log_path = os.path.join( script_dir, "rp_%05d.log" % (job_id))

    script = []
    script.append( "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) )
    script.append( "cd %s" % cwd )
    script.append( "hostname >> %s" % log_path )
    script.append( "date >> %s" % log_path )
    script.append( ("/usr/bin/time bash %s " % p_script_fn)  + ( " >> %s 2>&1" % log_path ) + ( " && touch %s" % fn( self.job_done ) ) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))



    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid4())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "sge_option": sge_option_la,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])
    wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
Example #57
0
def run_daligner(self):
    daligner_cmd = self.parameters["daligner_cmd"]
    job_uid = self.parameters["job_uid"]
    cwd = self.parameters["cwd"]
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]
    install_prefix = config["install_prefix"]
    db_prefix = self.parameters["db_prefix"]
    nblock = self.parameters["nblock"]

    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rj_%s.sh" % (job_uid))
    log_path = os.path.join( script_dir, "rj_%s.log" % (job_uid))

    script = []
    script.append( "source {install_prefix}/bin/activate\n".format(install_prefix = install_prefix) )
    script.append( "cd %s" % cwd )
    script.append( "hostname >> %s" % log_path )
    script.append( "date >> %s" % log_path )
    script.append( "/usr/bin/time "+ daligner_cmd + ( " >> %s 2>&1 " % log_path ) + ( " && touch %s" % fn( self.job_done ) ) )

    for p_id in xrange( 1, nblock+1 ):
        script.append( """ for f in `find $PWD -wholename "*%s.%d.%s.*.*.las"`; do ln -sf $f ../m_%05d; done """  % (db_prefix, p_id, db_prefix, p_id) )

    with open(script_fn,"w") as script_file:
        script_file.write("\n".join(script))

    job_name = self.URL.split("/")[-1]
    job_name += "-"+str(uuid.uuid4())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "sge_option": sge_option_da,
                "script_fn": script_fn }
    run_script(job_data, job_type = config["job_type"])
    wait_for_file( fn( self.job_done ), task=self, job_name=job_name )
Example #58
0
def task_run_daligner(self):
    job_done = fn(self.job_done)
    daligner_cmd = self.parameters["daligner_cmd"]
    job_uid = self.parameters["job_uid"]
    cwd = self.parameters["cwd"]
    db_prefix = self.parameters["db_prefix"]
    nblock = self.parameters["nblock"]
    config = self.parameters["config"]
    sge_option_da = config["sge_option_da"]
    script_dir = os.path.join( cwd )
    script_fn =  os.path.join( script_dir , "rj_%s.sh" % (job_uid))
    args = {
        'daligner_cmd': daligner_cmd,
        'db_prefix': db_prefix,
        'nblock': nblock,
        'config': config,
        'job_done': job_done,
        'script_fn': script_fn,
    }
    support.run_daligner(**args)

    job_data = support.make_job_data(self.URL, script_fn)
    job_data["sge_option"] = sge_option_da
    run_script(job_data, job_type = config["job_type"])
    wait_for_file(job_done, task=self, job_name=job_data['job_name'])