def bwa_mem(fastq_list, genome, name, number, data_type, thread, job_type, concurrent, refresh, work_dir, out_dir): genome, fastq_list = check_paths([genome, fastq_list]) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) dag = DAG("split_ngs") split_work = mkdir(os.path.join(work_dir, "00_data")) split_out = mkdir(os.path.join(out_dir, "00_data")) splitfp_task, fq_path, r1_name, r2_name = split_ngs_task( fastq_list=fastq_list, name=name, number=number, data_type=data_type, job_type=job_type, work_dir=split_work, out_dir=split_out) dag.add_task(splitfp_task) do_dag(dag, concurrent, refresh) dag = DAG("bwa_mem") index_task, bwa_tasks, merge_task, sorted_bam, genome = run_bwa_mem( fq_path=fq_path, r1_name=r1_name, r2_name=r2_name, genome=genome, name=name, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(index_task) dag.add_task(*bwa_tasks) dag.add_task(merge_task) index_task.set_downstream(*bwa_tasks) merge_task.set_upstream(*bwa_tasks) do_dag(dag, concurrent, refresh) return sorted_bam, genome
def run_minimap(reads, genome, platform, name, split, thread, job_type, concurrent, refresh, work_dir, out_dir): option = OrderedDict() option["minimap2"] = { "version": get_version(SOFTWARE_VERSION["minimap2"]), "option": "%s" % SEQUENCER[platform]["minimap2"] } work_dict = {"minimap": "01_minimap", "merge": "02_merge"} for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) dag = DAG("minimap") minimap_tasks, bams = create_minimap_tasks(reads=reads, genome=genome, platform=platform, name=name, thread=thread, job_type=job_type, work_dir=os.path.join( work_dir, work_dict["minimap"]), out_dir=out_dir, split=split) merge_task, bam = merge_bam_task(bams=bams, name=name, thread=thread, job_type=job_type, work_dir=os.path.join( work_dir, work_dict["merge"]), out_dir=out_dir) dag.add_task(*minimap_tasks) dag.add_task(merge_task) merge_task.set_upstream(*minimap_tasks) do_dag(dag, concurrent, refresh) return bam, option
def minimap(r1, r2, genome, name, split, platform, number, thread, job_type, concurrent, refresh, work_dir, out_dir): genome = check_path(genome) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) if r2 != "": r2 = check_paths(r2) options = {"software": OrderedDict(), "database": OrderedDict()} data_work = mkdir(os.path.join(work_dir, '00_data')) reads = split_data(r1=r1, r2=r2, name=name, number=number, job_type=job_type, work_dir=data_work, out_dir=out_dir) bam, option = run_minimap(reads=reads, genome=genome, platform=platform, name=name, split=split, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) options["software"] = option with open(os.path.join(out_dir, "minimap2.json"), "w") as fh: json.dump(options, fh, indent=2) return bam
def run_bwa_mem(fq_path, r1_name, r2_name, genome, name, thread, job_type, work_dir, out_dir): work_dict = {"index": "01_index", "bwa": "02_bwa", "merge": "03_merge"} index_work = mkdir(os.path.join(work_dir, work_dict["index"])) index_out = mkdir(os.path.join(out_dir, work_dict["index"])) index_task, genome = bwa_index_task(genome=genome, name=name, job_type=job_type, work_dir=index_work, out_dir=index_out) bwa_work = mkdir(os.path.join(work_dir, work_dict["bwa"])) bwa_out = mkdir(os.path.join(out_dir, work_dict["bwa"])) bwa_tasks, sort_bams = bwa_mem_tasks(fq_path=fq_path, r1_name=r1_name, r2_name=r2_name, genome=genome, name=name, thread=thread, job_type=job_type, work_dir=bwa_work, out_dir=bwa_out) merge_work = mkdir(os.path.join(work_dir, work_dict["merge"])) merge_out = mkdir(os.path.join(out_dir, work_dict["merge"])) merge_task, sorted_bam = bwa_merge_bam_task(sort_bams=sort_bams, name=name, thread=thread, job_type=job_type, work_dir=merge_work, out_dir=merge_out) return index_task, bwa_tasks, merge_task, sorted_bam, genome
def run_survey(r1, r2, name, trim, kingdom, kmer_length, sample_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) dag = DAG("survey_qc") merge_task, qc_task, cont_task, result_task, clean1, clean2, quality, content, gc, stat_qc, poll_png, poll_tsv = ngs_qc_tasks( name=name, r1=r1, r2=r2, trim=trim, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) data_work = mkdir(os.path.join(work_dir, "choose_data")) freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=data_work, out_dir=data_work) dag.add_task(merge_task) dag.add_task(qc_task) qc_task.set_upstream(merge_task) dag.add_task(cont_task) dag.add_task(result_task) dag.add_task(freq_task1) freq_task1.set_upstream(qc_task) cont_task.set_upstream(qc_task) result_task.set_upstream(qc_task) do_dag(dag, concurrent, refresh) for line in read_tsv(kmer_stat): if line[0] == "kmer_depth": kmer_depth = int(line[1]) if sample_depth > kmer_depth: LOG.debug( 'The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % kmer_depth) sample_depth = kmer_depth proportion = sample_depth * 1.0 / kmer_depth dag = DAG("survey") choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, proportion=proportion, kingdom=kingdom, thread=thread, job_type=job_type, queue=queue, work_dir=work_dir, out_dir=out_dir) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" ngs_list = "false" dag.add_task(choose_task) dag.add_task(freq_task) dag.add_task(heter_task) freq_task.set_upstream(choose_task) dag.add_task(jellyfish_task) jellyfish_task.set_upstream(choose_task) dag.add_task(gse_scope_task) heter_task.set_upstream(freq_task) gse_scope_task.set_upstream(jellyfish_task) do_dag(dag, concurrent, refresh) if ngs_list == "false": print("Genomics are not assembled") gc_depth_png = heter_png else: depth_work = mkdir(os.path.join(work_dir, "05_GC-depth")) depth_out = mkdir(os.path.join(out_dir, "05_GC-depth")) gc_depth_png = run_gc_depth(genome=genome, fastq_list=ngs_list, name=name, window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=depth_work, out_dir=depth_out) run_report(name, asm, kmer_length, stat_qc, quality, content, gc, poll_tsv, poll_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth_png, out_dir) return stat_qc, quality, content, gc, poll_png, poll_tsv, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
def run_survey(r1, r2, name, trim, kingdom, mode, cratio, kmer_length, kmer_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir, split=""): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) clean1, clean2, taxid, stat_qc, quality, content, gc, cont_tsv, cont_png = run_ngs_qc( r1=r1, r2=r2, name=name, trim=trim, kingdom=kingdom, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join(work_dir, "01_data"), out_dir=os.path.join(out_dir, "01_data")) stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth = run_kmer_denovo( r1=[clean1], r2=[clean2], taxid=taxid, name=name, mode=mode, cratio=cratio, kmer_length=kmer_length, kmer_depth=kmer_depth, kingdom=kingdom, asm=asm, window=window, thread=thread, job_type=job_type, queue=queue, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir, split=split, platform="illumina") run_report(name, asm, kmer_length, stat_qc, quality, content, gc, cont_tsv, cont_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth, out_dir)
def run_filter_contamination(r1, r2, name, kmer_length, kmer_depth, taxid, kingdom, thread, job_type, concurrent, refresh, work_dir, out_dir, split, mode="fast", cratio=10): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) taxid = check_path(taxid) options = { "software": OrderedDict(), "database": OrderedDict() } option, r1, r2 = choose_data( r1=r1, r2=r2, name=name, kmer_length=kmer_length, kmer_depth=kmer_depth, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) options["software"].update(option) if mode!="fast": work_dict = { "data": "00_data", "ref": "01_ref", "ump": "02_ump" } for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) reads = split_data( r1=r1, r2=r2, name=name, number=2000000, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["data"]), concurrent=concurrent, refresh=refresh, out_dir=out_dir, platform="illumina") dag = DAG("unmap_data") ref_task, ref= obtain_contamination_task( taxid=taxid, name=name, kingdom=kingdom, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["ref"]), out_dir=out_dir, mode=mode, cratio=cratio) dag.add_task(ref_task) unmap_tasks, reads, option = create_unmap_tasks( name=name, reference=ref, reads=reads, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["ump"]), out_dir=out_dir, split=split) dag.add_task(*unmap_tasks) ref_task.set_downstream(*unmap_tasks) do_dag(dag, concurrent, refresh) options["software"].update(option) reads = [reads] else: reads = [r1, r2] return reads, options
def run_kmer_denovo(r1, r2, name, kingdom, kmer_length, sample_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) if r1[0].endswith(".gz") or r2[0].endswith(".gz"): tools = "zcat" else: tools = "cat" dag_data = DAG("survey_data") data_work = mkdir(os.path.join(work_dir, "choose_data")) cat_data_task, clean1, clean2 = merge_raw_data_task(name=name, r1=" ".join(r1), r2=" ".join(r2), tools=tools, job_type=job_type, work_dir=data_work, out_dir=data_work) freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task( r1=clean1, r2=clean2, name=name, kmer_length=17, thread=thread, job_type=job_type, work_dir=data_work, out_dir=data_work) dag_data.add_task(cat_data_task) dag_data.add_task(freq_task1) freq_task1.set_upstream(cat_data_task) do_dag(dag_data, concurrent, refresh) for line in read_tsv(kmer_stat): if line[0] == "kmer_depth": kmer_depth = int(line[1]) if sample_depth > kmer_depth: LOG.debug( 'The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % kmer_depth) sample_depth = kmer_depth proportion = sample_depth * 1.0 / kmer_depth dag = DAG("survey") choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, proportion=proportion, kingdom=kingdom, thread=thread, job_type=job_type, queue=queue, work_dir=work_dir, out_dir=out_dir) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" ngs_list = "false" dag.add_task(choose_task) dag.add_task(freq_task) dag.add_task(heter_task) freq_task.set_upstream(choose_task) dag.add_task(jellyfish_task) jellyfish_task.set_upstream(choose_task) dag.add_task(gse_scope_task) heter_task.set_upstream(freq_task) gse_scope_task.set_upstream(jellyfish_task) do_dag(dag, concurrent, refresh) if ngs_list == "false": print("Genomics are not assembled") gc_depth_png = heter_png else: depth_work = mkdir(os.path.join(work_dir, "05_GC-depth")) depth_out = mkdir(os.path.join(out_dir, "05_GC-depth")) gc_depth_png = run_gc_depth(genome=genome, fastq_list=ngs_list, name=name, window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=depth_work, out_dir=depth_out) return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
def kmer_denovo_tasks(r1, r2, name, kmer_length, proportion, kingdom, thread, job_type, queue, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) clean1 = check_paths(r1) clean2 = check_paths(r2) work_dict = { "choose": "choose_data", "gse_scope": "02_gse_scope", "kmerfreq": "03_Kmerfreq", "denovo": "04_Soapdenovo", "gc_depth": "05_GC-depth" } choose_work = mkdir(os.path.join(work_dir, work_dict["choose"])) choose_task, choose_r1, choose_r2, choose_r = sample_fastq_task( r1=clean1, r2=clean2, proportion=proportion, name=name, job_type=job_type, work_dir=choose_work) heter_work = mkdir(os.path.join(work_dir, work_dict["kmerfreq"])) heter_out = mkdir(os.path.join(out_dir, work_dict["kmerfreq"])) freq_task, histo, kmer_depth, estimate = kmerfreq_task(r1=choose_r1, r2=choose_r2, name=name, kmer_length=17, thread=thread, job_type=job_type, work_dir=heter_work, out_dir=heter_out) heter_task, stat_heter, heter_png = get_heterozygosity_task( histo=histo, estimate=estimate, kingdom=kingdom, name=name, job_type=job_type, work_dir=heter_work, out_dir=heter_out) scope_work = mkdir(os.path.join(work_dir, work_dict["gse_scope"])) scope_out = mkdir(os.path.join(out_dir, work_dict["gse_scope"])) jellyfish_task, histogram = get_jellyfish_task(fastq=choose_r, name=name, depth=40 * 100, thread=thread, job_type=job_type, work_dir=scope_work, out_dir=scope_out) gse_scope_task, scope_txt, gse_txt, scope_png, gse_png = get_gse_scope_task( histogram=histogram, name=name, kmer_length=kmer_length, job_type=job_type, work_dir=scope_work, out_dir=scope_out) denovo_work = mkdir(os.path.join(work_dir, work_dict["denovo"])) denovo_out = mkdir(os.path.join(out_dir, work_dict["denovo"])) denovo_task, genome, stat_genome, ngs_list = soapdenovo_task( r1=clean1, r2=clean2, name=name, thread=thread * 2, queue=queue, job_type=job_type, work_dir=denovo_work, out_dir=denovo_out) return choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list
def run_kmer_denovo(r1, r2, taxid, name, mode, cratio, kmer_length, kmer_depth, kingdom, asm, window, thread, job_type, queue, concurrent, refresh, work_dir, out_dir, split, platform="illumina"): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) work_dict = { "contamination": "01_contamination", "gse_scope": "02_gse_scope", "kmerfreq": "03_Kmerfreq", "denovo": "04_Soapdenovo", "gc_depth": "05_GC-depth" } for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) if k == "contamination": continue mkdir(os.path.join(out_dir, v)) reads, options = run_filter_contamination(r1=r1, r2=r2, name=name, kmer_length=kmer_length, kmer_depth=kmer_depth, taxid=taxid, kingdom=kingdom, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join( work_dir, work_dict["contamination"]), out_dir=out_dir, mode=mode, cratio=cratio, split=split) dag = DAG("kmer_denovo") jellyfish_task, gse_scope_task, scope_txt, gse_txt, scope_png, gse_png, option = gse_scope( reads=" ".join(reads), name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["gse_scope"]), out_dir=os.path.join(out_dir, work_dict["gse_scope"]), mode=mode) options["software"].update(option) dag.add_task(jellyfish_task) dag.add_task(gse_scope_task) kmerfreq_task, heter_task, stat_heter, heter_png, option = kmerfreq( reads=" ".join(reads), name=name, kingdom=kingdom, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["kmerfreq"]), out_dir=os.path.join(out_dir, work_dict["kmerfreq"])) options["software"].update(option) dag.add_task(kmerfreq_task) dag.add_task(heter_task) denovo_task, genome, stat_genome, option = create_soapdenovo_task( r1=" ".join(r1), r2=" ".join(r2), name=name, thread=thread, queue=queue, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["denovo"]), out_dir=os.path.join(out_dir, work_dict["denovo"])) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" do_dag(dag, concurrent, refresh) if asm == "true": gc_depth = run_gc_depth(genome=genome, r1=" ".join(r1), r2=" ".join(r2), name=name, platform=platform, split="no_split", window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join(work_dir, work_dict["gc_depth"]), out_dir=os.path.join(out_dir, work_dict["gc_depth"])) else: gc_depth = heter_png with open(os.path.join(out_dir, "kmer_denovo.json"), "w") as fh: json.dump(options, fh, indent=2) return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth