def run_gc_depth(genome, fastq_list, name, window, thread, job_type, concurrent, refresh, work_dir, out_dir): genome, fastq_list = check_paths([genome, fastq_list]) sort_bam, genome = bwa_mem(fastq_list=fastq_list, genome=genome, name=name, number=5000000, data_type='', thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=work_dir) sort_bam = check_paths(sort_bam) dag = DAG("gc_depth") gc_depth_task, gc_depth_png = stat_gc_depth_task(genome=genome, bam=sort_bam, name=name, window=window, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(gc_depth_task) do_dag(dag, concurrent, refresh) return gc_depth_png
def run_ncovann(genomes, refgff, concurrent, refresh, job_type, work_dir, out_dir): genomes = check_paths(genomes) refgff = check_path(refgff) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) dag = DAG("ncovann") for genome in genomes: name = genome.split('/')[-1] if '--' in name: name = name.split('--')[0].split('.')[0] else: name = name.split('.')[0] ann_task = create_ncovann_task(genome=genome, name=name, refgff=refgff, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(ann_task) do_dag(dag, concurrent, refresh) return 0
def choose_data(r1, r2, name, kmer_length, kmer_depth, thread, job_type, concurrent, refresh, work_dir, out_dir): r1, r2, kmer_stat, option = stat_kmer_depth( r1=r1, r2=r2, name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) dag = DAG("choose_data") data_task, r1, r2 = choose_data_task( r1=r1, r2=r2, name=name, kmer_stat=kmer_stat, kmer_depth=kmer_depth, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(data_task) do_dag(dag, concurrent, refresh) return option, r1, r2
def run_ncovsnp(reads, reffa, refgb, thread, concurrent, refresh, job_type, work_dir, out_dir, clean=""): reads = check_paths(reads) reffa = check_path(reffa) refgb = check_path(refgb) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) options = {"software": OrderedDict(), "database": OrderedDict()} dag = DAG("ncovsnp") option = OrderedDict() depths = os.path.join(work_dir, "*/*.depth.xls") snps = os.path.join(work_dir, "*/*.snps.gff") stat_map_task = stat_mapcover_snp(reads=' '.join(reads), clean=clean, depths=depths, snps=snps, job_type="local", work_dir=work_dir, out_dir=out_dir) genomes = [] for read in reads: name = read.split('/')[-1] if '--' in name: name = name.split('--')[0].split('.')[0] else: name = name.split('.')[0] name_work = mkdir(os.path.join(work_dir, name)) snp_task, snippy_task, concencus, option = create_ncovsnp_tasks( read=read, name=name, reffa=reffa, refgb=refgb, thread=thread, job_type=job_type, work_dir=name_work, out_dir=out_dir) genomes.append(concencus) dag.add_task(snp_task) dag.add_task(snippy_task) stat_map_task.set_upstream(snp_task) stat_map_task.set_upstream(snippy_task) options["software"] = option dag.add_task(stat_map_task) do_dag(dag, concurrent, refresh) return genomes, options
def split_data(r1, r2, name, number, job_type, concurrent, refresh, work_dir, out_dir, platform="illumina"): if platform in ["PromethION", "GridION" , "RSII", "Sequel"]: read = "%s.part_*.fast*" % name r2 = "" elif platform in ["illumina", "mgi"]: read = "%s.r1.part_*.fastq" % name else: raise Exception("The input sequencing platform is abnormal.") dag = DAG("split_data") task = Task( id="split_data", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {script}/splitfp.py -r1 {r1} -r2 {r2} -o {name} -n {number} #cp {name}.* {out_dir} """.format( script=SCRIPTS, r1=r1, r2=r2, name=name, number=number, out_dir=out_dir ) ) dag.add_task(task) do_dag(dag, concurrent, refresh) temp = read_files(work_dir, read) reads = [] if platform in ["illumina", "mgi"]: for i in temp: j = i.replace(".r1.part_", ".r2.part_") reads.append("%s %s" % (i, j)) else: reads = temp return reads
def bwa_mem(fastq_list, genome, name, number, data_type, thread, job_type, concurrent, refresh, work_dir, out_dir): genome, fastq_list = check_paths([genome, fastq_list]) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) dag = DAG("split_ngs") split_work = mkdir(os.path.join(work_dir, "00_data")) split_out = mkdir(os.path.join(out_dir, "00_data")) splitfp_task, fq_path, r1_name, r2_name = split_ngs_task( fastq_list=fastq_list, name=name, number=number, data_type=data_type, job_type=job_type, work_dir=split_work, out_dir=split_out) dag.add_task(splitfp_task) do_dag(dag, concurrent, refresh) dag = DAG("bwa_mem") index_task, bwa_tasks, merge_task, sorted_bam, genome = run_bwa_mem( fq_path=fq_path, r1_name=r1_name, r2_name=r2_name, genome=genome, name=name, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(index_task) dag.add_task(*bwa_tasks) dag.add_task(merge_task) index_task.set_downstream(*bwa_tasks) merge_task.set_upstream(*bwa_tasks) do_dag(dag, concurrent, refresh) return sorted_bam, genome
def stat_kmer_depth(r1, r2, name, kmer_length, thread, job_type, concurrent, refresh, work_dir, out_dir): dag = DAG("survey_data") data_task, r1, r2 = merge_data_task( name=name, r1=r1, r2=r2, job_type=job_type, work_dir=work_dir, out_dir=out_dir) kmerfreq_task, kmer_stat, option= create_kmerfreq_task( r1=r1, r2=r2, name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(data_task) dag.add_task(kmerfreq_task) kmerfreq_task.set_upstream(data_task) do_dag(dag, concurrent, refresh) return r1, r2, kmer_stat, option
def split_data(r1, r2, name, number, job_type, work_dir, out_dir): if len(r1) != len(r2) and len(r2) <= 1: read = "%s.part_*.fast*" % name r2 = "" elif len(r1) == len(r2): read = "%s.r1.part_*.fastq" % name else: raise Exception("The input sequencing platform is abnormal.") dag = DAG("split_data") task = Task(id="split_data", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {script}/splitfp.py -r1 {r1} -r2 {r2} -o {name} -n {number} #cp {name}.* {out_dir} """.format(script=SCRIPTS, r1=r1, r2=r2, name=name, number=number, out_dir=out_dir)) dag.add_task(task) do_dag(dag, 8, 10) temp = read_files(work_dir, read) reads = [] if len(r1) == len(r2): for i in temp: j = i.replace(".r1.part_", ".r2.part_") reads.append("%s %s" % (i, j)) else: reads = temp return reads
def run_gc_depth(genome, r1, r2, name, platform, split, window, thread, job_type, concurrent, refresh, work_dir, out_dir): genome = check_path(genome) r1 = check_paths(r1) r2 = check_paths(r2) sort_bam = minimap(r1=r1, r2=r2, genome=genome, name=name, split=split, platform=platform, number=5000000, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) sort_bam = check_paths(sort_bam) dag = DAG("gc_depth") gc_depth_task, gc_depth_png = stat_gc_depth_task(genome=genome, bam=sort_bam, name=name, window=window, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(gc_depth_task) do_dag(dag, concurrent, refresh) return gc_depth_png
def run_ncovqc(reads, reference, thread, job_type, concurrent, refresh, work_dir, out_dir): reference = check_path(reference) work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) reads = check_paths(reads) names = [] for i in reads: name = i.split('/')[-1] if '--' in name: name = name.split('--')[1].split('.bam')[0] else: name = name.split('.')[0] names.append(name) options = {"software": OrderedDict(), "database": OrderedDict()} dag = DAG("ncovqc") raw_task, raw_stat = stat_reads_task(reads=" ".join(reads), name="raw", thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) map_tasks, clean_reads, option = map_ref_tasks(reads=reads, names=names, reference=reference, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) options["software"] = option clean_task, clean_stat = stat_reads_task(reads=clean_reads, name="clean", thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) dag.add_task(raw_task) dag.add_task(*map_tasks) dag.add_task(clean_task) clean_task.set_upstream(*map_tasks) do_dag(dag, concurrent, refresh) return clean_reads, clean_stat, options
def run_minimap(reads, genome, platform, name, split, thread, job_type, concurrent, refresh, work_dir, out_dir): option = OrderedDict() option["minimap2"] = { "version": get_version(SOFTWARE_VERSION["minimap2"]), "option": "%s" % SEQUENCER[platform]["minimap2"] } work_dict = {"minimap": "01_minimap", "merge": "02_merge"} for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) dag = DAG("minimap") minimap_tasks, bams = create_minimap_tasks(reads=reads, genome=genome, platform=platform, name=name, thread=thread, job_type=job_type, work_dir=os.path.join( work_dir, work_dict["minimap"]), out_dir=out_dir, split=split) merge_task, bam = merge_bam_task(bams=bams, name=name, thread=thread, job_type=job_type, work_dir=os.path.join( work_dir, work_dict["merge"]), out_dir=out_dir) dag.add_task(*minimap_tasks) dag.add_task(merge_task) merge_task.set_upstream(*minimap_tasks) do_dag(dag, concurrent, refresh) return bam, option
def run_survey(r1, r2, name, trim, kingdom, kmer_length, sample_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) dag = DAG("survey_qc") merge_task, qc_task, cont_task, result_task, clean1, clean2, quality, content, gc, stat_qc, poll_png, poll_tsv = ngs_qc_tasks( name=name, r1=r1, r2=r2, trim=trim, thread=thread, job_type=job_type, work_dir=work_dir, out_dir=out_dir) data_work = mkdir(os.path.join(work_dir, "choose_data")) freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=data_work, out_dir=data_work) dag.add_task(merge_task) dag.add_task(qc_task) qc_task.set_upstream(merge_task) dag.add_task(cont_task) dag.add_task(result_task) dag.add_task(freq_task1) freq_task1.set_upstream(qc_task) cont_task.set_upstream(qc_task) result_task.set_upstream(qc_task) do_dag(dag, concurrent, refresh) for line in read_tsv(kmer_stat): if line[0] == "kmer_depth": kmer_depth = int(line[1]) if sample_depth > kmer_depth: LOG.debug( 'The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % kmer_depth) sample_depth = kmer_depth proportion = sample_depth * 1.0 / kmer_depth dag = DAG("survey") choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, proportion=proportion, kingdom=kingdom, thread=thread, job_type=job_type, queue=queue, work_dir=work_dir, out_dir=out_dir) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" ngs_list = "false" dag.add_task(choose_task) dag.add_task(freq_task) dag.add_task(heter_task) freq_task.set_upstream(choose_task) dag.add_task(jellyfish_task) jellyfish_task.set_upstream(choose_task) dag.add_task(gse_scope_task) heter_task.set_upstream(freq_task) gse_scope_task.set_upstream(jellyfish_task) do_dag(dag, concurrent, refresh) if ngs_list == "false": print("Genomics are not assembled") gc_depth_png = heter_png else: depth_work = mkdir(os.path.join(work_dir, "05_GC-depth")) depth_out = mkdir(os.path.join(out_dir, "05_GC-depth")) gc_depth_png = run_gc_depth(genome=genome, fastq_list=ngs_list, name=name, window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=depth_work, out_dir=depth_out) run_report(name, asm, kmer_length, stat_qc, quality, content, gc, poll_tsv, poll_png, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth_png, out_dir) return stat_qc, quality, content, gc, poll_png, poll_tsv, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
def run_filter_contamination(r1, r2, name, kmer_length, kmer_depth, taxid, kingdom, thread, job_type, concurrent, refresh, work_dir, out_dir, split, mode="fast", cratio=10): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) taxid = check_path(taxid) options = { "software": OrderedDict(), "database": OrderedDict() } option, r1, r2 = choose_data( r1=r1, r2=r2, name=name, kmer_length=kmer_length, kmer_depth=kmer_depth, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=work_dir, out_dir=out_dir) options["software"].update(option) if mode!="fast": work_dict = { "data": "00_data", "ref": "01_ref", "ump": "02_ump" } for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) reads = split_data( r1=r1, r2=r2, name=name, number=2000000, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["data"]), concurrent=concurrent, refresh=refresh, out_dir=out_dir, platform="illumina") dag = DAG("unmap_data") ref_task, ref= obtain_contamination_task( taxid=taxid, name=name, kingdom=kingdom, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["ref"]), out_dir=out_dir, mode=mode, cratio=cratio) dag.add_task(ref_task) unmap_tasks, reads, option = create_unmap_tasks( name=name, reference=ref, reads=reads, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["ump"]), out_dir=out_dir, split=split) dag.add_task(*unmap_tasks) ref_task.set_downstream(*unmap_tasks) do_dag(dag, concurrent, refresh) options["software"].update(option) reads = [reads] else: reads = [r1, r2] return reads, options
def run_kmer_denovo(r1, r2, name, kingdom, kmer_length, sample_depth, thread, asm, window, job_type, queue, concurrent, refresh, work_dir, out_dir): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) if r1[0].endswith(".gz") or r2[0].endswith(".gz"): tools = "zcat" else: tools = "cat" dag_data = DAG("survey_data") data_work = mkdir(os.path.join(work_dir, "choose_data")) cat_data_task, clean1, clean2 = merge_raw_data_task(name=name, r1=" ".join(r1), r2=" ".join(r2), tools=tools, job_type=job_type, work_dir=data_work, out_dir=data_work) freq_task1, histo1, kmer_stat, estimate1 = kmerfreq_task( r1=clean1, r2=clean2, name=name, kmer_length=17, thread=thread, job_type=job_type, work_dir=data_work, out_dir=data_work) dag_data.add_task(cat_data_task) dag_data.add_task(freq_task1) freq_task1.set_upstream(cat_data_task) do_dag(dag_data, concurrent, refresh) for line in read_tsv(kmer_stat): if line[0] == "kmer_depth": kmer_depth = int(line[1]) if sample_depth > kmer_depth: LOG.debug( 'The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % kmer_depth) sample_depth = kmer_depth proportion = sample_depth * 1.0 / kmer_depth dag = DAG("survey") choose_task, freq_task, heter_task, jellyfish_task, gse_scope_task, denovo_task, stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, genome, ngs_list = kmer_denovo_tasks( r1=clean1, r2=clean2, name=name, kmer_length=kmer_length, proportion=proportion, kingdom=kingdom, thread=thread, job_type=job_type, queue=queue, work_dir=work_dir, out_dir=out_dir) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" ngs_list = "false" dag.add_task(choose_task) dag.add_task(freq_task) dag.add_task(heter_task) freq_task.set_upstream(choose_task) dag.add_task(jellyfish_task) jellyfish_task.set_upstream(choose_task) dag.add_task(gse_scope_task) heter_task.set_upstream(freq_task) gse_scope_task.set_upstream(jellyfish_task) do_dag(dag, concurrent, refresh) if ngs_list == "false": print("Genomics are not assembled") gc_depth_png = heter_png else: depth_work = mkdir(os.path.join(work_dir, "05_GC-depth")) depth_out = mkdir(os.path.join(out_dir, "05_GC-depth")) gc_depth_png = run_gc_depth(genome=genome, fastq_list=ngs_list, name=name, window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=depth_work, out_dir=depth_out) return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome
def run_kmer_denovo(r1, r2, taxid, name, mode, cratio, kmer_length, kmer_depth, kingdom, asm, window, thread, job_type, queue, concurrent, refresh, work_dir, out_dir, split, platform="illumina"): work_dir = mkdir(work_dir) out_dir = mkdir(out_dir) r1 = check_paths(r1) r2 = check_paths(r2) work_dict = { "contamination": "01_contamination", "gse_scope": "02_gse_scope", "kmerfreq": "03_Kmerfreq", "denovo": "04_Soapdenovo", "gc_depth": "05_GC-depth" } for k, v in work_dict.items(): mkdir(os.path.join(work_dir, v)) if k == "contamination": continue mkdir(os.path.join(out_dir, v)) reads, options = run_filter_contamination(r1=r1, r2=r2, name=name, kmer_length=kmer_length, kmer_depth=kmer_depth, taxid=taxid, kingdom=kingdom, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join( work_dir, work_dict["contamination"]), out_dir=out_dir, mode=mode, cratio=cratio, split=split) dag = DAG("kmer_denovo") jellyfish_task, gse_scope_task, scope_txt, gse_txt, scope_png, gse_png, option = gse_scope( reads=" ".join(reads), name=name, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["gse_scope"]), out_dir=os.path.join(out_dir, work_dict["gse_scope"]), mode=mode) options["software"].update(option) dag.add_task(jellyfish_task) dag.add_task(gse_scope_task) kmerfreq_task, heter_task, stat_heter, heter_png, option = kmerfreq( reads=" ".join(reads), name=name, kingdom=kingdom, kmer_length=kmer_length, thread=thread, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["kmerfreq"]), out_dir=os.path.join(out_dir, work_dict["kmerfreq"])) options["software"].update(option) dag.add_task(kmerfreq_task) dag.add_task(heter_task) denovo_task, genome, stat_genome, option = create_soapdenovo_task( r1=" ".join(r1), r2=" ".join(r2), name=name, thread=thread, queue=queue, job_type=job_type, work_dir=os.path.join(work_dir, work_dict["denovo"]), out_dir=os.path.join(out_dir, work_dict["denovo"])) if asm == "true": dag.add_task(denovo_task) else: genome = "false" stat_genome = "false" do_dag(dag, concurrent, refresh) if asm == "true": gc_depth = run_gc_depth(genome=genome, r1=" ".join(r1), r2=" ".join(r2), name=name, platform=platform, split="no_split", window=window, thread=thread, job_type=job_type, concurrent=concurrent, refresh=refresh, work_dir=os.path.join(work_dir, work_dict["gc_depth"]), out_dir=os.path.join(out_dir, work_dict["gc_depth"])) else: gc_depth = heter_png with open(os.path.join(out_dir, "kmer_denovo.json"), "w") as fh: json.dump(options, fh, indent=2) return stat_heter, heter_png, scope_txt, gse_txt, scope_png, gse_png, stat_genome, gc_depth