def get_gse_scope_task(histogram, name, kmer_length, job_type, work_dir, out_dir): """运行genomescope和findGSE预估基因组大小""" task = Task(id="scope_and_gse", work_dir=work_dir, type=job_type, option="-pe smp 2", script=""" {rscript} {script}/genomescope.R {histogram} {kmer_length} 150 {work_dir} -1 1 cp {work_dir}/plot.png {out_dir}/{name}.genomescope.png cp {work_dir}/summary.txt {out_dir}/{name}.genomescope.txt {rscript} {script}/findGSE.R {histogram} {kmer_length} {work_dir} mv {work_dir}/v1.94.est.{name}.histogram.txt.sizek{kmer_length}.curvefitted.pdf {work_dir}/{name}.findgse.pdf cp {work_dir}/v1.94.est.{name}.histogram.txt.genome.size.estimated.k{kmer_length}to{kmer_length}.fitted.txt {out_dir}/{name}.findgse.txt #convert -density 300 -quality 300 {work_dir}/{name}.findgse.pdf {work_dir}/{name}.findgse.png {gs} -dQUIET -dNOSAFER -r300 -dBATCH -sDEVICE=pngalpha -dNOPAUSE -dNOPROMPT -sOutputFile={out_dir}/{name}.findgse.png {work_dir}/{name}.findgse.pdf #cp {work_dir}/{name}.findgse-0.png {out_dir}/{name}.findgse.png rm -rf {work_dir}/round* """.format(rscript=RSCRIPT, script=SCRIPTS, gs=GHOSTSCRIPT, name=name, kmer_length=kmer_length, histogram=histogram, work_dir=work_dir, out_dir=out_dir)) return task, os.path.join( out_dir, "%s.genomescope.txt" % name), os.path.join(out_dir, "%s.findgse.txt" % name), os.path.join( out_dir, "%s.genomescope.png" % name), os.path.join(out_dir, "%s.findgse.png" % name)
def choose_data_task(r1, r2, name, kmer_stat, kmer_depth, job_type, work_dir, out_dir): for line in read_tsv(kmer_stat): if line[0]=="kmer_depth": sdepth = int(line[1]) proportion=kmer_depth*1.0/sdepth if kmer_depth>sdepth: LOG.debug('The amount of sequencing data may be insufficient. Sequencing depth is only %s X' % sdepth) proportion = 1 task = Task( id="choose_fastq", work_dir=work_dir, type=job_type, option="-pe smp 2", script=""" if [ {proportion} -ge 1] ; then ln -s {r1} {name}_choose.r1.fastq ln -s {r2} {name}_choose.r2.fastq else {seqkit} sample -p {proportion} -2 -o {name}_choose.r1.fastq {r1} {seqkit} sample -p {proportion} -2 -o {name}_choose.r2.fastq {r2} fi #cp {name}_choose.r1.fastq {name}_choose.r2.fastq """.format(seqkit=SEQKIT, r1=r1, r2=r2, name=name, proportion=proportion, out_dir=out_dir ) ) return task, os.path.join(work_dir, "%s_choose.r1.fastq" % name), os.path.join(work_dir, "%s_choose.r2.fastq" % name)
def kmerfreq_task(r1, r2, name, kmer_length, thread, job_type, work_dir, out_dir): """kmerfreq 统计reads的kmer频率分布""" if kmer_length >= 17: kmer_length = 17 task = Task( id="kmerfreq", work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={python}:$PATH ls {r1} >{work_dir}/{name}.data ls {r2} >>{work_dir}/{name}.data {kmerfreq} -k {kmer_length} -t {thread} -p {work_dir}/{name} -q 33 -m 0 {work_dir}/{name}.data > {work_dir}/{name}.kmer.count cp {work_dir}/{name}.freq.stat {out_dir}/{name}.kmerfreq.stat python {script}/kmerfreq_stat.py {work_dir}/{name}.freq.stat >{work_dir}/{name}.kmer.stat """.format( script=SCRIPTS, python=PYTHON_BIN, kmerfreq=KMERFREQ, kmer_length=kmer_length, #kmer数 name=name, r1=r1, #文件列表 r2=r2, thread=thread, work_dir=work_dir, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.freq.stat" % name), os.path.join( work_dir, "%s.kmer.stat" % name), os.path.join(work_dir, "%s.genome_estimate" % name)
def stat_mapcover_snp(reads, clean, depths, snps, job_type, work_dir, out_dir): task = Task(id="stat_cover_snp", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" export PATH={python}:$PATH if [ -f "{clean}" ];then cp {clean} clean.stat_reads.tsv else python {scripts}/stat_barcode.py --input {reads} --out clean.stat_reads.tsv >data.js fi python {scripts}/stat_map_coverage.py --input {depths} --clean clean.stat_reads.tsv --out stat_map_coverage.tsv python {scripts}/stat_snp_gff.py --input {snps} >stat.snp.tsv cp stat_map_coverage.tsv stat.snp.tsv {out_dir} """.format(python=PYTHON_BIN, scripts=SCRIPTS, clean=clean, reads=reads, depths=depths, snps=snps, out_dir=out_dir)) return task
def create_soapdenovo_task(r1, r2, name, thread, queue, job_type, work_dir, out_dir): option = {} option["soapdenovo"] = { "version": get_version(SOFTWARE_VERSION["soapdenovo"]), "option": "max_rd_len=151 avg_ins=400 reverse_seq=0 asm_flags=3 rank=1 pair_num_cutoff=3 map_len=64" } task = Task(id="soapdenovo_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp %s %s" % (thread, queue), script=""" export PATH={soapdenovo}:{script}:$PATH echo -e "max_rd_len=151\n[LIB]\navg_ins=400\nreverse_seq=0\nasm_flags=3\nrank=1\npair_num_cutoff=3\nmap_len=64\nq1={r1}\nq2={r2}\n" > {name}.config echo -e "{r1} {r2}" >{name}_ngs.list SOAPdenovo-127mer all -s {name}.config -K 47 -p {thread} -d 2 -R 2 -o {name} >ass.log mv {name}.scafSeq {name}.asm.fasta stat_genome.py -s {name}.asm.fasta -r {name}.asm.tsv cp {name}.asm.tsv {name}.asm.fasta {out_dir} """.format(script=SCRIPTS, soapdenovo=SOAPDENOVO_BIN, r1=r1, r2=r2, name=name, thread=thread, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.asm.fasta" % name), os.path.join( work_dir, "%s.asm.tsv" % name), option
def create_kmerfreq_task(reads, name, kmer_length, thread, job_type, work_dir, out_dir): option = {} option["kmerfreq"] = { "version": get_version(SOFTWARE_VERSION["kmerfreq"]), "option": "-q 33 -m 0 -k %s" % kmer_length } if kmer_length >= 17: kmer_length = 17 task = Task(id="kmerfreq_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={python}:$PATH ls {reads} >{name}.data {kmerfreq} -k {kmer_length} -t {thread} -p {name} -q 33 -m 0 {name}.data > {name}.kmer.count python {script}/kmerfreq_stat.py {name}.freq.stat >{name}.kmer.stat cp {name}.freq.stat {out_dir}/{name}.kmerfreq.stat """.format(script=SCRIPTS, python=PYTHON_BIN, kmerfreq=KMERFREQ, kmer_length=kmer_length, name=name, reads=reads, thread=thread, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.freq.stat" % name), os.path.join( work_dir, "%s.genome_estimate" % name), option
def create_unmap_task(name, reference, r1, r2, thread, job_type, work_dir, out_dir): option = OrderedDict() option["minimap2"] = { "version": get_version(SOFTWARE_VERSION["minimap2"]), "option:": "-ax sr" } option["samblaster"] = { "version": get_version(SOFTWARE_VERSION["samblaster"]), "option:": "default" } task = Task(id="unmap__%s" % name, work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={minimap2}:{samblaster}:$PATH minimap2 -t {thread} -ax sr {reference} {r1} {r2} |samblaster -u {name}.unmap.fq #cp {name}.unmap.fq {out_dir} """.format(minimap2=MINIMAP_BIN, samblaster=SAMBLASTER_BIN, reference=reference, r1=r1, r2=r2, name=name, thread=thread, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.unmap.fq" % name), option
def create_gse_scope_task(histogram, name, kmer_length, job_type, work_dir, out_dir): task = Task(id="scope_gse_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp 2", script=""" {rscript} {script}/genomescope.R {histogram} {kmer_length} 150 ./ -1 1 mv plot.png {name}.genomescope.png mv summary.txt {name}.genomescope.txt {rscript} {script}/findGSE.R {histogram} {kmer_length} ./ mv v1.94.est.{name}.histogram.txt.sizek{kmer_length}.curvefitted.pdf {name}.findgse.pdf mv v1.94.est.{name}.histogram.txt.genome.size.estimated.k{kmer_length}to{kmer_length}.fitted.txt {name}.findgse.txt {gs} -dQUIET -dNOSAFER -r300 -dBATCH -sDEVICE=pngalpha -dNOPAUSE -dNOPROMPT -sOutputFile={name}.findgse.png {name}.findgse.pdf rm -rf round* cp {name}.genomescope.png {name}.genomescope.txt {out_dir} cp {name}.findgse.txt {name}.findgse.png {name}.findgse.pdf {out_dir} """.format(rscript=RSCRIPT, script=SCRIPTS, gs=GHOSTSCRIPT, name=name, kmer_length=kmer_length, histogram=histogram, out_dir=out_dir)) return task, os.path.join( work_dir, "%s.genomescope.txt" % name), os.path.join(work_dir, "%s.findgse.txt" % name), os.path.join( work_dir, "%s.genomescope.png" % name), os.path.join(work_dir, "%s.findgse.png" % name)
def merge_data_task(name, r1, r2, job_type, work_dir, out_dir): if r1[0].endswith(".gz") or r2[0].endswith(".gz"): suffix = ".gz" tools = "zcat" else: suffix = "" tools = "cat" if len(r1)<=1 and suffix == "": job_type = "local" run = """ ln -s {r1} {name}.clean.r1.fastq ln -s {r2} {name}.clean.r2.fastq """.format(r1=" ".join(r1), r2=" ".join(r2), name=name) else: run = """ {tools} {r1} >{name}.clean.r1.fastq {tools} {r2} >{name}.clean.r2.fastq """.format(tools=tools, r1=" ".join(r1), r2=" ".join(r2), name=name) task = Task( id="data_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {run} """.format(run=run) ) return task, os.path.join(work_dir, "%s.clean.r1.fastq" % name), os.path.join(work_dir, "%s.clean.r2.fastq" % name)
def stat_gc_depth_task(genome, bam, name, window, job_type, work_dir, out_dir): bam = check_paths(bam) task = Task(id="stat_coverage", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" export PATH={samtools}:{python}:$PATH samtools depth -aa {bam} > {work_dir}/{name}.depth python {script}/stat_coverage.py -i {work_dir}/{name}.depth -d 1,5,10,20 -o {out_dir}/{name}.coverage.xlsx python {script}/stat_length_gc.py -d {work_dir}/{name}.depth -g {genome} -n {out_dir}/{name} python {script}/stat_gc_depth.py -d {work_dir}/{name}.depth -g {genome} -b 1000 -w 5000 -e 100 -n {work_dir}/{name} python {script}/draw_depth_gc.py -gcd {work_dir}/{name}.stat_gc_depth.tsv -n {out_dir}/{name} #python {script}/plot_gc_depth.py -gcd {work_dir}/{name}.stat_gc_depth.tsv -n {out_dir}/{name} """.format(samtools=SAMTOOLS_BIN, script=SCRIPTS, python=PYTHON_BIN, genome=genome, bam=bam, name=name, window=window, work_dir=work_dir, out_dir=out_dir)) return task, os.path.join(out_dir, "%s.gc_depth.png" % name)
def soapdenovo_task(r1, r2, name, thread, queue, job_type, work_dir, out_dir): """soapdenove组装""" task = Task(id="soapdenovo", work_dir=work_dir, type=job_type, option="-pe smp %s %s" % (thread, queue), script=""" export PATH={soapdenovo}:{script}:$PATH echo -e "max_rd_len=151\n[LIB]\navg_ins=400\nreverse_seq=0\nasm_flags=3\nrank=1\npair_num_cutoff=3\nmap_len=64\nq1={r1}\nq2={r2}\n" > {name}.config echo -e "{r1} {r2}" >{name}_ngs.list SOAPdenovo-127mer all -s {name}.config -K 47 -p {thread} -d 2 -R 2 -o {name} >ass.log cp {name}.scafSeq {out_dir}/{name}.asm.fasta stat_genome.py -s {out_dir}/{name}.asm.fasta -r {out_dir}/{name}.asm.tsv """.format(script=SCRIPTS, soapdenovo=SOAPDENOVO_BIN, r1=r1, r2=r2, name=name, thread=thread, out_dir=out_dir)) return task, os.path.join(out_dir, "%s.asm.fasta" % name), os.path.join( out_dir, "%s.asm.tsv" % name), os.path.join(work_dir, "%s_ngs.list" % name)
def get_heterozygosity_task(histo, estimate, kingdom, name, job_type, work_dir, out_dir): task = Task(id="heterozygosity", work_dir=work_dir, type=job_type, option="-pe smp 2", script=""" export PATH={python}:$PATH python {script}/fit_heterozygosity.py {histo} \ -e {estimate} --kingdom {kingdom} \ --name {name} --database {script} > {name}.heterozygosity.xls cp {name}.heterozygosity.xls {out_dir} cp {name}.kmer.p* {name}.heterozygosity.p* {out_dir} """.format(script=SCRIPTS, python=PYTHON_BIN, histo=histo, estimate=estimate, name=name, kingdom=kingdom, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.heterozygosity.xls" % name), os.path.join( work_dir, "%s.heterozygosity.png" % name)
def bwa_index_task(genome, name, job_type, work_dir, out_dir): task = Task(id="bwa_index", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" export PATH={bwa}:$PATH ln -sf {genome} {out_dir}/{name}.fasta bwa index {out_dir}/{name}.fasta """.format(bwa=BWA_BIN, genome=genome, name=name, out_dir=out_dir)) return task, os.path.join(out_dir, "%s.fasta" % name)
def merge_raw_data_task(name, r1, r2, tools, job_type, work_dir, out_dir): task = Task(id="merge_data", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {tools} {r1} >{out_dir}/{name}.raw.r1.fastq {tools} {r2} >{out_dir}/{name}.raw.r2.fastq """.format(name=name, tools=tools, r1=r1, r2=r2, out_dir=out_dir)) r1 = os.path.join(out_dir, "%s.raw.r1.fastq" % name) r2 = os.path.join(out_dir, "%s.raw.r2.fastq" % name) return task, r1, r2
def stat_reads_task(reads, name, thread, job_type, work_dir, out_dir): task = Task(id="stat_reads_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={python}:$PATH python {scripts}/stat_barcode.py --input {reads} --out {name}.stat_reads.tsv >data.js cp {name}.stat_reads.tsv {out_dir} """.format(scripts=SCRIPTS, python=PYTHON_BIN, reads=reads, name=name, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.stat_reads.tsv" % name)
def obtain_contamination_task(taxid, name, kingdom, job_type, work_dir, out_dir, mode="general", cratio=10): for line in open(taxid): if line.startswith("#"): line = line.split("\t") prok_ratio = float(line[0].split(':')[1]) top10 = int(line[1].split(':')[1]) break if prok_ratio>=cratio or top10 >0: LOG.info("There is serious contamination of the sample, strict mode is mandatory") mode = "strict" if mode == "strict": run = 'blastdbcmd -db {dbase} -dbtype "nucl" -taxidlist {name}.prokaryotic.taxid -out {name}.prokaryotic.fa'.format( dbase=NT_TAXON["fungi"], name=name) pfa = "%s.prokaryotic.fa" % name else: run = "" pfa = "" task = Task( id="blastdbcmd_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp 2", script=""" export PATH={blast}:$PATH {script}/print.py {taxid} >{name}.prokaryotic.taxid {run} cat {pfa} {mbase} >{name}.ref.fa #cp {name}.ref.fa {out_dir} """.format(blast=BLAST_BIN, script=SCRIPTS, taxid=taxid, run=run, pfa=pfa, name=name, mbase=MC_TAXON[kingdom], out_dir=out_dir ) ) return task, os.path.join(work_dir, "%s.ref.fa" % name)
def bwa_merge_bam_task(sort_bams, name, thread, job_type, work_dir, out_dir): task = Task(id="merge_bwa_bam", work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={samtools}:$PATH ls {sort_bams} >{out_dir}/bam.list samtools merge -f -c --threads {thread} -b {out_dir}/bam.list {out_dir}/{name}.sorted.bam samtools index {out_dir}/{name}.sorted.bam #rm -rf {sort_bams} """.format(samtools=SAMTOOLS_BIN, sort_bams=sort_bams, name=name, thread=thread, out_dir=out_dir)) return task, os.path.join(out_dir, "%s.sorted.bam" % name)
def merge_bam_task(bams, name, thread, job_type, work_dir, out_dir): task = Task(id="merge_bam", work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={samtools}:$PATH samtools merge -f -c --threads {thread} {name}.sorted.bam {bams} samtools index {name}.sorted.bam #rm {bams} #cp {name}.sorted.bam {out_dir} """.format(samtools=SAMTOOLS_BIN, bams=bams, name=name, thread=thread, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.sorted.bam" % name)
def create_jellyfish_task(reads, name, thread, job_type, work_dir, out_dir, mode="general"): option = {} option["jellyfish"] = { "version": get_version(SOFTWARE_VERSION["jellyfish"]), "option": "-m 21 -s 1G" } if mode == "general": histout = "%s.histogram.txt" % name runh = "" else: histout = "%s.histogram_old.txt" % name runh = "head -n 5000 %s.histogram_old.txt >%s.histogram.txt" % (name, name) task = Task(id="jellyfish", work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={jellyfish}:$PATH jellyfish count -m 21 -s 1G -t {thread} -C {reads} -o {name}.jellyfish.jf jellyfish histo -f {name}.jellyfish.jf -t {thread} > {histout} {runh} jellyfish stats -v -o {name}.stats.kmer.txt {name}.jellyfish.jf cp {name}.histogram.txt {out_dir} rm -rf {name}.jellyfish.jf """.format(jellyfish=JELLYFISH_BIN, reads=reads, name=name, histout=histout, runh=runh, thread=thread, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.histogram.txt" % name), option
def split_ngs_task(fastq_list, name, number, data_type, job_type, work_dir, out_dir): task = Task(id="split_ngs", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {script}/splitfp.py -i {fastq_list} -w {out_dir} -o {name} -n {number} {type} """.format(script=SCRIPTS, fastq_list=fastq_list, name=name, number=number, type=data_type, out_dir=out_dir)) r1_name = '%s.r1.part_*.fastq' % name r2_name = '%s.r2.part_*.fastq' % name return task, out_dir, r1_name, r2_name
def split_data(r1, r2, name, number, job_type, concurrent, refresh, work_dir, out_dir, platform="illumina"): if platform in ["PromethION", "GridION" , "RSII", "Sequel"]: read = "%s.part_*.fast*" % name r2 = "" elif platform in ["illumina", "mgi"]: read = "%s.r1.part_*.fastq" % name else: raise Exception("The input sequencing platform is abnormal.") dag = DAG("split_data") task = Task( id="split_data", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {script}/splitfp.py -r1 {r1} -r2 {r2} -o {name} -n {number} #cp {name}.* {out_dir} """.format( script=SCRIPTS, r1=r1, r2=r2, name=name, number=number, out_dir=out_dir ) ) dag.add_task(task) do_dag(dag, concurrent, refresh) temp = read_files(work_dir, read) reads = [] if platform in ["illumina", "mgi"]: for i in temp: j = i.replace(".r1.part_", ".r2.part_") reads.append("%s %s" % (i, j)) else: reads = temp return reads
def split_data(r1, r2, name, number, job_type, work_dir, out_dir): if len(r1) != len(r2) and len(r2) <= 1: read = "%s.part_*.fast*" % name r2 = "" elif len(r1) == len(r2): read = "%s.r1.part_*.fastq" % name else: raise Exception("The input sequencing platform is abnormal.") dag = DAG("split_data") task = Task(id="split_data", work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" {script}/splitfp.py -r1 {r1} -r2 {r2} -o {name} -n {number} #cp {name}.* {out_dir} """.format(script=SCRIPTS, r1=r1, r2=r2, name=name, number=number, out_dir=out_dir)) dag.add_task(task) do_dag(dag, 8, 10) temp = read_files(work_dir, read) reads = [] if len(r1) == len(r2): for i in temp: j = i.replace(".r1.part_", ".r2.part_") reads.append("%s %s" % (i, j)) else: reads = temp return reads
def create_ncovann_task(genome, name, refgff, job_type, work_dir, out_dir): ann_task = Task(id="annotate_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp 1", script=""" export PATH={tbl2asn}:$PATH {script}/process_assembly.py {genome} --topology linear --moltype ss-RNA --completeness complete --gcode 1 --organism 'Unknow' --strain {name} > {name}.genomic.fasta {script}/sed.py {refgff} --old MN908947.3 --new {name} > {name}.genomic.gff {script}/gff2tbl.py {name}.genomic.gff >{name}.genomic.tbl tbl2asn -i {name}.genomic.fasta -V b -s T mv {name}.genomic.gbf {name}.genomic.gb {script}/gb2protein.py {name}.genomic.gb >{name}.protein.fasta cp {name}.genomic.fasta {name}.genomic.gff {name}.genomic.sqn {name}.genomic.gb {name}.protein.fasta {out_dir} """.format(tbl2asn=TBL2ASN_BIN, script=SCRIPTS, genome=genome, name=name, refgff=refgff, out_dir=out_dir)) return ann_task
def get_jellyfish_task(fastq, name, depth, thread, job_type, work_dir, out_dir): """运行jellyfish""" task = Task(id="jellyfish", work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={jellyfish}:$PATH jellyfish count -m 21 -s 1G -t {thread} -C {fastq} -o {name}.jellyfish.jf jellyfish histo -f {name}.jellyfish.jf -t {thread} > {name}.histogram_old.txt head -n {depth} {name}.histogram_old.txt >{name}.histogram.txt jellyfish stats -v -o {name}.stats.kmer.txt {name}.jellyfish.jf cp {name}.histogram.txt {out_dir} rm -rf {name}.jellyfish.jf """.format(jellyfish=JELLYFISH_BIN, fastq=fastq, name=name, depth=depth, thread=thread, out_dir=out_dir)) return task, os.path.join(work_dir, "%s.histogram.txt" % name)
def sample_fastq_task(r1, r2, proportion, name, job_type, work_dir): """fastq 取样""" task = Task( id="sample_fastq", work_dir=work_dir, type=job_type, option="-pe smp 2", script=""" {seqkit} sample -p {proportion} -2 -o {work_dir}/{name}_choose.r1.fastq {r1} {seqkit} sample -p {proportion} -2 -o {work_dir}/{name}_choose.r2.fastq {r2} """.format( seqkit=SEQKIT, r1=r1, r2=r2, name=name, proportion=proportion, #取样比例 work_dir=work_dir)) return task, os.path.join(work_dir, "%s_choose.r1.fastq" % name), os.path.join( work_dir, "%s_choose.r2.fastq" % name), os.path.join( work_dir, "%s_choose.r*.fastq" % name)
def create_ncovsnp_tasks(read, name, reffa, refgb, thread, job_type, work_dir, out_dir): option = OrderedDict() option["samtools"] = { "version": get_version(SOFTWARE_VERSION["samtools"]), "option": "default" } option["medaka"] = { "version": get_version(SOFTWARE_VERSION["medaka"]), "option": "medaka_variant (parameters: default)" } option["snippy"] = { "version": get_version(SOFTWARE_VERSION["snippy"]), "option": "--minfrac 0.9" } snp_task = Task(id="medaka_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={samtools}:{minimap}:{medaka}:{python}:$PATH source {activate} medaka minimap2 -ax map-ont -t {thread} {reffa} {read} |samtools sort -@ 5 - >{name}.sort.bam samtools index {name}.sort.bam samtools depth -a {name}.sort.bam -d 0 --reference {reffa} >{name}.depth.xls medaka_variant -f {reffa} -i {name}.sort.bam -t {thread} -b 1000 -p {name} cp medaka_variant/round_1.vcf {name}.raw.vcf python {script}/vcf2fasta.py --vcf {name}.raw.vcf --depth {name}.depth.xls --refer {reffa} >{name}.raw_concencus.fasta rm -rf medaka_variant {name}.sort.bam #cp {name}.concencus.fasta {out_dir} """.format(minimap=MINIMAP_BIN, samtools=SAMTOOLS_BIN, medaka=MEDAKA_BIN, activate=MEDAKA_ENV, script=SCRIPTS, python=PYTHON_BIN, read=read, name=name, reffa=reffa, thread=thread, out_dir=out_dir)) snippy_task = Task(id="snippy_%s" % name, work_dir=work_dir, type=job_type, option="-pe smp %s" % thread, script=""" export PATH={snippy}:{python}:$PATH python {script}/plot_depth_stat.py -i {name}.depth.xls -w 1 -o {name} snippy --cpus {thread} --outdir {name} --minfrac 0.9 --ref {refgb} -ctgs {name}.raw_concencus.fasta python {script}/filter_snp_gff.py -i {name}/snps.gff -d {name}.depth.xls >{name}.snps.gff python {script}/snp_gff2fasta.py --gff {name}.snps.gff --refer {reffa} >{name}.concencus.fasta cp {name}/snps.vcf {name}.snps.vcf rm -rf {name} cp {name}.concencus.fasta {out_dir} cp {name}.depth.png {name}.depth.pdf {name}.snps.gff {out_dir} """.format(snippy=SNIPPY_BIN, python=PYTHON_BIN, script=SCRIPTS, refgb=refgb, reffa=reffa, name=name, thread=thread, out_dir=out_dir)) snippy_task.set_upstream(snp_task) return snp_task, snippy_task, os.path.join(out_dir, "%s.concencus.fasta" % name), option