def annotate_gff(self, sampleID): cmd = ''' set -eo pipefail echo annotate gff for {sampleID}: `date "+%F %T"`\n cd {analydir}/SV/{sampleID}/{sv_soft} sh {annovar} \\ -t SVType \\ {sampleID}.{sv_soft}.gff \\ {sampleID}.{sv_soft} python {moduledir}/Varition/SV/sv_cnv_stat.py \\ -i {sampleID}.{sv_soft}.hg19_multianno.xls \\ -s {sampleID} \\ -soft {sv_soft} echo annotate gff for {sampleID} `date "+%F %T"` '''.format(sampleID=sampleID, **self.__dict__) shell_path = '{analydir}/SV/{sampleID}/{sv_soft}/annotate_gff_{sampleID}.sh'.format( sampleID=sampleID, **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'annotate_gff' job_name = 'annotate_gff_{}'.format(sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) after_jobs = ['data_release', 'primary_report'] utils.add_order(self.orders, job_name, after_jobs=after_jobs)
def breakdancer_config(self, sampleID): cmd = ''' set -eo pipefail echo breakdancer config for {sampleID} start: `date "+%F %T"`\n cd {analydir}/SV/{sampleID}/breakdancer perl {soft_dir}/breakdancer/current/bam2cfg.pl \\ -g -h -n 100000 \\ {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\ > {sampleID}.breakdancer.cfg echo breakdancer config for {sampleID} done: `date "+%F %T"` '''.format(sampleID=sampleID, **self.__dict__) shell_path = '{analydir}/SV/{sampleID}/breakdancer/breakdancer_config_{sampleID}.sh'.format( sampleID=sampleID, **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'breakdancer_config' job_name = 'breakdancer_config_{}'.format(sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)] utils.add_order(self.orders, job_name, before_jobs=before_jobs)
def phenolyzer(self): print '> phenolyzer ...' # write shell if not self.args['disease_name']: print '[error] phenolyzer needs disease name in your sample_info' exit(1) cmd = ''' set -eo pipefail echo phenolyzer start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/Network # Phenolyzer python {moduledir}/Phenolyzer/phenolyzer-0.1.5/phenolyzer_pipe4.7.py \\ --dir {analydir} \\ --disease "{disease_name}" \\ --genelist {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls \\ --job {newjob} # DisGeNet python {moduledir}/DisGeNet/disgenet.py \\ --id '{disease_ids}' \\ --glist {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls \\ --out_dir . # Brief Result echo generate brief results python {ROOT_DIR}/modules/brief/text2excel.py \\ {BriefResults}/Network/phenolyzer.xlsx \\ {ROOT_DIR}/modules/brief/readme/phenolyzer.readme.xls \\ AllGene_list.xls \\ CandidateGene_list.xls \\ CandidateGene_score.xls python {ROOT_DIR}/modules/brief/text2excel.py \\ {BriefResults}/Network/disgenet.xlsx \\ {ROOT_DIR}/modules/brief/readme/disgenet.readme.xls \\ DisGeNet_shared_gene.xls echo phenolyzer done: `date "+%F %T"` '''.format(**self.__dict__) shell_path = '{analydir}/Advance/{newjob}/Network/phenolyzer.sh'.format( **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'phenolyzer' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['integrate_result'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def stat_uncover(self, patientID, sampleID): # based on sort.bam # print ' stat uncover...' # write shell if self.args['seqstrag'] != 'WGS': cmd = ''' set -eo pipefail echo stat uncover for {sampleID} start: `date "+%F %T"` cd {analydir}/Alnstat/{sampleID} samtools-1.6 depth \\ -aa -q 0 -Q 0 \\ -b {TR} \\ {analydir}/Mapping/{patientID}.{sampleID}/{sampleID}.sort.bam | awk -F'\\t' '$3==0' | grep -vwf target_region.00.depth > target_region.0.depth python {moduledir}/Alnstat/uncover_pos_chr_pipe4.6.py \\ target_region.0.depth \\ {sampleID} \\ {sampleID}.uncovered_region.annovar.result.xls rm -f target_region.0.depth echo stat uncover for {sampleID} done: `date "+%F %T"` ''' else: cmd = ''' set -eo pipefail echo stat uncover for {sampleID} start: `date "+%F %T"` rm -f *.depth *.bed *.pdf* *.png echo stat uncover for {sampleID} done: `date "+%F %T"` ''' cmd = cmd.format(patientID=patientID, sampleID=sampleID, **self.args) shell_path = '{analydir}/Alnstat/{sampleID}/stat_uncover_{sampleID}.sh'.format( analydir=self.analydir, sampleID=sampleID) utils.write_shell(shell_path, cmd) # add job now_point = 'stat_uncover' job_name = 'stat_uncover_{sampleID}'.format(sampleID=sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['stat_depth_{sampleID}'.format(sampleID=sampleID)] after_jobs = [] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def ibd(self): if self.args['seqstrag'] == 'WGS': region = '-r 1' else: region = '-R {TR}'.format(**self.__dict__) print '> IBD' cmd = ''' set -eo pipefail echo IBD start: `date "+%F %T"` cd ${Merged_vcf}/IBD # extract region bcftools-1.6 view \\ ${region} \\ ../VCF/snp.merged.vcf.gz | awk '$5!~/*/' > snp.merged.bed.vcf # extract sample_info awk -F '\\t' -v OFS='\\t' '$1!~/^#/{print $2, $2, $1, $2}' \\ ${samp_info} \\ > sample.ped # plink plink --vcf snp.merged.bed.vcf --double-id --update-ids sample.ped --make-bed -out plink plink --bfile plink --genome -out all plink --bfile plink --genome --rel-check -out family # result awk -v OFS='\\t' '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' all.genome > all.IBD.xls awk -v OFS='\\t' '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' family.genome > family.IBD.xls ln -sf ${moduledir}/IBD/readme.txt IBD.readme.txt rm -f plink.* *.{log,nosex,genome} *.vcf *.ped echo IBD done: `date "+%F %T"` ''' cmd = Template(cmd).safe_substitute(**dict(self.__dict__, **locals())) shell_path = '{Merged_vcf}/IBD/IBD.sh'.format(**self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'ibd' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['annotate_merged_snp'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def hla_bwa_mem(self, sampleid, lane): # print '> hla bwa mem ...' cmd = ''' set -eo pipefail echo hla bwa mem and samtools sort for {sampleid} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid} fq1={analydir}/QC/{sampleid}/{sampleid}_{novoid}_{flowcell}_L{lane}_1.clean.fq fq2={analydir}/QC/{sampleid}/{sampleid}_{novoid}_{flowcell}_L{lane}_2.clean.fq if [ ! -f $fq1 ];then fq1=$fq1.gz fq2=$fq2.gz fi bwa mem \\ -t 6 -M \\ -R "@RG\\tID:{sampleid}_{novoid}_{flowcell}_L{lane}\\tSM:{sampleid}\\tLB:{sampleid}\\tPU:{novoid}_{flowcell}_L{lane}\\tPL:illumina\\tCN:novogene" \\ {athlates_db_dir}/ref/hla_nclean.fasta \\ $fq1 $fq2 | samtools-1.6 view \\ -@ 5 -b -S -F 4 -t \\ {athlates_db_dir}/ref/hla_nclean.fasta.fai | samtools-1.6 sort \\ -@ 3 -m 2G \\ -T {sampleid}_{novoid}_{flowcell}_L{lane}.tmp \\ -o {sampleid}_{novoid}_{flowcell}_L{lane}.sort.bam echo hla bwa mem and samtools sort for {sampleid} done: `date "+%F %T"` '''.format(sampleid=sampleid, **dict(lane, **self.__dict__)) shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_bwa_mem_{sampleid}_{novoid}_{flowcell}_L{lane}.sh'.format( sampleid=sampleid, **dict(lane, **self.__dict__)) utils.write_shell(shell_path, cmd) # add job now_point = 'hla_bwa_mem' job_name = 'hla_bwa_mem_{sampleid}_{novoid}_{flowcell}_L{lane}'.format( sampleid=sampleid, **lane) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [] if self.qc_status == 'waiting': before_jobs = [ 'qc_{sampleid}_{novoid}_{flowcell}_L{lane}'.format( sampleid=sampleid, **lane) ] after_jobs = [ 'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid, **lane) ] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def sentieon_markdup(self, patientID, sampleID): # print ' sentieon markdup...' # write shell markdup_threads = self.threads['markdup'] cmd = ''' set -eo pipefail echo sentieon markdup for {sampleID} start: `date "+%F %T"` cd {analydir}/Mapping/{patientID}.{sampleID} sentieon driver \\ -t {markdup_threads} \\ -i {sampleID}.sort.bam \\ --algo LocusCollector \\ --fun score_info \\ {sampleID}.score.txt sentieon driver \\ -t {markdup_threads} \\ -i {sampleID}.sort.bam \\ --algo Dedup \\ --score_info {sampleID}.score.txt \\ --metrics {sampleID}.dedup.metrics.txt \\ {sampleID}.nodup.bam echo sentieon markdup for {sampleID} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/sentieon_markdup_{sampleID}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'sentieon_markdup' job_name = 'sentieon_markdup_{sampleID}'.format(sampleID=sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.sentieon_queues, threads=markdup_threads) # add order before_jobs = [ '{merge_soft}_merge_{sampleID}'.format(sampleID=sampleID, **self.__dict__) ] after_jobs = 'mapping_check_{sampleID} stat_flag_{sampleID}'.format( sampleID=sampleID).split() utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def filter_cnv(self): print '> filter_cnv' for sampleid in self.sample_infos: cmd = ''' set -eo pipefail echo filter cnv for {sampleid} start: `date "+%F %T"` cd {FilterCNV}/{sampleid} python {moduledir}/Varition/Filter/filter_sv_cnv.py \\ --proj {analydir} \\ --overlap 0.7 \\ --sampleID {sampleid} \\ --outdir {FilterCNV} \\ --soft {cnv_soft} \\ --lib StringentLib,InclusiveLib,DGV.GoldStandard.July2015,DGV,CNVD # Brief Result echo generate brief results python {ROOT_DIR}/modules/brief/brief_anno.py \\ -i {sampleid}.LikelyDeleterious.CNV.xls \\ -O {BriefResults}/FilterCNV \\ -t sv_cnv python {ROOT_DIR}/modules/brief/text2excel.py \\ {BriefResults}/FilterCNV/{sampleid}.LikelyDeleterious.CNV.xlsx \\ {ROOT_DIR}/modules/brief/readme/filter_sv_cnv.readme.xls \\ {BriefResults}/FilterCNV/{sampleid}.LikelyDeleterious.CNV.brief.xls echo filter cnv for {sampleid} done: `date "+%F %T"` '''.format(sampleid=sampleid, **self.__dict__) shell_path = '{FilterCNV}/{sampleid}/filter_cnv_{sampleid}.sh'.format( sampleid=sampleid, **self.args) utils.write_shell(shell_path, cmd) # add job now_point = 'filter_cnv' job_name = 'filter_cnv_{}'.format(sampleid) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order if self.cnv_soft == 'freec': cnv_last = 'freec_call_{}'.format(sampleid) elif self.cnv_soft == 'conifer': cnv_last = 'conifer_call' before_jobs = [cnv_last] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def pathway(self): print '> pathway ...' # write shell cmd = ''' set -eo pipefail echo pathway start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/Pathway python {moduledir}/Enrich_R/bin/stat_phyper_table.py \\ -i {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls \\ -o Pathway \\ -f 1 # extract gene awk -F '\\t' 'NR>1 && $5<0.05 {print8}' Pathway/Pathway_kegg.xls | tr '/' '\\n' | sort -u > KEGG.xls # extract path awk -F '\\t' 'NR>1 && $5<0.05 {print1}' Pathway/Pathway_kegg.xls | sort -u >> KEGG.xls python {moduledir}/KEGG/kegg_svg.py KEGG.xls # Brief Result echo generate brief results python {ROOT_DIR}/modules/brief/text2excel.py \\ {BriefResults}/Pathway/pathway.xlsx \\ {ROOT_DIR}/modules/brief/readme/pathway.readme.xls \\ Pathway/Pathway_go_MF.xls \\ Pathway/Pathway_go_BP.xls \\ Pathway/Pathway_go_CC.xls \\ Pathway/Pathway_kegg.xls echo pathway done: `date "+%F %T"` '''.format( print1='{print $1}', print8='{print $8}', **self.args) shell_path = '{analydir}/Advance/{newjob}/Pathway/pathway_enrichment.sh'.format( **self.args) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'pathway' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['integrate_result'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def hla_sort_by_name(self, sampleid, gene): # print ' sort by name ...' cmd = ''' set -eo pipefail echo hla sort by name for {sampleid} {gene} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene} mkdir -p TMP for gene in {gene} non-{gene};do samtools-1.6 view \\ -b -L {athlates_db_dir}/bed/hla.$gene.bed \\ -o {sampleid}.$gene.bam \\ -@ 4 \\ ../../{sampleid}.nodup.bam ( samtools-1.6 view -H {sampleid}.$gene.bam samtools-1.6 view {sampleid}.$gene.bam | sort -k1,1 -k3,3 -T TMP ) | samtools-1.6 view -bS -o {sampleid}.$gene.sort.bam -@ 4 - rm -f {sampleid}.$gene.bam done rm -rf TMP # rm -f ../../{sampleid}.sort.bam # rm -f ../../{sampleid}.nodup.bam echo hla sort by name for {sampleid} {gene} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene}/hla_sort_by_name_{sampleid}_{gene}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'hla_sort_by_name' job_name = 'hla_sort_by_name_{sampleid}_{gene}'.format(**locals()) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['hla_picard_markdup_{sampleid}'.format(**locals())] after_jobs = [] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def freec_call(self, sampleID): seqtype = self.args['seqstrag'].split('_')[0] target = '' if seqtype != 'WGS': target = '\\\n{:16}--target {} '.format(' ', self.args['TR']) sex = 'XX' if self.sample_infos[sampleID]['sex'] == 'F' else 'XY' REF = 'hg19' if self.__dict__['ref'] == 'b37' else self.__dict__['ref'] cmd = ''' set -eo pipefail echo cnv call with freec for {sampleID} start: `date "+%F %T"` cd {analydir}/SV/{sampleID}/freec python {moduledir}/Varition/CNV/freec/freec_calling.py \\ --type {seqtype} {target}\\ --format BAM \\ --loh 0 \\ --contamination 0 \\ --samName {sampleID} \\ --sample {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\ --sex {sex} \\ --ref {ref} \\ --o . python {moduledir}/Varition/CNV/freec/Chr_CNV_freec_pipe4.5.py \\ --inf ./{sampleID}.freec.{REF}_multianno.xls \\ --ref {ref} \\ --sample_info {samp_info} rm -f *cpn *txt echo cnv call with freec for {sampleID} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/SV/{sampleID}/freec/freec_call_{sampleID}.sh'.format( sampleID=sampleID, **self.args) utils.write_shell(shell_path, cmd) # add job now_point = 'freec_call' job_name = 'freec_call_{}'.format(sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)] after_jobs = ['primary_report'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def sambamba_markdup(self, patientID, sampleID): # print ' sambamba merge...' # write shell markdup_threads = self.threads['markdup'] cmd = ''' set -eo pipefail echo sambamba markdup for {sampleID} start: `date "+%F %T"` cd {analydir}/Mapping/{patientID}.{sampleID} sambamba markdup \\ -t {markdup_threads} \\ --overflow-list-size=10000000 \\ --tmpdir=tmp \\ {sampleID}.sort.bam \\ {sampleID}.nodup.bam rm -rf tmp echo sambamba markdup for {sampleID} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/sambamba_markdup_{sampleID}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'sambamba_markdup' job_name = 'sambamba_markdup_{sampleID}'.format(sampleID=sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues, threads=markdup_threads) # add order before_jobs = [ '{merge_soft}_merge_{sampleID}'.format(sampleID=sampleID, **self.__dict__) ] after_jobs = 'mapping_check_{sampleID} stat_flag_{sampleID}'.format( sampleID=sampleID).split() utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def gene_association(self): print '> gene association ...' self.gene_as_filter() # write shell cmd = ''' set -eo pipefail echo gene association start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/GeneAS for mtype in snp snp.indel;do python {moduledir}/Association/Burden/GetBurdenFre.py \\ -case $mtype.filter.noXY.V6Pad100.xls.gz \\ -control {moduledir}/{pad_100_stat} \\ -cr 0.95 \\ -nr 0.6 \\ -cc N \\ -Num 2827 \\ -out $mtype.burden.stat.xls rows=`wc -l $mtype.burden.stat.xls | cut -d' ' -f1` if [ $rows -eq 1 ];then echo "[error] no data in $mtype.burden.stat.xls" exit 1 fi Rscript {moduledir}/Association/Burden/GeneFisherPlot.R \\ --infile $mtype.burden.stat.xls \\ --outpre $mtype.burden paste $mtype.burden.fisher.xls $mtype.burden.stat.samstat.xls > $mtype.burden.result.xls done echo gene association done: `date "+%F %T"` '''.format(**self.__dict__) shell_path = '{analydir}/Advance/{newjob}/GeneAS/gene_association.sh'.format( **self.args) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'gene_association' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['gene_as_filter'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def ppi(self): print '> ppi ...' # write shell cmd = ''' set -eo pipefail echo ppi start: `date "+%F %T"` cd {PPI} echo 9606 > PPI_genes.xls cat {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls | tr ',' '\\n' | tr '\\n' '\\t' >> PPI_genes.xls echo -e "\\nall\\n20" >> PPI_genes.xls java -Xmx6G -jar {genemania_jar} QueryRunner \\ --data {genemania_data} \\ --out flat \\ --results . \\ PPI_genes.xls python {moduledir}/PPI/SplitPPI_Result.py . # Brief Result echo generate brief results python {ROOT_DIR}/modules/brief/text2excel.py \\ {BriefResults}/PPI/PPI.xlsx \\ {ROOT_DIR}/modules/brief/readme/ppi.readme.xls \\ Gene_interactions.xls \\ Gene_description.xls \\ Networks.description.xls echo ppi done: `date "+%F %T"` '''.format(**self.__dict__) shell_path = '{PPI}/ppi.sh'.format( **self.args) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'ppi' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['integrate_result'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def mapping_check(self, patientID, sampleID): # print ' mapping check...' # write shell sex = self.sample_infos[sampleID]['sex'] cmd = ''' set -eo pipefail echo mapping check for sample {sampleID} start: `date "+%F %T"` python2 {moduledir}/QC/auto_check.py \\ --qc_list {qc_list} \\ --sampid {sampleID} \\ --pwd {analydir} \\ --check map \\ --jobname {newjob} \\ --seqstrag {seqstrag} \\ --email {email} \\ --PE {PE} \\ --gender {sex} \\ --dup {dup} \\ --depth {depth} # remove clean data if mapping check passed if {rm_clean};then rm -f {analydir}/QC/{sampleID}/{sampleID}_*.clean.fq* fi echo mapping check for sample {sampleID} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/mapping_check_{sampleID}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'mapping_check' job_name = 'mapping_check_{sampleID}'.format(sampleID=sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [] after_jobs = [ 'final_bam_{sampleID}'.format(sampleID=sampleID), 'data_release' ] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def crest_call(self, sampleID, chrom_list): for chrom in chrom_list: # 1 extract soft clip cmd = ''' set -eo pipefail echo sv call with crest for {sampleID} start: `date "+%F %T"`\n # 1 Extract Softclip echo extract softclip... perl {soft_dir}/CREST/CREST/extractSClip.pl \\ -o {analydir}/SV/{sampleID}/crest/bychr \\ -i {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\ -ref_genome {reffasta} \\ -r {chrom} \\ -p {sampleID} # 2 CREST Calling echo crest calling... perl {soft_dir}/CREST/crest_sv_calling_pipe4.6.pl \\ -outDir {analydir}/SV/{sampleID}/crest/bychr \\ -tumorBam {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\ --min_one_side_reads 4 \\ -sampleID {sampleID}.{chrom} \\ -regionList {refbed} \\ -cover {analydir}/SV/{sampleID}/crest/bychr/{sampleID}.{chrom}.cover \\ -ref {reffasta} \\ -bit {reffasta2bit} echo sv call with crest for {sampleID} done: `date "+%F %T"` '''.format(sampleID=sampleID, chrom=chrom, **self.__dict__) shell_path = '{analydir}/SV/{sampleID}/crest/crest_call_chr_{chrom}_{sampleID}.sh'.format( sampleID=sampleID, chrom=chrom.strip('chr'), **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'crest_call' job_name = 'crest_call_{}_{}'.format(chrom, sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)] after_jobs = ['crest_txt2gff_{sampleID}'.format(sampleID=sampleID)] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def breakdancer_call(self, sampleID): cmd = ''' set -eo pipefail echo breakdancer call for {sampleID} start: `date "+%F %T"`\n cd {analydir}/SV/{sampleID}/breakdancer {soft_dir}/breakdancer/current/breakdancer-max \\ -h \\ -d {sampleID}.breakdancer.SV-supporting \\ -g {sampleID}.breakdancer.bed \\ {sampleID}.breakdancer.cfg | grep -vwE 'hs37d5|GL000220' \\ > {sampleID}.breakdancer.txt perl {moduledir}/Varition/SV/breakdancer/breakdancer_filter.pl \\ -g {sex} \\ -n 6 \\ -a {sampleID}.breakdancer.txt \\ > {sampleID}.breakdancer.flt.txt perl {moduledir}/Varition/SV/breakdancer/breakdancer_txt2gff.pl \\ {sampleID}.breakdancer.flt.txt \\ > {sampleID}.breakdancer.gff echo breakdancer call for {sampleID} done: `date "+%F %T"` '''.format(sampleID=sampleID, sex=self.sample_infos[sampleID]['sex'], **self.__dict__) shell_path = '{analydir}/SV/{sampleID}/breakdancer/breakdancer_call_{sampleID}.sh'.format( sampleID=sampleID, **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'breakdancer_call' job_name = 'breakdancer_call_{}'.format(sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [ 'breakdancer_config_{sampleID}'.format(sampleID=sampleID) ] after_jobs = ['annotate_gff_{sampleID}'.format(sampleID=sampleID)] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def crest_txt2gff(self, sampleID, chrom_list): auto_chrom_last = filter(lambda x: x.strip('chr').isdigit(), chrom_list)[-1] other_chrom = filter(lambda x: not x.strip('chr').isdigit(), chrom_list) # print auto_chrom_last # print other_chrom crest_results = '{{1..%s},%s}' % (auto_chrom_last.strip( 'chr'), ','.join(map(lambda x: x.strip('chr'), other_chrom))) if 'chr' in chrom_list[-1]: crest_results = 'chr' + crest_results # print crest_results cmd = ''' set -eo pipefail echo convert sv results to gff for {sampleID} start: `date "+%F %T"`\n cd {analydir}/SV/{sampleID}/crest cat bychr/{sampleID}.{crest_results}.predSV.txt | grep -vw hs37d5 > {sampleID}.predSV.txt perl {moduledir}/Varition/SV/crest/crest_txt2gff.pl \\ {sampleID}.predSV.txt > {sampleID}.crest.gff echo convert sv results to gff for {sampleID} done: `date "+%F %T"` '''.format(crest_results=crest_results, sampleID=sampleID, **self.__dict__) shell_path = '{analydir}/SV/{sampleID}/crest/crest_txt2gff_{sampleID}.sh'.format( sampleID=sampleID, **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'crest_txt2gff' job_name = 'crest_txt2gff_{}'.format(sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order after_jobs = ['annotate_gff_{sampleID}'.format(sampleID=sampleID)] utils.add_order(self.orders, job_name, after_jobs=after_jobs)
def samtools_call_hapmap(self, familyid, samples_with_data): vcf_list = '{analydir}/Advance/{newjob}/Linkage/{familyid}/vcf_{familyid}.list'.format( **dict(self.__dict__, **locals())) with utils.safe_open(vcf_list, 'w') as out: for sampleid in samples_with_data: out.write('{}.vcf\n'.format(sampleid)) for sampleid in samples_with_data: print '> samtools call hapmap for', sampleid cmd = ''' set -eo pipefail echo samtools call hapmap for {sampleid} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/Linkage/{familyid} samtoolsv0.1.19 mpileup \\ -d 10000 -C 50 -D -S -m 2 -F 0.02 -q 13 -Q 13 \\ -gf {reffasta} \\ -l {moduledir}/Linkage/annotHapMap2L.txt \\ {analydir}/Mapping/{sampleid}.{sampleid}/{sampleid}.final.bam | bcftools_lh view \\ -cg -t 0.5 \\ -> {sampleid}.vcf echo samtools call hapmap for {sampleid} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Advance/{newjob}/Linkage/{familyid}/samtools_call_hapmap_{sampleid}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'samtools_call_hapmap' job_name = 'samtools_call_hapmap_{sampleid}'.format(**locals()) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['final_bam_{sampleid}'.format(**locals())] after_jobs = ['linkdatagen_{familyid}'.format(**locals())] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def gzip_md5_clean(self, sampleID, lane): # print ' gzip and md5sum clean data...' # write shell cmd = ''' set -eo pipefail echo Compress and md5sum clean for sample {sampleID} start: `date "+%F %T"`\n cd {analydir}/QC/{sampleID} fq1={sampleID}_{novoid}_{flowcell}_L{lane}_1.clean.fq fq2={sampleID}_{novoid}_{flowcell}_L{lane}_2.clean.fq for fq in $fq1 $fq2;do if [ -s $fq.gz -a ! -s $fq ];then echo $fq has been compressed. else pigz -p4 -f $fq fi md5sum $fq.gz | unix2dos > $fq.gz.MD5.txt done echo Compress and md5sum clean for sample {sampleID} done: `date "+%F %T"` '''.format(analydir=self.analydir, sampleID=sampleID, **lane) shell_path = '{analydir}/QC/{sampleID}/gzip_md5_clean_{sampleID}_{novoid}_{flowcell}_L{lane}.sh'.format( analydir=self.analydir, sampleID=sampleID, **lane) utils.write_shell(shell_path, cmd) # add job now_point = 'gzip_md5_clean' job_name = 'gzip_md5_clean_{sampleID}_{novoid}_{flowcell}_L{lane}'.format( sampleID=sampleID, **lane) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [ 'qc_{sampleID}_{novoid}_{flowcell}_L{lane}'.format( sampleID=sampleID, **lane) ] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def hla_athlates_typing(self, sampleid, gene): # print ' athlates typing ...' cmd = ''' set -eo pipefail echo hla athlates typing for {sampleid} {gene} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene} typing \\ -hd 2 \\ -msa {athlates_db_dir}/msa/{gene}_nuc.txt \\ -bam {sampleid}.{gene}.sort.bam \\ -exlbam {sampleid}.non-{gene}.sort.bam \\ -o {sampleid}.{gene} rm -f *bam # link result mkdir -p {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/result cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/result ln -sf ../{sampleid}/gene/{gene}/{sampleid}.{gene}.typing.txt . echo hla athlates typing for {sampleid} {gene} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene}/hla_athlates_typing_{sampleid}_{gene}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'hla_athlates_typing' job_name = 'hla_athlates_typing_{sampleid}_{gene}'.format(**locals()) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['hla_sort_by_name_{sampleid}_{gene}'.format(**locals())] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def hla_sambamba_markdup(self, sampleid): # print ' sambamba merge...' # write shell cmd = ''' set -eo pipefail echo sambamba markdup for {sampleid} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid} sambamba markdup \\ -t 5 \\ --overflow-list-size=10000000 \\ --tmpdir=tmp \\ {sampleid}.sort.bam \\ {sampleid}.nodup.bam rm -rf tmp echo sambamba markdup for {sampleid} done: `date "+%F %T"` '''.format(sampleid=sampleid, **self.__dict__) shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_sambamba_markdup_{sampleid}.sh'.format( sampleid=sampleid, **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'hla_sambamba_markdup' job_name = 'hla_sambamba_markdup_{sampleid}'.format(sampleid=sampleid) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [ 'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid) ] after_jobs = [] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def hla_picard_markdup(self, sampleid): # print ' picard markdup ...' cmd = ''' set -eo pipefail echo picard markdup for {sampleid} start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid} java1.8.0 -Xmx5g -jar {picard_jar} \\ MarkDuplicates \\ TMP_DIR=TMP \\ INPUT={sampleid}.sort.bam \\ OUTPUT={sampleid}.nodup.bam \\ METRICS_FILE={sampleid}.nodup.metrics.txt \\ CREATE_INDEX=true \\ ASSUME_SORTED=true echo picard markdup for {sampleid} done: `date "+%F %T"` '''.format(sampleid=sampleid, **self.__dict__) shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_picard_markdup_{sampleid}.sh'.format( sampleid=sampleid, **self.__dict__) utils.write_shell(shell_path, cmd) # add job now_point = 'hla_picard_markdup' job_name = 'hla_picard_markdup_{sampleid}'.format(sampleid=sampleid) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = [ 'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid) ] after_jobs = [] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def disease(self, disease_id): print '> disease analysis ...' # write shell array = ','.join(map(str, self.args['analy_array'])) cmd = ''' set -eo pipefail echo disease analysis start: `date "+%F %T"` cd {Disease} python {moduledir}/Disease/disease.py \\ -i {IntegrateResult}/Integrate.candidate.xls \\ -o Integrate.disease.xls \\ -id {disease_id} \\ -enc utf8 python {ROOT_DIR}/modules/brief/text2excel.py \\ {BriefResults}/Disease/Integrate.disease.xlsx \\ Integrate.disease.xls echo disease analysis done: `date "+%F %T"` '''.format(**dict(self.args, **locals())) shell_path = '{Disease}/disease_analysis.sh'.format(**self.args) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'disease_analysis' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['integrate_result'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def combine_stat(self, patientID, sampleID): # print ' combine stat result...' # write shell cmd = ''' set -eo pipefail echo combine stat result for {sampleID} start: `date "+%F %T"`\n cd {analydir}/Alnstat/{sampleID}\n python {moduledir}/Alnstat/combine_pipe4.6.py \\ information.xlsx \\ {sampleID}.flagstat \\ {sampleID} \\ {seqstrag} \\ > {sampleID}_mapping_coverage.txt\n echo combine stat result for {sampleID} done: `date "+%F %T"` '''.format(patientID=patientID, sampleID=sampleID, **self.args) shell_path = '{analydir}/Alnstat/{sampleID}/combine_stat_{sampleID}.sh'.format( analydir=self.analydir, sampleID=sampleID) utils.write_shell(shell_path, cmd) # add job now_point = 'combine_stat' job_name = 'combine_stat_{sampleID}'.format(sampleID=sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = 'stat_depth_{sampleID} stat_flag_{sampleID}'.format( sampleID=sampleID).split() after_jobs = [ 'mapping_check_{sampleID}'.format(sampleID=sampleID), 'mapping_report' ] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def lumpy_call(self, sampleID): sex = self.sample_infos[sampleID]['sex'] cmd = ''' set -eo pipefail echo lumpy call for {sampleID} start: `date "+%F %T"`\n cd {analydir}/SV/{sampleID}/lumpy python {moduledir}/Varition/SV/lumpy/lumpy.py \\ -b {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\ -r {ref} \\ -o {sampleID} rm -f *bam* echo lumpy call for {sampleID} done: `date "+%F %T"` '''.format(**dict(self.__dict__, **locals())) shell_path = '{analydir}/SV/{sampleID}/lumpy/lumpy_call_{sampleID}.sh'.format( **dict(self.__dict__, **locals())) utils.write_shell(shell_path, cmd) # add job now_point = 'lumpy_call' job_name = 'lumpy_call_{}'.format(sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)] after_jobs = ['annotate_gff_{sampleID}'.format(sampleID=sampleID)] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def qc_check(self, sampleID): # print ' qc check...' # write shell cmd = ''' set -eo pipefail echo qc check for sample {sampleID} start: `date "+%F %T"` python2 {moduledir}/QC/auto_check.py \\ --qc_list {qc_list} \\ --sampid {sampleID} \\ --pwd {analydir} \\ --check qc \\ --jobname {newjob} \\ --seqstrag {seqstrag} \\ --email {email} \\ --PE {PE} \\ --q20 {Q20} \\ --q30 {Q30} \\ --error {error} \\ --raw {rawdata} echo qc check for sample {sampleID} done: `date "+%F %T"` '''.format(sampleID=sampleID, **self.args) shell_path = '{analydir}/QC/{sampleID}/qc_check_{sampleID}.sh'.format( analydir=self.analydir, sampleID=sampleID) utils.write_shell(shell_path, cmd) # add job now_point = 'qc_check' job_name = 'qc_check_{sampleID}'.format(sampleID=sampleID) utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order after_jobs = ['qc_report'] utils.add_order(self.orders, job_name, after_jobs=after_jobs)
def site_association(self): print '> site association ...' # write shell cmd = ''' set -eo pipefail echo site association start: `date "+%F %T"` cd {analydir}/Advance/{newjob}/SiteAS python {moduledir}/Association/site_AS/v3/SiteAS.py \\ --pwd . \\ --infile ../Merged_vcf/VCF/snp.merged.annovar.hg19_multianno.xls.gz \\ --type allele \\ --db NOVO \\ --out AS sh AS_siteAS_Allele_NOVO.sh echo site association done: `date "+%F %T"` '''.format(**self.args) shell_path = '{analydir}/Advance/{newjob}/SiteAS/site_association.sh'.format( **self.args) utils.write_shell(shell_path, cmd) # add job now_point = job_name = 'site_association' utils.add_job(self.jobs, now_point, self.args['startpoint'], self.ANALYSIS_POINTS, job_name, shell_path, self.queues) # add order before_jobs = ['annotate_merged_snp', 'annotate_merged_indel'] after_jobs = ['data_release'] utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
def json_api(): ''' 该方法提供了法律查询的接口,返回json化后的结果 返回数据的结构为: {task: {the order: [law_number, sketch of laws, complete laws], the order: [law_number, sketch of laws, complete laws]}, ... } ''' try: word = request.args['info'] law_lst = MySearch().search_word(word) # tup是number和content组成的 o_law_list = add_order(law_lst) # 给tuple加入序号, 并转换为api的数据结构 return jsonify({'task': o_law_list}), 200 except Exception, e: print e abort(400) return
def submit_order_handler(update, context): chat = utils.get_chat(context, update) chat_id = chat.effective_chat.id logger.info(f'submit_order_handler -> {context.user_data}') order_id = utils.add_order(context.user_data, chat_id) context.user_data.update( order_id=order_id) # 1. Send Order Info to admins chat utils.send_message_to_admin( context.bot, f'{utils.generate_full_order_info(context.user_data, chat_id)} \n\n' f'`User_id: {chat_id}` \n' f'`Order_id: {order_id}` \n', True, chat_id) # 2. Send notification to user update.message.reply_text( utils.generate_order_confirmation( context.user_data ), reply_markup=utils.get_start_kb() ) done(update, context)