Exemple #1
0
    def md5_raw(self, sampleID, lane):

        # print '  md5sum rawdata...'
        # write shell
        cmd = '''
            set -eo pipefail
            echo MD5 rawdata for sample {sampleID} start: `date "+%F %T"`

            cd {analydir}/RawData/{sampleID}

            md5sum {sampleID}_{novoid}_{flowcell}_L{lane}_1.fq.gz |unix2dos > {sampleID}_{novoid}_{flowcell}_L{lane}_1.fq.gz.MD5.txt
            md5sum {sampleID}_{novoid}_{flowcell}_L{lane}_2.fq.gz |unix2dos > {sampleID}_{novoid}_{flowcell}_L{lane}_2.fq.gz.MD5.txt

            echo MD5 rawdata for sample {sampleID} done: `date "+%F %T"`
        '''.format(analydir=self.analydir, sampleID=sampleID, **lane)

        shell_path = '{analydir}/RawData/{sampleID}/md5_raw_{sampleID}_{novoid}_{flowcell}_L{lane}.sh'.format(
            analydir=self.analydir, sampleID=sampleID, **lane)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'md5_raw'
        job_name = 'md5_raw_{sampleID}_{novoid}_{flowcell}_L{lane}'.format(
            sampleID=sampleID, **lane)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)
Exemple #2
0
    def stat_flag(self, patientID, sampleID):

        # based on nodup.bam
        # print '  stat flag...'
        # write shell
        cmd = '''
            set -eo pipefail
            echo stat flag for {sampleID} start: `date "+%F %T"`

            cd {analydir}/Alnstat/{sampleID}

            python {moduledir}/Alnstat/sam_flagstat_pipe4.6.py \\
                --bam {analydir}/Mapping/{patientID}.{sampleID}/{sampleID}.nodup.bam \\
                > {sampleID}.flagstat

            echo stat flag for {sampleID} done: `date "+%F %T"`
        '''.format(patientID=patientID, sampleID=sampleID, **self.args)

        shell_path = '{analydir}/Alnstat/{sampleID}/stat_flag_{sampleID}.sh'.format(
            analydir=self.analydir, sampleID=sampleID)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'stat_flag'
        job_name = 'stat_flag_{sampleID}'.format(sampleID=sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)
Exemple #3
0
    def annotate_gff(self, sampleID):

        cmd = '''
            set -eo pipefail
            echo annotate gff for {sampleID}: `date "+%F %T"`\n

            cd {analydir}/SV/{sampleID}/{sv_soft}

            sh {annovar} \\
                -t SVType \\
                {sampleID}.{sv_soft}.gff \\
                {sampleID}.{sv_soft}

            python {moduledir}/Varition/SV/sv_cnv_stat.py \\
                -i {sampleID}.{sv_soft}.hg19_multianno.xls \\
                -s {sampleID} \\
                -soft {sv_soft}

            echo annotate gff for {sampleID} `date "+%F %T"`
            '''.format(sampleID=sampleID, **self.__dict__)

        shell_path = '{analydir}/SV/{sampleID}/{sv_soft}/annotate_gff_{sampleID}.sh'.format(
            sampleID=sampleID, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'annotate_gff'
        job_name = 'annotate_gff_{}'.format(sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        after_jobs = ['data_release', 'primary_report']
        utils.add_order(self.orders, job_name, after_jobs=after_jobs)
Exemple #4
0
    def breakdancer_config(self, sampleID):

        cmd = '''
            set -eo pipefail
            echo breakdancer config for {sampleID} start: `date "+%F %T"`\n

            cd {analydir}/SV/{sampleID}/breakdancer

            perl {soft_dir}/breakdancer/current/bam2cfg.pl \\
                -g -h -n 100000 \\
                {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\
                > {sampleID}.breakdancer.cfg

            echo breakdancer config for {sampleID} done: `date "+%F %T"`
        '''.format(sampleID=sampleID, **self.__dict__)

        shell_path = '{analydir}/SV/{sampleID}/breakdancer/breakdancer_config_{sampleID}.sh'.format(
            sampleID=sampleID, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'breakdancer_config'
        job_name = 'breakdancer_config_{}'.format(sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)]
        utils.add_order(self.orders, job_name, before_jobs=before_jobs)
Exemple #5
0
    def final_bam(self, patientID, sampleID):

        # print '  fianl bam...'
        # write shell

        cmd = '''
            set -eo pipefail
            echo final bam for {sampleID} start: `date "+%F %T"`
            
            cd {analydir}/Mapping/{patientID}.{sampleID}
            
            ln -sf {sampleID}.nodup.bam {sampleID}.final.bam
            ln -sf {sampleID}.nodup.bam.bai {sampleID}.final.bam.bai
            
            echo final bam for {sampleID} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/final_bam_{sampleID}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'final_bam'
        job_name = 'final_bam_{sampleID}'.format(sampleID=sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)
Exemple #6
0
    def phenolyzer(self):

        print '>  phenolyzer ...'
        # write shell
        if not self.args['disease_name']:
            print '[error] phenolyzer needs disease name in your sample_info'
            exit(1)

        cmd = '''
            set -eo pipefail
            echo phenolyzer start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/Network

            # Phenolyzer
            python {moduledir}/Phenolyzer/phenolyzer-0.1.5/phenolyzer_pipe4.7.py \\
                --dir {analydir} \\
                --disease "{disease_name}" \\
                --genelist {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls \\
                --job {newjob}

            # DisGeNet
            python {moduledir}/DisGeNet/disgenet.py \\
                --id '{disease_ids}' \\
                --glist {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls \\
                --out_dir .

            # Brief Result
            echo generate brief results

            python {ROOT_DIR}/modules/brief/text2excel.py \\
                {BriefResults}/Network/phenolyzer.xlsx \\
                {ROOT_DIR}/modules/brief/readme/phenolyzer.readme.xls \\
                AllGene_list.xls \\
                CandidateGene_list.xls \\
                CandidateGene_score.xls

            python {ROOT_DIR}/modules/brief/text2excel.py \\
                {BriefResults}/Network/disgenet.xlsx \\
                {ROOT_DIR}/modules/brief/readme/disgenet.readme.xls \\
                DisGeNet_shared_gene.xls

            echo phenolyzer done: `date "+%F %T"`
        '''.format(**self.__dict__)

        shell_path = '{analydir}/Advance/{newjob}/Network/phenolyzer.sh'.format(
            **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'phenolyzer'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['integrate_result']
        after_jobs = ['data_release']
        utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
Exemple #7
0
    def ibd(self):

        if self.args['seqstrag'] == 'WGS':
            region = '-r 1'
        else:
            region = '-R {TR}'.format(**self.__dict__)

        print '>   IBD'
        cmd = '''
            set -eo pipefail

            echo IBD start: `date "+%F %T"`

            cd ${Merged_vcf}/IBD

            # extract region
            bcftools-1.6 view \\
                ${region} \\
                ../VCF/snp.merged.vcf.gz |
            awk '$5!~/*/' > snp.merged.bed.vcf

            # extract sample_info
            awk -F '\\t' -v OFS='\\t' '$1!~/^#/{print $2, $2, $1, $2}' \\
                ${samp_info} \\
                > sample.ped

            # plink 
            plink --vcf snp.merged.bed.vcf --double-id --update-ids sample.ped --make-bed -out plink

            plink --bfile plink --genome -out all
            plink --bfile plink --genome  --rel-check -out family

            # result
            awk -v OFS='\\t' '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' all.genome > all.IBD.xls
            awk -v OFS='\\t' '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}' family.genome > family.IBD.xls

            ln -sf ${moduledir}/IBD/readme.txt IBD.readme.txt

            rm -f plink.* *.{log,nosex,genome} *.vcf *.ped

            echo IBD done: `date "+%F %T"`
        '''
        
        cmd = Template(cmd).safe_substitute(**dict(self.__dict__, **locals()))

        shell_path = '{Merged_vcf}/IBD/IBD.sh'.format(**self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'ibd'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['annotate_merged_snp']
        after_jobs = ['data_release']
        utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
Exemple #8
0
    def filter_cnv(self):

        print '>   filter_cnv'
        for sampleid in self.sample_infos:
            cmd = '''
                set -eo pipefail
                echo filter cnv for {sampleid} start: `date "+%F %T"`

                cd {FilterCNV}/{sampleid}

                python {moduledir}/Varition/Filter/filter_sv_cnv.py \\
                    --proj {analydir} \\
                    --overlap 0.7 \\
                    --sampleID {sampleid} \\
                    --outdir {FilterCNV} \\
                    --soft {cnv_soft} \\
                    --lib StringentLib,InclusiveLib,DGV.GoldStandard.July2015,DGV,CNVD

                # Brief Result
                echo generate brief results

                python {ROOT_DIR}/modules/brief/brief_anno.py \\
                    -i {sampleid}.LikelyDeleterious.CNV.xls \\
                    -O {BriefResults}/FilterCNV \\
                    -t sv_cnv

                python {ROOT_DIR}/modules/brief/text2excel.py \\
                    {BriefResults}/FilterCNV/{sampleid}.LikelyDeleterious.CNV.xlsx \\
                    {ROOT_DIR}/modules/brief/readme/filter_sv_cnv.readme.xls \\
                    {BriefResults}/FilterCNV/{sampleid}.LikelyDeleterious.CNV.brief.xls

                echo filter cnv for {sampleid} done: `date "+%F %T"`
            '''.format(sampleid=sampleid, **self.__dict__)

            shell_path = '{FilterCNV}/{sampleid}/filter_cnv_{sampleid}.sh'.format(
                sampleid=sampleid, **self.args)

            utils.write_shell(shell_path, cmd)

            # add job
            now_point = 'filter_cnv'
            job_name = 'filter_cnv_{}'.format(sampleid)
            utils.add_job(self.jobs, now_point, self.args['startpoint'],
                          self.ANALYSIS_POINTS, job_name, shell_path,
                          self.queues)

            # add order
            if self.cnv_soft == 'freec':
                cnv_last = 'freec_call_{}'.format(sampleid)
            elif self.cnv_soft == 'conifer':
                cnv_last = 'conifer_call'

            before_jobs = [cnv_last]
            after_jobs = ['data_release']
            utils.add_order(self.orders,
                            job_name,
                            before_jobs=before_jobs,
                            after_jobs=after_jobs)
Exemple #9
0
    def stat_uncover(self, patientID, sampleID):

        # based on sort.bam
        # print '  stat uncover...'
        # write shell
        if self.args['seqstrag'] != 'WGS':
            cmd = '''
                set -eo pipefail
                echo stat uncover for {sampleID} start: `date "+%F %T"`

                cd {analydir}/Alnstat/{sampleID}

                samtools-1.6 depth \\
                    -aa -q 0 -Q 0 \\
                    -b {TR} \\
                    {analydir}/Mapping/{patientID}.{sampleID}/{sampleID}.sort.bam |
                awk -F'\\t' '$3==0' |
                grep -vwf target_region.00.depth > target_region.0.depth

                python {moduledir}/Alnstat/uncover_pos_chr_pipe4.6.py \\
                    target_region.0.depth \\
                    {sampleID} \\
                    {sampleID}.uncovered_region.annovar.result.xls

                rm -f target_region.0.depth

                echo stat uncover for {sampleID} done: `date "+%F %T"`
            '''
        else:
            cmd = '''
                set -eo pipefail
                echo stat uncover for {sampleID} start: `date "+%F %T"`
            
                rm -f *.depth *.bed *.pdf* *.png

                echo stat uncover for {sampleID} done: `date "+%F %T"`
            '''

        cmd = cmd.format(patientID=patientID, sampleID=sampleID, **self.args)

        shell_path = '{analydir}/Alnstat/{sampleID}/stat_uncover_{sampleID}.sh'.format(
            analydir=self.analydir, sampleID=sampleID)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'stat_uncover'
        job_name = 'stat_uncover_{sampleID}'.format(sampleID=sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['stat_depth_{sampleID}'.format(sampleID=sampleID)]
        after_jobs = []
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #10
0
    def hla_bwa_mem(self, sampleid, lane):

        # print '>  hla bwa mem ...'
        cmd = '''
            set -eo pipefail
            echo hla bwa mem and samtools sort for {sampleid} start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}

            fq1={analydir}/QC/{sampleid}/{sampleid}_{novoid}_{flowcell}_L{lane}_1.clean.fq
            fq2={analydir}/QC/{sampleid}/{sampleid}_{novoid}_{flowcell}_L{lane}_2.clean.fq
            if [ ! -f $fq1 ];then
                fq1=$fq1.gz
                fq2=$fq2.gz
            fi

            bwa mem \\
                -t 6 -M \\
                -R "@RG\\tID:{sampleid}_{novoid}_{flowcell}_L{lane}\\tSM:{sampleid}\\tLB:{sampleid}\\tPU:{novoid}_{flowcell}_L{lane}\\tPL:illumina\\tCN:novogene" \\
                {athlates_db_dir}/ref/hla_nclean.fasta \\
                $fq1 $fq2 |
            samtools-1.6 view \\
                -@ 5 -b -S -F 4 -t \\
                {athlates_db_dir}/ref/hla_nclean.fasta.fai |
            samtools-1.6 sort \\
                -@ 3 -m 2G \\
                -T {sampleid}_{novoid}_{flowcell}_L{lane}.tmp \\
                -o {sampleid}_{novoid}_{flowcell}_L{lane}.sort.bam

            echo hla bwa mem and samtools sort for {sampleid} done: `date "+%F %T"`
        '''.format(sampleid=sampleid, **dict(lane, **self.__dict__))

        shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_bwa_mem_{sampleid}_{novoid}_{flowcell}_L{lane}.sh'.format(
            sampleid=sampleid, **dict(lane, **self.__dict__))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'hla_bwa_mem'
        job_name = 'hla_bwa_mem_{sampleid}_{novoid}_{flowcell}_L{lane}'.format(
            sampleid=sampleid, **lane)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)
        # add order
        before_jobs = []
        if self.qc_status == 'waiting':
            before_jobs = [
                'qc_{sampleid}_{novoid}_{flowcell}_L{lane}'.format(
                    sampleid=sampleid, **lane)
            ]

        after_jobs = [
            'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid, **lane)
        ]
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #11
0
    def sentieon_markdup(self, patientID, sampleID):

        # print '  sentieon markdup...'
        # write shell
        markdup_threads = self.threads['markdup']

        cmd = '''
            set -eo pipefail            
            echo sentieon markdup for {sampleID} start: `date "+%F %T"`

            cd {analydir}/Mapping/{patientID}.{sampleID}

            sentieon driver \\
                -t {markdup_threads} \\
                -i {sampleID}.sort.bam \\
                --algo LocusCollector \\
                --fun score_info \\
                {sampleID}.score.txt

            sentieon driver \\
                -t {markdup_threads} \\
                -i {sampleID}.sort.bam \\
                --algo Dedup \\
                --score_info {sampleID}.score.txt \\
                --metrics {sampleID}.dedup.metrics.txt \\
                {sampleID}.nodup.bam

            echo sentieon markdup for {sampleID} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/sentieon_markdup_{sampleID}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'sentieon_markdup'
        job_name = 'sentieon_markdup_{sampleID}'.format(sampleID=sampleID)
        utils.add_job(self.jobs,
                      now_point,
                      self.args['startpoint'],
                      self.ANALYSIS_POINTS,
                      job_name,
                      shell_path,
                      self.sentieon_queues,
                      threads=markdup_threads)

        # add order
        before_jobs = [
            '{merge_soft}_merge_{sampleID}'.format(sampleID=sampleID,
                                                   **self.__dict__)
        ]
        after_jobs = 'mapping_check_{sampleID} stat_flag_{sampleID}'.format(
            sampleID=sampleID).split()
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #12
0
    def release(self):

        # print '  data release ...'
        array = ','.join(map(str, self.args['analy_array']))

        sample_info_done = ''
        if self.args['samp_info_done']:
            sample_info_done = '\\\n{:16}--samp_info_done {samp_info_done}'.format(
                '', **self.args)

        cmd = '''
            set -eo pipefail
            echo data release start: `date "+%F %T"`

            python2 {ROOT_DIR}/modules/release/data_release.py \\
               --analydir {analydir} \\
               --qc_list {qc_list} \\
               --samp_info {samp_info} {sample_info_done} \\
               --pn {pn} \\
               --odir {analydir}/Result/{newjob} \\
               --newjob {newjob} \\
               --analy_array {array}

            echo data release done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Result/{newjob}/release.{newjob}.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'data_release'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # tar release
        cmd = '''
            set -eo pipefail
            echo tar release start: `date "+%F %T"`

            python2 {ROOT_DIR}/modules/release/tar_release.old.py \\
                --projdir {analydir} \\
                --analy_array {array} \\
                --odir {analydir}/Result/{newjob} \\
                --date {newjob} \\
                --pre {qcsuffix} \\
                --pn {pn} \\
                --mail {email} \\
                --yymail {yymail}

            echo tar release done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Result/{newjob}/tar_release.{newjob}.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)
Exemple #13
0
    def pathway(self):

        print '>  pathway ...'
        # write shell
        cmd = '''
            set -eo pipefail
            echo pathway start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/Pathway

            python {moduledir}/Enrich_R/bin/stat_phyper_table.py \\
                -i {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls \\
                -o Pathway \\
                -f 1

            # extract gene
            awk -F '\\t' 'NR>1 && $5<0.05 {print8}' Pathway/Pathway_kegg.xls |
                tr '/' '\\n' |
                sort -u > KEGG.xls

            # extract path
            awk -F '\\t' 'NR>1 && $5<0.05 {print1}' Pathway/Pathway_kegg.xls |
                sort -u >> KEGG.xls

            python {moduledir}/KEGG/kegg_svg.py KEGG.xls

            # Brief Result
            echo generate brief results

            python {ROOT_DIR}/modules/brief/text2excel.py \\
                {BriefResults}/Pathway/pathway.xlsx \\
                {ROOT_DIR}/modules/brief/readme/pathway.readme.xls \\
                Pathway/Pathway_go_MF.xls \\
                Pathway/Pathway_go_BP.xls \\
                Pathway/Pathway_go_CC.xls \\
                Pathway/Pathway_kegg.xls

            echo pathway done: `date "+%F %T"`
        '''.format(
            print1='{print $1}',
            print8='{print $8}',
            **self.args)

        shell_path = '{analydir}/Advance/{newjob}/Pathway/pathway_enrichment.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'pathway'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['integrate_result']
        after_jobs = ['data_release']
        utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
Exemple #14
0
    def hla_sort_by_name(self, sampleid, gene):

        # print '  sort by name ...'
        cmd = '''
            set -eo pipefail
            echo hla sort by name for {sampleid} {gene} start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene}

            mkdir -p TMP

            for gene in {gene} non-{gene};do

                samtools-1.6 view \\
                    -b -L {athlates_db_dir}/bed/hla.$gene.bed \\
                    -o {sampleid}.$gene.bam \\
                    -@ 4 \\
                    ../../{sampleid}.nodup.bam

                (
                    samtools-1.6 view -H {sampleid}.$gene.bam
                    samtools-1.6 view {sampleid}.$gene.bam | sort -k1,1 -k3,3 -T TMP
                ) | samtools-1.6 view -bS -o {sampleid}.$gene.sort.bam -@ 4 -

                rm -f {sampleid}.$gene.bam

            done

            rm -rf TMP

            # rm -f ../../{sampleid}.sort.bam
            # rm -f ../../{sampleid}.nodup.bam

            echo hla sort by name for {sampleid} {gene} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene}/hla_sort_by_name_{sampleid}_{gene}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'hla_sort_by_name'
        job_name = 'hla_sort_by_name_{sampleid}_{gene}'.format(**locals())

        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['hla_picard_markdup_{sampleid}'.format(**locals())]
        after_jobs = []
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #15
0
    def freec_call(self, sampleID):

        seqtype = self.args['seqstrag'].split('_')[0]
        target = ''
        if seqtype != 'WGS':
            target = '\\\n{:16}--target {} '.format(' ', self.args['TR'])
        sex = 'XX' if self.sample_infos[sampleID]['sex'] == 'F' else 'XY'

        REF = 'hg19' if self.__dict__['ref'] == 'b37' else self.__dict__['ref']

        cmd = '''
            set -eo pipefail
            echo cnv call with freec for {sampleID} start: `date "+%F %T"`

            cd {analydir}/SV/{sampleID}/freec

            python {moduledir}/Varition/CNV/freec/freec_calling.py \\
                --type {seqtype} {target}\\
                --format BAM \\
                --loh 0 \\
                --contamination 0 \\
                --samName {sampleID} \\
                --sample {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\
                --sex {sex} \\
                --ref {ref} \\
                --o .

            python {moduledir}/Varition/CNV/freec/Chr_CNV_freec_pipe4.5.py \\
                --inf ./{sampleID}.freec.{REF}_multianno.xls \\
                --ref {ref} \\
                --sample_info {samp_info}

            rm -f *cpn *txt

            echo cnv call with freec for {sampleID} done: `date "+%F %T"`
            '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/SV/{sampleID}/freec/freec_call_{sampleID}.sh'.format(
            sampleID=sampleID, **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'freec_call'
        job_name = 'freec_call_{}'.format(sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)]
        after_jobs = ['primary_report']
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #16
0
    def gene_association(self):

        print '>  gene association ...'
        self.gene_as_filter()
        # write shell
        cmd = '''
            set -eo pipefail
            echo gene association start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/GeneAS

            for mtype in snp snp.indel;do
                python {moduledir}/Association/Burden/GetBurdenFre.py \\
                    -case $mtype.filter.noXY.V6Pad100.xls.gz \\
                    -control {moduledir}/{pad_100_stat} \\
                    -cr 0.95 \\
                    -nr 0.6 \\
                    -cc N \\
                    -Num 2827 \\
                    -out $mtype.burden.stat.xls
                
                rows=`wc -l $mtype.burden.stat.xls | cut -d' ' -f1`
                if [ $rows -eq 1 ];then
                    echo "[error] no data in $mtype.burden.stat.xls"
                    exit 1
                fi

                Rscript {moduledir}/Association/Burden/GeneFisherPlot.R \\
                    --infile $mtype.burden.stat.xls \\
                    --outpre $mtype.burden

                paste $mtype.burden.fisher.xls $mtype.burden.stat.samstat.xls > $mtype.burden.result.xls
            done

            echo gene association done: `date "+%F %T"`
        '''.format(**self.__dict__)



        shell_path = '{analydir}/Advance/{newjob}/GeneAS/gene_association.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'gene_association'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['gene_as_filter']
        after_jobs = ['data_release']
        utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
Exemple #17
0
    def ppi(self):

        print '>  ppi ...'
        # write shell
        cmd = '''
            set -eo pipefail
            echo ppi start: `date "+%F %T"`

            cd {PPI}

            echo 9606 > PPI_genes.xls

            cat {analydir}/Advance/{newjob}/IntegrateResult/total.candidate.gene.xls |
                tr ',' '\\n' |
                tr '\\n' '\\t' >> PPI_genes.xls

            echo -e "\\nall\\n20" >> PPI_genes.xls

            java -Xmx6G -jar {genemania_jar} QueryRunner \\
                --data {genemania_data} \\
                --out flat \\
                --results . \\
                PPI_genes.xls

            python {moduledir}/PPI/SplitPPI_Result.py .

            # Brief Result
            echo generate brief results

            python {ROOT_DIR}/modules/brief/text2excel.py \\
                {BriefResults}/PPI/PPI.xlsx \\
                {ROOT_DIR}/modules/brief/readme/ppi.readme.xls \\
                Gene_interactions.xls \\
                Gene_description.xls \\
                Networks.description.xls

            echo ppi done: `date "+%F %T"`
        '''.format(**self.__dict__)

        shell_path = '{PPI}/ppi.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'ppi'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['integrate_result']
        after_jobs = ['data_release']
        utils.add_order(self.orders, job_name, before_jobs=before_jobs, after_jobs=after_jobs)
Exemple #18
0
    def sambamba_markdup(self, patientID, sampleID):

        # print '  sambamba merge...'
        # write shell
        markdup_threads = self.threads['markdup']

        cmd = '''
            set -eo pipefail
            echo sambamba markdup for {sampleID} start: `date "+%F %T"`

            cd {analydir}/Mapping/{patientID}.{sampleID}

            sambamba markdup \\
                -t {markdup_threads} \\
                --overflow-list-size=10000000 \\
                --tmpdir=tmp \\
                {sampleID}.sort.bam \\
                {sampleID}.nodup.bam

            rm -rf tmp

            echo sambamba markdup for {sampleID} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/sambamba_markdup_{sampleID}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'sambamba_markdup'
        job_name = 'sambamba_markdup_{sampleID}'.format(sampleID=sampleID)

        utils.add_job(self.jobs,
                      now_point,
                      self.args['startpoint'],
                      self.ANALYSIS_POINTS,
                      job_name,
                      shell_path,
                      self.queues,
                      threads=markdup_threads)

        # add order
        before_jobs = [
            '{merge_soft}_merge_{sampleID}'.format(sampleID=sampleID,
                                                   **self.__dict__)
        ]
        after_jobs = 'mapping_check_{sampleID} stat_flag_{sampleID}'.format(
            sampleID=sampleID).split()
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #19
0
    def crest_call(self, sampleID, chrom_list):

        for chrom in chrom_list:
            # 1 extract soft clip
            cmd = '''
                set -eo pipefail
                echo sv call with crest for {sampleID} start: `date "+%F %T"`\n

                # 1 Extract Softclip
                echo extract softclip...
                perl {soft_dir}/CREST/CREST/extractSClip.pl \\
                    -o {analydir}/SV/{sampleID}/crest/bychr \\
                    -i {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\
                    -ref_genome {reffasta} \\
                    -r {chrom} \\
                    -p {sampleID}

                # 2 CREST Calling
                echo crest calling...
                perl {soft_dir}/CREST/crest_sv_calling_pipe4.6.pl \\
                    -outDir {analydir}/SV/{sampleID}/crest/bychr \\
                    -tumorBam {analydir}/Mapping/{sampleID}.{sampleID}/{sampleID}.final.bam \\
                    --min_one_side_reads 4 \\
                    -sampleID {sampleID}.{chrom} \\
                    -regionList {refbed} \\
                    -cover {analydir}/SV/{sampleID}/crest/bychr/{sampleID}.{chrom}.cover \\
                    -ref {reffasta} \\
                    -bit {reffasta2bit}

                echo sv call with crest for {sampleID} done: `date "+%F %T"`
            '''.format(sampleID=sampleID, chrom=chrom, **self.__dict__)

            shell_path = '{analydir}/SV/{sampleID}/crest/crest_call_chr_{chrom}_{sampleID}.sh'.format(
                sampleID=sampleID, chrom=chrom.strip('chr'), **self.__dict__)

            utils.write_shell(shell_path, cmd)

            # add job
            now_point = 'crest_call'
            job_name = 'crest_call_{}_{}'.format(chrom, sampleID)
            utils.add_job(self.jobs, now_point, self.args['startpoint'],
                          self.ANALYSIS_POINTS, job_name, shell_path,
                          self.queues)

            # add order
            before_jobs = ['final_bam_{sampleID}'.format(sampleID=sampleID)]
            after_jobs = ['crest_txt2gff_{sampleID}'.format(sampleID=sampleID)]
            utils.add_order(self.orders,
                            job_name,
                            before_jobs=before_jobs,
                            after_jobs=after_jobs)
Exemple #20
0
    def breakdancer_call(self, sampleID):

        cmd = '''
            set -eo pipefail
            echo breakdancer call for {sampleID} start: `date "+%F %T"`\n

            cd {analydir}/SV/{sampleID}/breakdancer

            {soft_dir}/breakdancer/current/breakdancer-max \\
                -h \\
                -d {sampleID}.breakdancer.SV-supporting \\
                -g {sampleID}.breakdancer.bed \\
                {sampleID}.breakdancer.cfg |
                grep -vwE 'hs37d5|GL000220' \\
                > {sampleID}.breakdancer.txt

            perl {moduledir}/Varition/SV/breakdancer/breakdancer_filter.pl \\
                -g {sex} \\
                -n 6 \\
                -a {sampleID}.breakdancer.txt \\
                > {sampleID}.breakdancer.flt.txt

            perl {moduledir}/Varition/SV/breakdancer/breakdancer_txt2gff.pl \\
                {sampleID}.breakdancer.flt.txt \\
                > {sampleID}.breakdancer.gff

            echo breakdancer call for {sampleID} done: `date "+%F %T"`
        '''.format(sampleID=sampleID,
                   sex=self.sample_infos[sampleID]['sex'],
                   **self.__dict__)

        shell_path = '{analydir}/SV/{sampleID}/breakdancer/breakdancer_call_{sampleID}.sh'.format(
            sampleID=sampleID, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'breakdancer_call'
        job_name = 'breakdancer_call_{}'.format(sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = [
            'breakdancer_config_{sampleID}'.format(sampleID=sampleID)
        ]
        after_jobs = ['annotate_gff_{sampleID}'.format(sampleID=sampleID)]
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #21
0
    def mapping_check(self, patientID, sampleID):

        # print '  mapping check...'
        # write shell
        sex = self.sample_infos[sampleID]['sex']

        cmd = '''
            set -eo pipefail
            echo mapping check for sample {sampleID} start: `date "+%F %T"`

            python2 {moduledir}/QC/auto_check.py \\
                --qc_list {qc_list} \\
                --sampid {sampleID} \\
                --pwd {analydir} \\
                --check map \\
                --jobname {newjob} \\
                --seqstrag {seqstrag} \\
                --email {email} \\
                --PE {PE} \\
                --gender {sex} \\
                --dup {dup} \\
                --depth {depth}
            
            # remove clean data if mapping check passed
            if {rm_clean};then
                rm -f {analydir}/QC/{sampleID}/{sampleID}_*.clean.fq*
            fi

            echo mapping check for sample {sampleID} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Mapping/{patientID}.{sampleID}/mapping_check_{sampleID}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'mapping_check'
        job_name = 'mapping_check_{sampleID}'.format(sampleID=sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = []
        after_jobs = [
            'final_bam_{sampleID}'.format(sampleID=sampleID), 'data_release'
        ]
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #22
0
    def crest_txt2gff(self, sampleID, chrom_list):

        auto_chrom_last = filter(lambda x: x.strip('chr').isdigit(),
                                 chrom_list)[-1]

        other_chrom = filter(lambda x: not x.strip('chr').isdigit(),
                             chrom_list)

        # print auto_chrom_last
        # print other_chrom

        crest_results = '{{1..%s},%s}' % (auto_chrom_last.strip(
            'chr'), ','.join(map(lambda x: x.strip('chr'), other_chrom)))

        if 'chr' in chrom_list[-1]:
            crest_results = 'chr' + crest_results

        # print crest_results

        cmd = '''
            set -eo pipefail
            echo convert sv results to gff for {sampleID} start: `date "+%F %T"`\n

            cd {analydir}/SV/{sampleID}/crest

            cat bychr/{sampleID}.{crest_results}.predSV.txt |
                grep -vw hs37d5 > {sampleID}.predSV.txt

            perl {moduledir}/Varition/SV/crest/crest_txt2gff.pl \\
                {sampleID}.predSV.txt > {sampleID}.crest.gff

            echo convert sv results to gff for {sampleID} done: `date "+%F %T"`
        '''.format(crest_results=crest_results,
                   sampleID=sampleID,
                   **self.__dict__)

        shell_path = '{analydir}/SV/{sampleID}/crest/crest_txt2gff_{sampleID}.sh'.format(
            sampleID=sampleID, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'crest_txt2gff'
        job_name = 'crest_txt2gff_{}'.format(sampleID)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        after_jobs = ['annotate_gff_{sampleID}'.format(sampleID=sampleID)]
        utils.add_order(self.orders, job_name, after_jobs=after_jobs)
Exemple #23
0
    def samtools_call_hapmap(self, familyid, samples_with_data):

        vcf_list = '{analydir}/Advance/{newjob}/Linkage/{familyid}/vcf_{familyid}.list'.format(
            **dict(self.__dict__, **locals()))
        with utils.safe_open(vcf_list, 'w') as out:
            for sampleid in samples_with_data:
                out.write('{}.vcf\n'.format(sampleid))

        for sampleid in samples_with_data:
            print '>    samtools call hapmap for', sampleid

            cmd = '''
                set -eo pipefail
                echo samtools call hapmap for {sampleid} start: `date "+%F %T"`

                cd {analydir}/Advance/{newjob}/Linkage/{familyid}

                samtoolsv0.1.19 mpileup \\
                    -d 10000 -C 50 -D -S -m 2 -F 0.02 -q 13 -Q 13 \\
                    -gf {reffasta} \\
                    -l {moduledir}/Linkage/annotHapMap2L.txt \\
                    {analydir}/Mapping/{sampleid}.{sampleid}/{sampleid}.final.bam |
                bcftools_lh view \\
                    -cg -t 0.5 \\
                    -> {sampleid}.vcf

                echo samtools call hapmap for {sampleid} done: `date "+%F %T"`
            '''.format(**dict(self.__dict__, **locals()))

            shell_path = '{analydir}/Advance/{newjob}/Linkage/{familyid}/samtools_call_hapmap_{sampleid}.sh'.format(
                **dict(self.__dict__, **locals()))

            utils.write_shell(shell_path, cmd)

            # add job
            now_point = 'samtools_call_hapmap'
            job_name = 'samtools_call_hapmap_{sampleid}'.format(**locals())
            utils.add_job(self.jobs, now_point, self.args['startpoint'],
                          self.ANALYSIS_POINTS, job_name, shell_path,
                          self.queues)

            # add order
            before_jobs = ['final_bam_{sampleid}'.format(**locals())]
            after_jobs = ['linkdatagen_{familyid}'.format(**locals())]
            utils.add_order(self.orders,
                            job_name,
                            before_jobs=before_jobs,
                            after_jobs=after_jobs)
Exemple #24
0
    def gzip_md5_clean(self, sampleID, lane):

        # print '  gzip and md5sum clean data...'
        # write shell
        cmd = '''
            set -eo pipefail
            echo Compress and md5sum clean for sample {sampleID} start: `date "+%F %T"`\n

            cd {analydir}/QC/{sampleID}

            fq1={sampleID}_{novoid}_{flowcell}_L{lane}_1.clean.fq
            fq2={sampleID}_{novoid}_{flowcell}_L{lane}_2.clean.fq

            for fq in $fq1 $fq2;do
                if [ -s $fq.gz -a ! -s $fq ];then
                    echo $fq has been compressed.
                else
                    pigz -p4 -f $fq
                fi
                md5sum $fq.gz | unix2dos > $fq.gz.MD5.txt
            done

            echo Compress and md5sum clean for sample {sampleID} done: `date "+%F %T"`
        '''.format(analydir=self.analydir, sampleID=sampleID, **lane)

        shell_path = '{analydir}/QC/{sampleID}/gzip_md5_clean_{sampleID}_{novoid}_{flowcell}_L{lane}.sh'.format(
            analydir=self.analydir, sampleID=sampleID, **lane)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'gzip_md5_clean'
        job_name = 'gzip_md5_clean_{sampleID}_{novoid}_{flowcell}_L{lane}'.format(
            sampleID=sampleID, **lane)

        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = [
            'qc_{sampleID}_{novoid}_{flowcell}_L{lane}'.format(
                sampleID=sampleID, **lane)
        ]
        after_jobs = ['data_release']
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #25
0
    def hla_athlates_typing(self, sampleid, gene):

        # print '  athlates typing ...'
        cmd = '''
            set -eo pipefail
            echo hla athlates typing for {sampleid} {gene} start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene}

            typing \\
                -hd 2 \\
                -msa {athlates_db_dir}/msa/{gene}_nuc.txt \\
                -bam {sampleid}.{gene}.sort.bam \\
                -exlbam {sampleid}.non-{gene}.sort.bam \\
                -o {sampleid}.{gene}

            rm -f *bam

            # link result
            mkdir -p {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/result

            cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/result

            ln -sf ../{sampleid}/gene/{gene}/{sampleid}.{gene}.typing.txt .

            echo hla athlates typing for {sampleid} {gene} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/gene/{gene}/hla_athlates_typing_{sampleid}_{gene}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'hla_athlates_typing'
        job_name = 'hla_athlates_typing_{sampleid}_{gene}'.format(**locals())

        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = ['hla_sort_by_name_{sampleid}_{gene}'.format(**locals())]
        after_jobs = ['data_release']
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #26
0
    def integrate(self):

        print '>  integrate result ...'
        # write shell
        cmd = Template('''
            set -eo pipefail
            echo integrate result start: `date "+%F %T"`

            cd ${IntegrateResult}

            python ${ROOT_DIR}/modules/integrate/integrate.py \\
                --analydir ${analydir} \\
                --samp-info ${samp_info} \\
                --newjob ${newjob} \\
                --moduledir ${moduledir} \\
                --analy-array ${array} \\
                --reffasta ${reffasta} \\
                --confidence ${confidence} \\
                --out .

            grep -v '\\.' total.candidate.gene.xls | tr  ',' '\\n' | sort -u > temp
            mv temp total.candidate.gene.xls

            awk -v OFS='\\t' 'NR==1 || $1!="GeneName" {print $1, $2, $3}' Integrate.xls  | sort -u > CandidateGene.xls
            python ${ROOT_DIR}/modules/brief/text2excel.py \\
                ${BriefResults}/IntegrateResult/candidate_gene.xlsx \\
                ${ROOT_DIR}/modules/brief/readme/candidate_gene.readme.xls \\
                CandidateGene.xls

            rm -f CandidateGene.xls

            echo integrate result done: `date "+%F %T"`
        ''')

        cmd = cmd.safe_substitute(array=','.join(
            map(str, self.args['analy_array'])),
                                  **self.args)

        shell_path = '{IntegrateResult}/integrate_result.sh'.format(
            **self.args)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = job_name = 'integrate_result'
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)
Exemple #27
0
    def hla_sambamba_merge(self, sampleid, sort_bams):

        # print '  sambamba merge...'
        # write shell
        if len(sort_bams) == 1:
            cmd = '''
                set -eo pipefail
                echo rename sortbam for {sampleid} start: `date "+%F %T"`

                cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}

                mv {sort_bam} {sampleid}.sort.bam

                samtools-1.6 index -@ 4 {sampleid}.sort.bam

                echo rename sortbam for {sampleid} done: `date "+%F %T"`
            '''.format(sampleid=sampleid,
                       sort_bam=sort_bams[0],
                       **self.__dict__)
        else:
            cmd = '''
                set -eo pipefail
                echo hla sambamba merge for {sampleid} start: `date "+%F %T"`

                cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}

                sambamba merge \\
                    -t 4 \\
                    {sampleid}.sort.bam \\
                    {sort_bams}

                echo hla sambamba merge for {sampleid} done: `date "+%F %T"`
            '''.format(sampleid=sampleid,
                       sort_bams=' '.join(sort_bams),
                       **self.__dict__)

        shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_sambamba_merge_{sampleid}.sh'.format(
            sampleid=sampleid, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'hla_sambamba_merge'
        job_name = 'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid)
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)
Exemple #28
0
    def hla_sambamba_markdup(self, sampleid):

        # print '  sambamba merge...'
        # write shell
        cmd = '''
            set -eo pipefail
            echo sambamba markdup for {sampleid} start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}

            sambamba markdup \\
                -t 5 \\
                --overflow-list-size=10000000 \\
                --tmpdir=tmp \\
                {sampleid}.sort.bam \\
                {sampleid}.nodup.bam

            rm -rf tmp

            echo sambamba markdup for {sampleid} done: `date "+%F %T"`
        '''.format(sampleid=sampleid, **self.__dict__)

        shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_sambamba_markdup_{sampleid}.sh'.format(
            sampleid=sampleid, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'hla_sambamba_markdup'
        job_name = 'hla_sambamba_markdup_{sampleid}'.format(sampleid=sampleid)

        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = [
            'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid)
        ]
        after_jobs = []
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #29
0
    def hla_picard_markdup(self, sampleid):

        # print '  picard markdup ...'
        cmd = '''
            set -eo pipefail
            echo picard markdup for {sampleid} start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}

            java1.8.0 -Xmx5g -jar {picard_jar} \\
                MarkDuplicates \\
                TMP_DIR=TMP \\
                INPUT={sampleid}.sort.bam \\
                OUTPUT={sampleid}.nodup.bam \\
                METRICS_FILE={sampleid}.nodup.metrics.txt \\
                CREATE_INDEX=true \\
                ASSUME_SORTED=true

            echo picard markdup for {sampleid} done: `date "+%F %T"`
        '''.format(sampleid=sampleid, **self.__dict__)

        shell_path = '{analydir}/Advance/{newjob}/HLA/ATHLATES_typing/{sampleid}/hla_picard_markdup_{sampleid}.sh'.format(
            sampleid=sampleid, **self.__dict__)

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'hla_picard_markdup'
        job_name = 'hla_picard_markdup_{sampleid}'.format(sampleid=sampleid)

        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)

        # add order
        before_jobs = [
            'hla_sambamba_merge_{sampleid}'.format(sampleid=sampleid)
        ]
        after_jobs = []
        utils.add_order(self.orders,
                        job_name,
                        before_jobs=before_jobs,
                        after_jobs=after_jobs)
Exemple #30
0
    def linkdatagen(self, familyid):

        print '>   linkdatagen for', familyid

        cmd = '''
            set -eo pipefail
            echo linkdatagen for {familyid} start: `date "+%F %T"`

            cd {analydir}/Advance/{newjob}/Linkage/{familyid}

            # vcf2linkdatagen: VCF  => BRLMM genotype file
            perl {moduledir}/Linkage/vcf2linkdatagen.pl \\
                -annotfile {moduledir}/Linkage/annotHapMap2.txt \\
                -pop CHB -mindepth 10 -missingness 0 \\
                -idlist vcf_{familyid}.list \\
                > {familyid}.brlmm
            
            # linkdatagen: generate datasets for linkage anallysis by MERLIN
            perl {moduledir}/Linkage/linkdatagen.pl \\
                -data m \\
                -pedfile {familyid}.ped \\
                -whichSamplesFile {familyid}.ws \\
                -callFile {familyid}.brlmm \\
                -annotFile {moduledir}/Linkage/annotHapMap2.txt \\
                -pop CHB -binsize 0.3 -prog me \\
                -outputDir {familyid}_HapMap2 \\
                > {familyid}_HapMap2.out

            echo linkdatagen for {familyid} done: `date "+%F %T"`
        '''.format(**dict(self.__dict__, **locals()))

        shell_path = '{analydir}/Advance/{newjob}/Linkage/{familyid}/linkdatagen_{familyid}.sh'.format(
            **dict(self.__dict__, **locals()))

        utils.write_shell(shell_path, cmd)

        # add job
        now_point = 'linkdatagen'
        job_name = 'linkdatagen_{familyid}'.format(**locals())
        utils.add_job(self.jobs, now_point, self.args['startpoint'],
                      self.ANALYSIS_POINTS, job_name, shell_path, self.queues)