def fastq2bam(self, inputs, bam_out, sample):
        '''
          Convert fastq to a prealigned bam. 
          stages:
          1 infer lanes and indexes
          2 split into lanes
          3 fastq2bam
          4 merge
        '''

        # input filenames
        #fastq_read1_in, fastq_read2_in = input, input.replace('_R1', '_R2')
        fastq_read1_in, fastq_read2_in = inputs
        output_dir = os.path.dirname(bam_out)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        log_out = os.path.join(output_dir, '{}.log.out'.format(bam_out))
        log_err = os.path.join(output_dir, '{}.log.err'.format(bam_out))

        command = "python {}/src/util/fastq2bam.py --r1 {} --r2 {} --output_dir {} --bam {} 1>{} 2>{}".format(
            config.ROOT, fastq_read1_in, fastq_read2_in, output_dir, bam_out,
            log_out, log_err)
        run_stage(self.state, 'fastq2bam', command)
Example #2
0
    def analyse_wgs_prepare(self, input, output):
        '''
            creates working directory and scripts to run for wgs pipeline
        '''
        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'))
        if normal_id is None:  # nothing to do
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        tmp_id = 'wgs-{}'.format(tumour_id)
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir(tmp_dir)
        safe_make_dir(os.path.dirname(output))
        command = 'cp {root}/src/util/analysisWGS.serial.sh {tmp_dir}/analysisWGS.sh && cp {root}/src/util/ds-wrapper-wgs-1.0.8.pl {tmp_dir}/ds-wrapper.pl && touch {output}'.format(
            root=config.ROOT, output=output, tmp_dir=tmp_dir)
        run_stage(self.state, 'analyse_wgs_prepare', command)
    def contest(self, input, output):
        '''
            run contest
        '''

        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{root}/cfg/sample-metadata.csv".format(root=config.ROOT),
                 'r'))

        # tumour_id is actually normal
        if normal_id is None:
            normal_id = tumour_id
            validation_data = open(
                "{root}/out/{sample}.validation".format(root=config.ROOT,
                                                        sample=normal_id),
                'r').readlines()
            normal_uuid = validation_data[1].split('\t')[8]
            with open(
                    '{tmp_dir}/{tumour_id}.contest.sh'.format(
                        tmp_dir=config.TMP, tumour_id=tumour_id),
                    'w') as analyse_fh:
                for line in open(
                        '{root}/src/util/contest.sh.template'.format(
                            root=config.ROOT), 'r'):
                    new_line = re.sub('TUMOUR', tumour_id, line)
                    new_line = re.sub('NORMAL', normal_id, new_line)
                    new_line = re.sub('UUID', normal_uuid, new_line)
                    new_line = re.sub('ROOT', config.ROOT, new_line)
                    analyse_fh.write(new_line)
        else:
            # it's a tumour
            validation_data = open(
                "{root}/out/{sample}.validation".format(root=config.ROOT,
                                                        sample=normal_id),
                'r').readlines()
            normal_uuid = validation_data[1].split('\t')[8]
            with open(
                    '{tmp_dir}/{tumour_id}.contest.sh'.format(
                        tmp_dir=config.TMP, tumour_id=tumour_id),
                    'w') as analyse_fh:
                for line in open(
                        '{root}/src/util/contest.sh.template'.format(
                            root=config.ROOT), 'r'):
                    new_line = re.sub('TUMOUR', tumour_id, line)
                    new_line = re.sub('NORMAL', normal_id, new_line)
                    new_line = re.sub('UUID', normal_uuid, new_line)
                    new_line = re.sub('ROOT', config.ROOT, new_line)
                    analyse_fh.write(new_line)

        command = 'bash {tmp_dir}/{tumour_id}.contest.sh 2>{prefix}.contest.log.err 1>{prefix}.contest.log.out && touch "{output}"'.format(
            tmp_dir=config.TMP,
            tumour_id=tumour_id,
            output=output,
            prefix=prefix)

        run_stage(self.state, 'contest', command)
Example #4
0
 def align_stats_picard(self, inputs, stats_out):
     '''
       generate coverage stats from bam
     '''
     mapped_bam = inputs
     command = 'java -jar {root}/tools/picard-2.8.2.jar CollectRawWgsMetrics INPUT={input} OUTPUT={output} REFERENCE_SEQUENCE={root}/reference/core_ref_GRCh37d5/genome.fa INCLUDE_BQ_HISTOGRAM=true 1>{output}.log.out 2>{output}.log.err'.format(
         root=config.ROOT, input=mapped_bam, output=stats_out)
     run_stage(self.state, 'align_stats_picard', command)
Example #5
0
 def align_stats_bedtools(self, inputs, stats_out):
     '''
       generate coverage stats from bam
     '''
     mapped_bam = inputs
     command = 'bedtools genomecov -ibam {mapped_bam} | python {root}/src/util/coverage_histogram.py {stats_out}.histogram.html 1>{stats_out} 2>{stats_out}.err'.format(
         root=config.ROOT, mapped_bam=mapped_bam, stats_out=stats_out)
     run_stage(self.state, 'align_stats_bedtools', command)
    def _analyse_wgs_with_command(self, input, output, subcommand, cpu=4):
        '''
          take mapped bams and generate variant calls by running the sanger pipeline cgpwgs
        '''
        input = input[0]
        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'))
        if normal_id is None:  # nothing to do
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        tmp_id = 'wgs-{}-{}'.format(config.WGS_VERSION, tumour_id)
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir('{}/home'.format(tmp_dir))

        # make subcommand analysis script
        with open(
                '{tmp_dir}/analyse-{subcommand}.sh'.format(
                    tmp_dir=tmp_dir, subcommand=subcommand),
                'w') as analyse_fh:
            for line in open(
                    '{root}/src/util/analyse-{wgs_version}.sh.template'.format(
                        wgs_version=config.WGS_VERSION, root=config.ROOT),
                    'r'):  #analyse-1.1.2.sh.template
                new_line = re.sub('TMP_ID', tmp_id, line)
                new_line = re.sub('TUMOUR', tumour_id, new_line)
                new_line = re.sub('NORMAL', normal_id, new_line)
                new_line = re.sub('COMMAND', subcommand, new_line)
                new_line = re.sub('WGS_VERSION', config.WGS_VERSION, new_line)
                new_line = re.sub('CPULIMIT', str(cpu), new_line)
                analyse_fh.write(new_line)

        command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp}:/mnt/tmp --workdir {tmp_dir} --home {tmp_dir}/home:/home/z --contain {root}/img/cgpwgs-{wgs_version}.img bash /mnt/tmp/{tmp_id}/analyse-{subcommand}.sh 1>{prefix}.wgs.{subcommand}.{wgs_version}.log.out 2>{prefix}.wgs.{subcommand}.{wgs_version}.log.err && touch {output}'.format(
            root=config.ROOT,
            in_dir=config.IN,
            out=config.OUT,
            reference=config.REFERENCE,
            tmp=config.TMP,
            tmp_dir=tmp_dir,
            tmp_id=tmp_id,
            prefix=prefix,
            output=output,
            subcommand=subcommand,
            wgs_version=config.WGS_VERSION)
        run_stage(self.state, 'analyse_wgs_{}'.format(subcommand), command)
Example #7
0
    def align(self, inputs, bam_out):
        '''
          run the alignment dockstore image
          @input: the pre-aligned bam
          @bam_out: aligned bam
        '''
        # generate dockstore file as sample.dockstore
        validation, bam = inputs
        prefix = re.sub('.bam$', '', bam)  # full path without .bam
        sample_filename = prefix.split('/')[-1]  # e.g. CMHS1
        dockstore_out = re.sub('.bam$', '.dockstore', bam)

        # determine sample from validation file
        for line in open(validation, 'r'):
            if line.startswith('#'):
                continue
            fields = line.strip('\n').split('\t')
            sample = fields[8]

        if input == dockstore_out:
            raise Exception("Unexpected input file {}".format(bam))

        #log_out = '{}.log.out'.format(bam_out)
        #log_err = '{}.log.err'.format(bam_out)

        # make our own align script
        tmp_id = 'align-{}-{}'.format(sample, str(uuid.uuid4()))
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir(tmp_dir)
        with open('{tmp_dir}/align.sh'.format(tmp_dir=tmp_dir),
                  'w') as align_fh:
            for line in open(
                    '{root}/src/util/align.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TMP_ID', tmp_id, line)
                new_line = re.sub('SAMPLE_FILENAME', sample_filename, new_line)
                new_line = re.sub('SAMPLE_ID', sample, new_line)
                align_fh.write(new_line)

        command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp}:/mnt/tmp --workdir {tmp_dir} --contain {root}/img/cgpmap.img bash /mnt/tmp/{tmp_id}/align.sh 1>{prefix}.mapped.log.out 2>{prefix}.mapped.log.err && rm -rf "{tmp_dir}"'.format(
            root=config.ROOT,
            in_dir=config.IN,
            out=config.OUT,
            reference=config.REFERENCE,
            tmp=config.TMP,
            tmp_dir=tmp_dir,
            tmp_id=tmp_id,
            prefix=prefix)
        run_stage(self.state, 'align', command)
    def callable_bases(self, input, output):
        '''
            run callable bases
        '''

        MINIMUM_COVERAGE_TUMOR = '17'
        MINIMUM_COVERAGE_NORMAL = '10'

        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{root}/cfg/sample-metadata.csv".format(root=config.ROOT),
                 'r'))

        # nothing to do for normal sample
        if normal_id is None:
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        # it's a tumour
        with open(
                '{tmp_dir}/{tumour_id}.callable_bases.sh'.format(
                    tmp_dir=config.TMP, tumour_id=tumour_id),
                'w') as analyse_fh:
            for line in open(
                    '{root}/src/util/callable_bases.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TUMOUR', tumour_id, line)
                new_line = re.sub('NORMAL', normal_id, new_line)
                new_line = re.sub('ROOT', config.ROOT, new_line)
                new_line = re.sub('TMP_DIR', config.TMP, new_line)
                new_line = re.sub('MIN_TUM', MINIMUM_COVERAGE_TUMOR, new_line)
                new_line = re.sub('MIN_NORM', MINIMUM_COVERAGE_NORMAL,
                                  new_line)
                analyse_fh.write(new_line)

        command = 'bash {tmp_dir}/{tumour_id}.callable_bases.sh 2>{prefix}.callable_bases.log.err 1>{prefix}.callable_bases.log.out && touch "{output}"'.format(
            tmp_dir=config.TMP,
            tumour_id=tumour_id,
            output=output,
            prefix=prefix)

        run_stage(self.state, 'callable_bases', command)
    def hmmcopy(self, input, output):
        '''
            run hmmcopy
        '''
        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'))

        # tumour_id is actually a normal
        if normal_id is None:
            target_dir = '{}.hmmcopy'.format(prefix)
            safe_make_dir(target_dir)
            with open('{target_dir}/hmmcopy.sh'.format(target_dir=target_dir),
                      'w') as analyse_fh:
                for line in open(
                        '{root}/src/util/hmmcopy-normal.sh.template'.format(
                            root=config.ROOT), 'r'):
                    new_line = re.sub('NORMAL', tumour_id, line)
                    new_line = re.sub('ROOT', config.ROOT, new_line)
                    new_line = re.sub('TARGET_DIR', target_dir, new_line)
                    analyse_fh.write(new_line)

            command = 'bash {target_dir}/hmmcopy.sh 2>{prefix}.hmmcopy.log.err 1>{prefix}.hmmcopy.log.out && touch "{output}"'.format(
                target_dir=target_dir, output=output, prefix=prefix)

        else:
            # it's a tumour
            target_dir = '{}.hmmcopy'.format(prefix)
            safe_make_dir(target_dir)

            with open('{target_dir}/hmmcopy.sh'.format(target_dir=target_dir),
                      'w') as analyse_fh:
                for line in open(
                        '{root}/src/util/hmmcopy.sh.template'.format(
                            root=config.ROOT), 'r'):
                    new_line = re.sub('TUMOUR', tumour_id, line)
                    new_line = re.sub('NORMAL', normal_id, new_line)
                    new_line = re.sub('ROOT', config.ROOT, new_line)
                    new_line = re.sub('TARGET_DIR', target_dir, new_line)
                    analyse_fh.write(new_line)

            command = 'bash {target_dir}/hmmcopy.sh 2>{prefix}.hmmcopy.log.err 1>{prefix}.hmmcopy.log.out && touch "{output}"'.format(
                target_dir=target_dir, output=output, prefix=prefix)

        run_stage(self.state, 'hmmcopy', command)
    def varscan_germline_indel(self, input, output):
        '''
          call germline variants
        '''
        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        sample_id = prefix.split('/')[-1]  # e.g. CMHS1

        command = 'echo "{sample_id}" > {prefix}.varscan_indel.tmp && samtools mpileup -B -f {reference}/core_ref_GRCh37d5/genome.fa {input} | java -jar {root}/tools/VarScan.v2.4.2.jar mpileup2indel --output-vcf 1 --vcf-sample-list {prefix}.varscan_indel.tmp 1>{prefix}.varscan_indel.vcf 2>{prefix}.varscan_indel.log.err && touch {output} && rm {prefix}.varscan_indel.tmp'.format(
            root=config.ROOT,
            reference=config.REFERENCE,
            input=input,
            output=output,
            prefix=prefix,
            sample_id=sample_id)
        run_stage(self.state, 'varscan_germline_indel', command)
Example #11
0
    def delly(self, input, output, cpu=6):
        '''
          run the delly singularity container
        '''
        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'))

        # nothing to do for normal sample
        if normal_id is None:
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        # it's a tumour
        tmp_id = 'delly-{}-{}'.format(tumour_id, str(uuid.uuid4()))
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir(tmp_dir)
        with open('{tmp_dir}/delly.sh'.format(tmp_dir=tmp_dir),
                  'w') as analyse_fh:
            for line in open(
                    '{root}/src/util/delly.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TUMOUR', tumour_id, line)
                new_line = re.sub('NORMAL', normal_id, new_line)
                new_line = re.sub('CORES', str(cpu), new_line)
                analyse_fh.write(new_line)

        command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp_dir}:/mnt/tmp --workdir {tmp_dir} --contain {root}/img/delly-2.0.0.img bash /mnt/tmp/delly.sh 1>{prefix}.delly.log.out 2>{prefix}.delly.log.err && mv {tmp_dir}/workdir {prefix}.delly.results && touch "{output}" && rm -r "{tmp_dir}"'.format(
            root=config.ROOT,
            in_dir=config.IN,
            out=config.OUT,
            reference=config.REFERENCE_DELLY,
            tmp=config.TMP,
            tmp_dir=tmp_dir,
            tmp_id=tmp_id,
            prefix=prefix,
            output=output)

        run_stage(self.state, 'delly', command)
    def validate_prealigned_bam(self, input, validation_out):
        '''
            run validation script
            @input: the pre-aligned bam
            @validation_out: tsv file with validation details
        '''
        prefix = re.sub('.bam$', '', input)
        sample = re.sub('.bam$', '', os.path.basename(input))

        validation_in = '{}.validation_src'.format(prefix)
        # read in additional metadata
        found = False
        for line in open("/mnt/vicnode_nfs/code/sample-metadata.csv", 'r'):
            # Sample UUID,Patient UUID,Lab ID,tissue_id,is_normal
            fields = line.strip('\n').split(',')
            if fields[0] == sample:
                donor_id = fields[1]
                tissue_id = fields[3]
                is_normal = fields[4]
                found = True
                break

        if not found:
            raise Exception(
                "Sample '{}' not found in metadata file".format(sample))

        # generate input to the validation script
        with open(validation_in, 'w') as validation_src:
            validation_src.write(
                '#Donor_ID\tTissue_ID\tis_normal (Yes/No,Y/N)\tSample_ID\trelative_file_path\n'
            )
            validation_src.write(
                '{donor_id}\t{tissue_id}\t{is_normal}\t{sample_id}\t{sample}.bam\n'
                .format(donor_id=donor_id,
                        tissue_id=tissue_id,
                        is_normal=is_normal,
                        sample_id=sample,
                        sample=sample))

        # run the validation script and generate output
        command = ". /mnt/vicnode_nfs/code/profile; validate_sample_meta.pl -in {validation_in} -out {validation_out} -f tsv 1>>{prefix}.validation.out 2>>{prefix}.validation.err".format(
            validation_in=validation_in,
            validation_out=validation_out,
            prefix=prefix)
        run_stage(self.state, 'validate_prealigned_bam', command)
    def somatic_sniper(self, input, output):
        '''
            run somatic sniper
        '''

        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{root}/cfg/sample-metadata.csv".format(root=config.ROOT),
                 'r'))

        # nothing to do for normal sample
        if normal_id is None:
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        # it's a tumour
        with open(
                '{tmp_dir}/{tumour_id}.somatic_sniper.sh'.format(
                    tmp_dir=config.TMP, tumour_id=tumour_id),
                'w') as analyse_fh:
            for line in open(
                    '{root}/src/util/somatic_sniper.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TUMOUR_ID', tumour_id, line)
                new_line = re.sub('NORMAL_ID', normal_id, new_line)
                new_line = re.sub('ROOT_PATH', config.ROOT, new_line)
                analyse_fh.write(new_line)

        command = 'bash {tmp_dir}/{tumour_id}.somatic_sniper.sh 2>{prefix}.somatic_sniper.log.err 1>{prefix}.somatic_sniper.log.out && touch "{output}"'.format(
            tmp_dir=config.TMP,
            tumour_id=tumour_id,
            output=output,
            prefix=prefix)

        run_stage(self.state, 'somatic_sniper', command)
Example #14
0
    def gridss(self, input, output):
        '''
            run gridss
        '''
        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'))

        # nothing to do for normal sample
        if normal_id is None:
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        # it's a tumour
        tmp_id = 'gridss-{}-{}'.format(tumour_id, str(uuid.uuid4()))
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir(tmp_dir)

        with open('{tmp_dir}/gridss.sh'.format(tmp_dir=tmp_dir),
                  'w') as analyse_fh:
            for line in open(
                    '{root}/src/util/gridss.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TUMOUR', tumour_id, line)
                new_line = re.sub('NORMAL', normal_id, new_line)
                new_line = re.sub('ROOT', config.ROOT, new_line)
                new_line = re.sub('ACCOUNT', config.ACCOUNT, new_line)
                analyse_fh.write(new_line)

        #command = 'bash {tmp_dir}/muse.sh && touch "{output}" && rm -r "{tmp_dir}"'.format(tmp_dir=tmp_dir, output=output)
        command = 'bash {tmp_dir}/gridss.sh 2>{prefix}.gridss.log.err 1>{prefix}.gridss.log.out && touch "{output}" && rm -r {tmp_dir}'.format(
            tmp_dir=tmp_dir, output=output, prefix=prefix)

        run_stage(self.state, 'gridss', command)
    def align(self, inputs, bam_out):
        '''
          run the alignment dockstore image
          @input: the pre-aligned bam
          @bam_out: aligned bam
        '''
        # generate dockstore file as sample.dockstore
        validation, bam = inputs
        prefix = re.sub('.bam$', '', bam)
        dockstore_out = re.sub('.bam$', '.dockstore', bam)

        # determine sample from validation file
        for line in open(validation, 'r'):
            if line.startswith('#'):
                continue
            fields = line.strip('\n').split('\t')
            sample = fields[8]

        if input == dockstore_out:
            raise Exception("Unexpected input file {}".format(bam))

        log_out = '{}.log.out'.format(bam_out)
        log_err = '{}.log.err'.format(bam_out)

        # replace sample with our sample
        with open(dockstore_out, 'w') as dockstore_fh:
            for line in open('/mnt/vicnode_nfs/code/dockstore.template', 'r'):
                new_line = re.sub('PREFIX', prefix, line)
                new_line = re.sub('SAMPLE', sample, new_line)
                dockstore_fh.write(new_line)

        #command = '/mnt/vicnode_nfs/dockstore/dockstore tool launch --entry quay.io/wtsicgp/dockstore-cgpmap:1.0.6 --json {} 1>>{} 2>>{}'.format(dockstore_out, log_out, log_err)
        tmp_dir = '/mnt/vicnode_nfs/dockstore-tmp/{}-{}'.format(
            sample, str(uuid.uuid4()))
        command = 'mkdir -p "{}" && TMPDIR="{}" && PARAM_FILE=/mnt/vicnode_nfs/code/dockstore.params && /mnt/vicnode_nfs/dockstore/dockstore tool launch --entry quay.io/wtsicgp/dockstore-cgpmap:2.0.0 --json {} 1>>{} 2>>{} && rm -r "{}"'.format(
            tmp_dir, tmp_dir, dockstore_out, log_out, log_err, tmp_dir)
        run_stage(self.state, 'align', command)
    def fastq2bam(self, inputs, bam_out, sample):
        '''
          Convert fastq to a prealigned bam. 
          stages:
          1 infer lanes and indexes
          2 split into lanes
          3 fastq2bam
          4 merge
        '''

        # input filenames
        fastq_read1_in, fastq_read2_in = inputs
        output_dir = os.path.dirname(bam_out)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        log_out = os.path.join(output_dir, '{}.log.out'.format(bam_out))
        log_err = os.path.join(output_dir, '{}.log.err'.format(bam_out))

        command = "python /mnt/vicnode_nfs/code/fastq2bam.py --r1 {} --r2 {} --output_dir {} --bam {} 1>>{} 2>>{}".format(
            fastq_read1_in, fastq_read2_in, output_dir, bam_out, log_out,
            log_err)
        run_stage(self.state, 'fastq2bam', command)
Example #17
0
    def validate_prealigned_bam(self, input, validation_out):
        '''
            run validation script
            @input: the pre-aligned bam
            @validation_out: tsv file with validation details
        '''
        prefix = re.sub('.bam$', '', input)
        sample = re.sub('.bam$', '', os.path.basename(input))

        validation_in = '{}.validation_src'.format(prefix)
        # read in additional metadata
        found = False
        for line in open("{}/cfg/sample-metadata.csv".format(config.ROOT),
                         'r'):
            # Sample UUID,Patient UUID,Lab ID,tissue_id,is_normal
            fields = line.strip('\n').split(',')
            if fields[0] == sample:
                donor_id = fields[1]
                tissue_id = fields[3]
                is_normal = fields[4]
                found = True
                break

        if not found:
            raise Exception(
                "Sample '{}' not found in metadata file".format(sample))

        # generate input to the validation script
        with open(validation_in, 'w') as validation_src:
            validation_src.write(
                '#Donor_ID\tTissue_ID\tis_normal (Yes/No,Y/N)\tSample_ID\trelative_file_path\n'
            )
            validation_src.write(
                '{donor_id}\t{tissue_id}\t{is_normal}\t{sample_id}\t{sample}.bam\n'
                .format(donor_id=donor_id,
                        tissue_id=tissue_id,
                        is_normal=is_normal,
                        sample_id=sample,
                        sample=sample))

        # make our own align script
        tmp_id = '{}-{}'.format(sample, str(uuid.uuid4()))
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir(tmp_dir)
        with open('{tmp_dir}/validate.sh'.format(tmp_dir=tmp_dir),
                  'w') as align_fh:
            for line in open(
                    '{root}/src/util/validate.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TMP_ID', tmp_id, line)
                new_line = re.sub('SAMPLE', sample, new_line)
                align_fh.write(new_line)

        # run the validation script and generate output
        #command = ". {root}/src/util/profile; validate_sample_meta.pl -in {validation_in} -out {validation_out} -f tsv 1>{prefix}.validation.out 2>{prefix}.validation.err".format(root=config.ROOT, validation_in=validation_in, validation_out=validation_out, prefix=prefix)
        command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp}:/mnt/tmp --workdir {tmp_dir} --contain {root}/img/cgpqc.img bash /mnt/tmp/{tmp_id}/validate.sh'.format(
            root=config.ROOT,
            in_dir=config.IN,
            out=config.OUT,
            reference=config.REFERENCE,
            tmp=config.TMP,
            tmp_dir=tmp_dir,
            tmp_id=tmp_id)
        run_stage(self.state, 'validate_prealigned_bam', command)
Example #18
0
 def bismark_genome_prepare(self, bisulfite_genome):
     '''Prepare the human genome using bismark'''
     command = "/data/projects/punim0095/methylation/bismark_v0.18.1/bismark_genome_preparation --path_to_bowtie /usr/local/easybuild/software/Bowtie2/2.2.9-intel-2016.u3/bin/ --verbose reference/"
     run_stage(self.state, 'bismark_genome_prepare', command)
Example #19
0
    def muse(self, input, output):
        '''
          run muse
        '''
        interval = 50000000  # chunk size to break chromosomes into for muse

        prefix = re.sub('.mapped.bam$', '',
                        input)  # full path without mapped.bam
        tumour_id = prefix.split('/')[-1]  # e.g. CMHS1
        normal_id = util.find_normal(
            tumour_id,
            open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'))

        # nothing to do for normal sample
        if normal_id is None:
            safe_make_dir(os.path.dirname(output))
            with open(output, 'w') as output_fh:
                output_fh.write(
                    'Normal sample does not require analysis. See the relevant tumour file.\n'
                )
            return

        # it's a tumour
        tmp_id = 'muse-{}-{}'.format(tumour_id, str(uuid.uuid4()))
        tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id)
        safe_make_dir(tmp_dir)

        # build combine variants commands
        muse_commands = []
        cmd = ['samtools', 'view', '-H', input]
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        for line in proc.stdout.readlines():
            if line.startswith('@SQ\t'):
                fields = line.strip().split('\t')
                chromosome = fields[1].split(':')[1]  # SN
                size = int(fields[2].split(':')[1])  # LN
                # now write regions as zero based
                current = 0
                while current < size:
                    final = min(size, current + interval)
                    muse_commands.append(
                        '$MUSE call -O {tmp_dir}/tmp{chromosome}_{current}_{final} -f $REFERENCE -r "{chromosome}:{current}-{final}" $TMR_ABS $NRML_ABS'
                        .format(tmp_dir=tmp_dir,
                                chromosome=chromosome,
                                current=current,
                                final=final,
                                prefix=prefix))
                    current = final

        with open('{tmp_dir}/muse.sh'.format(tmp_dir=tmp_dir),
                  'w') as analyse_fh:
            for line in open(
                    '{root}/src/util/muse.sh.template'.format(
                        root=config.ROOT), 'r'):
                new_line = re.sub('TUMOUR', tumour_id, line)
                new_line = re.sub('NORMAL', normal_id, new_line)
                new_line = re.sub('TMP_DIR', tmp_dir, new_line)
                new_line = re.sub('ROOT', config.ROOT, new_line)
                new_line = re.sub('CALL_VARIANTS', '\n'.join(muse_commands),
                                  new_line)

                analyse_fh.write(new_line)

        #command = 'bash {tmp_dir}/muse.sh && touch "{output}" && rm -r "{tmp_dir}"'.format(tmp_dir=tmp_dir, output=output)
        command = 'bash {tmp_dir}/muse.sh 2>{prefix}.muse.log.err 1>{prefix}.muse.log.out && touch "{output}" && rm -r {tmp_dir}'.format(
            tmp_dir=tmp_dir, output=output, prefix=prefix)

        run_stage(self.state, 'muse', command)
Example #20
0
 def fastqc(self, fastq_in, dir_out):
     '''Quality check fastq file using fastqc'''
     safe_make_dir(dir_out)
     command = 'fastqc --extract -o {dir} {fastq}'.format(dir=dir_out,
                                                          fastq=fastq_in)
     run_stage(self.state, 'fastqc', command)