def _analyse_wgs_with_command(self, input, output, subcommand, cpu=4): ''' take mapped bams and generate variant calls by running the sanger pipeline cgpwgs ''' input = input[0] prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r')) if normal_id is None: # nothing to do safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return tmp_id = 'wgs-{}-{}'.format(config.WGS_VERSION, tumour_id) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir('{}/home'.format(tmp_dir)) # make subcommand analysis script with open( '{tmp_dir}/analyse-{subcommand}.sh'.format( tmp_dir=tmp_dir, subcommand=subcommand), 'w') as analyse_fh: for line in open( '{root}/src/util/analyse-{wgs_version}.sh.template'.format( wgs_version=config.WGS_VERSION, root=config.ROOT), 'r'): #analyse-1.1.2.sh.template new_line = re.sub('TMP_ID', tmp_id, line) new_line = re.sub('TUMOUR', tumour_id, new_line) new_line = re.sub('NORMAL', normal_id, new_line) new_line = re.sub('COMMAND', subcommand, new_line) new_line = re.sub('WGS_VERSION', config.WGS_VERSION, new_line) new_line = re.sub('CPULIMIT', str(cpu), new_line) analyse_fh.write(new_line) command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp}:/mnt/tmp --workdir {tmp_dir} --home {tmp_dir}/home:/home/z --contain {root}/img/cgpwgs-{wgs_version}.img bash /mnt/tmp/{tmp_id}/analyse-{subcommand}.sh 1>{prefix}.wgs.{subcommand}.{wgs_version}.log.out 2>{prefix}.wgs.{subcommand}.{wgs_version}.log.err && touch {output}'.format( root=config.ROOT, in_dir=config.IN, out=config.OUT, reference=config.REFERENCE, tmp=config.TMP, tmp_dir=tmp_dir, tmp_id=tmp_id, prefix=prefix, output=output, subcommand=subcommand, wgs_version=config.WGS_VERSION) run_stage(self.state, 'analyse_wgs_{}'.format(subcommand), command)
def align(self, inputs, bam_out): ''' run the alignment dockstore image @input: the pre-aligned bam @bam_out: aligned bam ''' # generate dockstore file as sample.dockstore validation, bam = inputs prefix = re.sub('.bam$', '', bam) # full path without .bam sample_filename = prefix.split('/')[-1] # e.g. CMHS1 dockstore_out = re.sub('.bam$', '.dockstore', bam) # determine sample from validation file for line in open(validation, 'r'): if line.startswith('#'): continue fields = line.strip('\n').split('\t') sample = fields[8] if input == dockstore_out: raise Exception("Unexpected input file {}".format(bam)) #log_out = '{}.log.out'.format(bam_out) #log_err = '{}.log.err'.format(bam_out) # make our own align script tmp_id = 'align-{}-{}'.format(sample, str(uuid.uuid4())) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir(tmp_dir) with open('{tmp_dir}/align.sh'.format(tmp_dir=tmp_dir), 'w') as align_fh: for line in open( '{root}/src/util/align.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TMP_ID', tmp_id, line) new_line = re.sub('SAMPLE_FILENAME', sample_filename, new_line) new_line = re.sub('SAMPLE_ID', sample, new_line) align_fh.write(new_line) command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp}:/mnt/tmp --workdir {tmp_dir} --contain {root}/img/cgpmap.img bash /mnt/tmp/{tmp_id}/align.sh 1>{prefix}.mapped.log.out 2>{prefix}.mapped.log.err && rm -rf "{tmp_dir}"'.format( root=config.ROOT, in_dir=config.IN, out=config.OUT, reference=config.REFERENCE, tmp=config.TMP, tmp_dir=tmp_dir, tmp_id=tmp_id, prefix=prefix) run_stage(self.state, 'align', command)
def callable_bases(self, input, output): ''' run callable bases ''' MINIMUM_COVERAGE_TUMOR = '17' MINIMUM_COVERAGE_NORMAL = '10' prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{root}/cfg/sample-metadata.csv".format(root=config.ROOT), 'r')) # nothing to do for normal sample if normal_id is None: safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return # it's a tumour with open( '{tmp_dir}/{tumour_id}.callable_bases.sh'.format( tmp_dir=config.TMP, tumour_id=tumour_id), 'w') as analyse_fh: for line in open( '{root}/src/util/callable_bases.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TUMOUR', tumour_id, line) new_line = re.sub('NORMAL', normal_id, new_line) new_line = re.sub('ROOT', config.ROOT, new_line) new_line = re.sub('TMP_DIR', config.TMP, new_line) new_line = re.sub('MIN_TUM', MINIMUM_COVERAGE_TUMOR, new_line) new_line = re.sub('MIN_NORM', MINIMUM_COVERAGE_NORMAL, new_line) analyse_fh.write(new_line) command = 'bash {tmp_dir}/{tumour_id}.callable_bases.sh 2>{prefix}.callable_bases.log.err 1>{prefix}.callable_bases.log.out && touch "{output}"'.format( tmp_dir=config.TMP, tumour_id=tumour_id, output=output, prefix=prefix) run_stage(self.state, 'callable_bases', command)
def hmmcopy(self, input, output): ''' run hmmcopy ''' prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r')) # tumour_id is actually a normal if normal_id is None: target_dir = '{}.hmmcopy'.format(prefix) safe_make_dir(target_dir) with open('{target_dir}/hmmcopy.sh'.format(target_dir=target_dir), 'w') as analyse_fh: for line in open( '{root}/src/util/hmmcopy-normal.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('NORMAL', tumour_id, line) new_line = re.sub('ROOT', config.ROOT, new_line) new_line = re.sub('TARGET_DIR', target_dir, new_line) analyse_fh.write(new_line) command = 'bash {target_dir}/hmmcopy.sh 2>{prefix}.hmmcopy.log.err 1>{prefix}.hmmcopy.log.out && touch "{output}"'.format( target_dir=target_dir, output=output, prefix=prefix) else: # it's a tumour target_dir = '{}.hmmcopy'.format(prefix) safe_make_dir(target_dir) with open('{target_dir}/hmmcopy.sh'.format(target_dir=target_dir), 'w') as analyse_fh: for line in open( '{root}/src/util/hmmcopy.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TUMOUR', tumour_id, line) new_line = re.sub('NORMAL', normal_id, new_line) new_line = re.sub('ROOT', config.ROOT, new_line) new_line = re.sub('TARGET_DIR', target_dir, new_line) analyse_fh.write(new_line) command = 'bash {target_dir}/hmmcopy.sh 2>{prefix}.hmmcopy.log.err 1>{prefix}.hmmcopy.log.out && touch "{output}"'.format( target_dir=target_dir, output=output, prefix=prefix) run_stage(self.state, 'hmmcopy', command)
def delly(self, input, output, cpu=6): ''' run the delly singularity container ''' prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r')) # nothing to do for normal sample if normal_id is None: safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return # it's a tumour tmp_id = 'delly-{}-{}'.format(tumour_id, str(uuid.uuid4())) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir(tmp_dir) with open('{tmp_dir}/delly.sh'.format(tmp_dir=tmp_dir), 'w') as analyse_fh: for line in open( '{root}/src/util/delly.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TUMOUR', tumour_id, line) new_line = re.sub('NORMAL', normal_id, new_line) new_line = re.sub('CORES', str(cpu), new_line) analyse_fh.write(new_line) command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp_dir}:/mnt/tmp --workdir {tmp_dir} --contain {root}/img/delly-2.0.0.img bash /mnt/tmp/delly.sh 1>{prefix}.delly.log.out 2>{prefix}.delly.log.err && mv {tmp_dir}/workdir {prefix}.delly.results && touch "{output}" && rm -r "{tmp_dir}"'.format( root=config.ROOT, in_dir=config.IN, out=config.OUT, reference=config.REFERENCE_DELLY, tmp=config.TMP, tmp_dir=tmp_dir, tmp_id=tmp_id, prefix=prefix, output=output) run_stage(self.state, 'delly', command)
def somatic_sniper(self, input, output): ''' run somatic sniper ''' prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{root}/cfg/sample-metadata.csv".format(root=config.ROOT), 'r')) # nothing to do for normal sample if normal_id is None: safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return # it's a tumour with open( '{tmp_dir}/{tumour_id}.somatic_sniper.sh'.format( tmp_dir=config.TMP, tumour_id=tumour_id), 'w') as analyse_fh: for line in open( '{root}/src/util/somatic_sniper.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TUMOUR_ID', tumour_id, line) new_line = re.sub('NORMAL_ID', normal_id, new_line) new_line = re.sub('ROOT_PATH', config.ROOT, new_line) analyse_fh.write(new_line) command = 'bash {tmp_dir}/{tumour_id}.somatic_sniper.sh 2>{prefix}.somatic_sniper.log.err 1>{prefix}.somatic_sniper.log.out && touch "{output}"'.format( tmp_dir=config.TMP, tumour_id=tumour_id, output=output, prefix=prefix) run_stage(self.state, 'somatic_sniper', command)
def gridss(self, input, output): ''' run gridss ''' prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r')) # nothing to do for normal sample if normal_id is None: safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return # it's a tumour tmp_id = 'gridss-{}-{}'.format(tumour_id, str(uuid.uuid4())) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir(tmp_dir) with open('{tmp_dir}/gridss.sh'.format(tmp_dir=tmp_dir), 'w') as analyse_fh: for line in open( '{root}/src/util/gridss.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TUMOUR', tumour_id, line) new_line = re.sub('NORMAL', normal_id, new_line) new_line = re.sub('ROOT', config.ROOT, new_line) new_line = re.sub('ACCOUNT', config.ACCOUNT, new_line) analyse_fh.write(new_line) #command = 'bash {tmp_dir}/muse.sh && touch "{output}" && rm -r "{tmp_dir}"'.format(tmp_dir=tmp_dir, output=output) command = 'bash {tmp_dir}/gridss.sh 2>{prefix}.gridss.log.err 1>{prefix}.gridss.log.out && touch "{output}" && rm -r {tmp_dir}'.format( tmp_dir=tmp_dir, output=output, prefix=prefix) run_stage(self.state, 'gridss', command)
def analyse_wgs_prepare(self, input, output): ''' creates working directory and scripts to run for wgs pipeline ''' prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r')) if normal_id is None: # nothing to do safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return tmp_id = 'wgs-{}'.format(tumour_id) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir(tmp_dir) safe_make_dir(os.path.dirname(output)) command = 'cp {root}/src/util/analysisWGS.serial.sh {tmp_dir}/analysisWGS.sh && cp {root}/src/util/ds-wrapper-wgs-1.0.8.pl {tmp_dir}/ds-wrapper.pl && touch {output}'.format( root=config.ROOT, output=output, tmp_dir=tmp_dir) run_stage(self.state, 'analyse_wgs_prepare', command)
def validate_prealigned_bam(self, input, validation_out): ''' run validation script @input: the pre-aligned bam @validation_out: tsv file with validation details ''' prefix = re.sub('.bam$', '', input) sample = re.sub('.bam$', '', os.path.basename(input)) validation_in = '{}.validation_src'.format(prefix) # read in additional metadata found = False for line in open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r'): # Sample UUID,Patient UUID,Lab ID,tissue_id,is_normal fields = line.strip('\n').split(',') if fields[0] == sample: donor_id = fields[1] tissue_id = fields[3] is_normal = fields[4] found = True break if not found: raise Exception( "Sample '{}' not found in metadata file".format(sample)) # generate input to the validation script with open(validation_in, 'w') as validation_src: validation_src.write( '#Donor_ID\tTissue_ID\tis_normal (Yes/No,Y/N)\tSample_ID\trelative_file_path\n' ) validation_src.write( '{donor_id}\t{tissue_id}\t{is_normal}\t{sample_id}\t{sample}.bam\n' .format(donor_id=donor_id, tissue_id=tissue_id, is_normal=is_normal, sample_id=sample, sample=sample)) # make our own align script tmp_id = '{}-{}'.format(sample, str(uuid.uuid4())) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir(tmp_dir) with open('{tmp_dir}/validate.sh'.format(tmp_dir=tmp_dir), 'w') as align_fh: for line in open( '{root}/src/util/validate.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TMP_ID', tmp_id, line) new_line = re.sub('SAMPLE', sample, new_line) align_fh.write(new_line) # run the validation script and generate output #command = ". {root}/src/util/profile; validate_sample_meta.pl -in {validation_in} -out {validation_out} -f tsv 1>{prefix}.validation.out 2>{prefix}.validation.err".format(root=config.ROOT, validation_in=validation_in, validation_out=validation_out, prefix=prefix) command = 'singularity exec -i --bind {in_dir}:/mnt/in,{out}:/mnt/out,{reference}:/mnt/reference,{tmp}:/mnt/tmp --workdir {tmp_dir} --contain {root}/img/cgpqc.img bash /mnt/tmp/{tmp_id}/validate.sh'.format( root=config.ROOT, in_dir=config.IN, out=config.OUT, reference=config.REFERENCE, tmp=config.TMP, tmp_dir=tmp_dir, tmp_id=tmp_id) run_stage(self.state, 'validate_prealigned_bam', command)
def muse(self, input, output): ''' run muse ''' interval = 50000000 # chunk size to break chromosomes into for muse prefix = re.sub('.mapped.bam$', '', input) # full path without mapped.bam tumour_id = prefix.split('/')[-1] # e.g. CMHS1 normal_id = util.find_normal( tumour_id, open("{}/cfg/sample-metadata.csv".format(config.ROOT), 'r')) # nothing to do for normal sample if normal_id is None: safe_make_dir(os.path.dirname(output)) with open(output, 'w') as output_fh: output_fh.write( 'Normal sample does not require analysis. See the relevant tumour file.\n' ) return # it's a tumour tmp_id = 'muse-{}-{}'.format(tumour_id, str(uuid.uuid4())) tmp_dir = '{tmp}/{tmp_id}'.format(tmp=config.TMP, tmp_id=tmp_id) safe_make_dir(tmp_dir) # build combine variants commands muse_commands = [] cmd = ['samtools', 'view', '-H', input] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) for line in proc.stdout.readlines(): if line.startswith('@SQ\t'): fields = line.strip().split('\t') chromosome = fields[1].split(':')[1] # SN size = int(fields[2].split(':')[1]) # LN # now write regions as zero based current = 0 while current < size: final = min(size, current + interval) muse_commands.append( '$MUSE call -O {tmp_dir}/tmp{chromosome}_{current}_{final} -f $REFERENCE -r "{chromosome}:{current}-{final}" $TMR_ABS $NRML_ABS' .format(tmp_dir=tmp_dir, chromosome=chromosome, current=current, final=final, prefix=prefix)) current = final with open('{tmp_dir}/muse.sh'.format(tmp_dir=tmp_dir), 'w') as analyse_fh: for line in open( '{root}/src/util/muse.sh.template'.format( root=config.ROOT), 'r'): new_line = re.sub('TUMOUR', tumour_id, line) new_line = re.sub('NORMAL', normal_id, new_line) new_line = re.sub('TMP_DIR', tmp_dir, new_line) new_line = re.sub('ROOT', config.ROOT, new_line) new_line = re.sub('CALL_VARIANTS', '\n'.join(muse_commands), new_line) analyse_fh.write(new_line) #command = 'bash {tmp_dir}/muse.sh && touch "{output}" && rm -r "{tmp_dir}"'.format(tmp_dir=tmp_dir, output=output) command = 'bash {tmp_dir}/muse.sh 2>{prefix}.muse.log.err 1>{prefix}.muse.log.out && touch "{output}" && rm -r {tmp_dir}'.format( tmp_dir=tmp_dir, output=output, prefix=prefix) run_stage(self.state, 'muse', command)
def fastqc(self, fastq_in, dir_out): '''Quality check fastq file using fastqc''' safe_make_dir(dir_out) command = 'fastqc --extract -o {dir} {fastq}'.format(dir=dir_out, fastq=fastq_in) run_stage(self.state, 'fastqc', command)