def run_lumpy_preprocess(bamfile, disc_reads, split_reads, tempdir, config, samtools_docker_image=None, lumpy_docker_image=None): helpers.makedirs(tempdir) # disc unsorted_disc = os.path.join(tempdir, 'discordants.unsorted.bam') run_samtools_view(bamfile, unsorted_disc, docker_image=samtools_docker_image) run_samtools_sort(unsorted_disc, disc_reads, docker_image=samtools_docker_image) os.remove(unsorted_disc) unsorted_split = os.path.join(tempdir, 'splitters.unsorted.bam') run_lumpy_extract_split_reads_bwamem(bamfile, unsorted_split, config, docker_image=lumpy_docker_image) run_samtools_sort(unsorted_split, split_reads, docker_image=samtools_docker_image) os.remove(unsorted_split)
def circos(titan_calls, sample_id, sv_calls, circos_plot_remixt, circos_plot_titan, tempdir, remixt_calls="NULL", docker_image=None): helpers.makedirs(tempdir) prepped_titan_calls = os.path.join(tempdir, 'prepped_titan_calls.csv') read_titan.make_for_circos(titan_calls, prepped_titan_calls) if remixt_calls != "NULL": prepped_remixt_calls = os.path.join(tempdir, 'prepped_remixt_calls.csv') read_remixt.make_for_circos(remixt_calls, sample_id, prepped_remixt_calls) else: prepped_remixt_calls = remixt_calls # circos = ["singularity", "run", "--bind", "/admin", "--bind", "/common", "--bind", # "/juno/work", "docker://docker.io/wgspipeline/circos:v0.0.1"] cmd = [ "circos.R", prepped_titan_calls, prepped_remixt_calls, sv_calls, circos_plot_remixt, circos_plot_titan, sample_id ] pypeliner.commandline.execute(*cmd, docker_image=docker_image)
def run_mutect_one_job(tempdir, vcf, reference, intervals, normal_bam, tumour_bam): commands = [] for i, interval in enumerate(intervals): ival_temp_dir = os.path.join(tempdir, str(i)) helpers.makedirs(ival_temp_dir) unfiltered_output = os.path.join(ival_temp_dir, 'mutect.vcf.gz') cmd = mutect_run_command(reference, interval, normal_bam, tumour_bam, unfiltered_output) commands.append(cmd) output = os.path.join(ival_temp_dir, 'mutect.vcf.gz') cmd = mutect_filter_command(reference, unfiltered_output, output) commands.append(cmd) parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp') helpers.run_in_gnu_parallel(commands, parallel_temp_dir) vcf_files = [ os.path.join(tempdir, str(i), 'mutect.vcf.gz') for i in range(len(intervals)) ] merge_tempdir = os.path.join(tempdir, 'mutect_merge') helpers.makedirs(merge_tempdir) merge_vcfs(vcf_files, vcf, merge_tempdir)
def concatenate_vcf( in_files, out_file, tempdir, allow_overlap=False): """ Fast concatenation of VCF file using `bcftools`. :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys. :param out_file: path where output file will be written in VCF format. """ if isinstance(in_files, dict): in_files = in_files.values() helpers.makedirs(tempdir) merged_file = os.path.join(tempdir, 'merged.vcf') if allow_overlap: cmd = ['bcftools', 'concat', '-a', '-O', 'z', '-o', merged_file] else: cmd = ['bcftools', 'concat', '-O', 'z', '-o', merged_file] cmd += in_files pypeliner.commandline.execute(*cmd) # sort merged vcf file cmd = ['bcftools', 'sort', '-O', 'z', '-o', out_file, merged_file] pypeliner.commandline.execute(*cmd) index_vcf(out_file) index_bcf(out_file)
def bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename, summary_filename, chart_filename, tempdir, mem="2G"): helpers.makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectGcBiasMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'S=' + summary_filename, 'CHART_OUTPUT=' + chart_filename, 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', 'QUIET=true', )
def tar_all_data(params, segs, igv_segs, markers, parsed, plots, tar_output, tempdir, chunks): helpers.makedirs(tempdir) for chunk in chunks: num_cluster, ploidy = chunk num_cluster = str(num_cluster) ploidy = str(ploidy) outdir = os.path.join(tempdir, 'numcluster_' + num_cluster, 'ploidy_' + ploidy) helpers.makedirs(outdir) params_outfile = os.path.join(outdir, 'params.csv') shutil.copyfile(params[chunk], params_outfile) segs_outfile = os.path.join(outdir, 'segs.csv') shutil.copyfile(segs[chunk], segs_outfile) igv_segs_outfile = os.path.join(outdir, 'igv_segs.csv') shutil.copyfile(igv_segs[chunk], igv_segs_outfile) markers_outfile = os.path.join(outdir, 'titan_markers.csv') shutil.copyfile(markers[chunk], markers_outfile) parsed_outfile = os.path.join(outdir, 'parsed.csv') shutil.copyfile(parsed[chunk], parsed_outfile) plots_outfile = os.path.join(outdir, 'plots.pdf') shutil.copyfile(plots[chunk], plots_outfile) helpers.make_tarfile(tar_output, tempdir)
def split_by_rg(infile, read1_output, read2_output, tempdir): helpers.makedirs(tempdir) print("***********") print(tempdir) print(os.listdir(tempdir)) print("***********") cmd = ['wgs_bamtofastq', infile, tempdir] pypeliner.commandline.execute(*cmd) print("***********") print(tempdir) print(os.listdir(tempdir)) print("***********") try: readgroups = os.listdir(tempdir) except OSError: time.sleep(60) readgroups = os.listdir(tempdir) for readgroup in readgroups: os.rename( os.path.join(tempdir, readgroup, 'R1.fastq.gz'), read1_output[readgroup] ) os.rename( os.path.join(tempdir, readgroup, 'R2.fastq.gz'), read2_output[readgroup] )
def bam_collect_wgs_metrics(bam_filename, ref_genome, metrics_filename, config, tempdir, mem="2G", docker_image=None): helpers.makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectWgsMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'MINIMUM_BASE_QUALITY=' + str(config['min_bqual']), 'MINIMUM_MAPPING_QUALITY=' + str(config['min_mqual']), 'COVERAGE_CAP=500', 'VALIDATION_STRINGENCY=LENIENT', 'COUNT_UNPAIRED=' + ('True' if config['count_unpaired'] else 'False'), 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', docker_image=docker_image)
def run_samtools_germline_one_job(tempdir, vcf, reference, intervals, bam_file, samtools_docker_image=None, vcftools_docker_image=None): commands = [] for i, interval in enumerate(intervals): ival_temp_dir = os.path.join(tempdir, str(i)) helpers.makedirs(ival_temp_dir) output = os.path.join(ival_temp_dir, 'germline.vcf.gz') cmd = samtools_germline_command(output, reference, interval, bam_file) commands.append(cmd) parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp') helpers.run_in_gnu_parallel(commands, parallel_temp_dir, samtools_docker_image) vcf_files = [ os.path.join(tempdir, str(i), 'germline.vcf.gz') for i in range(len(intervals)) ] merge_tempdir = os.path.join(tempdir, 'germline_merge') helpers.makedirs(merge_tempdir) merge_vcfs(vcf_files, vcf, merge_tempdir, docker_image=vcftools_docker_image)
def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir, **kwargs): temp_out_dir = os.path.join(temp_dir, 'out') temp_tmp_dir = os.path.join(temp_dir, 'tmp') helpers.makedirs(temp_out_dir) helpers.makedirs(temp_tmp_dir) pypeliner.commandline.execute('fastqc', '--outdir=' + temp_out_dir, '--dir=' + temp_tmp_dir, fastq_filename, **kwargs) fastq_basename = os.path.basename(fastq_filename) if fastq_basename.endswith(".fastq.gz"): fastq_basename = fastq_basename[:-len(".fastq.gz")] elif fastq_basename.endswith(".fq.gz"): fastq_basename = fastq_basename[:-len(".fq.gz")] elif fastq_basename.endswith(".fq"): fastq_basename = fastq_basename[:-len(".fq")] elif fastq_basename.endswith(".fastq"): fastq_basename = fastq_basename[:-len(".fastq")] else: raise Exception("Unknown file type") output_basename = os.path.join(temp_out_dir, fastq_basename) shutil.move(output_basename + '_fastqc.zip', output_plots) shutil.move(output_basename + '_fastqc.html', output_html)
def generate_pipeline_config(args): if args['which'] == 'generate_config': config_yaml = args['pipeline_config'] config_yaml = os.path.abspath(config_yaml) else: config_yaml = "config.yaml" tmpdir = args.get("tmpdir", None) pipelinedir = args.get("pipelinedir", None) # use pypeliner tmpdir to store yaml if pipelinedir: config_yaml = os.path.join(pipelinedir, config_yaml) elif tmpdir: config_yaml = os.path.join(tmpdir, config_yaml) else: warnings.warn("no tmpdir specified, generating configs in working dir") config_yaml = os.path.join(os.getcwd(), config_yaml) config_yaml = helpers.get_incrementing_filename(config_yaml) print config_yaml params_override = {'cluster': 'azure', 'reference': 'grch37'} if args['config_override']: params_override.update(args["config_override"]) helpers.makedirs(config_yaml, isfile=True) config = get_config(params_override) write_config(config, config_yaml) args["config_file"] = config_yaml print config_yaml return args
def run_samtools_germline_one_job(tempdir, vcf, reference, intervals, bam_file): commands = [] for i, interval in enumerate(intervals): ival_temp_dir = os.path.join(tempdir, str(i)) helpers.makedirs(ival_temp_dir) output = os.path.join(ival_temp_dir, 'germline.vcf.gz') cmd = samtools_germline_command(output, reference, interval, bam_file) commands.append(cmd) parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp') helpers.run_in_gnu_parallel(commands, parallel_temp_dir) vcf_files = [ os.path.join(tempdir, str(i), 'germline.vcf.gz') for i in range(len(intervals)) ] merge_tempdir = os.path.join(tempdir, 'germline_merge') helpers.makedirs(merge_tempdir) temp_vcf = os.path.join(merge_tempdir, 'merged_rtg.vcf') merge_vcfs(vcf_files, temp_vcf, merge_tempdir) normal_id = bamutils.get_sample_id(bam_file) vcfutils.update_germline_header_sample_ids(temp_vcf, vcf, normal_id)
def bam_sort(bam_filename, sorted_bam_filename, tempdir, threads=1, mem="2G"): helpers.makedirs(tempdir) prefix = os.path.join(tempdir, 'samtools_sort') pypeliner.commandline.execute('samtools', 'sort', '-@', threads, '-m', mem, bam_filename, '-o', sorted_bam_filename, '-T', prefix)
def run_mutect(vcf, reference, interval, normal_bam, tumour_bam, tempdir): helpers.makedirs(tempdir) unfiltered_vcf = os.path.join(tempdir, 'temp.vcf') cmd = mutect_run_command(reference, interval, normal_bam, tumour_bam, unfiltered_vcf) pypeliner.commandline.execute(*cmd) cmd = mutect_filter_command(reference, unfiltered_vcf, vcf) pypeliner.commandline.execute(*cmd)
def run_samtools_germline(vcf, reference, interval, bam_file, tempdir): helpers.makedirs(tempdir) vcf_file = os.path.join(tempdir, 'samtools_snps.vcf.gz') cmd = samtools_germline_command(vcf_file, reference, interval, bam_file) pypeliner.commandline.execute(*cmd) normal_id = bamutils.get_sample_id(bam_file) vcfutils.update_germline_header_sample_ids(vcf_file, vcf, normal_id)
def run_freebayes_germline(vcf, reference, interval, bam_file, tempdir): helpers.makedirs(tempdir) temp_vcf = os.path.join(tempdir, 'temp.vcf') cmd = freebayes_germline_command(temp_vcf, reference, interval, bam_file) pypeliner.commandline.execute(*cmd) normal_id = bamutils.get_sample_id(bam_file) vcfutils.update_germline_header_sample_ids(temp_vcf, vcf, normal_id)
def get_outfiles(outdir, readgroups): outfiles = {} for readgroup in readgroups: helpers.makedirs(os.path.join(outdir, readgroup)) r1 = os.path.join(outdir, readgroup, 'R1.fastq.gz') r2 = os.path.join(outdir, readgroup, 'R2.fastq.gz') outfiles[readgroup] = (r1, r2) return outfiles
def run_museq_one_job(tempdir, museq_vcf, reference, intervals, museq_params, tumour_bam=None, normal_bam=None, titan_mode=False): ''' Run museq script for all chromosomes and merge VCF files :param tumour: path to tumour bam :param normal: path to normal bam :param out: path to the temporary output VCF file for the merged VCF files :param log: path to the log file :param config: path to the config YAML file ''' commands = [] for i, interval in enumerate(intervals): ival_temp_dir = os.path.join(tempdir, str(i)) helpers.makedirs(ival_temp_dir) output = os.path.join(ival_temp_dir, 'museq.vcf') log = os.path.join(ival_temp_dir, 'museq.log') command = run_museq(output, log, reference, interval, museq_params, ival_temp_dir, tumour_bam=tumour_bam, normal_bam=normal_bam, return_cmd=True, titan_mode=titan_mode) commands.append(command) parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp') helpers.run_in_gnu_parallel(commands, parallel_temp_dir) vcf_files = [ os.path.join(tempdir, str(i), 'museq.vcf') for i in range(len(intervals)) ] merge_tempdir = os.path.join(tempdir, 'museq_merge') helpers.makedirs(merge_tempdir) temp_museq_vcf = os.path.join(merge_tempdir, 'temp_museq_merge.vcf') merge_vcfs(vcf_files, temp_museq_vcf, merge_tempdir) tumour_id = get_sample_id(tumour_bam) normal_id = get_sample_id(normal_bam) update_header_sample_ids(temp_museq_vcf, museq_vcf, tumour_id, normal_id)
def roh_calling(samtools_germlines, roh_output, tempdir): helpers.makedirs(tempdir) output = os.path.join(tempdir, 'output.csv') cmd = [ 'bcftools', 'roh', '-G30', '--AF-dflt', 0.4, samtools_germlines, '>', output ] pypeliner.commandline.execute(*cmd) parse_roh_output(output, roh_output)
def annotate_maf_with_oncokb(maf, api_key, tmpspace, annotated_maf): ''' annotate maf with oncokb Parameters ---------- maf : maf path to annotate somatic_mafs : somatic maf path dictionary merged_maf: merged output Returns ------- ''' helpers.makedirs(tmpspace) ma.annotate(maf, annotated_maf, api_key)
def circos(titan_calls, remixt_calls, sample_id, sv_calls, circos_plot_remixt, circos_plot_titan, tempdir): helpers.makedirs(tempdir) script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'scripts', 'circos.R') cmd = [ 'Rscript', script_path, titan_calls, remixt_calls, sv_calls, circos_plot_remixt, circos_plot_titan, sample_id ] pypeliner.commandline.execute(*cmd)
def bam_collect_insert_metrics(bam_filename, flagstat_metrics_filename, metrics_filename, histogram_filename, tempdir, mem="2G"): bam_flagstat( bam_filename, flagstat_metrics_filename, ) # Check if any paired reads exist has_paired = None with open(flagstat_metrics_filename) as f: for line in f: if 'properly paired' in line: if line.startswith('0 '): has_paired = False else: has_paired = True if has_paired is None: raise Exception( 'Unable to determine number of properly paired reads from {}'. format(flagstat_metrics_filename)) if not has_paired: with open(metrics_filename, 'w') as f: f.write('## FAILED: No properly paired reads\n') with open(histogram_filename, 'w'): pass return helpers.makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectInsertSizeMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'HISTOGRAM_FILE=' + histogram_filename, 'ASSUME_SORTED=True', 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', 'QUIET=true', )
def merge_pdfs(infiles, outfile): if isinstance(infiles, dict): infiles = infiles.values() merger = PdfFileMerger() for infile in infiles: # add it to list if not empty. skip empty files to avoid errors later if os.path.getsize(infile): merger.append(open(infile, 'rb')) helpers.makedirs(outfile, isfile=True) with open(outfile, 'wb') as fout: merger.write(fout)
def parse_remixt_file(input, outputs, tables, tempdir): helpers.makedirs(tempdir) with pd.HDFStore(input) as data_store: for output, table in zip(outputs, tables): tempout = os.path.join(tempdir, '{}.csv'.format(table.replace('/', '_'))) df = data_store[table] if isinstance(df, pd.Series): df = pd.DataFrame({table: df}) df.to_csv(tempout, index=False) csvutils.finalize_csv(tempout, output, sep=',')
def run_vcf2maf( vcf_file, maf_output, tempdir, reference, tumour_id=None, normal_id=None, ): if os.path.exists(tempdir): helpers.rmdirs(tempdir) helpers.makedirs(tempdir) input_vcf = os.path.join(tempdir, os.path.basename(vcf_file)) shutil.copyfile(vcf_file, input_vcf) if vcf_file.endswith('.gz'): vcf_unzipped = os.path.join(tempdir, 'unzipped_vcf.vcf') gunzip_file(input_vcf, vcf_unzipped) else: vcf_unzipped = input_vcf assert vcf_unzipped.endswith('.vcf') vcf_unzipped_vep = vcf_unzipped[:-4] vcf_unzipped_vep = vcf_unzipped_vep + '.vep.vcf' if os.path.exists(vcf_unzipped_vep): os.remove(vcf_unzipped_vep) cmd = [ 'vcf2maf', vcf_unzipped, maf_output, os.path.join(reference, 'homo_sapiens', '99_GRCh37', 'Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz'), os.path.join(reference, 'ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz'), reference, ] if tumour_id: cmd.extend(['--tumor-id', tumour_id]) if normal_id: cmd.extend(['--normal-id', normal_id]) pypeliner.commandline.execute(*cmd)
def parse_vcf(infile, primary_table, snpeff_table, ma_table, id_table, parse_config, chromosomes, tempdir): ''' parses a vcf containing variant calls to a CSV. :param infile: vcf containing calls and annotations :param primary_table:csv output filepath containing base calls :param snpeff_table: csv output filepath containing snpeff annotations :param ma_table: csv output filepath containing ma annotations :param id_table: csv output filepath containg id annotations :param parse_config: config?? currently unused :param parse_low_mappability: boolean; whether or not to filter low-mappability calls ##assuming there will by a path to a blacklisted calls table in config ''' helpers.makedirs(tempdir) primary_temp = os.path.join(tempdir, 'primary.csv') snpeff_temp = os.path.join(tempdir, 'snpeff.csv') ma_temp = os.path.join(tempdir, 'ma.csv') ids_temp = os.path.join(tempdir, 'ids.csv') filter_out = [] if 'filter_low_mappability' in parse_config and parse_config[ 'filter_low_mappability']: filter_out.append(('LOW_MAPPABILITY', 'eq', True)) if chromosomes: filter_out.append(('CHROM', 'notin', chromosomes)) if 'pr_threshold' in parse_config and parse_config['pr_threshold']: filter_out.append(('PR', 'lt', parse_config['pr_threshold'])) with vcfparser.VcfParser(infile, primary_temp, snpeff_temp, ma_temp, ids_temp, filter_out) as vcf_parser: vcf_parser.write() csvutils.finalize_csv(primary_temp, primary_table) csvutils.finalize_csv(snpeff_temp, snpeff_table) csvutils.finalize_csv(ma_temp, ma_table) csvutils.finalize_csv(ids_temp, id_table)
def svaba_cmd(tumor, normal, reference, tempdir, region=None, ncores=None, sample_id='sample'): helpers.makedirs(tempdir) tempdir = os.path.join(tempdir, sample_id) cmd = [ 'svaba', 'run', '-t', tumor, '-n', normal, '-G', reference, '-z', '-a', tempdir ] if region: cmd += ['-k', region] if ncores: cmd += ['-p', ncores] return cmd
def split_by_rg(infile, read1_output, read2_output, tempdir, ignore_bamtofastq_exception): helpers.makedirs(tempdir) cmd = ['wgs_bamtofastq', infile, tempdir] if ignore_bamtofastq_exception: cmd.append('--ignore_bamtofastq_exception') pypeliner.commandline.execute(*cmd) try: readgroups = os.listdir(tempdir) except OSError: time.sleep(60) readgroups = os.listdir(tempdir) for readgroup in readgroups: os.rename(os.path.join(tempdir, readgroup, 'R1.fastq.gz'), read1_output[readgroup]) os.rename(os.path.join(tempdir, readgroup, 'R2.fastq.gz'), read2_output[readgroup])
def plot_hmm( tumour_copy, hmmcopy_res, correction_plots_dir, hmmcopy_plots_dir, bias_pdf, correction_pdf, hmmcopy_pdf, docker_image=None ): helpers.makedirs(correction_plots_dir) helpers.makedirs(hmmcopy_plots_dir) cmd = [ 'plot_hmmcopy.R', tumour_copy, hmmcopy_res, correction_plots_dir, bias_pdf, hmmcopy_plots_dir, ] pypeliner.commandline.execute(*cmd, docker_image=docker_image) correction_pdfs = [os.path.join(correction_plots_dir, f) for f in os.listdir(correction_plots_dir) if f.endswith('.pdf')] pdfutils.merge_pdfs(correction_pdfs, correction_pdf) all_hmmcopy_pdfs = [os.path.join(hmmcopy_plots_dir, pdf) for pdf in os.listdir(hmmcopy_plots_dir)] # just some sorting human_pdfs = [os.path.join(hmmcopy_plots_dir, 'chr_{}.pdf'.format(chrom)) for chrom in map(str, range(1,23)) + ['X']] all_hmmcopy_pdfs = [v for v in human_pdfs if v in all_hmmcopy_pdfs] all_hmmcopy_pdfs += list(set(all_hmmcopy_pdfs) - set(human_pdfs)) pdfutils.merge_pdfs(human_pdfs, hmmcopy_pdf)
def generate_submit_config_in_temp(args): azure_submit = ['azurebatch', 'pypeliner.contrib.azure.batchqueue.AzureJobQueue'] if not args.get("submit", None) in azure_submit: return args if args['which'] == 'generate_config': return args batch_yaml = "batch.yaml" tmpdir = args.get("tmpdir", None) pipelinedir = args.get("pipelinedir", None) # use pypeliner tmpdir to store yaml if pipelinedir: batch_yaml = os.path.join(pipelinedir, batch_yaml) elif tmpdir: batch_yaml = os.path.join(tmpdir, batch_yaml) else: logging.getLogger("wgs.generate_batch_config").warn( "no tmpdir specified, generating configs in working dir" ) batch_yaml = os.path.join(os.getcwd(), batch_yaml) helpers.makedirs(batch_yaml, isfile=True) batch_yaml = helpers.get_incrementing_filename(batch_yaml) params_override = args.get("config_override", {}) config_params = get_batch_params(override=params_override) config = get_batch_config(config_params, override=params_override) write_config(config, batch_yaml) args["submit_config"] = batch_yaml return args