def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir, **kwargs): helpers.makedirs(temp_dir) pypeliner.commandline.execute( 'fastqc', '--outdir=' + temp_dir, fastq_filename, **kwargs) fastq_basename = os.path.basename(fastq_filename) if fastq_basename.endswith(".fastq.gz"): fastq_basename = fastq_basename[:-len(".fastq.gz")] elif fastq_basename.endswith(".fq.gz"): fastq_basename = fastq_basename[:-len(".fq.gz")] elif fastq_basename.endswith(".fq"): fastq_basename = fastq_basename[:-len(".fq")] elif fastq_basename.endswith(".fastq"): fastq_basename = fastq_basename[:-len(".fastq")] else: raise Exception("Unknown file type") output_basename = os.path.join(temp_dir, fastq_basename) shutil.move(output_basename + '_fastqc.zip', output_plots) shutil.move(output_basename + '_fastqc.html', output_html)
def bam_collect_wgs_metrics(bam_filename, ref_genome, metrics_filename, config, tempdir, mem="2G", docker_image=None): helpers.makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectWgsMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'MINIMUM_BASE_QUALITY=' + str(config['min_bqual']), 'MINIMUM_MAPPING_QUALITY=' + str(config['min_mqual']), 'COVERAGE_CAP=500', 'VALIDATION_STRINGENCY=LENIENT', 'COUNT_UNPAIRED=' + ('True' if config['count_unpaired'] else 'False'), 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', docker_image=docker_image)
def bam_collect_insert_metrics(bam_filename, flagstat_metrics_filename, metrics_filename, histogram_filename, tempdir, mem="2G", picard_docker=None, samtools_docker=None): bam_flagstat(bam_filename, flagstat_metrics_filename, docker_image=samtools_docker) # Check if any paired reads exist has_paired = None with open(flagstat_metrics_filename) as f: for line in f: if 'properly paired' in line: if line.startswith('0 '): has_paired = False else: has_paired = True if has_paired is None: raise Exception( 'Unable to determine number of properly paired reads from {}'. format(flagstat_metrics_filename)) if not has_paired: with open(metrics_filename, 'w') as f: f.write('## FAILED: No properly paired reads\n') with open(histogram_filename, 'w'): pass return helpers.makedirs(tempdir) pypeliner.commandline.execute('picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectInsertSizeMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'HISTOGRAM_FILE=' + histogram_filename, 'ASSUME_SORTED=True', 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', docker_image=picard_docker)
def merge_pdfs(infiles, outfile): if isinstance(infiles, dict): infiles = infiles.values() merger = PdfFileMerger() for infile in infiles: # add it to list if not empty. skip empty files to avoid errors later if os.path.getsize(infile): merger.append(open(infile, 'rb')) helpers.makedirs(outfile, isfile=True) with open(outfile, 'wb') as fout: merger.write(fout)
def run_museq_one_job( tempdir, museq_vcf, reference, intervals, museq_params, tumour_bam=None, normal_bam=None, museq_docker_image=None, vcftools_docker_image=None, titan_mode=False ): ''' Run museq script for all chromosomes and merge VCF files :param tumour: path to tumour bam :param normal: path to normal bam :param out: path to the temporary output VCF file for the merged VCF files :param log: path to the log file :param config: path to the config YAML file ''' commands = [] for i, interval in enumerate(intervals): ival_temp_dir = os.path.join(tempdir, str(i)) helpers.makedirs(ival_temp_dir) output = os.path.join(ival_temp_dir, 'museq.vcf') log = os.path.join(ival_temp_dir, 'museq.log') command = run_museq( output, log, reference, interval, museq_params, tumour_bam=tumour_bam, normal_bam=normal_bam, return_cmd=True, titan_mode=titan_mode ) commands.append(command) parallel_temp_dir = os.path.join(tempdir, 'gnu_parallel_temp') helpers.run_in_gnu_parallel(commands, parallel_temp_dir, museq_docker_image) vcf_files = [os.path.join(tempdir, str(i), 'museq.vcf') for i in range(len(intervals))] merge_tempdir = os.path.join(tempdir, 'museq_merge') helpers.makedirs(merge_tempdir) merge_vcfs(vcf_files, museq_vcf, merge_tempdir, docker_image=vcftools_docker_image)
def generate_submit_config_in_temp(args): azure_submit = [ 'azurebatch', 'pypeliner.contrib.azure.batchqueue.AzureJobQueue' ] if not args.get("submit", None) in azure_submit: return args if args['which'] == 'generate_config': return args batch_yaml = "batch.yaml" tmpdir = args.get("tmpdir", None) pipelinedir = args.get("pipelinedir", None) # use pypeliner tmpdir to store yaml if pipelinedir: batch_yaml = os.path.join(pipelinedir, batch_yaml) elif tmpdir: batch_yaml = os.path.join(tmpdir, batch_yaml) else: logging.getLogger("wgs.generate_batch_config").warn( "no tmpdir specified, generating configs in working dir") batch_yaml = os.path.join(os.getcwd(), batch_yaml) helpers.makedirs(batch_yaml, isfile=True) batch_yaml = helpers.get_incrementing_filename(batch_yaml) params_override = args.get("config_override", {}) config_params = get_batch_params(override=params_override) config = get_batch_config(config_params, override=params_override) write_config(config, batch_yaml) args["submit_config"] = batch_yaml return args
def bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename, summary_filename, chart_filename, tempdir, mem="2G", docker_image=None): helpers.makedirs(tempdir) pypeliner.commandline.execute('picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectGcBiasMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'S=' + summary_filename, 'CHART_OUTPUT=' + chart_filename, 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', docker_image=docker_image)
def merge_vcfs(inputs, outfile, tempdir, docker_image=None): helpers.makedirs(tempdir) mergedfile = os.path.join(tempdir, 'merged.vcf') vcfutils.concatenate_vcf(inputs, mergedfile) vcfutils.sort_vcf(mergedfile, outfile, docker_image=docker_image)