def plot_kernel_density(infile, output, tempdir, sep, colname, plot_title, multipliers): helpers.makedirs(tempdir) multiplier_pdfs = [] for multiplier in multipliers: multiplier_output = os.path.join(tempdir, "{}.pdf".format(multiplier)) multiplier_pdfs.append(multiplier_output) mult_plot_title = '{}({})'.format(plot_title, multiplier) tablename = '/hmmcopy/metrics/{}'.format(multiplier) plot = PlotKernelDensity(infile, multiplier_output, sep, colname, mult_plot_title, tablename=tablename) plot.main() pdfutils.merge_pdfs(multiplier_pdfs, output)
def merge_pdf(in_filenames, outfilenames, metrics, cell_filters, tempdir, labels): helpers.makedirs(tempdir) good_cells = get_good_cells(metrics, cell_filters, '/hmmcopy/metrics/0') grouped_data = group_cells_by_row(good_cells, metrics, '/hmmcopy/metrics/0', sort_by_col=True) for infiles, outfiles, label in zip(in_filenames, outfilenames, labels): extension = os.path.splitext(infiles[good_cells[0]])[-1] plotdir = os.path.join(tempdir, label) helpers.makedirs(plotdir) for cell in good_cells: shutil.copyfile( infiles[cell], os.path.join(plotdir, cell + "_" + label + extension)) helpers.make_tarfile(outfiles, plotdir)
def organism_filter( fastq_r1, fastq_r2, filtered_fastq_r1, filtered_fastq_r2, detailed_metrics, summary_metrics, tempdir, cell_id, params, reference, docker_image=None, filter_contaminated_reads=False, ): # fastq screen tries to skip if files from old runs are available if os.path.exists(tempdir): shutil.rmtree(tempdir) helpers.makedirs(tempdir) tagged_fastq_r1, tagged_fastq_r2 = run_fastq_screen_paired_end( fastq_r1, fastq_r2, tempdir, params, docker_image=docker_image ) reader = fastqutils.PairedTaggedFastqReader(tagged_fastq_r1, tagged_fastq_r2) counts = reader.gather_counts() write_detailed_counts(counts, detailed_metrics, cell_id) write_summary_counts(counts, summary_metrics, cell_id) if filter_contaminated_reads: filter_reads( tagged_fastq_r1, tagged_fastq_r2, filtered_fastq_r1, filtered_fastq_r2, reference ) else: # use the full tagged fastq downstream # with organism type information in readname re_tag_reads(tagged_fastq_r1, filtered_fastq_r1) re_tag_reads(tagged_fastq_r2, filtered_fastq_r2)
def run_fastqc(fastq1, fastq2, reports, tempdir, config): """ run fastqc on both fastq files run trimgalore if needed, copy if not. """ container_ctx = helpers.get_container_ctx(config['containers'], 'fastqc', docker_only=True) reports_dir = os.path.join(tempdir, 'fastqc_reports') if not os.path.exists(reports_dir): helpers.makedirs(reports_dir) out_html = os.path.join(reports_dir, 'fastqc_R1.html') out_plot = os.path.join(reports_dir, 'fastqc_R1.zip') if not os.path.getsize(fastq1) == 0: bamutils.produce_fastqc_report(fastq1, out_html, out_plot, tempdir, **container_ctx) else: warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1) out_html = os.path.join(reports_dir, 'fastqc_R2.html') out_plot = os.path.join(reports_dir, 'fastqc_R2.zip') if not os.path.getsize(fastq2) == 0: bamutils.produce_fastqc_report(fastq2, out_html, out_plot, tempdir, **container_ctx) else: warnings.warn("fastq file %s is empty, skipping fastqc" % fastq1) helpers.make_tarfile(reports, reports_dir)
def run_hmmcopy( bam_file, corrected_reads_filename, segments_filename, parameters_filename, metrics_filename, hmmcopy_tar, cell_id, hmmparams, tempdir, docker_image ): # generate wig file for hmmcopy helpers.makedirs(tempdir) readcount_wig = os.path.join(tempdir, 'readcounter.wig') corrected_reads = os.path.join(tempdir, 'corrected_reads.csv') run_correction_hmmcopy( bam_file, corrected_reads, readcount_wig, hmmparams, docker_image ) hmmcopy_tempdir = os.path.join(tempdir, '{}_hmmcopy'.format(cell_id)) helpers.makedirs(hmmcopy_tempdir) run_hmmcopy_script( corrected_reads, hmmcopy_tempdir, cell_id, hmmparams, docker_image ) hmmcopy_outdir = os.path.join(hmmcopy_tempdir, str(0)) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "reads.csv"), corrected_reads_filename, dtypes=dtypes()['reads'] ) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "params.csv"), parameters_filename, dtypes=dtypes()['params'] ) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "segs.csv"), segments_filename, dtypes=dtypes()['segs'] ) csvutils.rewrite_csv_file( os.path.join(hmmcopy_outdir, "metrics.csv"), metrics_filename, dtypes=dtypes()['metrics'] ) helpers.make_tarfile(hmmcopy_tar, hmmcopy_tempdir)
def bam_collect_gc_metrics(bam_filename, ref_genome, metrics_filename, summary_filename, chart_filename, tempdir, mem="2G", docker_image=None): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute('picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectGcBiasMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'S=' + summary_filename, 'CHART_OUTPUT=' + chart_filename, 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', docker_image=docker_image)
def bam_collect_wgs_metrics(bam_filename, ref_genome, metrics_filename, config, tempdir, mem="2G", docker_image=None): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'CollectWgsMetrics', 'INPUT=' + bam_filename, 'OUTPUT=' + metrics_filename, 'REFERENCE_SEQUENCE=' + ref_genome, 'MINIMUM_BASE_QUALITY=' + str(config['min_bqual']), 'MINIMUM_MAPPING_QUALITY=' + str(config['min_mqual']), 'COVERAGE_CAP=500', 'VALIDATION_STRINGENCY=LENIENT', 'COUNT_UNPAIRED=' + ('True' if config['count_unpaired'] else 'False'), 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', docker_image=docker_image)
def run_lumpy(tumour_disc, tumour_split, tumour_hist, tumour_mean_stdev, tumour_id, normal_disc, normal_split, normal_hist, normal_mean_stdev, normal_id, vcf, tempdir): tumour_mean, tumour_stdev = load_metadata(tumour_mean_stdev) normal_mean, normal_stdev = load_metadata(normal_mean_stdev) helpers.makedirs(tempdir) tempdir = tempdir + '/lumpy' tumour_pe = 'id:{},bam_file:{},histo_file:{},mean:{},' \ 'stdev:{},read_length:101,min_non_overlap:101,' \ 'discordant_z:5,back_distance:10,weight:1,' \ 'min_mapping_threshold:20'.format(tumour_id, tumour_disc, tumour_hist, tumour_mean, tumour_stdev) tumour_sr = 'id:{},bam_file:{},back_distance:10,weight:1,' \ 'min_mapping_threshold:20'.format(tumour_id, tumour_split) normal_pe = 'id:{},bam_file:{},histo_file:{},mean:{},' \ 'stdev:{},read_length:101,min_non_overlap:101,' \ 'discordant_z:5,back_distance:10,weight:1,' \ 'min_mapping_threshold:20'.format(normal_id, normal_disc, normal_hist, normal_mean, normal_stdev) normal_sr = 'id:{},bam_file:{},back_distance:10,weight:1,' \ 'min_mapping_threshold:20'.format(normal_id, normal_split) cmd = [ 'lumpy', '-e', '-b', '-mw', 4, '-tt', 0, '-pe', tumour_pe, '-sr', tumour_sr, '-pe', normal_pe, '-sr', normal_sr, '-t', tempdir, '>', vcf ] pypeliner.commandline.execute(*cmd)
def split_bam_file_by_reads(bam, bai, outbams, outbais, tempspace, intervals, kwargs): # sort bam by reads and convert to sam helpers.makedirs(tempspace) headerfile = os.path.join(tempspace, "bam_header.sam") cmd = ['samtools', 'view', '-H', bam, '-o', headerfile] pypeliner.commandline.execute(*cmd, **kwargs) collate_prefix = os.path.join( tempspace, os.path.basename(bam) + "_collate_temp" ) collated_bam = os.path.join(tempspace, "bam_file_collated_sam_format.sam") cmd = [ 'samtools', 'collate', '-u', '-O', bam, collate_prefix, '|', 'samtools', 'view', '-', '-o', collated_bam ] pypeliner.commandline.execute(*cmd, **kwargs) tempoutputs = [ os.path.join(tempspace, os.path.basename(outbams[interval]) + ".split.temp") for interval in intervals ] split(collated_bam, tempoutputs, headerfile=headerfile) for inputsam, interval in zip(tempoutputs, intervals): outputbam = outbams[interval] cmd = ['samtools', 'view', '-Sb', inputsam, '-o', outputbam] pypeliner.commandline.execute(*cmd, **kwargs)
def run_fastqc(fastq1, fastq2, reports, tempdir, containers): """ run fastqc on both fastq files run trimgalore if needed, copy if not. """ reports_dir = os.path.join(tempdir, 'fastqc_reports') if not os.path.exists(reports_dir): helpers.makedirs(reports_dir) # empty fastq files if os.stat(fastq1).st_size < 100 and os.stat(fastq2).st_size < 100: helpers.make_tarfile(reports, reports_dir) return out_html = os.path.join(reports_dir, 'fastqc_R1.html') out_plot = os.path.join(reports_dir, 'fastqc_R1.zip') if not os.path.getsize(fastq1) == 0: bamutils.produce_fastqc_report(fastq1, out_html, out_plot, tempdir, docker_image=containers['fastqc']) else: logging.getLogger("single_cell.align.tasks").warn( "fastq file %s is empty, skipping fastqc" % fastq1) out_html = os.path.join(reports_dir, 'fastqc_R2.html') out_plot = os.path.join(reports_dir, 'fastqc_R2.zip') if not os.path.getsize(fastq2) == 0: bamutils.produce_fastqc_report(fastq2, out_html, out_plot, tempdir, docker_image=containers['fastqc']) else: logging.getLogger("single_cell.align.tasks").warn( "fastq file %s is empty, skipping fastqc" % fastq1) helpers.make_tarfile(reports, reports_dir)
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, docker_image=None): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd, docker_image=docker_image) cell_cycle_df = pd.read_csv(temp_output) hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output)
def generate_pipeline_config_in_temp(args): if args['which'] in ['clean_sentinels', 'generate_config']: return args if args.get("config_file", None): return args config_yaml = "config.yaml" tmpdir = args.get("tmpdir", None) pipelinedir = args.get("pipelinedir", None) # use pypeliner tmpdir to store yaml if pipelinedir: config_yaml = os.path.join(pipelinedir, config_yaml) elif tmpdir: config_yaml = os.path.join(tmpdir, config_yaml) else: warnings.warn("no tmpdir specified, generating configs in working dir") config_yaml = os.path.join(os.getcwd(), config_yaml) config_yaml = helpers.get_incrementing_filename(config_yaml) params_override = args["config_override"] helpers.makedirs(config_yaml, isfile=True) config_params = pipeline_config.get_config_params(override=params_override) config = pipeline_config.get_singlecell_pipeline_config(config_params) pipeline_config.write_config(config, config_yaml) args["config_file"] = config_yaml return args
def add_quality(hmmcopy_metrics, alignment_metrics, multipliers, output, training_data, tempdir): helpers.makedirs(tempdir) hmmcopy_tables = ['/hmmcopy/metrics/{}'.format(mult) for mult in multipliers] model = classify.train_classifier(training_data) feature_names = model.feature_names_ data = classify.load_data(hmmcopy_metrics, alignment_metrics, hmmcopy_tables, '/alignment/metrics', feature_names) for i, (hmmcopy_table, tabledata) in enumerate(data): intermediate_output = os.path.join( tempdir, '{}_metrics_with_quality.csv.gz'.format(i) ) predictions = classify.classify(model, tabledata) classify.write_to_output( hmmcopy_metrics, hmmcopy_table, intermediate_output, predictions) csvutils.prep_csv_files(intermediate_output, output, dtypes=dtypes()['metrics'])
def produce_fastqc_report(fastq_filename, output_html, output_plots, temp_dir, **kwargs): makedirs(temp_dir) pypeliner.commandline.execute( 'fastqc', '--outdir=' + temp_dir, fastq_filename, **kwargs) fastq_basename = os.path.basename(fastq_filename) if fastq_basename.endswith(".fastq.gz"): fastq_basename = fastq_basename.replace(".fastq.gz", "") elif fastq_basename.endswith(".fq.gz"): fastq_basename = fastq_basename.replace(".fq.gz", "") elif fastq_basename.endswith(".fq"): fastq_basename = fastq_basename.replace(".fq", "") elif fastq_basename.endswith(".fastq"): fastq_basename = fastq_basename.replace(".fastq", "") else: raise Exception("Unknown file type") output_basename = os.path.join(temp_dir, fastq_basename) shutil.move(output_basename + '_fastqc.zip', output_plots) shutil.move(output_basename + '_fastqc.html', output_html)
def create_chromosome_seqdata(seqdata, bam_file, tempdir, config, ref_data_dir, chromosomes=None): helpers.makedirs(tempdir) if not chromosomes: chromosomes = remixt.config.get_chromosomes(config, ref_data_dir) snp_positions_filename = remixt.config.get_filename( config, ref_data_dir, 'snp_positions') all_seqdata = {} bam_max_fragment_length = remixt.config.get_param( config, 'bam_max_fragment_length') bam_max_soft_clipped = remixt.config.get_param(config, 'bam_max_soft_clipped') bam_check_proper_pair = remixt.config.get_param(config, 'bam_check_proper_pair') for chrom in chromosomes: chrom_seqdata = os.path.join(tempdir, "{}_seqdata.h5".format(chrom)) all_seqdata[chrom] = chrom_seqdata remixt.seqdataio.create_chromosome_seqdata(chrom_seqdata, bam_file, snp_positions_filename, chrom, bam_max_fragment_length, bam_max_soft_clipped, bam_check_proper_pair) remixt.seqdataio.merge_seqdata(seqdata, all_seqdata)
def trim_fastqs(fastq1, fastq2, cell_id, tempdir, config): """ run fastqc on both fastq files run trimgalore if needed, copy if not. """ trim1 = os.path.join(tempdir, "fastq_R1_trimmed.fastq.gz") trim2 = os.path.join(tempdir, "fastq_R2_trimmed.fastq.gz") reports_dir = os.path.join(tempdir, 'fastqc_reports') if not os.path.exists(reports_dir): helpers.makedirs(reports_dir) rep1 = os.path.join(reports_dir, '{}_trimgalore_R1.html'.format(cell_id)) rep2 = os.path.join(reports_dir, '{}_trimgalore_R2.html'.format(cell_id)) qcrep1 = os.path.join(reports_dir, '{}_trimgalore_qc_R1.html'.format(cell_id)) qcrep2 = os.path.join(reports_dir, '{}_trimgalore_qc_R2.html'.format(cell_id)) qczip1 = os.path.join(reports_dir, '{}_trimgalore_qc_R1.zip'.format(cell_id)) qczip2 = os.path.join(reports_dir, '{}_trimgalore_qc_R2.zip'.format(cell_id)) run_trimgalore(fastq1, fastq2, trim1, trim2, 'trim_galore', 'cutadapt', tempdir, config['adapter'], config['adapter2'], rep1, rep2, qcrep1, qcrep2, qczip1, qczip2) return trim1, trim2
def download_blob(blob_path, tempdir): outpath = os.path.join(tempdir, blob_path) helpers.makedirs(outpath, isfile=True) storageutils.download_blob(blob_path, outpath, storage='azureblob') return outpath
def trim_fastqs(fastq1, fastq2, cell_id, tempdir, adapter, adapter2, trimgalore_docker): """ run fastqc on both fastq files run trimgalore if needed, copy if not. """ with helpers.getFileHandle(fastq1) as reader: if not reader.readline(): return fastq1, fastq2 trim1 = os.path.join(tempdir, "fastq_R1_trimmed.fastq.gz") trim2 = os.path.join(tempdir, "fastq_R2_trimmed.fastq.gz") reports_dir = os.path.join(tempdir, 'fastqc_reports') if not os.path.exists(reports_dir): helpers.makedirs(reports_dir) rep1 = os.path.join(reports_dir, '{}_trimgalore_R1.html'.format(cell_id)) rep2 = os.path.join(reports_dir, '{}_trimgalore_R2.html'.format(cell_id)) qcrep1 = os.path.join(reports_dir, '{}_trimgalore_qc_R1.html'.format(cell_id)) qcrep2 = os.path.join(reports_dir, '{}_trimgalore_qc_R2.html'.format(cell_id)) qczip1 = os.path.join(reports_dir, '{}_trimgalore_qc_R1.zip'.format(cell_id)) qczip2 = os.path.join(reports_dir, '{}_trimgalore_qc_R2.zip'.format(cell_id)) run_tg = RunTrimGalore(fastq1, fastq2, trim1, trim2, 'trim_galore', 'cutadapt', tempdir, adapter, adapter2, rep1, rep2, qcrep1, qcrep2, qczip1, qczip2, trimgalore_docker) run_tg.run_trimgalore() run_tg.gather_outputs() return trim1, trim2
def merge_postprocess_bams(inputs, output, tempdir, containers): helpers.makedirs(tempdir) merged_out = os.path.join(tempdir, 'merged_lanes.bam') picardutils.merge_bams(inputs, merged_out, docker_image=containers['picard']) bamutils.bam_index(merged_out, merged_out + '.bai', docker_image=containers['samtools']) sorted_bam = os.path.join(tempdir, 'sorted.bam') picardutils.bam_sort(merged_out, sorted_bam, tempdir, docker_image=containers['picard']) markdups_metrics = os.path.join(tempdir, 'markdups_metrics.txt') picardutils.bam_markdups(sorted_bam, output, markdups_metrics, tempdir, docker_image=containers['picard']) bamutils.bam_index(output, output + '.bai', docker_image=containers['samtools'])
def cell_cycle_classifier(hmmcopy_reads, hmmcopy_metrics, alignment_metrics, output, tempdir, genome_labels): helpers.makedirs(tempdir) temp_output = os.path.join(tempdir, 'cell_cycle_output.csv') cmd = [ 'cell_cycle_classifier', 'train-classify', hmmcopy_reads, hmmcopy_metrics, alignment_metrics, temp_output ] pypeliner.commandline.execute(*cmd) cell_cycle_df = pd.read_csv(temp_output) cols_cell_cycle = cell_cycle_df.columns.values hmm_metrics_df = csvutils.read_csv_and_yaml(hmmcopy_metrics) hmm_metrics_df = hmm_metrics_df.merge(cell_cycle_df, on=['cell_id'], how='outer') out_dtypes = dtypes(genome_labels) for colname in cols_cell_cycle: hmm_metrics_df[colname] = hmm_metrics_df[colname].astype( out_dtypes[colname]) csvutils.write_dataframe_to_csv_and_yaml(hmm_metrics_df, output, out_dtypes)
def concatenate_vcf(in_files, out_file, tempdir, docker_config={}, allow_overlap=False, bcf_index_file=None, vcf_index_file=None): """ Fast concatenation of VCF file using `bcftools`. :param in_files: dict with values being files to be concatenated. Files will be concatenated based on sorted order of keys. :param out_file: path where output file will be written in VCF format. """ helpers.makedirs(tempdir) merged_file = os.path.join(tempdir, 'merged.vcf') if allow_overlap: cmd = ['bcftools', 'concat', '-a', '-O', 'z', '-o', merged_file] else: cmd = ['bcftools', 'concat', '-O', 'z', '-o', merged_file] cmd += flatten_input(in_files) pypeliner.commandline.execute(*cmd, **docker_config) #sort merged vcf file cmd = ['bcftools', 'sort', '-O', 'z', '-o', out_file, merged_file] pypeliner.commandline.execute(*cmd, **docker_config) index_vcf(out_file, docker_config, index_file=vcf_index_file) index_bcf(out_file, docker_config, index_file=bcf_index_file)
def picard_wgs_dup( input_bam, markdups_bam, markdups_metrics, tempdir, ref_genome, wgs_metrics, picard_wgs_params, ): tempdir_markdups = os.path.join(tempdir, 'markdups') helpers.makedirs(tempdir_markdups) picardutils.bam_markdups( input_bam, markdups_bam, markdups_metrics, tempdir_markdups, ) tempdir_wgs = os.path.join(tempdir, 'wgs') helpers.makedirs(tempdir_wgs) picardutils.bam_collect_wgs_metrics( input_bam, ref_genome, wgs_metrics, picard_wgs_params, tempdir_wgs, )
def organism_filter(fastq_r1, fastq_r2, filtered_fastq_r1, filtered_fastq_r2, detailed_metrics, summary_metrics, tempdir, cell_id, params): # fastq screen tries to skip if files from old runs are available if os.path.exists(tempdir): shutil.rmtree(tempdir) helpers.makedirs(tempdir) tagged_fastq_r1, tagged_fastq_r2 = run_fastq_screen_paired_end( fastq_r1, fastq_r2, tempdir, params, ) reader = fastqutils.PairedTaggedFastqReader(tagged_fastq_r1, tagged_fastq_r2) counts = reader.gather_counts() write_detailed_counts(counts, detailed_metrics, cell_id, params) write_summary_counts(counts, summary_metrics, cell_id, params) utils.filter_tag_reads(tagged_fastq_r1, tagged_fastq_r2, filtered_fastq_r1, filtered_fastq_r2, params)
def write_svtyper_annotations(csv, output_paths, tempdir): """ writes the annotations contained in the below annotations list to files, each to their own :param csv: csv file containg annotations as features :type csv: :param output_paths: output directories for annotation files :type output_paths: :param tempdir: :type tempdir: :return: :rtype: """ helpers.makedirs(tempdir) annotations = [ "AO", "AP", "AS", "ASC", "DP", "GQ", "QA", "QR", "RO", "RP", "RS", "SQ", "GL", "AB" ] csv = pd.read_csv(csv, delimiter=",") for annotation in annotations: temp_output_path = os.path.join(tempdir, '{}.csv.gz'.format(annotation)) write_svtyper_annotation(annotation, csv, temp_output_path) csvutils.finalize_csv(temp_output_path, output_paths[annotation])
def create_hmmcopy_data_tar(infiles, tar_output, tempdir): helpers.makedirs(tempdir) for key, infile in infiles.items(): helpers.extract_tar(infile, os.path.join(tempdir, key)) helpers.make_tarfile(tar_output, tempdir)
def picard_insert_gc_flagstat(input_bam, ref_genome, gc_metrics, gc_metrics_summary, gc_metrics_pdf, tempdir, flagstat_metrics, insert_metrics, insert_pdf, picard_docker=None, samtools_docker=None): bamutils.bam_flagstat(input_bam, flagstat_metrics, docker_image=samtools_docker) gc_tempdir = os.path.join(tempdir, 'gc') helpers.makedirs(gc_tempdir) picardutils.bam_collect_gc_metrics(input_bam, ref_genome, gc_metrics, gc_metrics_summary, gc_metrics_pdf, gc_tempdir, docker_image=picard_docker) insert_tempdir = os.path.join(tempdir, 'insert') helpers.makedirs(insert_tempdir) picardutils.bam_collect_insert_metrics(input_bam, flagstat_metrics, insert_metrics, insert_pdf, insert_tempdir, docker_image=picard_docker)
def bam_sort(bam_filename, sorted_bam_filename, tempdir, mem="2G"): if not os.path.exists(tempdir): makedirs(tempdir) pypeliner.commandline.execute( 'picard', '-Xmx' + mem, '-Xms' + mem, '-XX:ParallelGCThreads=1', 'SortSam', 'INPUT=' + bam_filename, 'OUTPUT=' + sorted_bam_filename, 'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT', 'TMP_DIR=' + tempdir, 'MAX_RECORDS_IN_RAM=150000', 'QUIET=true')
def generate_qc_report(tempdir, reference_gc, fastqscreen_training_data, metrics_df, gc_metrics_df, qc_report, metrics_df_annotated): helpers.makedirs(tempdir) fastqscreen_classify.classify_fastqscreen(fastqscreen_training_data, metrics_df, metrics_df_annotated, dtypes()['metrics']) generate_qc.generate_html_report(tempdir, qc_report, reference_gc, metrics_df, gc_metrics_df)
def tar_align_data(infiles, tar_output, tempdir): helpers.makedirs(tempdir) for infile in infiles: for key, filepath in infile.items(): temp_path = os.path.join( tempdir, '{}_{}'.format(key, os.path.basename(filepath))) helpers.copyfile(filepath, temp_path) helpers.make_tarfile(tar_output, tempdir)
def align_pe(fastq1, fastq2, output, reports_dir, tempdir, reference, trim, centre, sample_info, cell_id, lane_id, library_id, aligner, containers, adapter, adapter2, fastqscreen_detailed_metrics, fastqscreen_summary_metrics, fastqscreen_params): fastqscreen_tempdir = os.path.join(tempdir, 'fastq_screen') helpers.makedirs(fastqscreen_tempdir) filtered_fastq_r1 = os.path.join(fastqscreen_tempdir, "fastq_r1.fastq.gz") filtered_fastq_r2 = os.path.join(fastqscreen_tempdir, "fastq_r2.fastq.gz") fastqscreen.organism_filter( fastq1, fastq2, filtered_fastq_r1, filtered_fastq_r2, fastqscreen_detailed_metrics, fastqscreen_summary_metrics, fastqscreen_tempdir, cell_id, fastqscreen_params, reference, docker_image=containers['fastq_screen'], filter_contaminated_reads=fastqscreen_params[ 'filter_contaminated_reads'], ) readgroup = get_readgroup(lane_id, cell_id, library_id, centre, sample_info) run_fastqc(filtered_fastq_r1, filtered_fastq_r2, reports_dir, tempdir, containers) aln_temp = os.path.join(tempdir, "temp_alignments.bam") if aligner == "bwa-aln" and trim: filtered_fastq_r1, filtered_fastq_r2 = trim_fastqs( filtered_fastq_r1, filtered_fastq_r2, cell_id, tempdir, adapter, adapter2, containers['trimgalore']) align_pe_with_bwa(filtered_fastq_r1, filtered_fastq_r2, aln_temp, reference, readgroup, tempdir, containers, aligner=aligner) picardutils.bam_sort(aln_temp, output, tempdir, docker_image=containers['picard']) metrics = os.path.join(reports_dir, 'flagstat_metrics.txt') bamutils.bam_flagstat(output, metrics, docker_image=containers['samtools'])