def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate options bam = pipeline_args['bam'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') cufflinks_lib_type = pipeline_args['cufflinks_lib_type'] htseq_stranded = pipeline_args['htseq_stranded'] # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir]) # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] # Establish Software instances cufflinks = Software('Cufflinks', pipeline_config['cufflinks']['path']) htseq = Software('HTSeq', pipeline_config['htseq']['path']) cufflinks_output_dir = os.path.join(output_dir, 'cufflinks') subprocess.call(['mkdir', '-p', cufflinks_output_dir]) cufflinks.run( Parameter('--GTF', pipeline_config['cufflinks']['transcriptome-gtf']), Parameter('-p', pipeline_config['cufflinks']['threads']), Parameter('--library-type', cufflinks_lib_type), Parameter('--upper-quartile-norm'), Parameter('-o', cufflinks_output_dir), Parameter('--max-bundle-frags', '1000000000'), Parameter(bam) ) htseq_output_dir = os.path.join(output_dir, 'htseq') subprocess.call(['mkdir', '-p', htseq_output_dir]) for id_attr in ['gene_id', 'gene_name']: for feature_type in ['gene', 'transcript', 'exon']: htseq.run( Parameter('-f', 'bam'), Parameter('-r', 'name'), Parameter('-s', htseq_stranded), Parameter('-t', feature_type), Parameter('-i', id_attr), Parameter(bam), Parameter(pipeline_config['htseq']['transcriptome-gtf']), Redirect(stream=Redirect.STDOUT, dest=os.path.join(htseq_output_dir, '{}.{}.counts'.format(feature_type, id_attr))) ) # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate options reads = pipeline_args['reads'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = pipeline_args['step'] forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] run_is_stranded = pipeline_args['is_stranded'] # Determine if run is paired-end from input run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1 # Create output, tmp, and logs directories subprocess.call([ 'mkdir', '-p', output_dir, logs_dir, os.path.join(output_dir, 'tmp') ]) # Timing functions for getting running time start_time = datetime.now() # Gather QC data qc_data = { 'total_raw_reads_counts': [], 'trimmed_reads_counts': [], 'num_reads_mapped': '0', 'running_time_seconds': '', 'running_time_readable': '' } # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] # Establish software instances cat = Software('cat', '/bin/cat') cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) star = Software('STAR', pipeline_config['STAR']['path']) rsem_calculate_expression = Software( 'RSEM', pipeline_config['RSEM']['path-calculate-expression']) rsem_plot_model = Software('RSEM', pipeline_config['RSEM']['path-plot-model']) bedGraph_to_bw = Software('bedGraphToBigWig', pipeline_config['bedgraph_to_bw']['path']) bed_sort = Software('bedSort', pipeline_config['bedSort']['path']) samtools_flagstat = Software( 'samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat') # Step 1: If more than one reads pairs are provided, combine them if step <= 1 and len(reads) >= 2: if run_is_paired_end: # Aggregate read1s and read2s read1s, read2s = [], [] for reads_set in reads: read1, read2 = reads_set.split(':') read1s.append(read1) read2s.append(read2) # Combine reads groups combined_reads = [] for name, reads_group in [('read1', read1s), ('read2', read2s)]: combined_read_filename = os.path.join( output_dir, '{}.combined.{}.fastq.gz'.format(lib_prefix, name)) combined_reads.append(combined_read_filename) staging_delete.append(combined_read_filename) cat.run( Parameter(*[read for read in reads_group]), Redirect(stream=Redirect.STDOUT, dest=combined_read_filename)) # Update reads list reads = [':'.join(combined_reads)] else: # Combine reads combined_read_filename = os.path.join( output_dir, '{}.combined.fastq.gz'.format(lib_prefix)) staging_delete.append(combined_read_filename) cat.run( Parameter(*[read for read in reads]), Redirect(stream=Redirect.STDOUT, dest=combined_read_filename)) # Update reads list reads = [combined_read_filename] # Step 2: Trim adapters with cutadapt if step <= 2: reads_set = reads[FIRST_READS_PAIR] if run_is_paired_end: # Get paired-end reads, construct new filenames read1, read2 = reads_set.split(':') # QC: Get raw fastq read counts qc_data['total_raw_reads_counts'].extend([ str(int(self.count_gzipped_lines(read1)) / 4), str(int(self.count_gzipped_lines(read2)) / 4) ]) trimmed_read1_filename = os.path.join( output_dir, lib_prefix + '_read1.trimmed.fastq.gz') trimmed_read2_filename = os.path.join( output_dir, lib_prefix + '_read2.trimmed.fastq.gz') staging_delete.append(trimmed_read1_filename) staging_delete.append(trimmed_read2_filename) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format( pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read1_filename)), Parameter( '--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter), Parameter('-A', reverse_adapter), Parameter('-q', '30'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))) # QC: Get trimmed fastq read counts qc_data['trimmed_reads_counts'].extend([ str( int(self.count_gzipped_lines(trimmed_read1_filename)) / 4), str( int(self.count_gzipped_lines(trimmed_read2_filename)) / 4) ]) # Update reads list reads = ':'.join( [trimmed_read1_filename, trimmed_read2_filename]) else: # QC: Get raw fastq read count qc_data['total_raw_reads_counts'].append( str( int( self.count_gzipped_lines( os.path.join( output_dir, '{}.combined.fastq.gz'.format( lib_prefix)))) / 4)) # Construct new filename trimmed_read_filename = os.path.join( output_dir, lib_prefix + '.trimmed.fastq.gz') staging_delete.append(trimmed_read_filename) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format( pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read_filename)), Parameter('-a', forward_adapter), Parameter('-q', '30'), Parameter(reads[FIRST_READS_PAIR]), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary'))) # QC: Get trimmed fastq read count qc_data['trimmed_reads_counts'].append( str( int(self.count_gzipped_lines(trimmed_read_filename)) / 4)) # Update reads list reads = [trimmed_read_filename] # Step 3: Alignment if step <= 3: # Gets reads for paired-end and single-end if run_is_paired_end: read1, read2 = reads.split(':') else: read1 = reads[FIRST_READS_PAIR] read2 = '' # Set up STAR parameters star_outfile_prefix = os.path.join( output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) star_common = [ Parameter('--outFileNamePrefix', star_outfile_prefix), Parameter('--genomeDir', pipeline_config['STAR']['genome-dir']), Parameter('--readFilesIn', read1, read2), Parameter('--readFilesCommand', 'zcat'), Parameter('--outFilterType', 'BySJout'), Parameter('--outFilterMultimapNmax', '20'), Parameter('--alignSJoverhangMin', '8'), Parameter('--alignSJDBoverhangMin', '1'), Parameter('--outFilterMismatchNmax', '999'), Parameter('--alignIntronMin', '20'), Parameter('--alignIntronMax', '1000000'), Parameter('--alignMatesGapMax', '1000000'), Parameter('--outSAMunmapped', 'Within'), Parameter('--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD'), Parameter('--outFilterMismatchNoverReadLmax', '0.04'), Parameter('--sjdbScore', '1') ] star_run = [ Parameter('--runThreadN', pipeline_config['STAR']['threads']), #Parameter('--genomeLoad', 'LoadAndKeep'), #Parameter('--limitBAMsortRAM', '10000000000') ] star_bam = [ Parameter('--outSAMtype', 'BAM', 'SortedByCoordinate'), Parameter('--quantMode', 'TranscriptomeSAM') ] star_strand, star_wig = [], [] # STAR strandedness parameters if run_is_stranded: star_wig.append(Parameter('--outWigStrand', 'Stranded')) else: star_strand.append( Parameter('--outSAMstrandField', 'intronMotif')) star_wig.append(Parameter('--outWigStrand', 'Unstranded')) star_meta = [] # Run STAR alignment step star.run(*(star_common + star_run + star_bam + star_strand + star_meta)) # Store STAR output files star_output_bam = star_outfile_prefix + 'Aligned.sortedByCoord.out.bam' # QC: Get samtools flagstat samtools_flagstat.run( Parameter(star_output_bam), Redirect(stream=Redirect.STDOUT, dest=star_output_bam + '.flagstat')) # QC: Get number of mapped reads from this BAM with open(star_output_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents) if target_line is not None: qc_data['num_reads_mapped'] = str( int(target_line.group(1)) / 2) # Generate bedGraph signal_output_dir = os.path.join(output_dir, 'signal') subprocess.call(['mkdir', '-p', signal_output_dir]) signal_output_prefix = os.path.join( signal_output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) # Run STAR for signal generation star.run(Parameter('--runMode', 'inputAlignmentsFromBAM'), Parameter('--inputBAMfile', star_output_bam), Parameter('--outWigType', 'bedGraph'), Parameter('--outFileNamePrefix', signal_output_prefix), Parameter('--outWigReferencesPrefix', 'chr'), *star_wig) # Convert bedGraph to bigWig chrNL_txt = os.path.join(output_dir, 'chrNL.txt') with open(chrNL_txt, 'w') as chrNL_filehandle: subprocess.call([ 'grep', '^chr', os.path.join(pipeline_config['STAR']['genome-dir'], 'chrNameLength.txt') ], stdout=chrNL_filehandle) # Generate temporary signal file path sig_tmp = os.path.join(output_dir, 'sig.tmp') staging_delete.append(sig_tmp) if run_is_stranded: strand = [None, '-', '+'] for i_strand in [1, 2]: for i_mult in ['Unique', 'UniqueMultiple']: # Get signal file for this iteration signal_file = '{}Signal.{}.str{}.out.bg'.format( signal_output_prefix, i_mult, str(i_strand)) # Write to temporary signal file with open(sig_tmp, 'w') as sig_tmp_filehandle: subprocess.call(['grep', '^chr', signal_file], stdout=sig_tmp_filehandle) # Sort sig.tmp with bedSort bed_sort.run(Parameter(sig_tmp), Parameter(sig_tmp)) # Run bedGraph to bigWig conversion bedGraph_to_bw.run( Parameter(sig_tmp), Parameter(chrNL_txt), Parameter('{}Signal.{}.strand{}.bw'.format( signal_output_prefix, i_mult, strand[i_strand]))) else: for i_mult in ['Unique', 'UniqueMultiple']: # Get signal file for this iteration signal_file = '{}Signal.{}.str1.out.bg'.format( signal_output_prefix, i_mult) # Write to temporary signal file with open(sig_tmp, 'w') as sig_tmp_filehandle: subprocess.call(['grep', '^chr', signal_file], stdout=sig_tmp_filehandle) # Sort sig.tmp with bedSort bed_sort.run(Parameter(sig_tmp), Parameter(sig_tmp)) # Run bedGraph to bigWig conversion bedGraph_to_bw.run( Parameter(sig_tmp), Parameter(chrNL_txt), Parameter('{}Signal.{}.unstranded.bw'.format( signal_output_prefix, i_mult))) # Step 4: Sort transcriptome BAM to ensure order of reads to make RSEM output deterministic if step <= 4: # Set BAM file paths, mv transcriptome BAM to temporary name star_outfile_prefix = os.path.join( output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam' tr_bam = star_outfile_prefix + 'Tr.bam' staging_delete.append(tr_bam) subprocess.call(['mv', transcriptome_bam, tr_bam]) # Template command merge_cmd = 'cat <({input1}) <({input2}) | {compress} > {output}' input1_cmd = '{samtools} view -H {bam}' compress_cmd = 'samtools view -@ {threads} -bS -' if run_is_paired_end: input2_cmd = ( '{samtools} view -@ {threads} {bam} | ' + 'awk \'{{printf "%s", $0 " "; getline; print}}\' | ' + 'sort -S {ram} -T {tmpdir} | ' + 'tr \' \' \'\\n\'') else: input2_cmd = ('{samtools} view -@ {threads} {bam} | ' + 'sort -S {ram} -T {tmpdir}') print merge_cmd.format( input1=input1_cmd.format( samtools=pipeline_config['samtools']['path'], bam=tr_bam), input2=input2_cmd.format( samtools=pipeline_config['samtools']['path'], threads=pipeline_config['RSEM']['threads'], bam=tr_bam, ram=pipeline_config['sort']['memory'], tmpdir=os.path.join(output_dir, 'tmp')), compress=compress_cmd.format( threads=pipeline_config['RSEM']['threads']), output=transcriptome_bam) subprocess.call(merge_cmd.format( input1=input1_cmd.format( samtools=pipeline_config['samtools']['path'], bam=tr_bam), input2=input2_cmd.format( samtools=pipeline_config['samtools']['path'], threads=pipeline_config['RSEM']['threads'], bam=tr_bam, ram=pipeline_config['sort']['memory'], tmpdir=os.path.join(output_dir, 'tmp')), compress=compress_cmd.format( threads=pipeline_config['RSEM']['threads']), output=transcriptome_bam), shell=True, executable='/bin/bash') subprocess.call(['rm', tr_bam]) # Step 5: Run RSEM to get quantification if step <= 5: star_outfile_prefix = os.path.join( output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam' # Set up RSEM parameters rsem_common = [ Parameter('--bam'), Parameter('--estimate-rspd'), Parameter('--calc-ci'), Parameter('--no-bam-output'), Parameter('--seed', '12345') ] rsem_run = [ Parameter('-p', pipeline_config['RSEM']['threads']), Parameter('--ci-memory', pipeline_config['RSEM']['memory']) ] rsem_type = [] if run_is_paired_end: rsem_type.append(Parameter('--paired-end')) if run_is_stranded: rsem_type.append(Parameter('--forward-prob', '0')) # Run RSEM quantification step rsem_calculate_expression.run( *(rsem_common + rsem_run + rsem_type + [ Parameter(transcriptome_bam), Parameter(pipeline_config['RSEM']['reference-dir']), Parameter(os.path.join(output_dir, 'RSEM_Quant')), Redirect(Redirect.BOTH, dest=os.path.join(logs_dir, 'Log.rsem')) ])) # Generate RSEM plot model rsem_plot_model.run( Parameter(os.path.join(output_dir, 'RSEM_Quant'), os.path.join(output_dir, 'Quant.pdf'))) # QC: Get time delta elapsed_time = datetime.now() - start_time qc_data['running_time_seconds'] = str(elapsed_time.seconds) qc_data['running_time_readable'] = str(elapsed_time) # QC: Output QC data to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(json.dumps(qc_data, indent=4) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file]) print 'Complete' print 'Elapsed time: {}'.format(str(elapsed_time)) print 'Elapsed time seconds: {}'.format(str(elapsed_time.seconds))
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate options reads = pipeline_args['reads'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = pipeline_args['step'] forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] run_is_stranded = pipeline_args['is_stranded'] # Determine if run is paired-end from input run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1 # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir]) # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] qc_metrics = { 'total_raw_reads': [], 'total_trimmed_reads': [], 'percent_num_reads_mapped_genome': [], 'percent_num_reads_mapped_transcriptome': [], 'percent_duplicate_reads': '0', 'num_reads_multimapped': [], 'percent_num_reads_rrna': '', 'viral_rna': [] } synapse_metadata = { 'Assay': 'RNAseq', 'Individual_ID': '', 'Sample_ID': '', 'File_Name': '', 'BrodmannArea': '', 'BrainRegion': '', 'Hemisphere': '', 'CellType': 'NA', 'TissueState': '', 'RNAIsolationBatch': '', 'RIN': '', 'LibraryBatch': '', 'LibraryPrep': 'stranded, rRNA depletion', 'LibraryKit': 'Illumina RS-122-2301', 'ERCC_Added': '', 'RunType': 'paired-end', 'ReadLength': '100bp', 'FlowcellBatch': '', 'SequencingPlatform': '', 'TotalReads': '', 'MappedReads_Primary': '0', 'MappedReads_Multimapped': '0', 'rRNARate': '0', 'Notes': '' } # Establish Software instances cutadapt = Software('Cutadapt', pipeline_config['cutadapt']['path']) fastqc = Software('FastQC', pipeline_config['fastqc']['path']) star = Software('STAR Two-Pass', pipeline_config['STAR']['path']) novosort = Software('Novosort', pipeline_config['novosort']['path']) samtools_flagstat = Software( 'Samtools Flagstat', pipeline_config['samtools']['path'] + ' flagstat') samtools_index = Software( 'Samtools Index', pipeline_config['samtools']['path'] + ' index') samtools_faidx = Software( 'Samtools Faidx', pipeline_config['samtools']['path'] + ' faidx') picard_markduplicates = Software( 'Picard MarkDuplicates', 'java -Xmx{heap_size}g -jar {path} MarkDuplicates'.format( heap_size=pipeline_config['picard'].get( 'heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['picard']['path'])) picard_create_seq_dict = Software( 'Picard CreateSequenceDictionary', 'java -Xmx{heap_size}g -jar {path} CreateSequenceDictionary'. format(heap_size=pipeline_config['picard'].get( 'heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['picard']['path'])) rnaseqc = Software( 'RNAseQC', 'java -Xmx{heap_size}g -jar {path}'.format( heap_size=pipeline_config['picard'].get( 'heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['RNAseQC']['path'])) picard_add_read_groups = Software( 'Picard AddOrReplaceReadGroups', 'java -Xmx{heap_size}g -jar {path} AddOrReplaceReadGroups'.format( heap_size=pipeline_config['picard'].get( 'heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['picard']['path'])) bedtools_coverage = Software( 'Bedtools Coverage', pipeline_config['bedtools']['path'] + ' coverage') bedtools_bamtobed = Software( 'Bedtools Bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed') # Housekeeping star_output = [] novosort_outfile = '' # Step 1: Trimming | Cutadapt if step <= 1: for i, read in enumerate(reads): if run_is_paired_end: # Get paired-end reads, construct new filenames read1, read2 = read.split(':') # QC: Get raw fastq read counts qc_metrics['total_raw_reads'].append([ str(int(self.count_gzipped_lines(read1)) / 4), str(int(self.count_gzipped_lines(read2)) / 4) ]) trimmed_read1_filename = os.path.join( output_dir, lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i)) trimmed_read2_filename = os.path.join( output_dir, lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i)) staging_delete.extend( [trimmed_read1_filename, trimmed_read2_filename]) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format( pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter( '--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format( trimmed_read2_filename)), Parameter('-a', forward_adapter), Parameter('-A', reverse_adapter), Parameter('-q', '30'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary'))) # QC: Get trimmed fastq read counts qc_metrics['total_trimmed_reads'].append([ str( int( self.count_gzipped_lines( trimmed_read1_filename)) / 4), str( int( self.count_gzipped_lines( trimmed_read2_filename)) / 4) ]) # Update reads list reads[i] = ':'.join( [trimmed_read1_filename, trimmed_read2_filename]) else: # QC: Get raw fastq read counts qc_metrics['total_raw_reads'].append( [str(int(self.count_gzipped_lines(read)) / 4)]) # Construct new filename trimmed_read_filename = os.path.join( output_dir, lib_prefix + '_{}.trimmed.fastq.gz'.format(i)) staging_delete.append(trimmed_read_filename) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format( pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read_filename)), Parameter('-a', forward_adapter), Parameter('-q', '30'), Parameter(read), Redirect(stream=Redirect.STDOUT, dest=os.path.join( logs_dir, 'cutadapt.chicago.summary'))) # QC: Get trimmed fastq read counts qc_metrics['total_trimmed_reads'].append([ str( int(self.count_gzipped_lines( trimmed_read_filename)) / 4) ]) # Update reads list reads[i] = trimmed_read_filename # Step 2: FastQC if step <= 2: # Make FastQC directory fastqc_output_dir = os.path.join(output_dir, 'fastqc') subprocess.call(['mkdir', '-p', fastqc_output_dir]) all_fastqs = [] if run_is_paired_end: for read in reads: all_fastqs.extend(read.split(':')) else: all_fastqs.extend(reads) for fastq in all_fastqs: fastqc.run(Parameter('--outdir={}'.format(fastqc_output_dir)), Parameter(fastq)) # Step 3: Alignment | STAR 2-pass, Alignment Stats | samtools flagstat if step <= 3: # Set up common STAR parameters star_common = [ Parameter('--runMode', 'alignReads'), Parameter('--twopassMode', 'Basic'), Parameter('--runThreadN', pipeline_config['STAR']['threads']), Parameter('--genomeDir', pipeline_config['STAR']['genome-dir']), Parameter('--readFilesCommand', 'zcat'), Parameter('--quantMode', 'TranscriptomeSAM', 'GeneCounts'), Parameter('--outSAMtype', 'BAM', 'Unsorted'), Parameter('--outFilterType', 'BySJout'), Parameter('--outFilterMultimapNmax', '20'), Parameter('--alignSJoverhangMin', '8'), Parameter('--alignSJDBoverhangMin', '1'), Parameter('--outFilterMismatchNmax', '2'), Parameter('--alignIntronMin', '20'), Parameter('--alignIntronMax', '1000000'), Parameter('--alignMatesGapMax', '1000000'), (Parameter('--outFilterIntronMotifs', 'RemoveNoncanonical') if run_is_stranded else Parameter('--outSAMstrandField', 'intronMotif')) ] # Get STAR output file prefix star_outfile_prefix = os.path.join( output_dir, lib_prefix + ('_' if lib_prefix[-1] != '.' else '') + '{}.') # Align each read or read pair for i, read in enumerate(reads): star_output_bam = star_outfile_prefix.format( i) + 'Aligned.out.bam' star_output_transcriptome_bam = star_outfile_prefix.format( i) + 'Aligned.toTranscriptome.out.bam' star_output.append(star_output_bam) if run_is_paired_end: read1, read2 = read.split(':') star_paired_end = [ Parameter('--readFilesIn', read1, read2), Parameter('--outFileNamePrefix', star_outfile_prefix.format(i)) ] star.run(*(star_common + star_paired_end)) else: star_single_end = [ Parameter('--readFilesIn', read), Parameter('--outFileNamePrefix', star_outfile_prefix.format(i)) ] star.run(*(star_common + star_single_end)) # Get flagstats for both alignments samtools_flagstat.run( Parameter(star_output_bam), Redirect(stream=Redirect.STDOUT, dest=star_output_bam + '.flagstat')) samtools_flagstat.run( Parameter(star_output_transcriptome_bam), Redirect(stream=Redirect.STDOUT, dest=star_output_transcriptome_bam + '.flagstat')) # QC: Get number of mapped reads to the genome from this BAM try: with open(star_output_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() # Pull out mapped reads target_line = re.search( r'(\d+) \+ \d+ mapped \(([0-9\.]+)%', flagstats_contents) if target_line is not None: num_mapped = int(target_line.group(1)) qc_metrics[ 'percent_num_reads_mapped_genome'].append([ str(num_mapped / 2), '{}%'.format(target_line.group(2)) ]) num_secondary = int( re.search(r'(\d+) \+ \d+ secondary', flagstats_contents).group(1)) num_supplementary = int( re.search(r'(\d+) \+ \d+ supplementary', flagstats_contents).group(1)) synapse_metadata['MappedReads_Primary'] = str( int(synapse_metadata['MappedReads_Primary']) + num_mapped - num_secondary - num_supplementary) synapse_metadata['MappedReads_Multimapped'] = str( int(synapse_metadata['MappedReads_Multimapped'] ) + num_secondary) else: qc_metrics[ 'percent_num_reads_mapped_genome'].append('0') # Pull out multimapped reads target_line = re.search(r'(\d+) \+ \d+ secondary', flagstats_contents) if target_line is not None: qc_metrics['num_reads_multimapped'].append( str(int(target_line.group(1)) / 2)) else: qc_metrics['num_reads_multimapped'].append('0') except: qc_metrics['percent_num_reads_mapped_genome'].append( 'Could not open flagstats for {}'.format( star_output_bam)) qc_metrics['num_reads_multimapped'].append( 'Could not open flagstats for {}'.format( star_output_bam)) # QC: Get number of mapped reads to the transcriptome from this BAM try: with open(star_output_transcriptome_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search( r'(\d+) \+ \d+ mapped \(([0-9\.]+)%', flagstats_contents) if target_line is not None: qc_metrics[ 'percent_num_reads_mapped_transcriptome'].append( [ str(int(target_line.group(1)) / 2), '{}%'.format(target_line.group(2)) ]) else: qc_metrics[ 'percent_num_reads_mapped_transcriptome'].append( '0') except: qc_metrics[ 'percent_num_reads_mapped_transcriptome'].append( 'Could not open flagstats for {}'.format( star_output_bam)) # Step 4: BAM Merge | Novosort if step <= 4: # Novosort to sort and merge BAM files novosort_outfile = os.path.join( output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '') + 'merged.Aligned.out.bam') novosort.run( Parameter('--tmpdir', os.path.join(output_dir, 'tmp')), Parameter(*[bam for bam in star_output]), Redirect(stream=Redirect.STDOUT, dest=novosort_outfile)) """ The step below was commented out on 27 June 2016. It was taking up large amounts of memory, more than Beagle could handle, and some samples were consistently failing as a result. I think RNAseQC does this step anyway, I only left it in because I figured it wasn't doing any harm. Well now it is, so it's gone. """ # QC: Get number of reads mapped to rRNA regions # aligned_bed_file = os.path.join(output_dir, str(uuid.uuid4()) + '.bed') # coverage_file = os.path.join(output_dir, str(uuid.uuid4()) + '.coverage.bed') # staging_delete.extend([aligned_bed_file, coverage_file]) # # bedtools_bamtobed.run( # Parameter('-i', novosort_outfile), # Redirect(stream=Redirect.STDOUT, dest=aligned_bed_file) # ) # bedtools_coverage.run( # Parameter('-s'), # Parameter('-counts'), # Parameter('-a', pipeline_config['qc']['rRNA-bed']), # Parameter('-b', aligned_bed_file), # Redirect(stream=Redirect.STDOUT, dest=coverage_file) # ) # try: # rRNA_count = 0 # with open(coverage_file) as coverage: # for line in coverage: # rRNA_count += int(line.strip().split('\t')[6]) # percent_rRNA = (rRNA_count / # float(sum([int(aln[MAPPED_READS_COUNT]) # for aln # in qc_metrics['percent_num_reads_mapped_transcriptome']])) # ) # qc_metrics['percent_num_reads_rrna'] = [str(rRNA_count), str(percent_rRNA)] # synapse_metadata['rRNARate'] = str(percent_rRNA) # except Exception as e: # qc_metrics['percent_num_reads_rrna'] = ['error', 'error', e.message] # Prepare genome fasta for RNAseQC genome_fa = pipeline_config['qc']['genome-fa'] genome_fai = genome_fa + '.fai' genome_dict = os.path.splitext(genome_fa)[0] + '.dict' if not os.path.isfile(genome_fai): samtools_faidx.run(Parameter(genome_fa)) if not os.path.isfile(genome_dict): picard_create_seq_dict.run( Parameter('REFERENCE={}'.format(genome_fa)), Parameter('OUTPUT={}'.format(genome_dict))) # Add read group to alignment file read_group_bam = os.path.join(output_dir, 'readgroup.bam') staging_delete.append(read_group_bam) picard_add_read_groups.run( Parameter('INPUT={}'.format(novosort_outfile)), Parameter('OUTPUT={}'.format(read_group_bam)), Parameter('RGLB={}'.format(lib_prefix)), Parameter('RGPL=Illumina'), Parameter('RGPU=1'), Parameter('RGSM=Sample')) # Generate BAM index for RNAseQC samtools_index.run(Parameter(read_group_bam)) staging_delete.append(read_group_bam + '.bai') # QC: Get RNAseQC output rnaseqc_output_dir = os.path.join(output_dir, 'RNAseQC') subprocess.call(['mkdir', '-p', rnaseqc_output_dir]) rnaseqc.run( Parameter('-o', rnaseqc_output_dir), Parameter('-r', genome_fa), Parameter('-t', pipeline_config['cufflinks']['transcriptome-gtf']), Parameter( '-s', '"{sample_id}|{bam_file}|{notes}"'.format( sample_id=lib_prefix, bam_file=read_group_bam, notes='None')), Parameter('-singleEnd') if not run_is_paired_end else Parameter()) # Picard MarkDuplicates to get duplicates metrics markduplicates_outfile = os.path.join( output_dir, '{}.processed.bam'.format(lib_prefix)) markduplicates_metrics_filepath = os.path.join( logs_dir, 'mark_dup.metrics') picard_markduplicates.run( Parameter('INPUT={}'.format(novosort_outfile)), Parameter('OUTPUT={}'.format(markduplicates_outfile)), Parameter('TMP_DIR={}'.format(tmp_dir)), Parameter( 'METRICS_FILE={}'.format(markduplicates_metrics_filepath)), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))) # QC: Get percent duplicates try: with open(markduplicates_metrics_filepath) as markdup_metrics: for line in markdup_metrics: if line[FIRST_CHAR] == '#': continue record = line.strip().split('\t') if len(record) == 9: if re.match(r'\d\.\d+', record[7]) is not None: qc_metrics['percent_duplicate_reads'] = record[ 7] except Exception as e: qc_metrics['percent_duplicate_reads'] = [ 'Could not open MarkDuplicates metrics', e.message ] # Write out QC metrics to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(json.dumps(qc_metrics, indent=4) + '\n') # Populate Synapse QC matrix if re.match(r'\d{4}-\d{4}', lib_prefix.strip()) is not None: synapse_metadata['Individual_ID'] = lib_prefix synapse_metadata[ 'File_Name'] = 'PEC_BrainGVEX_UIC-UChicago_FC_mRNA_HiSeq2000_{}'.format( lib_prefix) re_raw_filename = re.match( r'\d{4}-\d{4}_.+_(.+)_.+_(.+_\d)_\d_sequence\.txt\.gz', os.path.basename(pipeline_args['reads'][0].split(':')[0])) if re_raw_filename is not None: sequencing_inst_name = re_raw_filename.group(1) if '673' in sequencing_inst_name or '484' in sequencing_inst_name: synapse_metadata['SequencingPlatform'] = 'HiSeq2000' elif '1070' in sequencing_inst_name: synapse_metadata['SequencingPlatform'] = 'HiSeq2500' flowcell_batch = re_raw_filename.group(2) synapse_metadata['FlowcellBatch'] = flowcell_batch total_raw_reads_end1 = sum( [int(count[0]) for count in qc_metrics['total_raw_reads']]) / 4 synapse_metadata['TotalReads'] = str(total_raw_reads_end1) # Write out Synapse metadata with open(os.path.join(logs_dir, 'synapse_metadata.txt'), 'w') as synapse_metadata_file: synapse_metadata_file.write( json.dumps(synapse_metadata, indent=4) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): # create variables from parser if wanted bamFiles = pipeline_args['bam:lib'] outputDir = pipeline_args['output'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software picard = Software('picard', pipeline_config['picard']['path']) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] # Keep track of Bids in pipeline # bid_list = [] # bam_list = [] # for bamLib in bamFiles: # bid_list.append(bamLib.split(':')[1]) # bam_list.append(bamLib.split(':')[0]) ''' Picard tools java -jar picard.jar CollectMultipleMetrics I=2017-221.uniq_sorted.bam O= multiple_metrics R=GRCh37.p13.genome.fa java -jar picard.jar CollectGcBiasMetrics I= .uniq O=gc_bias_metrics.txt CHART=gc_bias_metrics.pdf S=summary_metrics.txt R=reference_sequence.fasta java -jar picard.jar CollectRnaSeqMetrics I=input.bam O=output.RNA_Metrics REF_FLAT=ref_flat.txt STRAND=FIRST_READ_TRANSCRIPTION_STRAND java -jar picard.jar MarkDuplicates I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt ASSUME_SORTED=true ''' for bamLib in bamFiles: bam, bid = bamLib.split(':') newDir = new_dir(outputDir, bid) subprocess.call(['mkdir', newDir]) # consider multithreading? picard.run( Parameter('CollectMultipleMetrics'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter('R={}'.format(pathTo_genomeFasta)) # genomeReference ) picard.run( Parameter('CollectGcBiasMetrics'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format(newDir, bid)), # summary metrics Parameter('R={}'.format(pathTo_genomeFasta)) # genome reference ) picard.run( Parameter('CollectRnaSeqMetrics'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}/{}'.format(newDir, bid)), # ref_flat Parameter('STRAND=FIRST_READ_TRANSCRIPTION_STRAND') # strandedness ) picard.run( Parameter('MarkDuplicates'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.marked_duplicates.bam'.format(newDir, bid)), # output Parameter('M={}/{}.marked_dup_metrics.txt'.format(new, bid)), # marked dup metrics Parameter('TMP_DIR={}'.format(newDir)), Parameter('ASSUME_SORTED=true') # sorted Parameter('VALIDATION_STRINGENCY=LENIENT'), Redirect(stream=Redirect.BOTH, dest=os.path.join(newDir, 'mark_dup.log')) )
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate variables from argparse read_pairs = pipeline_args['reads'] output_dir = os.path.abspath(pipeline_args['output']) logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = int(pipeline_args['step']) forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir]) # Keep list of items to delete staging_delete = [tmp_dir] bwa_bam_outs = [] qc_data = { 'total_raw_reads_counts': [], 'trimmed_reads_counts': [], # TODO Find a better way to store FastQC results 'num_reads_mapped': [], 'percent_duplicate_reads': '0', 'num_unique_reads_mapped': [], # TODO This isn't implemented 'num_mtDNA_reads_mapped': [], # TODO This isn't implemented 'num_reads_mapped_after_filtering': '-1', # TODO This isn't implemented 'num_peaks_called': '-1', # TODO Get number of peaks in annotation sites } # Instantiate software instances cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) fastqc = Software('FastQC', pipeline_config['fastqc']['path']) bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln') bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe') samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view') samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat') samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index') novosort = Software('novosort', pipeline_config['novosort']['path']) picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates') picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics') bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed') bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + ' sort') bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge') bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect') homer_maketagdir = Software('HOMER makeTagDirectory', pipeline_config['makeTagDirectory']['path']) homer_findpeaks = Software('HOMER findPeaks', pipeline_config['findPeaks']['path']) homer_pos2bed = Software('HOMER pos2bed', pipeline_config['pos2bed']['path']) if step <= 1: for i, read_pair in enumerate(read_pairs): read1, read2 = read_pair.split(':') # QC: Get raw fastq read counts qc_data['total_raw_reads_counts'].append([ str(int(self.count_gzipped_lines(read1))/4), str(int(self.count_gzipped_lines(read2))/4) ]) trimmed_read1_filename = os.path.join(output_dir, lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i)) trimmed_read2_filename = os.path.join(output_dir, lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i)) cutadapt.run( Parameter('--quality-base=33'), Parameter('--minimum-length=5'), Parameter('-q', '30'), # Minimum quality score Parameter('--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'), Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log')) ) # QC: Get trimmed fastq read counts qc_data['trimmed_reads_counts'].append([ str(int(self.count_gzipped_lines(trimmed_read1_filename))/4), str(int(self.count_gzipped_lines(trimmed_read2_filename))/4) ]) staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename]) read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename]) if step <= 2: # Make FastQC directory fastqc_output_dir = os.path.join(output_dir, 'fastqc') subprocess.call(['mkdir', '-p', fastqc_output_dir]) for i, read_pair in enumerate(read_pairs): for read in read_pair.split(':'): fastqc.run( Parameter('--outdir={}'.format(fastqc_output_dir)), Parameter(read) ) bwa_aln.run( Parameter('-t', pipeline_config['bwa']['threads']), Parameter(pipeline_config['bwa']['index-dir']), Parameter(read), Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read)) ) staging_delete.append('{}.sai'.format(read)) if step <= 3: for i, read_pair in enumerate(read_pairs): read1, read2 = read_pair.split(':') bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i)) bwa_sampe.run( Parameter('-a', '2000'), # Maximum insert size Parameter('-n', '1'), Parameter(pipeline_config['bwa']['index-dir']), Parameter('{}.sai'.format(read1)), Parameter('{}.sai'.format(read2)), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')), Pipe( samtools_view.pipe( Parameter('-hSb'), Parameter('-o', bwa_bam_output), Parameter('-') # Get input from stdin ) ) ) bwa_bam_outs.append(bwa_bam_output) if step <= 4: for i, bwa_bam in enumerate(bwa_bam_outs): samtools_flagstat.run( Parameter(bwa_bam), Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat') ) # QC: Get number of mapped reads from this BAM try: with open(bwa_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents) if target_line is not None: qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_reads_mapped'].append('0') except: qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format( bwa_bam + '.flagstat' )) sortmerged_bam = os.path.join(output_dir, '{}.sortmerged.bam'.format(lib_prefix)) steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix)) duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix)) unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix)) unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix)) chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix)) novosort.run( Parameter('--threads', pipeline_config['novosort']['threads']), Parameter('--tmpcompression', '6'), Parameter('--tmpdir', tmp_dir), Parameter(*[bam for bam in bwa_bam_outs]), Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log')) ) # This creates a dependency on PySam # Removes reads with template length < 38 due to steric hindrence samtools_index.run(Parameter(sortmerged_bam)) sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb') steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb', template=sortmerged_bam_alignmentfile) for read in sortmerged_bam_alignmentfile.fetch(): if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF: steric_filter_bam_alignmentfile.write(read) sortmerged_bam_alignmentfile.close() steric_filter_bam_alignmentfile.close() # Mark and remove duplicates markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics') picard_mark_dup.run( Parameter('INPUT={}'.format(steric_filter_bam)), Parameter('OUTPUT={}'.format(duprm_bam)), Parameter('TMP_DIR={}'.format(tmp_dir)), Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)), Parameter('REMOVE_DUPLICATES=true'), Parameter('VALIDATION_STRINGENCY=LENIENT'), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log')) ) # QC: Get percent duplicates try: with open(markduplicates_metrics_filepath) as markdup_metrics: for line in markdup_metrics: if line[FIRST_CHAR] == '#': continue record = line.strip().split('\t') if len(record) == 9: if re.match(r'\d+', record[7]) is not None: qc_data['percent_duplicate_reads'] = record[7] except: qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics' # Filter down to uniquely mapped reads samtools_view.run( Parameter('-b'), Parameter('-F', '256'), Parameter('-q', '10'), Parameter('-o', unique_bam), Parameter(duprm_bam) ) # Remove unmapped reads samtools_view.run( Parameter('-b'), Parameter('-F', '12'), Parameter('-o', unmappedrm_bam), Parameter(unique_bam) ) # Create BAM index, then remove chrM samtools_index.run( Parameter(unmappedrm_bam) ) # Remove chrM all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']] samtools_view.run( Parameter('-b'), Parameter('-o', chrmrm_bam), Parameter(unmappedrm_bam), *all_chr ) # Stage delete for temporary files staging_delete.extend([ sortmerged_bam, sortmerged_bam + '.bai', # BAM index file steric_filter_bam, unique_bam, duprm_bam, unmappedrm_bam, unmappedrm_bam + '.bai', # BAM index file chrmrm_bam ]) if step <= 5: # Generate filename for final processed BAM and BED processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix)) unshifted_bed = os.path.join(output_dir, '{}.unshifted.bed'.format(lib_prefix)) processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix)) # staging_delete.append(unshifted_bed) # Generate filename for chrM removed BAM chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix)) # Remove blacklisted genomic regions bedtools_intersect.run( Parameter('-v'), Parameter('-abam', chrmrm_bam), Parameter('-b', pipeline_config['bedtools']['blacklist-bed']), Parameter('-f', '0.5'), Redirect(stream=Redirect.STDOUT, dest=processed_bam) ) # QC: Generate insert size metrics PDF picard_insert_metrics.run( Parameter('INPUT={}'.format(processed_bam)), Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))), Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf'))) ) # Generate index for processed BAM samtools_index.run( Parameter(processed_bam) ) # Convert BAM to BED bedtools_bamtobed.run( Parameter('-i', processed_bam), Redirect(stream=Redirect.STDOUT, dest=unshifted_bed) ) staging_delete.append(unshifted_bed) # Shifting + strand by 4 and - strand by -5, according to # the ATACseq paper # This used to be bedtools shift, but they are fired self.shift_reads( input_bed_filepath=unshifted_bed, output_bed_filepath=processed_bed, log_filepath=os.path.join(logs_dir, 'shift_reads.logs'), genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'], minus_strand_shift=MINUS_STRAND_SHIFT, plus_strand_shift=PLUS_STRAND_SHIFT ) if step <= 6: processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix)) homer_tagdir = os.path.join(output_dir, '{}_tagdir'.format(lib_prefix)) unsorted_peaks = os.path.join(output_dir, '{}.unsorted.peaks.bed'.format(lib_prefix)) sorted_peaks = os.path.join(output_dir, '{}.sorted.peaks.bed'.format(lib_prefix)) merged_peaks = os.path.join(output_dir, '{}.peaks.bed'.format(lib_prefix)) # Populate HOMER tag directory homer_maketagdir.run( Parameter(homer_tagdir), Parameter('-format', 'bed'), Parameter(processed_bed), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'maketagdir.log')) ) # Run HOMER peak calling program homer_findpeaks.run( Parameter(homer_tagdir), Parameter('-fragLength', '0'), Parameter('-fdr', '0.01'), Parameter('-localSize', '50000'), Parameter('-o', 'auto'), Parameter('-style', 'dnase'), Parameter('-size', '150'), Parameter('-minDist', '50'), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'findpeaks.log')) ) # Convert HOMER peaks file to bed format homer_pos2bed.run( Parameter(os.path.join(homer_tagdir, 'peaks.txt')), Redirect(stream=Redirect.STDOUT, dest=unsorted_peaks), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'pos2bed.log')) ) # Sort called peaks bed file bedtools_sort.run( Parameter('-i', unsorted_peaks), Redirect(stream=Redirect.STDOUT, dest=sorted_peaks) ) # Merge peaks to create final peaks file bedtools_merge.run( Parameter('-i', sorted_peaks), Redirect(stream=Redirect.STDOUT, dest=merged_peaks) ) # Stage delete for temporary files staging_delete.extend([ unsorted_peaks, sorted_peaks ]) # QC: Output QC data to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(str(qc_data) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): # create variables from parser if wanted bamFiles = pipeline_args['bam:lib'] outputDir = pipeline_args['output'] adapter = pipeline_args['adapter'] numThreads = pipeline_args['threads'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) star = Software('STAR', pipeline_config['STAR']['path']) bedtools = Software('bedtools', pipeline_config['bedtools']['path']) bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path']) samtools = Software('samtools', pipeline_config['samtools']['path']) samtools_sort = Software('samtools sort', pipeline_config['samtools']['path']) read_distribution = Software( 'read_distribution.py', pipeline_config['read_distribution']['path']) featureCounts = Software('featureCounts', pipeline_config['featureCounts']['path']) fastQC = Software('FastQC', pipeline_config['FastQC']['path']) picard = Software('picard', pipeline_config['picard']['path']) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathToGenomeDir = pipeline_config['STAR']['genomeDir'] pathToGenome = pipeline_config['bowtie2']['genome_ref'] pathToGtf = pipeline_config['STAR']['GTF_ref'] pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed'] pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100'] pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed'] pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] ''' remove adaptor and trim adaptor sequence: AGATCGGAAGAGCACACGTCT -m 25 discard any reads shorter than 25 nucleotides keep only reads that had the adaptor sequence --discard-untrimmed cutadapt -a AGATCGGAAGAGCACACGTCT -m 25 --discard-untrimmed {filename}.fastq.gz > {filename}_trimmed.fastq.gz 2> {filename}_report.txt Remove adapters Only keep reads with adapters, otherwise artifact Discard reads shorter than 25 bp ''' # Keep track of Bids in pipeline bid_list = [] for bamLib in bamFiles: bid_list.append(bamLib.split(':')[-1]) ''' Sort and extract uniquely mapped reads for QC and further analyses samtools view -H $file > header.sam samtools view $file | grep -w NH:i:1 | cat header.sam - | samtools view -bS - | samtools sort - ${filename}_uniq_sorted rm header.sam Using this file for the rest of the analysis ''' for bamLib in bamFiles: bam, bid = bamLib.split(':') newDir = new_dir(outputDir, bid) samtools.run( Parameter('view'), Parameter('-H'), Parameter(bam), # star outfile name Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.header.sam'.format(bid)))) samtools.run( Parameter('view'), Parameter(bam), # star outfile name Pipe( grep.pipe( Parameter('-w'), Parameter('NH:i:1'), Pipe( cat.pipe( Parameter( os.path.join(newDir, '{}.header.sam'.format(bid)), '-'), Pipe( samtools.pipe( Parameter('view'), Parameter('-bS', '-'), Pipe( samtools.pipe( Parameter('sort'), Parameter( '-', '-o', '{}/{}.uniq_sorted.bam'. format(newDir, bid))))))))))) # subprocess.call(['rm', '{}/{}.header.sam'.format(newDir, bid)]) ''' SeQC to evaluate percent reads mapped to each genomic features read_distribution.py -r hg19_RefSeq.bed12 -i $file ''' for bid in bid_list: newDir = new_dir(outputDir, bid) read_distribution.run( Parameter('-r'), Parameter(pathTo_hg19_bed), Parameter('-i'), Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.read_distribution.log'.format(bid))), shell=True) ''' codon periodicity annotation=/glusterfs/users/ashieh/annotations/hg19_ccds_exons_plus_start100.bed bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100 awk -v OFS='\t' '{print ($2-($14+100))}' ${filename}_intersect_start100.bed | sort | uniq -c > ${filename}_relative_pos_aggregate.table ''' # bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100 for bid in bid_list: newDir = new_dir(outputDir, bid) bedtools.run( Parameter('intersect'), Parameter('-a {}'.format(pathTo_hg19_bed_start100)), Parameter('-b {}/{}.uniq_sorted.bam'.format(newDir, bid)), Parameter('-s'), Parameter('-bed'), Parameter('-wa'), Parameter('-wb'), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.intersect_start100.bed'.format(bid))), shell=True) awk.run( Parameter('-v'), Parameter("OFS='\\t'"), Parameter('{print ($8-($2+100))}'), Parameter('{}/{}.intersect_start100.bed'.format(newDir, bid)), Pipe( sort.pipe( Pipe( uniq.pipe( Parameter('-c'), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}_relative_pos_aggregate.table'. format(bid)))))))) for bid in bid_list: newDir = new_dir(outputDir, bid) rpaFile = open( '{dir}/{bid}_relative_pos_aggregate.table'.format(dir=newDir, bid=bid), 'rb') myDict = {} for i in range(-30, 31): myDict[i] = 0 for line in rpaFile: Frequency, start = line.strip().split(' ') if int(start) >= -30 and int(start) <= 30: print start myDict[int(start)] = Frequency # print times freqs = [] starts = [] for i in range(-30, 31): starts.append(i) freqs.append(myDict[i]) # print freqs fig, ax = plt.subplots() # plt.set_title('{} codon periodicity'.format(bid)) plt.xlabel("-30 to 30 relative position") plt.ylabel("Frequency") plt.bar(starts, freqs) fig.savefig('{dir}/{bid}_codon_periodicity_plot.png'.format( dir=newDir, bid=bid)) ''' Picard tools java -jar picard.jar CollectMultipleMetrics I=2017-221.uniq_sorted.bam O= multiple_metrics R=GRCh37.p13.genome.fa java -jar picard.jar CollectGcBiasMetrics I= .uniq O=gc_bias_metrics.txt CHART=gc_bias_metrics.pdf S=summary_metrics.txt R=reference_sequence.fasta java -jar picard.jar CollectRnaSeqMetrics I=input.bam O=output.RNA_Metrics REF_FLAT=ref_flat.txt STRAND=FIRST_READ_TRANSCRIPTION_STRAND java -jar picard.jar MarkDuplicates I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt ASSUME_SORTED=true ''' for bid in bid_list: newDir = new_dir(outputDir, bid) picard.run( Parameter('CollectMultipleMetrics'), Parameter('I={}/{}.uniq_sorted.bam'.format(newDir, bid)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter('R={}'.format(pathTo_genomeFasta)) # genomeReference ) picard.run( Parameter('CollectGcBiasMetrics'), Parameter('I={}/{}.uniq_sorted.bam'.format(newDir, bid)), # input Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format( newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format( newDir, bid)), # summary metrics Parameter( 'R={}'.format(pathTo_genomeFasta)) # genome reference ) picard.run( Parameter('CollectRnaSeqMetrics'), Parameter('I={}/{}.uniq_sorted.bam'.format(newDir, bid)), # input Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}/{}'.format(newDir, bid)), # ref_flat Parameter( 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND') # strandedness ) picard.run( Parameter('MarkDuplicates'), Parameter('I={}/{}.uniq_sorted.bam'.format(newDir, bid)), # input Parameter('O={}/{}.marked_duplicates.bam'.format( newDir, bid)), # output Parameter( 'M={}/{}.marked_dup_metrics.txt'), # marked dup metrics Parameter('ASSUME_SORTED=true') # sorted ) ''' subread: featureCounts featureCounts -a /path_to_gtf/gencode.v19.annotation.gtf -o <bid>.featureCounts <bid>.uniq_sorted.bam ''' for bid in bid_list: newDir = new_dir(outputDir, bid) featureCounts.run( Parameter('-a', '{}'.format(pathToGtf)), # gtf Parameter('-s', '1'), # strand-specific read counting Parameter('-o', '{}/{}.featureCounts'.format(newDir, bid)), # output Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)) # input )
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate options bam = pipeline_args['bam'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') # Create output, tmp, and logs directories subprocess.call([ 'mkdir', '-p', output_dir, logs_dir, os.path.join(output_dir, 'tmp') ]) # Timing functions for getting running time start_time = datetime.now() # Gather QC data qc_data = { 'total_raw_reads_counts': [], 'trimmed_reads_counts': [], 'num_reads_mapped': '0', 'running_time_seconds': '', 'running_time_readable': '' } # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] # Establish software instances rsem_calculat_expression = Software( 'RSEM', pipeline_config['RSEM']['path-calculate-expression']) rsem_plot_model = Software('RSEM', pipeline_config['RSEM']['path-plot-model']) # Set up RSEM parameters rsem_common = [ Parameter('--bam'), Parameter('--estimate-rspd'), Parameter('--calc-ci'), Parameter('--no-bam-output'), Parameter('--seed', '12345') ] rsem_run = [ Parameter('-p', pipeline_config['RSEM']['threads']), Parameter('--ci-memory', pipeline_config['RSEM']['memory']) ] rsem_type = [] if pipeline_args['is_paired_end']: rsem_type.append(Parameter('--paired-end')) if pipeline_args['is_stranded']: rsem_type.append(Parameter('--forward-prob', '0')) # Run RSEM quantification step rsem_calculat_expression.run(*(rsem_common + rsem_run + rsem_type + [ Parameter(bam), Parameter(pipeline_config['RSEM']['reference-dir']), Parameter(os.path.join(output_dir, 'RSEM_Quant')), Redirect(Redirect.BOTH, dest=os.path.join(logs_dir, 'Log.rsem')) ])) # Generate RSEM plot model rsem_plot_model.run( Parameter(os.path.join(output_dir, 'RSEM_Quant'), os.path.join(output_dir, 'Quant.pdf'))) # QC: Get time delta elapsed_time = datetime.now() - start_time qc_data['running_time_seconds'] = str(elapsed_time.seconds) qc_data['running_time_readable'] = str(elapsed_time) # QC: Output QC data to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(json.dumps(qc_data, indent=4) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate Software instances fastqc = Software('FastQC', pipeline_config['fastqc']['path']) rnaseqc = Software('RNA-SeQC', pipeline_config['RNA-SeQC']['path']) picard = { subprogram_name: Software( 'picard {}'.format(subprogram_name), pipeline_config['picard']['path'] + ' {}'.format(subprogram_name)) for subprogram_name in { 'CreateSequenceDictionary', 'MarkDuplicates', 'CollectRnaSeqMetrics', 'CollectInsertSizeMetrics', 'CollectAlignmentSummaryMetrics', 'CollectGcBiasMetrics', 'EstimateLibraryComplexity', 'AddOrReplaceReadGroups' } } preseq = { subprogram_name: Software( 'preseq {}'.format(subprogram_name), pipeline_config['preseq']['path'] + ' {}'.format(subprogram_name)) for subprogram_name in {'c_curve', 'lc_extrap', 'gc_extrap'} } bam2mr = Software('bam2mr', pipeline_config['preseq']['bam2mr']) featurecounts = Software('featureCounts', pipeline_config['featureCounts']['path']) samtools_faidx = Software( 'samtools faidx', pipeline_config['samtools']['path'] + ' faidx') novosort = Software('novosort', pipeline_config['novosort']['path']) # Create output directory subprocess.call('mkdir -p {}'.format(pipeline_args['output_dir']), shell=True) subprocess.call('mkdir -p /mnt/analysis/tmp', shell=True) # Sort bam file sorted_bam = os.path.join(pipeline_args['output_dir'], 'sorted.tmp.bam') novosort.run(Parameter('--index'), Parameter('--output', sorted_bam), Parameter(pipeline_args['bam'])) # Run FastQC self.run_fastqc(fastqc=fastqc, pipeline_args=pipeline_args) # Run RNA-SeQC self.run_rnaseqc(rnaseqc=rnaseqc, picard=picard, samtools_faidx=samtools_faidx, pipeline_config=pipeline_config, pipeline_args=pipeline_args, sorted_bam=sorted_bam) # Run Picard suite self.run_picard_suite(picard=picard, sorted_bam=sorted_bam, pipeline_config=pipeline_config, pipeline_args=pipeline_args) # self.run_preseq( # preseq=preseq, # bam2mr=bam2mr, # sorted_bam=sorted_bam, # pipeline_args=pipeline_args # ) self.run_featurecounts(featurecounts=featurecounts, sorted_bam=sorted_bam, pipeline_args=pipeline_args, pipeline_config=pipeline_config) self.run_chrm_percentage(sorted_bam=sorted_bam, pipeline_args=pipeline_args) # Remove temporary sorted bam os.remove(sorted_bam) os.remove(sorted_bam + '.bai') subprocess.call('rm -rf /mnt/analysis/tmp', shell=True)
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate options reads = pipeline_args['reads'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = pipeline_args['step'] forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] run_is_stranded = pipeline_args['is_stranded'] # Determine if run is paired-end from input run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1 # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir]) # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] qc_metrics = { 'total_raw_reads': [], 'total_trimmed_reads': [], 'percent_num_reads_mapped_genome': [], 'percent_num_reads_mapped_transcriptome': [], 'percent_duplicate_reads': '0', 'num_reads_multimapped': [], 'percent_num_reads_rrna': '', 'viral_rna': [] } synapse_metadata = { 'Assay': 'RNAseq', 'Individual_ID': '', 'Sample_ID': '', 'File_Name': '', 'BrodmannArea': '', 'BrainRegion': '', 'Hemisphere': '', 'CellType': 'NA', 'TissueState': '', 'RNAIsolationBatch': '', 'RIN': '', 'LibraryBatch': '', 'LibraryPrep': 'stranded, rRNA depletion', 'LibraryKit': 'Illumina RS-122-2301', 'ERCC_Added': '', 'RunType': 'paired-end', 'ReadLength': '100bp', 'FlowcellBatch': '', 'SequencingPlatform': '', 'TotalReads': '', 'MappedReads_Primary': '0', 'MappedReads_Multimapped': '0', 'rRNARate': '0', 'Notes': '' } # Establish Software instances cutadapt = Software('Cutadapt', pipeline_config['cutadapt']['path']) fastqc = Software('FastQC', pipeline_config['fastqc']['path']) star = Software('STAR Two-Pass', pipeline_config['STAR']['path']) novosort = Software('Novosort', pipeline_config['novosort']['path']) samtools_flagstat = Software('Samtools Flagstat', pipeline_config['samtools']['path'] + ' flagstat') samtools_index = Software('Samtools Index', pipeline_config['samtools']['path'] + ' index') samtools_faidx = Software('Samtools Faidx', pipeline_config['samtools']['path'] + ' faidx') picard_markduplicates = Software('Picard MarkDuplicates', 'java -Xmx{heap_size}g -jar {path} MarkDuplicates'.format( heap_size=pipeline_config['picard'].get('heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['picard']['path'] )) picard_create_seq_dict = Software('Picard CreateSequenceDictionary', 'java -Xmx{heap_size}g -jar {path} CreateSequenceDictionary'.format( heap_size=pipeline_config['picard'].get('heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['picard']['path'] )) rnaseqc = Software('RNAseQC', 'java -Xmx{heap_size}g -jar {path}'.format( heap_size=pipeline_config['picard'].get('heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['RNAseQC']['path'] )) picard_add_read_groups = Software('Picard AddOrReplaceReadGroups', 'java -Xmx{heap_size}g -jar {path} AddOrReplaceReadGroups'.format( heap_size=pipeline_config['picard'].get('heap_size', JAVA_DEFAULT_HEAP_SIZE), path=pipeline_config['picard']['path'] )) bedtools_coverage = Software('Bedtools Coverage', pipeline_config['bedtools']['path'] + ' coverage') bedtools_bamtobed = Software('Bedtools Bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed') # Housekeeping star_output = [] novosort_outfile = '' # Step 1: Trimming | Cutadapt if step <= 1: for i, read in enumerate(reads): if run_is_paired_end: # Get paired-end reads, construct new filenames read1, read2 = read.split(':') # QC: Get raw fastq read counts qc_metrics['total_raw_reads'].append([ str(int(self.count_gzipped_lines(read1))/4), str(int(self.count_gzipped_lines(read2))/4) ]) trimmed_read1_filename = os.path.join(output_dir, lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i)) trimmed_read2_filename = os.path.join(output_dir, lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i)) staging_delete.extend([ trimmed_read1_filename, trimmed_read2_filename ]) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter), Parameter('-A', reverse_adapter), Parameter('-q', '30'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary')) ) # QC: Get trimmed fastq read counts qc_metrics['total_trimmed_reads'].append([ str(int(self.count_gzipped_lines(trimmed_read1_filename))/4), str(int(self.count_gzipped_lines(trimmed_read2_filename))/4) ]) # Update reads list reads[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename]) else: # QC: Get raw fastq read counts qc_metrics['total_raw_reads'].append([ str(int(self.count_gzipped_lines(read))/4) ]) # Construct new filename trimmed_read_filename = os.path.join(output_dir, lib_prefix + '_{}.trimmed.fastq.gz'.format(i)) staging_delete.append(trimmed_read_filename) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read_filename)), Parameter('-a', forward_adapter), Parameter('-q', '30'), Parameter(read), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.chicago.summary')) ) # QC: Get trimmed fastq read counts qc_metrics['total_trimmed_reads'].append([ str(int(self.count_gzipped_lines(trimmed_read_filename))/4) ]) # Update reads list reads[i] = trimmed_read_filename # Step 2: FastQC if step <= 2: # Make FastQC directory fastqc_output_dir = os.path.join(output_dir, 'fastqc') subprocess.call(['mkdir', '-p', fastqc_output_dir]) all_fastqs = [] if run_is_paired_end: for read in reads: all_fastqs.extend(read.split(':')) else: all_fastqs.extend(reads) for fastq in all_fastqs: fastqc.run( Parameter('--outdir={}'.format(fastqc_output_dir)), Parameter(fastq) ) # Step 3: Alignment | STAR 2-pass, Alignment Stats | samtools flagstat if step <= 3: # Set up common STAR parameters star_common = [ Parameter('--runMode', 'alignReads'), Parameter('--twopassMode', 'Basic'), Parameter('--runThreadN', pipeline_config['STAR']['threads']), Parameter('--genomeDir', pipeline_config['STAR']['genome-dir']), Parameter('--readFilesCommand', 'zcat'), Parameter('--quantMode', 'TranscriptomeSAM', 'GeneCounts'), Parameter('--outSAMtype', 'BAM', 'Unsorted'), Parameter('--outFilterType', 'BySJout'), Parameter('--outFilterMultimapNmax', '20'), Parameter('--alignSJoverhangMin', '8'), Parameter('--alignSJDBoverhangMin', '1'), Parameter('--outFilterMismatchNmax', '2'), Parameter('--alignIntronMin', '20'), Parameter('--alignIntronMax', '1000000'), Parameter('--alignMatesGapMax', '1000000'), ( Parameter('--outFilterIntronMotifs', 'RemoveNoncanonical') if run_is_stranded else Parameter('--outSAMstrandField', 'intronMotif') ) ] # Get STAR output file prefix star_outfile_prefix = os.path.join(output_dir, lib_prefix + ('_' if lib_prefix[-1] != '.' else '') + '{}.') # Align each read or read pair for i, read in enumerate(reads): star_output_bam = star_outfile_prefix.format(i) + 'Aligned.out.bam' star_output_transcriptome_bam = star_outfile_prefix.format(i) + 'Aligned.toTranscriptome.out.bam' star_output.append(star_output_bam) if run_is_paired_end: read1, read2 = read.split(':') star_paired_end = [ Parameter('--readFilesIn', read1, read2), Parameter('--outFileNamePrefix', star_outfile_prefix.format(i)) ] star.run(*(star_common + star_paired_end)) else: star_single_end = [ Parameter('--readFilesIn', read), Parameter('--outFileNamePrefix', star_outfile_prefix.format(i)) ] star.run(*(star_common + star_single_end)) # Get flagstats for both alignments samtools_flagstat.run( Parameter(star_output_bam), Redirect(stream=Redirect.STDOUT, dest=star_output_bam + '.flagstat') ) samtools_flagstat.run( Parameter(star_output_transcriptome_bam), Redirect(stream=Redirect.STDOUT, dest=star_output_transcriptome_bam + '.flagstat') ) # QC: Get number of mapped reads to the genome from this BAM try: with open(star_output_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() # Pull out mapped reads target_line = re.search(r'(\d+) \+ \d+ mapped \(([0-9\.]+)%', flagstats_contents) if target_line is not None: num_mapped = int(target_line.group(1)) qc_metrics['percent_num_reads_mapped_genome'].append( [str(num_mapped/2), '{}%'.format(target_line.group(2))] ) num_secondary = int(re.search(r'(\d+) \+ \d+ secondary', flagstats_contents) .group(1) ) num_supplementary = int(re.search(r'(\d+) \+ \d+ supplementary', flagstats_contents) .group(1) ) synapse_metadata['MappedReads_Primary'] = str( int(synapse_metadata['MappedReads_Primary']) + num_mapped - num_secondary - num_supplementary ) synapse_metadata['MappedReads_Multimapped'] = str( int(synapse_metadata['MappedReads_Multimapped']) + num_secondary ) else: qc_metrics['percent_num_reads_mapped_genome'].append('0') # Pull out multimapped reads target_line = re.search(r'(\d+) \+ \d+ secondary', flagstats_contents) if target_line is not None: qc_metrics['num_reads_multimapped'].append( str(int(target_line.group(1))/2) ) else: qc_metrics['num_reads_multimapped'].append('0') except: qc_metrics['percent_num_reads_mapped_genome'].append( 'Could not open flagstats for {}'.format(star_output_bam) ) qc_metrics['num_reads_multimapped'].append( 'Could not open flagstats for {}'.format(star_output_bam) ) # QC: Get number of mapped reads to the transcriptome from this BAM try: with open(star_output_transcriptome_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped \(([0-9\.]+)%', flagstats_contents) if target_line is not None: qc_metrics['percent_num_reads_mapped_transcriptome'].append( [str(int(target_line.group(1))/2), '{}%'.format(target_line.group(2))] ) else: qc_metrics['percent_num_reads_mapped_transcriptome'].append('0') except: qc_metrics['percent_num_reads_mapped_transcriptome'].append( 'Could not open flagstats for {}'.format(star_output_bam) ) # Step 4: BAM Merge | Novosort if step <= 4: # Novosort to sort and merge BAM files novosort_outfile = os.path.join(output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '') + 'merged.Aligned.out.bam') novosort.run( Parameter('--tmpdir', os.path.join(output_dir, 'tmp')), Parameter(*[bam for bam in star_output]), Redirect(stream=Redirect.STDOUT, dest=novosort_outfile) ) """ The step below was commented out on 27 June 2016. It was taking up large amounts of memory, more than Beagle could handle, and some samples were consistently failing as a result. I think RNAseQC does this step anyway, I only left it in because I figured it wasn't doing any harm. Well now it is, so it's gone. """ # QC: Get number of reads mapped to rRNA regions # aligned_bed_file = os.path.join(output_dir, str(uuid.uuid4()) + '.bed') # coverage_file = os.path.join(output_dir, str(uuid.uuid4()) + '.coverage.bed') # staging_delete.extend([aligned_bed_file, coverage_file]) # # bedtools_bamtobed.run( # Parameter('-i', novosort_outfile), # Redirect(stream=Redirect.STDOUT, dest=aligned_bed_file) # ) # bedtools_coverage.run( # Parameter('-s'), # Parameter('-counts'), # Parameter('-a', pipeline_config['qc']['rRNA-bed']), # Parameter('-b', aligned_bed_file), # Redirect(stream=Redirect.STDOUT, dest=coverage_file) # ) # try: # rRNA_count = 0 # with open(coverage_file) as coverage: # for line in coverage: # rRNA_count += int(line.strip().split('\t')[6]) # percent_rRNA = (rRNA_count / # float(sum([int(aln[MAPPED_READS_COUNT]) # for aln # in qc_metrics['percent_num_reads_mapped_transcriptome']])) # ) # qc_metrics['percent_num_reads_rrna'] = [str(rRNA_count), str(percent_rRNA)] # synapse_metadata['rRNARate'] = str(percent_rRNA) # except Exception as e: # qc_metrics['percent_num_reads_rrna'] = ['error', 'error', e.message] # Prepare genome fasta for RNAseQC genome_fa = pipeline_config['qc']['genome-fa'] genome_fai = genome_fa + '.fai' genome_dict = os.path.splitext(genome_fa)[0] + '.dict' if not os.path.isfile(genome_fai): samtools_faidx.run( Parameter(genome_fa) ) if not os.path.isfile(genome_dict): picard_create_seq_dict.run( Parameter('REFERENCE={}'.format(genome_fa)), Parameter('OUTPUT={}'.format(genome_dict)) ) # Add read group to alignment file read_group_bam = os.path.join(output_dir, 'readgroup.bam') staging_delete.append(read_group_bam) picard_add_read_groups.run( Parameter('INPUT={}'.format(novosort_outfile)), Parameter('OUTPUT={}'.format(read_group_bam)), Parameter('RGLB={}'.format(lib_prefix)), Parameter('RGPL=Illumina'), Parameter('RGPU=1'), Parameter('RGSM=Sample') ) # Generate BAM index for RNAseQC samtools_index.run( Parameter(read_group_bam) ) staging_delete.append(read_group_bam + '.bai') # QC: Get RNAseQC output rnaseqc_output_dir = os.path.join(output_dir, 'RNAseQC') subprocess.call(['mkdir', '-p', rnaseqc_output_dir]) rnaseqc.run( Parameter('-o', rnaseqc_output_dir), Parameter('-r', genome_fa), Parameter('-t', pipeline_config['cufflinks']['transcriptome-gtf']), Parameter('-s', '"{sample_id}|{bam_file}|{notes}"'.format( sample_id=lib_prefix, bam_file=read_group_bam, notes='None' )), Parameter('-singleEnd') if not run_is_paired_end else Parameter() ) # Picard MarkDuplicates to get duplicates metrics markduplicates_outfile = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix)) markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics') picard_markduplicates.run( Parameter('INPUT={}'.format(novosort_outfile)), Parameter('OUTPUT={}'.format(markduplicates_outfile)), Parameter('TMP_DIR={}'.format(tmp_dir)), Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log')) ) # QC: Get percent duplicates try: with open(markduplicates_metrics_filepath) as markdup_metrics: for line in markdup_metrics: if line[FIRST_CHAR] == '#': continue record = line.strip().split('\t') if len(record) == 9: if re.match(r'\d\.\d+', record[7]) is not None: qc_metrics['percent_duplicate_reads'] = record[7] except Exception as e: qc_metrics['percent_duplicate_reads'] = ['Could not open MarkDuplicates metrics', e.message] # Write out QC metrics to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(json.dumps(qc_metrics, indent=4) + '\n') # Populate Synapse QC matrix if re.match(r'\d{4}-\d{4}', lib_prefix.strip()) is not None: synapse_metadata['Individual_ID'] = lib_prefix synapse_metadata['File_Name'] = 'PEC_BrainGVEX_UIC-UChicago_FC_mRNA_HiSeq2000_{}'.format(lib_prefix) re_raw_filename = re.match(r'\d{4}-\d{4}_.+_(.+)_.+_(.+_\d)_\d_sequence\.txt\.gz', os.path.basename(pipeline_args['reads'][0].split(':')[0])) if re_raw_filename is not None: sequencing_inst_name = re_raw_filename.group(1) if '673' in sequencing_inst_name or '484' in sequencing_inst_name: synapse_metadata['SequencingPlatform'] = 'HiSeq2000' elif '1070' in sequencing_inst_name: synapse_metadata['SequencingPlatform'] = 'HiSeq2500' flowcell_batch = re_raw_filename.group(2) synapse_metadata['FlowcellBatch'] = flowcell_batch total_raw_reads_end1 = sum([int(count[0]) for count in qc_metrics['total_raw_reads']])/4 synapse_metadata['TotalReads'] = str(total_raw_reads_end1) # Write out Synapse metadata with open(os.path.join(logs_dir, 'synapse_metadata.txt'), 'w') as synapse_metadata_file: synapse_metadata_file.write(json.dumps(synapse_metadata, indent=4) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate variable from argparse read_pairs = pipeline_args['reads'] output_dir = os.path.abspath(pipeline_args['output']) logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = int(pipeline_args['step']) forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir]) #Keep list of items to delete staging_delete = [tmp_dir] bwa_bam_outs = [] qc_data = { 'total_raw_reads_counts': [], 'trimmed_reads_counts': [], 'num_reads_mapped': [], 'num_read_removed_steric_hinderence': '0', 'percent_duplicate_reads': '0', 'num_unique_reads_mapped': [], #implemented 'num_mtDNA_reads_mapped': [], 'percent_mtDNA_reads_mapped': '0' , 'num_reads_mapped_after_filtering': '-1', #TODO This isn't implemented 'num_peaks_called': '-1', #TODO Get number of peaks in annotation sites } #Instatiate software instances cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) fastqc = Software('FastQC', pipeline_config['fastqc']['path']) bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln') bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe') samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view') samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat') samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index') novosort = Software('novosort', pipeline_config['novosort']['path']) picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates') picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics') bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed') bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + 'sort') bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge') bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect') macs2_callpeak = Software('macs2 callpeak', pipeline_config['macs2']['path'] + ' callpeak') if step <= 1: for i, read_pair in enumerate(read_pairs): read1, read2 = read_pair.split(':') #QC: Get raw fastq read counts qc_data['total_raw_reads_counts'].append([ str(int(self.count_gzipped_lines(read1))/4), str(int(self.count_gzipped_lines(read2))/4) ]) trimmed_read1_filename = os.path.join(output_dir, lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i)) trimmed_read2_filename = os.path.join(output_dir, lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i)) cutadapt.run( Parameter('--quality-base=33'), Parameter('--minimum-length=5'), Parameter('-q', '30'), # Minimum quality score Parameter('--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'), Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log')) ) # QC: Get trimmed fastq read counts qc_data['trimmed_reads_counts'].append([ str(int(self.count_gzipped_lines(trimmed_read1_filename))/4), str(int(self.count_gzipped_lines(trimmed_read2_filename))/4) ]) staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename]) read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename]) if step <= 2: #Make FastQC Directory fastqc_output_dir = os.path.join(output_dir, 'fastqc') subprocess.call(['mkdir', '-p', fastqc_output_dir]) for i, read_pair in enumerate(read_pairs): for read in read_pair.split(':'): fastqc.run( Parameter('--outdir={}'.format(fastqc_output_dir)), Parameter(read) ) bwa_aln.run( Parameter('-t', pipeline_config['bwa']['threads']), Parameter(pipeline_config['bwa']['index-dir']), Parameter(read), Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read)) ) staging_delete.append('{}.sai'.format(read)) if step <= 3: for i, read_pair in enumerate(read_pairs): read1, read2 = read_pair.split(':') bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i)) bwa_sampe.run( Parameter('-a', '2000'), # Maximum insert size Parameter('-n', '1'), Parameter(pipeline_config['bwa']['index-dir']), Parameter('{}.sai'.format(read1)), Parameter('{}.sai'.format(read2)), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')), Pipe( samtools_view.pipe( Parameter('-hSb'), Parameter('-o', bwa_bam_output), Parameter('-') # Get input from stdin ) ) ) bwa_bam_outs.append(bwa_bam_output) if step <= 4: for i, bwa_bam in enumerate(bwa_bam_outs): samtools_flagstat.run( Parameter(bwa_bam), Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat') ) #QC: Get number of mapped reads from this bam try: with open(bwa_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents) if target_line is not None: qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_reads_mapped'].append('0') except: qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format( bwa_bam + '.flagstat' )) sortmerged_bam = os.path.join(output_dir, '{}.sortmerged_bam'.format(lib_prefix)) steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix)) duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix)) unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix)) unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix)) chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix)) # binning read based off template size nucleosome_free_reads = os.path.join(output_dir, '{}.nucleosome_free.bam'.format(lib_prefix)) mononucleosome_reads = os.path.join(output_dir, '{}.mononucleosome.bam'.format(lib_prefix)) dinucleosome_reads = os.path.join(output_dir, '{}.dinucleosome.bam'.format(lib_prefix)) trinucleosome_reads = os.path.join(output_dir, '{}.trinucleosome.bam'.format(lib_prefix)) chrM_bam = os.path.join(output_dir, '{}.chrM.bam'.format(lib_prefix)) novosort.run( Parameter('--threads', pipeline_config['novosort']['threads']), Parameter('--tmpcompression', '6'), Parameter('--tmpdir', tmp_dir), Parameter(*[bam for bam in bwa_bam_outs]), Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log')) ) # This creates a dependency on pysam # Removes reads with template length < 38 due to steric hinderence samtools_index.run(Parameter(sortmerged_bam)) sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb') steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb', template=sortmerged_bam_alignmentfile) num_removed=0 for read in sortmerged_bam_alignmentfile.fetch(): if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF: steric_filter_bam_alignmentfile.write(read) else: num_removed += 1 qc_data['num_read_removed_steric_hinderence']=str(num_removed) sortmerged_bam_alignmentfile.close() steric_filter_bam_alignmentfile.close() # Mark and remove MarkDuplicates markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics') picard_mark_dup.run( Parameter('INPUT={}'.format(steric_filter_bam)), Parameter('OUTPUT={}'.format(duprm_bam)), Parameter('TMP_DIR={}'.format(tmp_dir)), Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)), Parameter('REMOVE_DUPLICATES=true'), Parameter('VALIDATION_STRINGENCY=LENIENT'), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log')) ) #QC: Get percent MarkDuplicates try: with open(markduplicates_metrics_filepath) as markdup_metrics: for line in markdup_metrics: if line[FIRST_CHAR] == '#': continue record = line.strip().split('\t') if len(record) == 9: if re.match(r'\d+', record[7]) is not None: qc_data['percent_duplicate_reads'] = record[7] except: qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics' # Filter down to uniquely mapped reads samtools_view.run( Parameter('-b'), Parameter('-F', '256'), Parameter('-q', '10'), Parameter('-o', unique_bam), Parameter(duprm_bam) ) # gets statistics on uniquely mapped reads for i, unique_map in enumerate(unique_bam): samtools_flagstat.run( Parameter(unique_bam), Redirect(stream=Redirect.STDOUT, dest=unique_bam + '.flagstat') ) #QC: Get number of mapped reads from unique bams try: with open(unique_bam + '.flagstat') as flagstats: unique_flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', unique_flagstats_contents) if target_line is not None: qc_data['num_unique_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_unique_reads_mapped'].append('0') except: qc_data['num_unique_reads_mapped'] + '.flagstat' # make AlignmentFile object to extract binned reads and chrM reads from the unique bam samtools_index.run(Parameter(unique_bam)) unique_bam_alignmentfile = pysam.AlignmentFile(unique_bam, 'rb') # Bins reads into 4 categories depending on template length read is derived from: # 50-115 (nucleosome-free), 180-247 (mononucleosome), 315-473 (dinucleosome), 558-615 (trinucleosome) nucleosome_free_reads_alignmentfile = pysam.AlignmentFile(nucleosome_free_reads, 'wb', template=unique_bam_alignmentfile) mononucleosome_reads_alignmentfile = pysam.AlignmentFile(mononucleosome_reads, 'wb', template=unique_bam_alignmentfile) dinucleosome_reads_alignmentfile = pysam.AlignmentFile(dinucleosome_reads, 'wb', template=unique_bam_alignmentfile) trinucleosome_reads_alignmentfile = pysam.AlignmentFile(trinucleosome_reads, 'wb', template=unique_bam_alignmentfile) # Extract chrM into new BAM chrM_reads_alignmentfile = pysam.AlignmentFile(chrM_bam, 'wb', template=unique_bam_alignmentfile) # Binning of nucleosome reads for read in unique_bam_alignmentfile.fetch(): if abs(int(read.template_length)) >= 50 and abs(int(read.template_length)) <= 115: nucleosome_free_reads_alignmentfile.write(read) elif abs(int(read.template_length)) >= 180 and abs(int(read.template_length)) <= 247: mononucleosome_reads_alignmentfile.write(read) elif abs(int(read.template_length)) >= 315 and abs(int(read.template_length)) <= 473: dinucleosome_reads_alignmentfile.write(read) elif abs(int(read.template_length)) >= 558 and abs(int(read.template_length)) <= 615: trinucleosome_reads_alignmentfile.write(read) else: continue; #stores chrM reads in separate file for read in unique_bam_alignmentfile.fetch(): if read.reference_name == 'chrM': chrM_reads_alignmentfile.write(read) nucleosome_free_reads_alignmentfile.close() mononucleosome_reads_alignmentfile.close() dinucleosome_reads_alignmentfile.close() trinucleosome_reads_alignmentfile.close() chrM_reads_alignmentfile.close() # gets series of flagstats results for non-main files samtools_flagstat.run( Parameter(nucleosome_free_reads), Redirect(stream=Redirect.STDOUT, dest=nucleosome_free_reads + '.flagstat')) samtools_flagstat.run( Parameter(mononucleosome_reads), Redirect(stream=Redirect.STDOUT, dest=mononucleosome_reads + '.flagstat')) samtools_flagstat.run( Parameter(dinucleosome_reads), Redirect(stream=Redirect.STDOUT, dest=dinucleosome_reads + '.flagstat')) samtools_flagstat.run( Parameter(trinucleosome_reads), Redirect(stream=Redirect.STDOUT, dest=trinucleosome_reads + '.flagstat')) # gets statistics on chrM mapped reads samtools_index.run(Parameter(chrM_bam)) for i, chrM_map in enumerate(chrM_bam): samtools_flagstat.run( Parameter(chrM_bam), Redirect(stream=Redirect.STDOUT, dest=chrM_bam + '.flagstat') ) try: with open(chrM_bam + '.flagstat') as flagstats: chrM_flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', chrM_flagstats_contents) if target_line is not None: qc_data['num_mtDNA_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_mtDNA_reads_mapped'].append('0') except: qc_data['num_mtDNA_reads_mapped'] + '.flagstat' # Remove unmapped reads samtools_view.run( Parameter('-b'), Parameter('-F', '12'), Parameter('-o', unmappedrm_bam), Parameter(unique_bam) ) # Create BAM index, then remove chrM samtools_index.run( Parameter(unmappedrm_bam) ) # Remove chrM all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']] samtools_view.run( Parameter('-b'), Parameter('-o', chrmrm_bam), Parameter(unmappedrm_bam), *all_chr ) # Stage delete for temporary files staging_delete.extend([ sortmerged_bam, sortmerged_bam + '.bai', # BAM index file steric_filter_bam, unique_bam, duprm_bam, unmappedrm_bam, unmappedrm_bam + '.bai', # BAM index file chrmrm_bam ]) if step <= 5: # Generate filename for final processed BAM and BED processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix)) unshifted_bed = os.path.join(output_dir, '{}.unshifted_bed'.format(lib_prefix)) processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix)) # staging_delete.append(unshifted_bed) # Generate filename for chrM removed BAM chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix)) # Remove blacklisted genomic regions bedtools_intersect.run( Parameter('-v'), Parameter('-abam', chrmrm_bam), Parameter('-b', pipeline_config['bedtools']['blacklist-bed']), Parameter('-f', '0.5'), Redirect(stream=Redirect.STDOUT, dest=processed_bam) ) # QC: Generate insert size metrics PDF picard_insert_metrics.run( Parameter('INPUT={}'.format(processed_bam)), Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))), Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf'))) ) # Generate index for processed BAM samtools_index.run( Parameter(processed_bam) ) # Convert BAM to BED bedtools_bamtobed.run( Parameter('-i', processed_bam), Redirect(stream=Redirect.STDOUT, dest=unshifted_bed) ) staging_delete.append(unshifted_bed) # Shifting + strand by 4 and - strand by -5, according to the ATACseq paper # This ysed to be bedtools shift, but they are fired self.shift_reads( input_bed_filepath=unshifted_bed, output_bed_filepath=processed_bed, log_filepath=os.path.join(logs_dir, 'shift_reads.logs'), genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'], minus_strand_shift=MINUS_STRAND_SHIFT, plus_strand_shift=PLUS_STRAND_SHIFT ) # Peak-calling; MACS2 if step <= 6: # for regular peak calling, including narrow, default q-value=0.01 processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix)) macs2_callpeak.run( Parameter('-t', processed_bed), Parameter('-f', 'BED'), Parameter('-g', 'hs'), Parameter('-n', str(processed_bed) + '_regular_peak_calls'), Parameter('--nomodel'), Parameter('--extsize', '200'), Parameter('--shift', '-100'), Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads Parameter('--call-summits'), Parameter('--keep-dup', 'all') ) #for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks macs2_callpeak.run( Parameter('-t', processed_bed), Parameter('-f', 'BED'), Parameter('-g', 'hs'), Parameter('-n', str(processed_bed) + '_broad_peak_calls'), Parameter('-q', '0.05'), Parameter('--nomodel'), Parameter('--extsize', '200'), Parameter('--shift', '-100'), Parameter('--broad'), Parameter('--keep-dup', 'all') ) # QC: Output QC data to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(str(qc_data) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate options reads = pipeline_args['reads'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = pipeline_args['step'] forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] run_is_stranded = pipeline_args['is_stranded'] # Determine if run is paired-end from input run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1 # Create output, tmp, and logs directories subprocess.call(['mkdir', '-p', output_dir, logs_dir, os.path.join(output_dir, 'tmp')]) # Timing functions for getting running time start_time = datetime.now() # Gather QC data qc_data = { 'total_raw_reads_counts': [], 'trimmed_reads_counts': [], 'num_reads_mapped': '0', 'running_time_seconds': '', 'running_time_readable': '' } # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] # Establish software instances cat = Software('cat', '/bin/cat') cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) star = Software('STAR', pipeline_config['STAR']['path']) rsem_calculat_expression = Software('RSEM', pipeline_config['RSEM']['path-calculate-expression']) rsem_plot_model = Software('RSEM', pipeline_config['RSEM']['path-plot-model']) bedGraph_to_bw = Software('bedGraphToBigWig', pipeline_config['bedgraph_to_bw']['path']) bed_sort = Software('bedSort', pipeline_config['bedSort']['path']) samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat') # Step 1: If more than one reads pairs are provided, combine them if step <= 1 and len(reads) >= 2: if run_is_paired_end: # Aggregate read1s and read2s read1s, read2s = [], [] for reads_set in reads: read1, read2 = reads_set.split(':') read1s.append(read1) read2s.append(read2) # Combine reads groups combined_reads = [] for name, reads_group in [('read1', read1s), ('read2', read2s)]: combined_read_filename = os.path.join(output_dir, '{}.combined.{}.fastq.gz'.format(lib_prefix, name)) combined_reads.append(combined_read_filename) staging_delete.append(combined_read_filename) cat.run( Parameter(*[read for read in reads_group]), Redirect(stream=Redirect.STDOUT, dest=combined_read_filename) ) # Update reads list reads = [':'.join(combined_reads)] else: # Combine reads combined_read_filename = os.path.join(output_dir, '{}.combined.fastq.gz'.format(lib_prefix)) staging_delete.append(combined_read_filename) cat.run( Parameter(*[read for read in reads]), Redirect(stream=Redirect.STDOUT, dest=combined_read_filename) ) # Update reads list reads = [combined_read_filename] # Step 2: Trim adapters with cutadapt if step <= 2: reads_set = reads[FIRST_READS_PAIR] if run_is_paired_end: # Get paired-end reads, construct new filenames read1, read2 = reads_set.split(':') # QC: Get raw fastq read counts qc_data['total_raw_reads_counts'].extend([ str(int(self.count_gzipped_lines(read1))/4), str(int(self.count_gzipped_lines(read2))/4) ]) trimmed_read1_filename = os.path.join(output_dir, lib_prefix + '_read1.trimmed.fastq.gz') trimmed_read2_filename = os.path.join(output_dir, lib_prefix + '_read2.trimmed.fastq.gz') staging_delete.append(trimmed_read1_filename) staging_delete.append(trimmed_read2_filename) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter), Parameter('-A', reverse_adapter), Parameter('-q', '30'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log')) ) # QC: Get trimmed fastq read counts qc_data['trimmed_reads_counts'].extend([ str(int(self.count_gzipped_lines(trimmed_read1_filename))/4), str(int(self.count_gzipped_lines(trimmed_read2_filename))/4) ]) # Update reads list reads = ':'.join([trimmed_read1_filename, trimmed_read2_filename]) else: # QC: Get raw fastq read count qc_data['total_raw_reads_counts'].append( str(int(self.count_gzipped_lines( os.path.join(output_dir, '{}.combined.fastq.gz'.format(lib_prefix)) ))/4) ) # Construct new filename trimmed_read_filename = os.path.join(output_dir, lib_prefix + '.trimmed.fastq.gz') staging_delete.append(trimmed_read_filename) # Run cutadapt cutadapt.run( Parameter('--quality-base={}'.format(pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length=5'), Parameter('--output={}'.format(trimmed_read_filename)), Parameter('-a', forward_adapter), Parameter('-q', '30'), Parameter(reads[FIRST_READS_PAIR]), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary')) ) # QC: Get trimmed fastq read count qc_data['trimmed_reads_counts'].append( str(int(self.count_gzipped_lines(trimmed_read_filename))/4) ) # Update reads list reads = [trimmed_read_filename] # Step 3: Alignment if step <= 3: # Gets reads for paired-end and single-end if run_is_paired_end: read1, read2 = reads.split(':') else: read1 = reads[FIRST_READS_PAIR] read2 = '' # Set up STAR parameters star_outfile_prefix = os.path.join(output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) star_common = [ Parameter('--outFileNamePrefix', star_outfile_prefix), Parameter('--genomeDir', pipeline_config['STAR']['genome-dir']), Parameter('--readFilesIn', read1, read2), Parameter('--readFilesCommand', 'zcat'), Parameter('--outFilterType', 'BySJout'), Parameter('--outFilterMultimapNmax', '20'), Parameter('--alignSJoverhangMin', '8'), Parameter('--alignSJDBoverhangMin', '1'), Parameter('--outFilterMismatchNmax', '999'), Parameter('--alignIntronMin', '20'), Parameter('--alignIntronMax', '1000000'), Parameter('--alignMatesGapMax', '1000000'), Parameter('--outSAMunmapped', 'Within'), Parameter('--outSAMattributes', 'NH', 'HI', 'AS', 'NM', 'MD'), Parameter('--outFilterMismatchNoverReadLmax', '0.04'), Parameter('--sjdbScore', '1') ] star_run = [ Parameter('--runThreadN', pipeline_config['STAR']['threads']), #Parameter('--genomeLoad', 'LoadAndKeep'), #Parameter('--limitBAMsortRAM', '10000000000') ] star_bam = [ Parameter('--outSAMtype', 'BAM', 'SortedByCoordinate'), Parameter('--quantMode', 'TranscriptomeSAM') ] star_strand, star_wig = [], [] # STAR strandedness parameters if run_is_stranded: star_wig.append(Parameter('--outWigStrand', 'Stranded')) else: star_strand.append(Parameter('--outSAMstrandField', 'intronMotif')) star_wig.append(Parameter('--outWigStrand', 'Unstranded')) # TODO Encode has SAM Header metadata here, but I'm going to skip it for now star_meta = [] # Run STAR alignment step star.run(*(star_common + star_run + star_bam + star_strand + star_meta)) # Store STAR output files star_output_bam = star_outfile_prefix + 'Aligned.sortedByCoord.out.bam' # QC: Get samtools flagstat samtools_flagstat.run( Parameter(star_output_bam), Redirect(stream=Redirect.STDOUT, dest=star_output_bam + '.flagstat') ) # QC: Get number of mapped reads from this BAM with open(star_output_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents) if target_line is not None: qc_data['num_reads_mapped'] = str(int(target_line.group(1))/2) # Generate bedGraph signal_output_dir = os.path.join(output_dir, 'signal') subprocess.call(['mkdir', '-p', signal_output_dir]) signal_output_prefix = os.path.join(signal_output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) # Run STAR for signal generation star.run( Parameter('--runMode', 'inputAlignmentsFromBAM'), Parameter('--inputBAMfile', star_output_bam), Parameter('--outWigType', 'bedGraph'), Parameter('--outFileNamePrefix', signal_output_prefix), Parameter('--outWigReferencesPrefix', 'chr'), *star_wig ) # Convert bedGraph to bigWig chrNL_txt = os.path.join(output_dir, 'chrNL.txt') with open(chrNL_txt, 'w') as chrNL_filehandle: subprocess.call(['grep', '^chr', os.path.join(pipeline_config['STAR']['genome-dir'], 'chrNameLength.txt') ], stdout=chrNL_filehandle) # Generate temporary signal file path sig_tmp = os.path.join(output_dir, 'sig.tmp') staging_delete.append(sig_tmp) if run_is_stranded: strand = [None, '-', '+'] for i_strand in [1, 2]: for i_mult in ['Unique', 'UniqueMultiple']: # Get signal file for this iteration signal_file = '{}Signal.{}.str{}.out.bg'.format(signal_output_prefix, i_mult, str(i_strand)) # Write to temporary signal file with open(sig_tmp, 'w') as sig_tmp_filehandle: subprocess.call(['grep', '^chr', signal_file], stdout=sig_tmp_filehandle) # Sort sig.tmp with bedSort bed_sort.run( Parameter(sig_tmp), Parameter(sig_tmp) ) # Run bedGraph to bigWig conversion bedGraph_to_bw.run( Parameter(sig_tmp), Parameter(chrNL_txt), Parameter('{}Signal.{}.strand{}.bw'.format( signal_output_prefix,i_mult, strand[i_strand] )) ) else: for i_mult in ['Unique', 'UniqueMultiple']: # Get signal file for this iteration signal_file = '{}Signal.{}.str1.out.bg'.format(signal_output_prefix, i_mult) # Write to temporary signal file with open(sig_tmp, 'w') as sig_tmp_filehandle: subprocess.call(['grep', '^chr', signal_file], stdout=sig_tmp_filehandle) # Sort sig.tmp with bedSort bed_sort.run( Parameter(sig_tmp), Parameter(sig_tmp) ) # Run bedGraph to bigWig conversion bedGraph_to_bw.run( Parameter(sig_tmp), Parameter(chrNL_txt), Parameter('{}Signal.{}.unstranded.bw'.format(signal_output_prefix, i_mult)) ) # Step 4: Sort transcriptome BAM to ensure order of reads to make RSEM output deterministic if step <= 4: # Set BAM file paths, mv transcriptome BAM to temporary name star_outfile_prefix = os.path.join(output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam' tr_bam = star_outfile_prefix + 'Tr.bam' staging_delete.append(tr_bam) subprocess.call(['mv', transcriptome_bam, tr_bam]) # Template command merge_cmd = 'cat <({input1}) <({input2}) | {compress} > {output}' input1_cmd = '{samtools} view -H {bam}' compress_cmd = 'samtools view -@ {threads} -bS -' if run_is_paired_end: input2_cmd = ('{samtools} view -@ {threads} {bam} | ' + 'awk \'{{printf "%s", $0 " "; getline; print}}\' | ' + 'sort -S {ram} -T {tmpdir} | ' + 'tr \' \' \'\\n\'') else: input2_cmd = ('{samtools} view -@ {threads} {bam} | ' + 'sort -S {ram} -T {tmpdir}') print merge_cmd.format( input1=input1_cmd.format( samtools=pipeline_config['samtools']['path'], bam=tr_bam ), input2=input2_cmd.format( samtools=pipeline_config['samtools']['path'], threads=pipeline_config['RSEM']['threads'], bam=tr_bam, ram=pipeline_config['sort']['memory'], tmpdir=os.path.join(output_dir, 'tmp') ), compress=compress_cmd.format( threads=pipeline_config['RSEM']['threads'] ), output=transcriptome_bam ) subprocess.call(merge_cmd.format( input1=input1_cmd.format( samtools=pipeline_config['samtools']['path'], bam=tr_bam ), input2=input2_cmd.format( samtools=pipeline_config['samtools']['path'], threads=pipeline_config['RSEM']['threads'], bam=tr_bam, ram=pipeline_config['sort']['memory'], tmpdir=os.path.join(output_dir, 'tmp') ), compress=compress_cmd.format( threads=pipeline_config['RSEM']['threads'] ), output=transcriptome_bam ), shell=True, executable='/bin/bash') subprocess.call(['rm', tr_bam]) # Step 5: Run RSEM to get quantification if step <= 5: star_outfile_prefix = os.path.join(output_dir, lib_prefix + ('.' if lib_prefix[-1] != '.' else '')) transcriptome_bam = star_outfile_prefix + 'Aligned.toTranscriptome.out.bam' # Set up RSEM parameters rsem_common = [ Parameter('--bam'), Parameter('--estimate-rspd'), Parameter('--calc-ci'), Parameter('--no-bam-output'), Parameter('--seed', '12345') ] rsem_run = [ Parameter('-p', pipeline_config['RSEM']['threads']), Parameter('--ci-memory', pipeline_config['RSEM']['memory']) ] rsem_type = [] if run_is_paired_end: rsem_type.append(Parameter('--paired-end')) if run_is_stranded: rsem_type.append(Parameter('--forward-prob', '0')) # Run RSEM quantification step rsem_calculat_expression.run(*(rsem_common + rsem_run + rsem_type + [ Parameter(transcriptome_bam), Parameter(pipeline_config['RSEM']['reference-dir']), Parameter(os.path.join(output_dir, 'RSEM_Quant')), Redirect(Redirect.BOTH, dest=os.path.join(logs_dir, 'Log.rsem')) ])) # Generate RSEM plot model rsem_plot_model.run( Parameter(os.path.join(output_dir, 'RSEM_Quant'), os.path.join(output_dir, 'Quant.pdf')) ) # QC: Get time delta elapsed_time = datetime.now() - start_time qc_data['running_time_seconds'] = str(elapsed_time.seconds) qc_data['running_time_readable'] = str(elapsed_time) # QC: Output QC data to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(json.dumps(qc_data, indent=4) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def add_pipeline_args(self, parser): parser.add_argument( '--fastq:lib', required=True, nargs='*', help='Fastq input for pipeline:library name(prefix for files)') parser.add_argument('--output', required=True, help='Where pipeline output should go') parser.add_argument('--adapter', default='AGATCGGAAGAGCACACGTCT', help='Adapter sequence for trimming') parser.add_argument( '--threads', default=defaultThreads, help='Threads to be used for multi-threaded programs. Default is 8' ) # chunky run RiboSeq_pipe.py --fastqs # /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz # --output /mnt/cinder/thomas/RiboSeq/test --threads # create variables from parser if wanted fastqFiles = pipeline_args['fastq:lib'] outputDir = pipeline_args['output'] adapter = pipeline_args['adapter'] numThreads = pipeline_args['threads'] # Create output directory subprocess.call(['mkdir', outputDir]) # Software cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) star = Software('STAR', pipeline_config['STAR']['path']) bedtools = Software('bedtools', pipeline_config['bedtools']['path']) bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path']) samtools = Software('samtools', pipeline_config['samtools']['path']) samtools_sort = Software('samtools sort', pipeline_config['samtools']['path']) read_distribution = Software( 'read_distribution.py', pipeline_config['read_distribution']['path']) featureCounts = Software('featureCounts', pipeline_config['featureCounts']['path']) fastQC = Software('FastQC', pipeline_config['FastQC']['path']) picard = Software('picard', pipeline_config['picard']['path']) # Change these to just be done in python script? # Common software tools awk = Software('awk', 'awk') sort = Software('sort', 'sort') uniq = Software('uniq', 'uniq') paste = Software('paste', 'paste') cat = Software('cat', 'cat') grep = Software('grep', 'grep') # Directories and Files pathToGenomeDir = pipeline_config['STAR']['genomeDir'] pathToGenome = pipeline_config['bowtie2']['genome_ref'] pathToGtf = pipeline_config['STAR']['GTF_ref'] pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed'] pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100'] pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed'] pathTo_genomeFasta = pipeline_config['picard']['genomeFasta'] pathTo_ref_flat = pipeline_config['picard']['refFlat'] ''' remove adaptor and trim adaptor sequence: AGATCGGAAGAGCACACGTCT -m 25 discard any reads shorter than 25 nucleotides keep only reads that had the adaptor sequence --discard-untrimmed cutadapt -a AGATCGGAAGAGCACACGTCT -m 25 --discard-untrimmed {filename}.fastq.gz > {filename}_trimmed.fastq.gz 2> {filename}_report.txt Remove adapters Only keep reads with adapters, otherwise artifact Discard reads shorter than 25 bp ''' # Keep track of Bids in pipeline bid_list = [] for fastqlib in fastqFiles: bid_list.append(fastqlib.split(':')[-1]) # Cutadapt for fastqlib in fastqFiles: fastq, bid = fastqlib.split(':') newDir = new_dir(outputDir, bid) # Make new directories to store data subprocess.call(['mkdir', newDir]) # consider multi-threading by splitting in multiple files and then combining cutadapt.run( Parameter('--quality-base=33'), Parameter('--minimum-length=25'), Parameter('--discard-untrimmed'), Parameter('--output={}/{}.trimmed.fastq.gz'.format( newDir, bid)), # Parameter('-a', forward_adapter if forward_adapter else 'AGATCGGAAGAGCACACGTCT'), Parameter('-a', adapter), Parameter(fastq), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.cutadapt.summary.log'.format(bid)))) ''' Bowtie2 bowtie2 --seedlen=23 --un-fq=${filename}_filtered.fq -x $genome -U $file -S | samtools view -Sb - > ${filename}.rts.bam Remove snoRNA, rRNA, tRNA, keep only mRna for alignment ''' for bid in bid_list: newDir = new_dir(outputDir, bid) bowtie2.run( Parameter('--seedlen=23'), Parameter('--threads', numThreads), Parameter('--un-gz {}/{}_filtered.fq.gz'.format(newDir, bid)), Parameter('-x', pathToGenome), # Path to rtsRNA_seqs files Parameter('-U', '{}/{}.trimmed.fastq.gz'.format(newDir, bid)), Parameter('-S'), Parameter('{}/{}.rts.sam'.format(newDir, bid)), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.bowtie2.log'.format(bid))), Redirect(stream=Redirect.STDERR, dest=os.path.join(newDir, '{}.bowtie2.log2'.format(bid))), shell=True # Look into changing ) # This doesn't work samtools.run( Parameter('view'), Parameter('-Sb'), Parameter('{}/{}.rts.sam'.format(newDir, bid)), Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.rts.bam'.format(bid))), ) ''' Star STAR --runThreadN 6 --sjdbGTFfile gtfFile --outSAMtype BAM Unsorted --outFileNamePrefix {filename}_ --genomeDir /path/to/genome/index --genomeFastaFiles --readFilesIn {filename}_filtered.fq.gz --readFilesCommand zcat Basically RNAseq at this point Align the kept reads from bowtie to the genome ''' # Only load the genome one time: genomeLoad = 'LoadAndKeep'.....Doesn't really work for bid in bid_list: newDir = new_dir(outputDir, bid) # remove genome from memory on last run # genomeLoad = 'LoadAndRemove' star.run( Parameter( '--runThreadN', numThreads), # Change to command line parameter --threads Parameter('--sjdbGTFfile', pathToGtf), Parameter('--outSAMtype', 'BAM', 'Unsorted'), Parameter('--outFileNamePrefix', '{}/{}_'.format(newDir, bid)), Parameter('--genomeDir', pathToGenomeDir), # Parameter('--genomeLoad', genomeLoad), broken Parameter('--readFilesIn', '{}/{}_filtered.fq.gz'.format(newDir, bid)), Parameter('--readFilesCommand zcat') # reads gzipped files ) ''' Sort and extract uniquely mapped reads for QC and further analyses samtools view -H $file > header.sam samtools view $file | grep -w NH:i:1 | cat header.sam - | samtools view -bS - | samtools sort - ${filename}_uniq_sorted rm header.sam Using this file for the rest of the analysis ''' for bid in bid_list: newDir = new_dir(outputDir, bid) samtools.run( Parameter('view'), Parameter('-H'), Parameter('{}/{}_Aligned.out.bam'.format( newDir, bid)), # star outfile name Redirect(stream=Redirect.STDOUT, dest=os.path.join(newDir, '{}.header.sam'.format(bid)))) samtools.run( Parameter('view'), Parameter('{}/{}_Aligned.out.bam'.format( newDir, bid)), # star outfile name Pipe( grep.pipe( Parameter('-w'), Parameter('NH:i:1'), Pipe( cat.pipe( Parameter( os.path.join(newDir, '{}.header.sam'.format(bid)), '-'), Pipe( samtools.pipe( Parameter('view'), Parameter('-bS', '-'), Pipe( samtools.pipe( Parameter('sort'), Parameter( '-', '-o', '{}/{}.uniq_sorted.bam'. format(newDir, bid))))))))))) # subprocess.call(['rm', '{}/{}.header.sam'.format(newDir, bid)]) ''' rSeQC to evaluate percent reads mapped to each genomic features read_distribution.py -r hg19_RefSeq.bed12 -i $file ''' for bid in bid_list: newDir = new_dir(outputDir, bid) read_distribution.run( Parameter('-r'), Parameter(pathTo_hg19_bed), Parameter('-i'), Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.read_distribution.log'.format(bid))), shell=True) ''' codon periodicity annotation=/glusterfs/users/ashieh/annotations/hg19_ccds_exons_plus_start100.bed bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100 awk -v OFS='\t' '{print ($2-($14+100))}' ${filename}_intersect_start100.bed | sort | uniq -c > ${filename}_relative_pos_aggregate.table ''' # bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100 for bid in bid_list: newDir = new_dir(outputDir, bid) bedtools.run( Parameter('intersect'), Parameter('-a {}'.format(pathTo_hg19_bed_start100)), Parameter('-b {}/{}.uniq_sorted.bam'.format(newDir, bid)), Parameter('-s'), Parameter('-bed'), Parameter('-wa'), Parameter('-wb'), Redirect(stream=Redirect.STDOUT, dest=os.path.join( newDir, '{}.intersect_start100.bed'.format(bid))), shell=True) start100_file = open( '{}/{}.intersect_start100.bed'.format(newDir, bid), 'rb') relativePos_file = open( '{}/{}_relative_pos_aggregate.table'.format(newDir, bid), 'wb') distanceList = [] for line in start100_file: splitLine = line.split('\t') # Really is relative start if len(splitLine) >= 7: distance = int(splitLine[7]) - (int(splitLine[1]) + 100) distanceList.append(distance) distanceList.sort() distanceCounting = Counter(distanceList) for key, value in distanceCounting.iteritems(): relativePos_file.write("{}\t{}\n".format(value, key)) # Create chart of relative_positions_aggregate to see codon periodicity for bid in bid_list: newDir = new_dir(outputDir, bid) rpaFile = open( '{dir}/{bid}_relative_pos_aggregate.table'.format(dir=newDir, bid=bid), 'rb') myDict = {} for i in range(-30, 31): myDict[i] = 0 for line in rpaFile: Frequency, start = line.strip().split(' ') if int(start) >= -30 and int(start) <= 30: # print start myDict[int(start)] = Frequency # Change to log scaling? freqs = [] starts = [] for i in range(-30, 31): starts.append(i) freqs.append(myDict[i]) # print freqs fig, ax = plt.subplots() # plt.set_title('{} codon periodicity'.format(bid)) plt.xlabel("-30 to 30 relative position") plt.ylabel("Frequency") plt.bar(starts, freqs) fig.savefig('{dir}/{bid}_codon_periodicity_plot.png'.format( dir=newDir, bid=bid)) ''' Picard tools java -jar picard.jar CollectMultipleMetrics I=2017-221.uniq_sorted.bam O= multiple_metrics R=GRCh37.p13.genome.fa java -jar picard.jar CollectGcBiasMetrics I= .uniq O=gc_bias_metrics.txt CHART=gc_bias_metrics.pdf S=summary_metrics.txt R=reference_sequence.fasta java -jar picard.jar CollectRnaSeqMetrics I=input.bam O=output.RNA_Metrics REF_FLAT=ref_flat.txt STRAND=FIRST_READ_TRANSCRIPTION_STRAND java -jar picard.jar MarkDuplicates I=input.bam O=marked_duplicates.bam M=marked_dup_metrics.txt ASSUME_SORTED=true ''' for bid in bid_list: newDir = new_dir(outputDir, bid) picard.run( Parameter('CollectMultipleMetrics'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.multiple_metrics'.format(newDir, bid)), # output Parameter('R={}'.format(pathTo_genomeFasta)) # genomeReference ) picard.run( Parameter('CollectGcBiasMetrics'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.gc_bias_metrics'.format(newDir, bid)), # output Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format( newDir, bid)), # chart Parameter('S={}/{}.summary_metrics'.format( newDir, bid)), # summary metrics Parameter( 'R={}'.format(pathTo_genomeFasta)) # genome reference ) picard.run( Parameter('CollectRnaSeqMetrics'), Parameter('I={}'.format(bam)), # input Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)), # output Parameter('REF_FLAT={}'.format( '{}'.format(pathTo_ref_flat))), # ref_flat Parameter( 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND') # strandedness ) picard.run( Parameter('MarkDuplicates'), Parameter('I={}/{}.uniq_sorted.bam'.format(newDir, bid)), # input Parameter('O={}/{}.marked_duplicates.bam'.format( newDir, bid)), # output Parameter('M={}/{}.marked_dup_metrics.txt'.format( newDir, bid)), # marked dup metrics Parameter('ASSUME_SORTED=true') # It is sorted ) ''' subread: featureCounts featureCounts -a /path_to_gtf/gencode.v19.annotation.gtf -o <bid>.featureCounts <bid>.uniq_sorted.bam ''' for bid in bid_list: newDir = new_dir(outputDir, bid) featureCounts.run( Parameter('-a', '{}'.format(pathToGtf)), # gtf Parameter('-s', '1'), # strand-specific read counting Parameter('-o', '{}/{}.featureCounts'.format(newDir, bid)), # output Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)) # input ) ''' FastQC fastqc --outdir=/path_to/<bid>/ /path_to_fastq/<bid>.fastq.gz ''' for fastqlib in fastqFiles: fastq, bid = fastqlib.split(':') newDir = new_dir(outputDir, bid) fastQC.run( Parameter('--outdir={}'.format(newDir)), # output Parameter('--t', numThreads), Parameter(fastq) # input )
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate variable from argparse read_pairs = pipeline_args['reads'] output_dir = os.path.abspath(pipeline_args['output']) logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] step = int(pipeline_args['step']) forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir]) #Keep list of items to delete staging_delete = [tmp_dir] bwa_bam_outs = [] qc_data = { 'total_raw_reads_counts': [], 'trimmed_reads_counts': [], 'num_reads_mapped': [], 'num_read_removed_steric_hinderence': '0', 'percent_duplicate_reads': '0', 'num_unique_reads_mapped': [], #implemented 'num_mtDNA_reads_mapped': [], 'percent_mtDNA_reads_mapped': '0' , 'num_reads_mapped_after_filtering': '-1', #TODO This isn't implemented 'num_peaks_called': '-1', #TODO Get number of peaks in annotation sites } #Instatiate software instances cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) fastqc = Software('FastQC', pipeline_config['fastqc']['path']) bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln') bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe') samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view') samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat') samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index') samtools_sort = Software('samtools sort', pipeline_config['samtools']['path'] + ' sort') novosort = Software('novosort', pipeline_config['novosort']['path']) picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates') picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics') bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed') bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + 'sort') bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge') bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect') macs2_callpeak = Software('macs2 callpeak', pipeline_config['macs2']['path'] + ' callpeak') if step <= 1: for i, read_pair in enumerate(read_pairs): read1, read2 = read_pair.split(':') #QC: Get raw fastq read counts qc_data['total_raw_reads_counts'].append([ str(int(self.count_gzipped_lines(read1))/4), str(int(self.count_gzipped_lines(read2))/4) ]) trimmed_read1_filename = os.path.join(output_dir, lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i)) trimmed_read2_filename = os.path.join(output_dir, lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i)) cutadapt.run( Parameter('--quality-base=33'), Parameter('--minimum-length=5'), Parameter('-q', '30'), # Minimum quality score Parameter('--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'), Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log')) ) # QC: Get trimmed fastq read counts qc_data['trimmed_reads_counts'].append([ str(int(self.count_gzipped_lines(trimmed_read1_filename))/4), str(int(self.count_gzipped_lines(trimmed_read2_filename))/4) ]) staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename]) read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename]) if step <= 2: #Make FastQC Directory fastqc_output_dir = os.path.join(output_dir, 'fastqc') subprocess.call(['mkdir', '-p', fastqc_output_dir]) for i, read_pair in enumerate(read_pairs): for read in read_pair.split(':'): fastqc.run( Parameter('--outdir={}'.format(fastqc_output_dir)), Parameter(read) ) bwa_aln.run( Parameter('-t', pipeline_config['bwa']['threads']), Parameter(pipeline_config['bwa']['index-dir']), Parameter(read), Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read)) ) staging_delete.append('{}.sai'.format(read)) if step <= 3: for i, read_pair in enumerate(read_pairs): read1, read2 = read_pair.split(':') bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i)) bwa_sampe.run( Parameter('-a', '2000'), # Maximum insert size Parameter('-n', '1'), Parameter(pipeline_config['bwa']['index-dir']), Parameter('{}.sai'.format(read1)), Parameter('{}.sai'.format(read2)), Parameter(read1), Parameter(read2), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')), Pipe( samtools_view.pipe( Parameter('-hSb'), Parameter('-o', bwa_bam_output), Parameter('-') # Get input from stdin ) ) ) bwa_bam_outs.append(bwa_bam_output) if step <= 4: for i, bwa_bam in enumerate(bwa_bam_outs): samtools_flagstat.run( Parameter(bwa_bam), Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat') ) #QC: Get number of mapped reads from this bam try: with open(bwa_bam + '.flagstat') as flagstats: flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents) if target_line is not None: qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_reads_mapped'].append('0') except: qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format( bwa_bam + '.flagstat' )) sortmerged_bam = os.path.join(output_dir, '{}.sortmerged_bam'.format(lib_prefix)) steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix)) duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix)) unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix)) unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix)) chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix)) # binning read based off template size nucleosome_free_reads = os.path.join(output_dir, '{}.nucleosome_free.bam'.format(lib_prefix)) mononucleosome_reads = os.path.join(output_dir, '{}.mononucleosome.bam'.format(lib_prefix)) dinucleosome_reads = os.path.join(output_dir, '{}.dinucleosome.bam'.format(lib_prefix)) trinucleosome_reads = os.path.join(output_dir, '{}.trinucleosome.bam'.format(lib_prefix)) chrM_bam = os.path.join(output_dir, '{}.chrM.bam'.format(lib_prefix)) sorted_for_PE_bam = os.path.join(output_dir, '{}.sorted_for_PE'.format(lib_prefix)) novosort.run( Parameter('--threads', pipeline_config['novosort']['threads']), Parameter('--tmpcompression', '6'), Parameter('--tmpdir', tmp_dir), Parameter(*[bam for bam in bwa_bam_outs]), Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam), Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log')) ) # This creates a dependency on pysam # Removes reads with template length < 38 due to steric hinderence samtools_index.run(Parameter(sortmerged_bam)) sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb') steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb', template=sortmerged_bam_alignmentfile) num_removed=0 for read in sortmerged_bam_alignmentfile.fetch(): if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF: steric_filter_bam_alignmentfile.write(read) else: num_removed += 1 qc_data['num_read_removed_steric_hinderence']=str(num_removed) sortmerged_bam_alignmentfile.close() steric_filter_bam_alignmentfile.close() # Mark and remove MarkDuplicates markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics') picard_mark_dup.run( Parameter('INPUT={}'.format(steric_filter_bam)), Parameter('OUTPUT={}'.format(duprm_bam)), Parameter('TMP_DIR={}'.format(tmp_dir)), Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)), Parameter('REMOVE_DUPLICATES=true'), Parameter('VALIDATION_STRINGENCY=LENIENT'), Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log')) ) #QC: Get percent MarkDuplicates try: with open(markduplicates_metrics_filepath) as markdup_metrics: for line in markdup_metrics: if line[FIRST_CHAR] == '#': continue record = line.strip().split('\t') if len(record) == 9: if re.match(r'\d+', record[7]) is not None: qc_data['percent_duplicate_reads'] = record[7] except: qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics' # Filter down to uniquely mapped reads samtools_view.run( Parameter('-b'), Parameter('-F', '256'), Parameter('-q', '10'), Parameter('-o', unique_bam), Parameter(duprm_bam) ) # gets statistics on uniquely mapped reads for i, unique_map in enumerate(unique_bam): samtools_flagstat.run( Parameter(unique_bam), Redirect(stream=Redirect.STDOUT, dest=unique_bam + '.flagstat') ) #QC: Get number of mapped reads from unique bams try: with open(unique_bam + '.flagstat') as flagstats: unique_flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', unique_flagstats_contents) if target_line is not None: qc_data['num_unique_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_unique_reads_mapped'].append('0') except: qc_data['num_unique_reads_mapped'] + '.flagstat' # make AlignmentFile object to extract binned reads and chrM reads from the unique bam samtools_index.run(Parameter(unique_bam)) unique_bam_alignmentfile = pysam.AlignmentFile(unique_bam, 'rb') # Bins reads into 4 categories depending on template length read is derived from: # 50-115 (nucleosome-free), 180-247 (mononucleosome), 315-473 (dinucleosome), 558-615 (trinucleosome) nucleosome_free_reads_alignmentfile = pysam.AlignmentFile(nucleosome_free_reads, 'wb', template=unique_bam_alignmentfile) mononucleosome_reads_alignmentfile = pysam.AlignmentFile(mononucleosome_reads, 'wb', template=unique_bam_alignmentfile) dinucleosome_reads_alignmentfile = pysam.AlignmentFile(dinucleosome_reads, 'wb', template=unique_bam_alignmentfile) trinucleosome_reads_alignmentfile = pysam.AlignmentFile(trinucleosome_reads, 'wb', template=unique_bam_alignmentfile) # Extract chrM into new BAM chrM_reads_alignmentfile = pysam.AlignmentFile(chrM_bam, 'wb', template=unique_bam_alignmentfile) # Binning of nucleosome reads for read in unique_bam_alignmentfile.fetch(): if abs(int(read.template_length)) >= 50 and abs(int(read.template_length)) <= 115: nucleosome_free_reads_alignmentfile.write(read) elif abs(int(read.template_length)) >= 180 and abs(int(read.template_length)) <= 247: mononucleosome_reads_alignmentfile.write(read) elif abs(int(read.template_length)) >= 315 and abs(int(read.template_length)) <= 473: dinucleosome_reads_alignmentfile.write(read) elif abs(int(read.template_length)) >= 558 and abs(int(read.template_length)) <= 615: trinucleosome_reads_alignmentfile.write(read) else: continue; #stores chrM reads in separate file for read in unique_bam_alignmentfile.fetch(): if read.reference_name == 'chrM': chrM_reads_alignmentfile.write(read) nucleosome_free_reads_alignmentfile.close() mononucleosome_reads_alignmentfile.close() dinucleosome_reads_alignmentfile.close() trinucleosome_reads_alignmentfile.close() chrM_reads_alignmentfile.close() # gets series of flagstats results for non-main files samtools_flagstat.run( Parameter(nucleosome_free_reads), Redirect(stream=Redirect.STDOUT, dest=nucleosome_free_reads + '.flagstat')) samtools_flagstat.run( Parameter(mononucleosome_reads), Redirect(stream=Redirect.STDOUT, dest=mononucleosome_reads + '.flagstat')) samtools_flagstat.run( Parameter(dinucleosome_reads), Redirect(stream=Redirect.STDOUT, dest=dinucleosome_reads + '.flagstat')) samtools_flagstat.run( Parameter(trinucleosome_reads), Redirect(stream=Redirect.STDOUT, dest=trinucleosome_reads + '.flagstat')) # gets statistics on chrM mapped reads samtools_index.run(Parameter(chrM_bam)) for i, chrM_map in enumerate(chrM_bam): samtools_flagstat.run( Parameter(chrM_bam), Redirect(stream=Redirect.STDOUT, dest=chrM_bam + '.flagstat') ) try: with open(chrM_bam + '.flagstat') as flagstats: chrM_flagstats_contents = flagstats.read() target_line = re.search(r'(\d+) \+ \d+ mapped', chrM_flagstats_contents) if target_line is not None: qc_data['num_mtDNA_reads_mapped'].append(str(int(target_line.group(1))/2)) else: qc_data['num_mtDNA_reads_mapped'].append('0') except: qc_data['num_mtDNA_reads_mapped'] + '.flagstat' # Remove unmapped reads samtools_view.run( Parameter('-b'), Parameter('-F', '12'), Parameter('-o', unmappedrm_bam), Parameter(unique_bam) ) # Create BAM index, then remove chrM samtools_index.run( Parameter(unmappedrm_bam) ) # Remove chrM all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']] samtools_view.run( Parameter('-b'), Parameter('-o', chrmrm_bam), Parameter(unmappedrm_bam), *all_chr ) # Stage delete for temporary files staging_delete.extend([ sortmerged_bam, sortmerged_bam + '.bai', # BAM index file steric_filter_bam, unique_bam, duprm_bam, unmappedrm_bam, unmappedrm_bam + '.bai', # BAM index file chrmrm_bam ]) if step <= 5: # Generate filename for final processed BAM and BED processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix)) unshifted_bed = os.path.join(output_dir, '{}.unshifted_bed'.format(lib_prefix)) processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix)) unshifted_bedpe = os.path.join(output_dir, '{}.unshifted_bedpe'.format(lib_prefix)) processed_bedpe_to_bed = os.path.join(output_dir,'{}.processed_bedpe_to_bed'.format(lib_prefix)) # staging_delete.append(unshifted_bed) # Generate filename for chrM removed BAM chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix)) # Remove blacklisted genomic regions bedtools_intersect.run( Parameter('-v'), Parameter('-abam', chrmrm_bam), Parameter('-b', pipeline_config['bedtools']['blacklist-bed']), Parameter('-f', '0.5'), Redirect(stream=Redirect.STDOUT, dest=processed_bam) ) # QC: Generate insert size metrics PDF picard_insert_metrics.run( Parameter('INPUT={}'.format(processed_bam)), Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))), Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf'))) ) # Generate index for processed BAM samtools_index.run( Parameter(processed_bam) ) # Convert BAM to BED bedtools_bamtobed.run( Parameter('-i', processed_bam), Redirect(stream=Redirect.STDOUT, dest=unshifted_bed) ) # Convert BAM to BEDPE, with specific quality and only properly paired reads, sorted by name samtools_view.run( Parameter('-uf', '0x2'), Parameter('-F', '1548'), Parameter('-q', '30'), Parameter(processed_bam), Pipe( samtools_sort.pipe( Parameter('-n'), Parameter('-'), Parameter(sorted_for_PE_bam) ) ) ) # convert bam to BEDPE bedtools_bamtobed.run( Parameter('-i', str(sorted_for_PE_bam)+'.bam'), Parameter('-bedpe'), Redirect(stream=Redirect.STDOUT, dest=unshifted_bedpe) ) unshifted_bedpe_to_bed = open(output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix), 'w') with open(unshifted_bedpe) as convertToBed: for line in convertToBed: chrpos1, start1, end1, chrpos2, start2, end2, name, score, strand1, strand2=line.split('\t') bedformat=[chrpos1, start1, end2, name, score, strand1, strand2.rstrip('\n')] unshifted_bedpe_to_bed.write('\t'.join(bedformat)+'\n') staging_delete.append(unshifted_bed) staging_delete.append(output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix)) # Shifting + strand by 4 and - strand by -5, according to the ATACseq paper # This ysed to be bedtools shift, but they are fired self.shift_reads( input_bed_filepath=unshifted_bed, output_bed_filepath=processed_bed, log_filepath=os.path.join(logs_dir, 'shift_reads.logs'), genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'], minus_strand_shift=MINUS_STRAND_SHIFT, plus_strand_shift=PLUS_STRAND_SHIFT ) ##TO DO, needs modification for bedpe format self.shift_reads_bedpe( input_bed_filepath=output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix), output_bed_filepath=processed_bedpe_to_bed, log_filepath=os.path.join(logs_dir, 'shift_reads_bedpe_to_bed.logs'), genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'], minus_strand_shift=MINUS_STRAND_SHIFT, plus_strand_shift=PLUS_STRAND_SHIFT ) # Peak-calling; MACS2 if step <= 6: # for regular peak calling, including narrow, default q-value=0.01 macs2_callpeak.run( Parameter('-t', processed_bed), Parameter('-f', 'BED'), Parameter('-g', 'hs'), Parameter('-n', str(processed_bed) + '_regular_peak_calls'), Parameter('--nomodel'), Parameter('--extsize', '200'), Parameter('--shift', '-100'), Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads Parameter('--call-summits'), Parameter('--keep-dup', 'all') ) #for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks macs2_callpeak.run( Parameter('-t', processed_bed), Parameter('-f', 'BED'), Parameter('-g', 'hs'), Parameter('-n', str(processed_bed) + '_broad_peak_calls'), Parameter('-q', '0.05'), Parameter('--nomodel'), Parameter('--extsize', '200'), Parameter('--shift', '-100'), Parameter('--broad'), Parameter('--keep-dup', 'all') ) # for regular peak calling, including narrow, default q-value=0.01 for processed bedpe to bed file # NOTE: BEDPE for MACS2 is not the same format at BEDPE accepted by NGS/UCSC standards macs2_callpeak.run( Parameter('-t', processed_bedpe_to_bed), Parameter('-f', 'BEDPE'), Parameter('-g', 'hs'), Parameter('-n', str(processed_bedpe_to_bed) + '_regular_peak_calls'), Parameter('--nomodel'), Parameter('--extsize', '200'), Parameter('--shift', '-100'), Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads Parameter('--call-summits'), Parameter('--keep-dup', 'all') ) #for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks for processed bedpe to bed file # NOTE: BEDPE for MACS2 is not the same format at BEDPE accepted by NGS/UCSC standards macs2_callpeak.run( Parameter('-t', processed_bedpe_to_bed), Parameter('-f', 'BEDPE'), Parameter('-g', 'hs'), Parameter('-n', str(processed_bedpe_to_bed) + '_broad_peak_calls'), Parameter('-q', '0.05'), Parameter('--nomodel'), Parameter('--extsize', '200'), Parameter('--shift', '-100'), Parameter('--broad'), Parameter('--keep-dup', 'all') ) # QC: Output QC data to file with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file: qc_data_file.write(str(qc_data) + '\n') # Delete temporary files for delete_file in staging_delete: subprocess.call(['rm', '-rf', delete_file])
def run_pipeline(self, pipeline_args, pipeline_config): reads = pipeline_args['reads'] output_dir = pipeline_args['output'] logs_dir = os.path.join(output_dir, 'logs') lib_prefix = pipeline_args['lib'] forward_adapter = pipeline_args['forward_adapter'] reverse_adapter = pipeline_args['reverse_adapter'] sailfish_libtype = pipeline_args['sailfish_libtype'] # Determine if run is paired-end from input run_is_paired_end = len(reads[FIRST_READS_PAIR].split(':')) > 1 # Create output, tmp, and logs directories tmp_dir = os.path.join(output_dir, 'tmp') subprocess.call(['mkdir', '-p', output_dir, logs_dir, tmp_dir]) # Keep list of items to delete staging_delete = [os.path.join(output_dir, 'tmp')] cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path']) kallisto = Software('kallisto', pipeline_config['kallisto']['path']) sailfish = Software('sailfish', pipeline_config['sailfish']['path']) # Combine reads with extra sequencing depth if run_is_paired_end: # Aggregate read1s and read2s read1s, read2s = [], [] for read in reads: read1, read2 = read.split(':') read1s.append(read1) read2s.append(read2) # Combine reads groups combined_reads = [] for name, reads_group in [('read1', read1s), ('read2', read2s)]: combined_read_filename = os.path.join( output_dir, '{}.combined.{}.fastq.gz'.format(lib_prefix, name)) combined_reads.append(combined_read_filename) staging_delete.append(combined_read_filename) with open(combined_read_filename, 'w') as combined_reads_fastq: subprocess.call(['cat'] + [read for read in reads_group], stdout=combined_reads_fastq) # Update reads list reads = ':'.join(combined_reads) else: # Combine reads combined_read_filename = os.path.join( output_dir, '{}.combined.fastq.gz'.format(lib_prefix)) staging_delete.append(combined_read_filename) with open(combined_read_filename, 'w') as combined_reads_fastq: subprocess.call(['cat'] + [read for read in reads], stdout=combined_reads_fastq) # Update reads list reads = combined_read_filename cutadapt_common = [ Parameter('--quality-base={}'.format( pipeline_config['cutadapt']['quality-base'])), Parameter('--minimum-length={}'.format( pipeline_config['cutadapt']['minimum-length'])), Parameter('-q', '30'), Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary')) ] if run_is_paired_end: read1, read2 = reads.split(':') trimmed_read1_filename = os.path.join( output_dir, lib_prefix + '_read1.trimmed.fastq.gz') trimmed_read2_filename = os.path.join( output_dir, lib_prefix + '_read2.trimmed.fastq.gz') staging_delete.append(trimmed_read1_filename) staging_delete.append(trimmed_read2_filename) cutadapt_specific = [ Parameter('--output={}'.format(trimmed_read1_filename)), Parameter('--paired-output={}'.format(trimmed_read2_filename)), Parameter('-a', forward_adapter), Parameter('-A', reverse_adapter), Parameter(read1), Parameter(read2) ] # Update reads list reads = ':'.join([trimmed_read1_filename, trimmed_read2_filename]) else: # Construct new filename trimmed_read_filename = os.path.join( output_dir, lib_prefix + '.trimmed.fastq.gz') staging_delete.append(trimmed_read_filename) cutadapt_specific = [ Parameter('--output={}'.format(trimmed_read_filename)), Parameter('-a', forward_adapter), Parameter(reads) ] # Update reads list reads = [trimmed_read_filename] # Run cutadapt cutadapt.run(*(cutadapt_common + cutadapt_specific)) # Step 3: Kallisto Quantification kallisto_common = [ Parameter('--index={}'.format( pipeline_config['kallisto']['index-path'])), Parameter('--output-dir={}'.format( os.path.join(output_dir, 'kallisto_quant'))) ] if run_is_paired_end: read1, read2 = reads.split(':') kallisto_ended = [Parameter(read1), Parameter(read2)] else: kallisto_ended = [Parameter(reads)] # Run kallisto kallisto.run(*(kallisto_common + kallisto_ended)) # Step 4: Sailfish Quantification sailfish_common = [ Parameter('--index', pipeline_config['sailfish']['index-path']), Parameter('--libType', '"{}"'.format(sailfish_libtype)), Parameter('--output', os.path.join(output_dir, 'sailfish_quant')) ] if run_is_paired_end: read1, read2 = reads.split(':') sailfish_ended = [ Parameter('-1', '<(zcat {})'.format(read1)), Parameter('-2', '<(zcat {})'.format(read2)), ] else: sailfish_ended = [Parameter('-r', '<(zcat {})'.format(reads))] sailfish.run(*(sailfish_common + sailfish_ended), shell=True) # Delete staged items for item in staging_delete: subprocess.call(['rm', '-rf', item])
def run_pipeline(self, pipeline_args, pipeline_config): # Instantiate Software instances fastqc = Software('FastQC', pipeline_config['fastqc']['path']) rnaseqc = Software('RNA-SeQC', pipeline_config['RNA-SeQC']['path']) picard = { subprogram_name: Software('picard {}'.format(subprogram_name), pipeline_config['picard']['path'] + ' {}'.format(subprogram_name)) for subprogram_name in {'CreateSequenceDictionary', 'MarkDuplicates', 'CollectRnaSeqMetrics', 'CollectInsertSizeMetrics', 'CollectAlignmentSummaryMetrics', 'CollectGcBiasMetrics', 'EstimateLibraryComplexity', 'AddOrReplaceReadGroups'} } preseq = { subprogram_name: Software('preseq {}'.format(subprogram_name), pipeline_config['preseq']['path'] + ' {}'.format(subprogram_name)) for subprogram_name in {'c_curve', 'lc_extrap', 'gc_extrap'} } bam2mr = Software('bam2mr', pipeline_config['preseq']['bam2mr']) featurecounts = Software('featureCounts', pipeline_config['featureCounts']['path']) samtools_faidx = Software('samtools faidx', pipeline_config['samtools']['path'] + ' faidx') novosort = Software('novosort', pipeline_config['novosort']['path']) # Create output directory subprocess.call('mkdir -p {}'.format(pipeline_args['output_dir']), shell=True) subprocess.call('mkdir -p {}'.format(pipeline_config['tmp-dir']), shell=True) # Sort bam file # sorted_bam = os.path.join(pipeline_args['output_dir'], 'sorted.tmp.bam') sorted_bam = os.path.join(pipeline_args['output_dir'], '{}.sorted.tmp.bam'.format(pipeline_args['lib'])) novosort.run( Parameter('--index'), Parameter('--output', sorted_bam), Parameter(pipeline_args['bam']) ) # Run FastQC self.run_fastqc( fastqc=fastqc, pipeline_args=pipeline_args ) # Run RNA-SeQC self.run_rnaseqc( rnaseqc=rnaseqc, picard=picard, samtools_faidx=samtools_faidx, pipeline_config=pipeline_config, pipeline_args=pipeline_args, sorted_bam=sorted_bam ) # Run Picard suite self.run_picard_suite( picard=picard, sorted_bam=sorted_bam, pipeline_config=pipeline_config, pipeline_args=pipeline_args ) # self.run_preseq( # preseq=preseq, # bam2mr=bam2mr, # sorted_bam=sorted_bam, # pipeline_args=pipeline_args # ) self.run_featurecounts( featurecounts=featurecounts, sorted_bam=sorted_bam, pipeline_args=pipeline_args, pipeline_config=pipeline_config ) self.run_chrm_percentage( sorted_bam=sorted_bam, pipeline_args=pipeline_args ) # Remove temporary sorted bam os.remove(sorted_bam) os.remove(sorted_bam + '.bai') # subprocess.call('rm -rf /mnt/analysis/tmp', shell=True)