Ejemplos de Software.pipe en Python

Lenguaje de programación: Python

Namespace/Package Name: chunkypipes.components

Clase / Tipo: Software

Método / Función: pipe

Ejemplos en hotexamples.com: 5

Python Software.pipe - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de chunkypipes.components.Software.pipe extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Software(10)

run(10)

pipe(3)

Métodos usados con frecuencia

Software (10)

run (10)

pipe (3)

Ejemplo n.º 1

Mostrar archivo

Archivo: RiboSeq_pipeline_only_QC.py Proyecto: qliugithub/RiboSeq_pipeline

    def run_pipeline(self, pipeline_args, pipeline_config):
        # create variables from parser if wanted
        bamFiles = pipeline_args['bam:lib']
        outputDir = pipeline_args['output']
        adapter = pipeline_args['adapter']
        numThreads = pipeline_args['threads']

        # Create output directory
        subprocess.call(['mkdir', outputDir])

        # Software
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        star = Software('STAR', pipeline_config['STAR']['path'])
        bedtools = Software('bedtools', pipeline_config['bedtools']['path'])
        bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path'])
        samtools = Software('samtools', pipeline_config['samtools']['path'])
        samtools_sort = Software('samtools sort',
                                 pipeline_config['samtools']['path'])
        read_distribution = Software(
            'read_distribution.py',
            pipeline_config['read_distribution']['path'])
        featureCounts = Software('featureCounts',
                                 pipeline_config['featureCounts']['path'])
        fastQC = Software('FastQC', pipeline_config['FastQC']['path'])
        picard = Software('picard', pipeline_config['picard']['path'])

        # Change these to just be done in python script?

        # Common software tools
        awk = Software('awk', 'awk')
        sort = Software('sort', 'sort')
        uniq = Software('uniq', 'uniq')
        paste = Software('paste', 'paste')
        cat = Software('cat', 'cat')
        grep = Software('grep', 'grep')

        # Directories and Files
        pathToGenomeDir = pipeline_config['STAR']['genomeDir']
        pathToGenome = pipeline_config['bowtie2']['genome_ref']
        pathToGtf = pipeline_config['STAR']['GTF_ref']
        pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed']
        pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100']
        pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed']
        pathTo_genomeFasta = pipeline_config['picard']['genomeFasta']
        '''

      remove adaptor and trim
      adaptor sequence: AGATCGGAAGAGCACACGTCT
      -m 25 discard any reads shorter than 25 nucleotides
      keep only reads that had the adaptor sequence --discard-untrimmed

      cutadapt -a AGATCGGAAGAGCACACGTCT -m 25 --discard-untrimmed {filename}.fastq.gz
       > {filename}_trimmed.fastq.gz 2> {filename}_report.txt
      
      Remove adapters
      Only keep reads with adapters, otherwise artifact
      Discard reads shorter than 25 bp
      
    '''

        # Keep track of Bids in pipeline

        bid_list = []
        for bamLib in bamFiles:
            bid_list.append(bamLib.split(':')[-1])
        '''
      Sort and extract uniquely mapped reads for QC and further analyses
        samtools view -H $file > header.sam
        samtools view $file | grep -w NH:i:1 | cat header.sam - | samtools view -bS - | samtools sort - ${filename}_uniq_sorted
        rm header.sam

      Using this file for the rest of the analysis
    '''

        for bamLib in bamFiles:
            bam, bid = bamLib.split(':')
            newDir = new_dir(outputDir, bid)
            samtools.run(
                Parameter('view'),
                Parameter('-H'),
                Parameter(bam),  # star outfile name
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir,
                                           '{}.header.sam'.format(bid))))
            samtools.run(
                Parameter('view'),
                Parameter(bam),  # star outfile name
                Pipe(
                    grep.pipe(
                        Parameter('-w'), Parameter('NH:i:1'),
                        Pipe(
                            cat.pipe(
                                Parameter(
                                    os.path.join(newDir,
                                                 '{}.header.sam'.format(bid)),
                                    '-'),
                                Pipe(
                                    samtools.pipe(
                                        Parameter('view'),
                                        Parameter('-bS', '-'),
                                        Pipe(
                                            samtools.pipe(
                                                Parameter('sort'),
                                                Parameter(
                                                    '-', '-o',
                                                    '{}/{}.uniq_sorted.bam'.
                                                    format(newDir,
                                                           bid)))))))))))
            # subprocess.call(['rm', '{}/{}.header.sam'.format(newDir, bid)])
        '''
      SeQC to evaluate percent reads mapped to each genomic features
        read_distribution.py -r hg19_RefSeq.bed12 -i $file
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            read_distribution.run(
                Parameter('-r'),
                Parameter(pathTo_hg19_bed),
                Parameter('-i'),
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.read_distribution.log'.format(bid))),
                shell=True)
        '''
      codon periodicity
        annotation=/glusterfs/users/ashieh/annotations/hg19_ccds_exons_plus_start100.bed

        bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100
        awk -v OFS='\t' '{print ($2-($14+100))}' ${filename}_intersect_start100.bed
         | sort | uniq -c > ${filename}_relative_pos_aggregate.table
    '''

        # bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            bedtools.run(
                Parameter('intersect'),
                Parameter('-a {}'.format(pathTo_hg19_bed_start100)),
                Parameter('-b {}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Parameter('-s'),
                Parameter('-bed'),
                Parameter('-wa'),
                Parameter('-wb'),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.intersect_start100.bed'.format(bid))),
                shell=True)
            awk.run(
                Parameter('-v'), Parameter("OFS='\\t'"),
                Parameter('{print ($8-($2+100))}'),
                Parameter('{}/{}.intersect_start100.bed'.format(newDir, bid)),
                Pipe(
                    sort.pipe(
                        Pipe(
                            uniq.pipe(
                                Parameter('-c'),
                                Redirect(stream=Redirect.STDOUT,
                                         dest=os.path.join(
                                             newDir,
                                             '{}_relative_pos_aggregate.table'.
                                             format(bid))))))))

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            rpaFile = open(
                '{dir}/{bid}_relative_pos_aggregate.table'.format(dir=newDir,
                                                                  bid=bid),
                'rb')
            myDict = {}

            for i in range(-30, 31):
                myDict[i] = 0

            for line in rpaFile:
                Frequency, start = line.strip().split(' ')
                if int(start) >= -30 and int(start) <= 30:
                    print start
                    myDict[int(start)] = Frequency

            # print times

            freqs = []
            starts = []
            for i in range(-30, 31):
                starts.append(i)
                freqs.append(myDict[i])

            # print freqs

            fig, ax = plt.subplots()
            # plt.set_title('{} codon periodicity'.format(bid))
            plt.xlabel("-30 to 30 relative position")
            plt.ylabel("Frequency")
            plt.bar(starts, freqs)
            fig.savefig('{dir}/{bid}_codon_periodicity_plot.png'.format(
                dir=newDir, bid=bid))
        '''
    Picard tools

    java -jar picard.jar CollectMultipleMetrics 
    I=2017-221.uniq_sorted.bam 
    O= multiple_metrics 
    R=GRCh37.p13.genome.fa

    java -jar picard.jar CollectGcBiasMetrics
    I= .uniq
    O=gc_bias_metrics.txt 
    CHART=gc_bias_metrics.pdf 
    S=summary_metrics.txt 
    R=reference_sequence.fasta

    java -jar picard.jar CollectRnaSeqMetrics
    I=input.bam 
    O=output.RNA_Metrics 
    REF_FLAT=ref_flat.txt 
    STRAND=FIRST_READ_TRANSCRIPTION_STRAND

    java -jar picard.jar MarkDuplicates
    I=input.bam 
    O=marked_duplicates.bam 
    M=marked_dup_metrics.txt
    ASSUME_SORTED=true
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)

            picard.run(
                Parameter('CollectMultipleMetrics'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.multiple_metrics'.format(newDir,
                                                            bid)),  # output
                Parameter('R={}'.format(pathTo_genomeFasta))  # genomeReference
            )

            picard.run(
                Parameter('CollectGcBiasMetrics'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.gc_bias_metrics'.format(newDir,
                                                           bid)),  # output
                Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(
                    newDir, bid)),  # chart
                Parameter('S={}/{}.summary_metrics'.format(
                    newDir, bid)),  # summary metrics
                Parameter(
                    'R={}'.format(pathTo_genomeFasta))  # genome reference
            )

            picard.run(
                Parameter('CollectRnaSeqMetrics'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)),  # output
                Parameter('REF_FLAT={}/{}'.format(newDir, bid)),  # ref_flat
                Parameter(
                    'STRAND=FIRST_READ_TRANSCRIPTION_STRAND')  # strandedness
            )

            picard.run(
                Parameter('MarkDuplicates'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.marked_duplicates.bam'.format(
                    newDir, bid)),  # output
                Parameter(
                    'M={}/{}.marked_dup_metrics.txt'),  # marked dup metrics
                Parameter('ASSUME_SORTED=true')  # sorted
            )
        '''
    subread: featureCounts

      featureCounts -a /path_to_gtf/gencode.v19.annotation.gtf -o <bid>.featureCounts <bid>.uniq_sorted.bam
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            featureCounts.run(
                Parameter('-a', '{}'.format(pathToGtf)),  # gtf
                Parameter('-s', '1'),  # strand-specific read counting 
                Parameter('-o', '{}/{}.featureCounts'.format(newDir,
                                                             bid)),  # output
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid))  # input
            )

Ejemplo n.º 2

Mostrar archivo

Archivo: atacseq.py Proyecto: djf604/chunky-pipes-pipelines

    def run_pipeline(self, pipeline_args, pipeline_config):
        # Instantiate variables from argparse
        read_pairs = pipeline_args['reads']
        output_dir = os.path.abspath(pipeline_args['output'])
        logs_dir = os.path.join(output_dir, 'logs')
        lib_prefix = pipeline_args['lib']
        step = int(pipeline_args['step'])
        forward_adapter = pipeline_args['forward_adapter']
        reverse_adapter = pipeline_args['reverse_adapter']

        # Create output, tmp, and logs directories
        tmp_dir = os.path.join(output_dir, 'tmp')
        subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir])

        # Keep list of items to delete
        staging_delete = [tmp_dir]
        bwa_bam_outs = []
        qc_data = {
            'total_raw_reads_counts': [],
            'trimmed_reads_counts': [],
            # TODO Find a better way to store FastQC results
            'num_reads_mapped': [],
            'percent_duplicate_reads': '0',
            'num_unique_reads_mapped': [],  # TODO This isn't implemented
            'num_mtDNA_reads_mapped': [],  # TODO This isn't implemented
            'num_reads_mapped_after_filtering': '-1',  # TODO This isn't implemented
            'num_peaks_called': '-1',
            # TODO Get number of peaks in annotation sites
        }

        # Instantiate software instances
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
        bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln')
        bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe')
        samtools_view = Software('samtools view',
                                 pipeline_config['samtools']['path'] + ' view')
        samtools_flagstat = Software('samtools flagstat',
                                     pipeline_config['samtools']['path'] + ' flagstat')
        samtools_index = Software('samtools index',
                                  pipeline_config['samtools']['path'] + ' index')
        novosort = Software('novosort', pipeline_config['novosort']['path'])
        picard_mark_dup = Software('Picard MarkDuplicates',
                                   pipeline_config['picard']['path'] + ' MarkDuplicates')
        picard_insert_metrics = Software('Picard CollectInsertSizeMetrics',
                                         pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics')
        bedtools_bamtobed = Software('bedtools bamtobed',
                            pipeline_config['bedtools']['path'] + ' bamtobed')
        bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + ' sort')
        bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge')
        bedtools_intersect = Software('bedtools intersect',
                                      pipeline_config['bedtools']['path'] + ' intersect')
        homer_maketagdir = Software('HOMER makeTagDirectory',
                                    pipeline_config['makeTagDirectory']['path'])
        homer_findpeaks = Software('HOMER findPeaks', pipeline_config['findPeaks']['path'])
        homer_pos2bed = Software('HOMER pos2bed', pipeline_config['pos2bed']['path'])

        if step <= 1:
            for i, read_pair in enumerate(read_pairs):
                read1, read2 = read_pair.split(':')

                # QC: Get raw fastq read counts
                qc_data['total_raw_reads_counts'].append([
                    str(int(self.count_gzipped_lines(read1))/4),
                    str(int(self.count_gzipped_lines(read2))/4)
                ])

                trimmed_read1_filename = os.path.join(output_dir,
                                                      lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
                trimmed_read2_filename = os.path.join(output_dir,
                                                      lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))

                cutadapt.run(
                    Parameter('--quality-base=33'),
                    Parameter('--minimum-length=5'),
                    Parameter('-q', '30'),  # Minimum quality score
                    Parameter('--output={}'.format(trimmed_read1_filename)),
                    Parameter('--paired-output={}'.format(trimmed_read2_filename)),
                    Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'),
                    Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'),
                    Parameter(read1),
                    Parameter(read2),
                    Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
                )

                # QC: Get trimmed fastq read counts
                qc_data['trimmed_reads_counts'].append([
                    str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
                    str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
                ])

                staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename])
                read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

        if step <= 2:
            # Make FastQC directory
            fastqc_output_dir = os.path.join(output_dir, 'fastqc')
            subprocess.call(['mkdir', '-p', fastqc_output_dir])
            for i, read_pair in enumerate(read_pairs):
                for read in read_pair.split(':'):
                    fastqc.run(
                        Parameter('--outdir={}'.format(fastqc_output_dir)),
                        Parameter(read)
                    )

                    bwa_aln.run(
                        Parameter('-t', pipeline_config['bwa']['threads']),
                        Parameter(pipeline_config['bwa']['index-dir']),
                        Parameter(read),
                        Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read))
                    )

                    staging_delete.append('{}.sai'.format(read))

        if step <= 3:
            for i, read_pair in enumerate(read_pairs):
                read1, read2 = read_pair.split(':')
                bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i))

                bwa_sampe.run(
                    Parameter('-a', '2000'),  # Maximum insert size
                    Parameter('-n', '1'),
                    Parameter(pipeline_config['bwa']['index-dir']),
                    Parameter('{}.sai'.format(read1)),
                    Parameter('{}.sai'.format(read2)),
                    Parameter(read1),
                    Parameter(read2),
                    Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')),
                    Pipe(
                        samtools_view.pipe(
                            Parameter('-hSb'),
                            Parameter('-o', bwa_bam_output),
                            Parameter('-')  # Get input from stdin
                        )
                    )
                )

                bwa_bam_outs.append(bwa_bam_output)

        if step <= 4:
            for i, bwa_bam in enumerate(bwa_bam_outs):
                samtools_flagstat.run(
                    Parameter(bwa_bam),
                    Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat')
                )

                # QC: Get number of mapped reads from this BAM
                try:
                    with open(bwa_bam + '.flagstat') as flagstats:
                        flagstats_contents = flagstats.read()
                        target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
                        if target_line is not None:
                            qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2))
                        else:
                            qc_data['num_reads_mapped'].append('0')
                except:
                    qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format(
                        bwa_bam + '.flagstat'
                    ))

            sortmerged_bam = os.path.join(output_dir, '{}.sortmerged.bam'.format(lib_prefix))
            steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix))
            duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix))
            unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix))
            unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix))
            chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

            novosort.run(
                Parameter('--threads', pipeline_config['novosort']['threads']),
                Parameter('--tmpcompression', '6'),
                Parameter('--tmpdir', tmp_dir),
                Parameter(*[bam for bam in bwa_bam_outs]),
                Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam),
                Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log'))
            )

            # This creates a dependency on PySam
            # Removes reads with template length < 38 due to steric hindrence
            samtools_index.run(Parameter(sortmerged_bam))
            sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb')
            steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb',
                                                                  template=sortmerged_bam_alignmentfile)
            for read in sortmerged_bam_alignmentfile.fetch():
                if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF:
                    steric_filter_bam_alignmentfile.write(read)

            sortmerged_bam_alignmentfile.close()
            steric_filter_bam_alignmentfile.close()

            # Mark and remove duplicates
            markduplicates_metrics_filepath = os.path.join(logs_dir,
                                                           'mark_dup.metrics')
            picard_mark_dup.run(
                Parameter('INPUT={}'.format(steric_filter_bam)),
                Parameter('OUTPUT={}'.format(duprm_bam)),
                Parameter('TMP_DIR={}'.format(tmp_dir)),
                Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
                Parameter('REMOVE_DUPLICATES=true'),
                Parameter('VALIDATION_STRINGENCY=LENIENT'),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
            )

            # QC: Get percent duplicates
            try:
                with open(markduplicates_metrics_filepath) as markdup_metrics:
                    for line in markdup_metrics:
                        if line[FIRST_CHAR] == '#':
                            continue
                        record = line.strip().split('\t')
                        if len(record) == 9:
                            if re.match(r'\d+', record[7]) is not None:
                                qc_data['percent_duplicate_reads'] = record[7]
            except:
                qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics'

            # Filter down to uniquely mapped reads
            samtools_view.run(
                Parameter('-b'),
                Parameter('-F', '256'),
                Parameter('-q', '10'),
                Parameter('-o', unique_bam),
                Parameter(duprm_bam)
            )

            # Remove unmapped reads
            samtools_view.run(
                Parameter('-b'),
                Parameter('-F', '12'),
                Parameter('-o', unmappedrm_bam),
                Parameter(unique_bam)
            )

            # Create BAM index, then remove chrM
            samtools_index.run(
                Parameter(unmappedrm_bam)
            )

            # Remove chrM
            all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']]
            samtools_view.run(
                Parameter('-b'),
                Parameter('-o', chrmrm_bam),
                Parameter(unmappedrm_bam),
                *all_chr
            )

            # Stage delete for temporary files
            staging_delete.extend([
                sortmerged_bam,
                sortmerged_bam + '.bai',  # BAM index file
                steric_filter_bam,
                unique_bam,
                duprm_bam,
                unmappedrm_bam,
                unmappedrm_bam + '.bai',  # BAM index file
                chrmrm_bam
            ])

        if step <= 5:
            # Generate filename for final processed BAM and BED
            processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
            unshifted_bed = os.path.join(output_dir, '{}.unshifted.bed'.format(lib_prefix))
            processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))

            # staging_delete.append(unshifted_bed)

            # Generate filename for chrM removed BAM
            chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

            # Remove blacklisted genomic regions
            bedtools_intersect.run(
                Parameter('-v'),
                Parameter('-abam', chrmrm_bam),
                Parameter('-b', pipeline_config['bedtools']['blacklist-bed']),
                Parameter('-f', '0.5'),
                Redirect(stream=Redirect.STDOUT, dest=processed_bam)
            )

            # QC: Generate insert size metrics PDF
            picard_insert_metrics.run(
                Parameter('INPUT={}'.format(processed_bam)),
                Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))),
                Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf')))
            )

            # Generate index for processed BAM
            samtools_index.run(
                Parameter(processed_bam)
            )

            # Convert BAM to BED
            bedtools_bamtobed.run(
                Parameter('-i', processed_bam),
                Redirect(stream=Redirect.STDOUT, dest=unshifted_bed)
            )

            staging_delete.append(unshifted_bed)

            # Shifting + strand by 4 and - strand by -5, according to
            # the ATACseq paper

            # This used to be bedtools shift, but they are fired
            self.shift_reads(
                input_bed_filepath=unshifted_bed,
                output_bed_filepath=processed_bed,
                log_filepath=os.path.join(logs_dir, 'shift_reads.logs'),
                genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
                minus_strand_shift=MINUS_STRAND_SHIFT,
                plus_strand_shift=PLUS_STRAND_SHIFT
            )

        if step <= 6:
            processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))
            homer_tagdir = os.path.join(output_dir, '{}_tagdir'.format(lib_prefix))
            unsorted_peaks = os.path.join(output_dir, '{}.unsorted.peaks.bed'.format(lib_prefix))
            sorted_peaks = os.path.join(output_dir, '{}.sorted.peaks.bed'.format(lib_prefix))
            merged_peaks = os.path.join(output_dir, '{}.peaks.bed'.format(lib_prefix))

            # Populate HOMER tag directory
            homer_maketagdir.run(
                Parameter(homer_tagdir),
                Parameter('-format', 'bed'),
                Parameter(processed_bed),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'maketagdir.log'))
            )

            # Run HOMER peak calling program
            homer_findpeaks.run(
                Parameter(homer_tagdir),
                Parameter('-fragLength', '0'),
                Parameter('-fdr', '0.01'),
                Parameter('-localSize', '50000'),
                Parameter('-o', 'auto'),
                Parameter('-style', 'dnase'),
                Parameter('-size', '150'),
                Parameter('-minDist', '50'),
                Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'findpeaks.log'))
            )

            # Convert HOMER peaks file to bed format
            homer_pos2bed.run(
                Parameter(os.path.join(homer_tagdir, 'peaks.txt')),
                Redirect(stream=Redirect.STDOUT, dest=unsorted_peaks),
                Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'pos2bed.log'))
            )

            # Sort called peaks bed file
            bedtools_sort.run(
                Parameter('-i', unsorted_peaks),
                Redirect(stream=Redirect.STDOUT, dest=sorted_peaks)
            )

            # Merge peaks to create final peaks file
            bedtools_merge.run(
                Parameter('-i', sorted_peaks),
                Redirect(stream=Redirect.STDOUT, dest=merged_peaks)
            )

            # Stage delete for temporary files
            staging_delete.extend([
                unsorted_peaks,
                sorted_peaks
            ])

        # QC: Output QC data to file
        with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
            qc_data_file.write(str(qc_data) + '\n')

        # Delete temporary files
        for delete_file in staging_delete:
            subprocess.call(['rm', '-rf', delete_file])

Ejemplo n.º 3

Mostrar archivo

Archivo: RiboSeq_pipeline.py Proyecto: qliugithub/RiboSeq_pipeline

    def add_pipeline_args(self, parser):
        parser.add_argument(
            '--fastq:lib',
            required=True,
            nargs='*',
            help='Fastq input for pipeline:library name(prefix for files)')
        parser.add_argument('--output',
                            required=True,
                            help='Where pipeline output should go')
        parser.add_argument('--adapter',
                            default='AGATCGGAAGAGCACACGTCT',
                            help='Adapter sequence for trimming')
        parser.add_argument(
            '--threads',
            default=defaultThreads,
            help='Threads to be used for multi-threaded programs. Default is 8'
        )

        # chunky run RiboSeq_pipe.py --fastqs
        #  /mnt/cinder/thomas/RiboSeq/Lane5/AWS-3_S3_L005_R1_001.fastq.gz
        #  --output /mnt/cinder/thomas/RiboSeq/test --threads

        # create variables from parser if wanted
        fastqFiles = pipeline_args['fastq:lib']
        outputDir = pipeline_args['output']
        adapter = pipeline_args['adapter']
        numThreads = pipeline_args['threads']

        # Create output directory
        subprocess.call(['mkdir', outputDir])

        # Software
        cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
        star = Software('STAR', pipeline_config['STAR']['path'])
        bedtools = Software('bedtools', pipeline_config['bedtools']['path'])
        bowtie2 = Software('bowtie2', pipeline_config['bowtie2']['path'])
        samtools = Software('samtools', pipeline_config['samtools']['path'])
        samtools_sort = Software('samtools sort',
                                 pipeline_config['samtools']['path'])
        read_distribution = Software(
            'read_distribution.py',
            pipeline_config['read_distribution']['path'])
        featureCounts = Software('featureCounts',
                                 pipeline_config['featureCounts']['path'])
        fastQC = Software('FastQC', pipeline_config['FastQC']['path'])
        picard = Software('picard', pipeline_config['picard']['path'])

        # Change these to just be done in python script?

        # Common software tools
        awk = Software('awk', 'awk')
        sort = Software('sort', 'sort')
        uniq = Software('uniq', 'uniq')
        paste = Software('paste', 'paste')
        cat = Software('cat', 'cat')
        grep = Software('grep', 'grep')

        # Directories and Files
        pathToGenomeDir = pipeline_config['STAR']['genomeDir']
        pathToGenome = pipeline_config['bowtie2']['genome_ref']
        pathToGtf = pipeline_config['STAR']['GTF_ref']
        pathTo_hg19_bed = pipeline_config['read_distribution']['hg19_bed']
        pathTo_hg19_bed_start100 = pipeline_config['bedtools']['hg19_start100']
        pathTo_grch37_bed = pipeline_config['bedtools']['grch37_bed']
        pathTo_genomeFasta = pipeline_config['picard']['genomeFasta']
        pathTo_ref_flat = pipeline_config['picard']['refFlat']
        '''

      remove adaptor and trim
      adaptor sequence: AGATCGGAAGAGCACACGTCT
      -m 25 discard any reads shorter than 25 nucleotides
      keep only reads that had the adaptor sequence --discard-untrimmed

      cutadapt -a AGATCGGAAGAGCACACGTCT -m 25 --discard-untrimmed {filename}.fastq.gz
       > {filename}_trimmed.fastq.gz 2> {filename}_report.txt
      
      Remove adapters
      Only keep reads with adapters, otherwise artifact
      Discard reads shorter than 25 bp
      
    '''

        # Keep track of Bids in pipeline

        bid_list = []
        for fastqlib in fastqFiles:
            bid_list.append(fastqlib.split(':')[-1])

        # Cutadapt

        for fastqlib in fastqFiles:
            fastq, bid = fastqlib.split(':')
            newDir = new_dir(outputDir, bid)
            # Make new directories to store data
            subprocess.call(['mkdir', newDir])

            # consider multi-threading by splitting in multiple files and then combining

            cutadapt.run(
                Parameter('--quality-base=33'),
                Parameter('--minimum-length=25'),
                Parameter('--discard-untrimmed'),
                Parameter('--output={}/{}.trimmed.fastq.gz'.format(
                    newDir, bid)),
                # Parameter('-a', forward_adapter if forward_adapter else 'AGATCGGAAGAGCACACGTCT'),
                Parameter('-a', adapter),
                Parameter(fastq),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.cutadapt.summary.log'.format(bid))))
        ''' 
    Bowtie2
    
    bowtie2 --seedlen=23 --un-fq=${filename}_filtered.fq -x $genome -U $file
     -S | samtools view -Sb - > ${filename}.rts.bam

    Remove snoRNA, rRNA, tRNA, keep only mRna for alignment

    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            bowtie2.run(
                Parameter('--seedlen=23'),
                Parameter('--threads', numThreads),
                Parameter('--un-gz {}/{}_filtered.fq.gz'.format(newDir, bid)),
                Parameter('-x', pathToGenome),  # Path to rtsRNA_seqs files
                Parameter('-U', '{}/{}.trimmed.fastq.gz'.format(newDir, bid)),
                Parameter('-S'),
                Parameter('{}/{}.rts.sam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir,
                                           '{}.bowtie2.log'.format(bid))),
                Redirect(stream=Redirect.STDERR,
                         dest=os.path.join(newDir,
                                           '{}.bowtie2.log2'.format(bid))),
                shell=True  # Look into changing     
            )

            # This doesn't work

            samtools.run(
                Parameter('view'),
                Parameter('-Sb'),
                Parameter('{}/{}.rts.sam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir, '{}.rts.bam'.format(bid))),
            )
        '''
    Star 
      STAR --runThreadN 6 --sjdbGTFfile gtfFile --outSAMtype  BAM Unsorted 
        --outFileNamePrefix {filename}_ --genomeDir /path/to/genome/index 
        --genomeFastaFiles --readFilesIn 
        {filename}_filtered.fq.gz --readFilesCommand zcat

    Basically RNAseq at this point

    Align the kept reads from bowtie to the genome
    '''

        # Only load the genome one time: genomeLoad = 'LoadAndKeep'.....Doesn't really work

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            # remove genome from memory on last run
            # genomeLoad = 'LoadAndRemove'
            star.run(
                Parameter(
                    '--runThreadN',
                    numThreads),  # Change to command line parameter --threads
                Parameter('--sjdbGTFfile', pathToGtf),
                Parameter('--outSAMtype', 'BAM', 'Unsorted'),
                Parameter('--outFileNamePrefix', '{}/{}_'.format(newDir, bid)),
                Parameter('--genomeDir', pathToGenomeDir),
                # Parameter('--genomeLoad', genomeLoad), broken
                Parameter('--readFilesIn',
                          '{}/{}_filtered.fq.gz'.format(newDir, bid)),
                Parameter('--readFilesCommand zcat')  # reads gzipped files
            )
        '''
      Sort and extract uniquely mapped reads for QC and further analyses
        samtools view -H $file > header.sam
        samtools view $file | grep -w NH:i:1 | cat header.sam - | samtools view -bS - | samtools sort - ${filename}_uniq_sorted
        rm header.sam

      Using this file for the rest of the analysis
    '''

        for bid in bid_list:

            newDir = new_dir(outputDir, bid)
            samtools.run(
                Parameter('view'),
                Parameter('-H'),
                Parameter('{}/{}_Aligned.out.bam'.format(
                    newDir, bid)),  # star outfile name
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(newDir,
                                           '{}.header.sam'.format(bid))))
            samtools.run(
                Parameter('view'),
                Parameter('{}/{}_Aligned.out.bam'.format(
                    newDir, bid)),  # star outfile name
                Pipe(
                    grep.pipe(
                        Parameter('-w'), Parameter('NH:i:1'),
                        Pipe(
                            cat.pipe(
                                Parameter(
                                    os.path.join(newDir,
                                                 '{}.header.sam'.format(bid)),
                                    '-'),
                                Pipe(
                                    samtools.pipe(
                                        Parameter('view'),
                                        Parameter('-bS', '-'),
                                        Pipe(
                                            samtools.pipe(
                                                Parameter('sort'),
                                                Parameter(
                                                    '-', '-o',
                                                    '{}/{}.uniq_sorted.bam'.
                                                    format(newDir,
                                                           bid)))))))))))
            # subprocess.call(['rm', '{}/{}.header.sam'.format(newDir, bid)])
        '''
      rSeQC to evaluate percent reads mapped to each genomic features
        read_distribution.py -r hg19_RefSeq.bed12 -i $file
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            read_distribution.run(
                Parameter('-r'),
                Parameter(pathTo_hg19_bed),
                Parameter('-i'),
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.read_distribution.log'.format(bid))),
                shell=True)
        '''
      codon periodicity
        annotation=/glusterfs/users/ashieh/annotations/hg19_ccds_exons_plus_start100.bed

        bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100
        awk -v OFS='\t' '{print ($2-($14+100))}' ${filename}_intersect_start100.bed
         | sort | uniq -c > ${filename}_relative_pos_aggregate.table
    '''

        # bedtools intersect -a {annotation} -b {uniq.bam} -s -bed -wa -wb > intersect_start100

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            bedtools.run(
                Parameter('intersect'),
                Parameter('-a {}'.format(pathTo_hg19_bed_start100)),
                Parameter('-b {}/{}.uniq_sorted.bam'.format(newDir, bid)),
                Parameter('-s'),
                Parameter('-bed'),
                Parameter('-wa'),
                Parameter('-wb'),
                Redirect(stream=Redirect.STDOUT,
                         dest=os.path.join(
                             newDir, '{}.intersect_start100.bed'.format(bid))),
                shell=True)
            start100_file = open(
                '{}/{}.intersect_start100.bed'.format(newDir, bid), 'rb')
            relativePos_file = open(
                '{}/{}_relative_pos_aggregate.table'.format(newDir, bid), 'wb')
            distanceList = []
            for line in start100_file:
                splitLine = line.split('\t')
                # Really is relative start
                if len(splitLine) >= 7:
                    distance = int(splitLine[7]) - (int(splitLine[1]) + 100)
                    distanceList.append(distance)
            distanceList.sort()
            distanceCounting = Counter(distanceList)
            for key, value in distanceCounting.iteritems():
                relativePos_file.write("{}\t{}\n".format(value, key))

        # Create chart of relative_positions_aggregate to see codon periodicity
        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            rpaFile = open(
                '{dir}/{bid}_relative_pos_aggregate.table'.format(dir=newDir,
                                                                  bid=bid),
                'rb')
            myDict = {}

            for i in range(-30, 31):
                myDict[i] = 0

            for line in rpaFile:
                Frequency, start = line.strip().split(' ')
                if int(start) >= -30 and int(start) <= 30:
                    # print start
                    myDict[int(start)] = Frequency

            # Change to log scaling?

            freqs = []
            starts = []
            for i in range(-30, 31):
                starts.append(i)
                freqs.append(myDict[i])

            # print freqs

            fig, ax = plt.subplots()
            # plt.set_title('{} codon periodicity'.format(bid))
            plt.xlabel("-30 to 30 relative position")
            plt.ylabel("Frequency")
            plt.bar(starts, freqs)
            fig.savefig('{dir}/{bid}_codon_periodicity_plot.png'.format(
                dir=newDir, bid=bid))
        '''
    Picard tools

    java -jar picard.jar CollectMultipleMetrics 
    I=2017-221.uniq_sorted.bam 
    O= multiple_metrics 
    R=GRCh37.p13.genome.fa

    java -jar picard.jar CollectGcBiasMetrics
    I= .uniq
    O=gc_bias_metrics.txt 
    CHART=gc_bias_metrics.pdf 
    S=summary_metrics.txt 
    R=reference_sequence.fasta

    java -jar picard.jar CollectRnaSeqMetrics
    I=input.bam 
    O=output.RNA_Metrics 
    REF_FLAT=ref_flat.txt 
    STRAND=FIRST_READ_TRANSCRIPTION_STRAND

    java -jar picard.jar MarkDuplicates
    I=input.bam 
    O=marked_duplicates.bam 
    M=marked_dup_metrics.txt
    ASSUME_SORTED=true
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)

            picard.run(
                Parameter('CollectMultipleMetrics'),
                Parameter('I={}'.format(bam)),  # input
                Parameter('O={}/{}.multiple_metrics'.format(newDir,
                                                            bid)),  # output
                Parameter('R={}'.format(pathTo_genomeFasta))  # genomeReference
            )

            picard.run(
                Parameter('CollectGcBiasMetrics'),
                Parameter('I={}'.format(bam)),  # input
                Parameter('O={}/{}.gc_bias_metrics'.format(newDir,
                                                           bid)),  # output
                Parameter('CHART={}/{}.gc_bias_metrics.pdf'.format(
                    newDir, bid)),  # chart
                Parameter('S={}/{}.summary_metrics'.format(
                    newDir, bid)),  # summary metrics
                Parameter(
                    'R={}'.format(pathTo_genomeFasta))  # genome reference
            )

            picard.run(
                Parameter('CollectRnaSeqMetrics'),
                Parameter('I={}'.format(bam)),  # input
                Parameter('O={}/{}.RNA_Metrics'.format(newDir, bid)),  # output
                Parameter('REF_FLAT={}'.format(
                    '{}'.format(pathTo_ref_flat))),  # ref_flat
                Parameter(
                    'STRAND=FIRST_READ_TRANSCRIPTION_STRAND')  # strandedness
            )

            picard.run(
                Parameter('MarkDuplicates'),
                Parameter('I={}/{}.uniq_sorted.bam'.format(newDir,
                                                           bid)),  # input
                Parameter('O={}/{}.marked_duplicates.bam'.format(
                    newDir, bid)),  # output
                Parameter('M={}/{}.marked_dup_metrics.txt'.format(
                    newDir, bid)),  # marked dup metrics
                Parameter('ASSUME_SORTED=true')  # It is sorted
            )
        '''
    subread: featureCounts

      featureCounts -a /path_to_gtf/gencode.v19.annotation.gtf -o <bid>.featureCounts <bid>.uniq_sorted.bam
    '''

        for bid in bid_list:
            newDir = new_dir(outputDir, bid)
            featureCounts.run(
                Parameter('-a', '{}'.format(pathToGtf)),  # gtf
                Parameter('-s', '1'),  # strand-specific read counting 
                Parameter('-o', '{}/{}.featureCounts'.format(newDir,
                                                             bid)),  # output
                Parameter('{}/{}.uniq_sorted.bam'.format(newDir, bid))  # input
            )
        '''
    FastQC

      fastqc --outdir=/path_to/<bid>/ /path_to_fastq/<bid>.fastq.gz
    '''

        for fastqlib in fastqFiles:
            fastq, bid = fastqlib.split(':')
            newDir = new_dir(outputDir, bid)
            fastQC.run(
                Parameter('--outdir={}'.format(newDir)),  # output
                Parameter('--t', numThreads),
                Parameter(fastq)  # input
            )

Ejemplo n.º 4

Mostrar archivo

Archivo: ATAC-seq-analysis-pipeline-version2.py Proyecto: tbrunetti/psychENCODE_data_analysis

	def run_pipeline(self, pipeline_args, pipeline_config):
		# Instantiate variable from argparse
		read_pairs = pipeline_args['reads']
		output_dir = os.path.abspath(pipeline_args['output'])
		logs_dir = os.path.join(output_dir, 'logs')
		lib_prefix = pipeline_args['lib']
		step = int(pipeline_args['step'])
		forward_adapter = pipeline_args['forward_adapter']
		reverse_adapter = pipeline_args['reverse_adapter']

		# Create output, tmp, and logs directories
		tmp_dir = os.path.join(output_dir, 'tmp')
		subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir])

		#Keep list of items to delete
		staging_delete = [tmp_dir]
		bwa_bam_outs = []
		qc_data = {
			'total_raw_reads_counts': [],
			'trimmed_reads_counts': [],
			'num_reads_mapped': [],
			'num_read_removed_steric_hinderence': '0',
			'percent_duplicate_reads': '0',
			'num_unique_reads_mapped': [], #implemented
			'num_mtDNA_reads_mapped': [],
			'percent_mtDNA_reads_mapped': '0' ,
			'num_reads_mapped_after_filtering': '-1', #TODO This isn't implemented
			'num_peaks_called': '-1',
			#TODO Get number of peaks in annotation sites
		}

		#Instatiate software instances
		cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
		fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
		bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln')
		bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe')
		samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view')
		samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat')
		samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index')
		novosort = Software('novosort', pipeline_config['novosort']['path'])
		picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates')
		picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics')
		bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed')
		bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + 'sort')
		bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge')
		bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect')
		macs2_callpeak = Software('macs2 callpeak', pipeline_config['macs2']['path'] + ' callpeak')

		if step <= 1:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')

				#QC: Get raw fastq read counts 
				qc_data['total_raw_reads_counts'].append([
					str(int(self.count_gzipped_lines(read1))/4),
					str(int(self.count_gzipped_lines(read2))/4)
				])

				trimmed_read1_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
				trimmed_read2_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))

				cutadapt.run(
					Parameter('--quality-base=33'),
					Parameter('--minimum-length=5'),
					Parameter('-q',  '30'), # Minimum quality score
					Parameter('--output={}'.format(trimmed_read1_filename)),
					Parameter('--paired-output={}'.format(trimmed_read2_filename)),
					Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'),
					Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
				)

				# QC: Get trimmed fastq read counts
				qc_data['trimmed_reads_counts'].append([
					str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
					str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
					])

				staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename])
				read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

		if step <= 2:
			#Make FastQC Directory
			fastqc_output_dir = os.path.join(output_dir, 'fastqc')
			subprocess.call(['mkdir', '-p', fastqc_output_dir])
			for i, read_pair in enumerate(read_pairs):
				for read in read_pair.split(':'):
					fastqc.run(
						Parameter('--outdir={}'.format(fastqc_output_dir)),
						Parameter(read)
					)

					bwa_aln.run(
						Parameter('-t', pipeline_config['bwa']['threads']),
						Parameter(pipeline_config['bwa']['index-dir']),
						Parameter(read),
						Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read))
					)

					staging_delete.append('{}.sai'.format(read))

		if step <= 3:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')
				bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i))

				bwa_sampe.run(
					Parameter('-a', '2000'), # Maximum insert size
					Parameter('-n', '1'),
					Parameter(pipeline_config['bwa']['index-dir']),
					Parameter('{}.sai'.format(read1)),
					Parameter('{}.sai'.format(read2)),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')),
					Pipe(
						samtools_view.pipe(
							Parameter('-hSb'),
							Parameter('-o', bwa_bam_output),
							Parameter('-') # Get input from stdin
						)
					)
				)

				bwa_bam_outs.append(bwa_bam_output)

		if step <= 4:
			for i, bwa_bam in enumerate(bwa_bam_outs):
				samtools_flagstat.run(
					Parameter(bwa_bam),
					Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from this bam
				try:
					with open(bwa_bam + '.flagstat') as flagstats:
						flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
						if target_line is not None:
							qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_reads_mapped'].append('0')
				except:
					qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format(
						bwa_bam + '.flagstat'
					))

			sortmerged_bam = os.path.join(output_dir, '{}.sortmerged_bam'.format(lib_prefix))
			steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix))
			duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix))
			unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix))
			unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix))
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))
			# binning read based off template size
			nucleosome_free_reads = os.path.join(output_dir, '{}.nucleosome_free.bam'.format(lib_prefix))
			mononucleosome_reads = os.path.join(output_dir, '{}.mononucleosome.bam'.format(lib_prefix))
			dinucleosome_reads = os.path.join(output_dir, '{}.dinucleosome.bam'.format(lib_prefix))
			trinucleosome_reads = os.path.join(output_dir, '{}.trinucleosome.bam'.format(lib_prefix))
			chrM_bam = os.path.join(output_dir, '{}.chrM.bam'.format(lib_prefix))
			
			novosort.run(
				Parameter('--threads', pipeline_config['novosort']['threads']),
				Parameter('--tmpcompression', '6'),
				Parameter('--tmpdir', tmp_dir),
				Parameter(*[bam for bam in bwa_bam_outs]),
				Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam),
				Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log'))
			)

			# This creates a dependency on pysam
			# Removes reads with template length < 38 due to steric hinderence
			samtools_index.run(Parameter(sortmerged_bam))
			sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb')
			steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb',
																	template=sortmerged_bam_alignmentfile)
			
			num_removed=0
			for read in sortmerged_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF:
					steric_filter_bam_alignmentfile.write(read)
				else:
					num_removed += 1
			qc_data['num_read_removed_steric_hinderence']=str(num_removed)
			
			
			sortmerged_bam_alignmentfile.close()
			steric_filter_bam_alignmentfile.close()

			# Mark and remove MarkDuplicates
			markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics')
			picard_mark_dup.run(
				Parameter('INPUT={}'.format(steric_filter_bam)),
				Parameter('OUTPUT={}'.format(duprm_bam)),
				Parameter('TMP_DIR={}'.format(tmp_dir)),
				Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
				Parameter('REMOVE_DUPLICATES=true'),
				Parameter('VALIDATION_STRINGENCY=LENIENT'),
				Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
			)

			#QC: Get percent MarkDuplicates
			try:
				with open(markduplicates_metrics_filepath) as markdup_metrics:
					for line in markdup_metrics:
						if line[FIRST_CHAR] == '#':
							continue
						record = line.strip().split('\t')
						if len(record) == 9:
							if re.match(r'\d+', record[7]) is not None:
								qc_data['percent_duplicate_reads'] = record[7]
			except:
				qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics'

			# Filter down to uniquely mapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '256'),
				Parameter('-q', '10'),
				Parameter('-o', unique_bam),
				Parameter(duprm_bam)
			)

			# gets statistics on uniquely mapped reads
			for i, unique_map in enumerate(unique_bam):
				samtools_flagstat.run(
					Parameter(unique_bam),
					Redirect(stream=Redirect.STDOUT, dest=unique_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from unique bams
				try:
					with open(unique_bam + '.flagstat') as flagstats:
						unique_flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', unique_flagstats_contents)
						if target_line is not None:
							qc_data['num_unique_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_unique_reads_mapped'].append('0')
				except:
					qc_data['num_unique_reads_mapped'] + '.flagstat'

			# make AlignmentFile object to extract binned reads and chrM reads from the unique bam
			samtools_index.run(Parameter(unique_bam))
			unique_bam_alignmentfile = pysam.AlignmentFile(unique_bam, 'rb')
			# Bins reads into 4 categories depending on template length read is derived from:
			# 50-115 (nucleosome-free), 180-247 (mononucleosome), 315-473 (dinucleosome), 558-615 (trinucleosome)
			nucleosome_free_reads_alignmentfile = pysam.AlignmentFile(nucleosome_free_reads, 'wb',
																	template=unique_bam_alignmentfile)
			mononucleosome_reads_alignmentfile = pysam.AlignmentFile(mononucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			dinucleosome_reads_alignmentfile = pysam.AlignmentFile(dinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			trinucleosome_reads_alignmentfile = pysam.AlignmentFile(trinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			
			# Extract chrM into new BAM
			chrM_reads_alignmentfile = pysam.AlignmentFile(chrM_bam, 'wb',
														template=unique_bam_alignmentfile)

			# Binning of nucleosome reads
			for read in unique_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= 50 and abs(int(read.template_length)) <= 115:
					nucleosome_free_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 180 and abs(int(read.template_length)) <= 247:
					mononucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 315 and abs(int(read.template_length)) <= 473:
					dinucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 558 and abs(int(read.template_length)) <= 615:
					trinucleosome_reads_alignmentfile.write(read)
				else:
					continue;

			#stores chrM reads in separate file
			for read in unique_bam_alignmentfile.fetch():
				if read.reference_name == 'chrM':
					chrM_reads_alignmentfile.write(read)
	
			nucleosome_free_reads_alignmentfile.close()
			mononucleosome_reads_alignmentfile.close()
			dinucleosome_reads_alignmentfile.close()
			trinucleosome_reads_alignmentfile.close()
			chrM_reads_alignmentfile.close()
			
			# gets series of flagstats results for non-main files
			samtools_flagstat.run(
					Parameter(nucleosome_free_reads),
					Redirect(stream=Redirect.STDOUT, dest=nucleosome_free_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(mononucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=mononucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(dinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=dinucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(trinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=trinucleosome_reads + '.flagstat'))

			
			# gets statistics on chrM mapped reads
			samtools_index.run(Parameter(chrM_bam))
			for i, chrM_map in enumerate(chrM_bam):
				samtools_flagstat.run(
					Parameter(chrM_bam),
					Redirect(stream=Redirect.STDOUT, dest=chrM_bam + '.flagstat')
				)
				try:
					with open(chrM_bam + '.flagstat') as flagstats:
						chrM_flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', chrM_flagstats_contents)
						if target_line is not None:
							qc_data['num_mtDNA_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_mtDNA_reads_mapped'].append('0')
				except:
					qc_data['num_mtDNA_reads_mapped'] + '.flagstat'



			# Remove unmapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '12'),
				Parameter('-o', unmappedrm_bam),
				Parameter(unique_bam)
			)

			# Create BAM index, then remove chrM
			samtools_index.run(
				Parameter(unmappedrm_bam)
			)

			# Remove chrM
			all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']]
			samtools_view.run(
				Parameter('-b'),
				Parameter('-o', chrmrm_bam),
				Parameter(unmappedrm_bam),
				*all_chr
			)

			# Stage delete for temporary files
			staging_delete.extend([
				sortmerged_bam,
				sortmerged_bam + '.bai', # BAM index file
				steric_filter_bam,
				unique_bam,
				duprm_bam,
				unmappedrm_bam,
				unmappedrm_bam + '.bai', # BAM index file
				chrmrm_bam
			])

		if step <= 5:
			# Generate filename for final processed BAM and BED
			processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
			unshifted_bed = os.path.join(output_dir, '{}.unshifted_bed'.format(lib_prefix))
			processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))

			# staging_delete.append(unshifted_bed)

			# Generate filename for chrM removed BAM
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

			# Remove blacklisted genomic regions
			bedtools_intersect.run(
				Parameter('-v'),
				Parameter('-abam', chrmrm_bam),
				Parameter('-b', pipeline_config['bedtools']['blacklist-bed']),
				Parameter('-f', '0.5'),
				Redirect(stream=Redirect.STDOUT, dest=processed_bam)
			)

			# QC: Generate insert size metrics PDF
			picard_insert_metrics.run(
				Parameter('INPUT={}'.format(processed_bam)),
				Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))),
				Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf')))
			)

			# Generate index for processed BAM
			samtools_index.run(
				Parameter(processed_bam)
			)

			# Convert BAM to BED
			bedtools_bamtobed.run(
				Parameter('-i', processed_bam),
				Redirect(stream=Redirect.STDOUT, dest=unshifted_bed)
			)

			staging_delete.append(unshifted_bed)

			# Shifting + strand by 4 and - strand by -5, according to the ATACseq paper

			# This ysed to be bedtools shift, but they are fired
			self.shift_reads(
				input_bed_filepath=unshifted_bed,
				output_bed_filepath=processed_bed,
				log_filepath=os.path.join(logs_dir, 'shift_reads.logs'),
				genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
				minus_strand_shift=MINUS_STRAND_SHIFT,
				plus_strand_shift=PLUS_STRAND_SHIFT
			)

		# Peak-calling; MACS2
		if step <= 6:
			# for regular peak calling, including narrow, default q-value=0.01
			processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_regular_peak_calls'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads
				Parameter('--call-summits'),
				Parameter('--keep-dup', 'all')
			)

			#for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_broad_peak_calls'),
				Parameter('-q', '0.05'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('--broad'),
				Parameter('--keep-dup', 'all')
			)

		# QC: Output QC data to file
		with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
			qc_data_file.write(str(qc_data) + '\n')

		# Delete temporary files
		for delete_file in staging_delete:
			subprocess.call(['rm', '-rf', delete_file])

Ejemplo n.º 5

Mostrar archivo

Archivo: ATAC-seq-analysis-pipeline-version3-bedpe.py Proyecto: tbrunetti/psychENCODE_data_analysis

	def run_pipeline(self, pipeline_args, pipeline_config):
		# Instantiate variable from argparse
		read_pairs = pipeline_args['reads']
		output_dir = os.path.abspath(pipeline_args['output'])
		logs_dir = os.path.join(output_dir, 'logs')
		lib_prefix = pipeline_args['lib']
		step = int(pipeline_args['step'])
		forward_adapter = pipeline_args['forward_adapter']
		reverse_adapter = pipeline_args['reverse_adapter']

		# Create output, tmp, and logs directories
		tmp_dir = os.path.join(output_dir, 'tmp')
		subprocess.call(['mkdir', '-p', output_dir, tmp_dir, logs_dir])

		#Keep list of items to delete
		staging_delete = [tmp_dir]
		bwa_bam_outs = []
		qc_data = {
			'total_raw_reads_counts': [],
			'trimmed_reads_counts': [],
			'num_reads_mapped': [],
			'num_read_removed_steric_hinderence': '0',
			'percent_duplicate_reads': '0',
			'num_unique_reads_mapped': [], #implemented
			'num_mtDNA_reads_mapped': [],
			'percent_mtDNA_reads_mapped': '0' ,
			'num_reads_mapped_after_filtering': '-1', #TODO This isn't implemented
			'num_peaks_called': '-1',
			#TODO Get number of peaks in annotation sites
		}

		#Instatiate software instances
		cutadapt = Software('cutadapt', pipeline_config['cutadapt']['path'])
		fastqc = Software('FastQC', pipeline_config['fastqc']['path'])
		bwa_aln = Software('BWA aln', pipeline_config['bwa']['path'] + ' aln')
		bwa_sampe = Software('BWA sampe', pipeline_config['bwa']['path'] + ' sampe')
		samtools_view = Software('samtools view', pipeline_config['samtools']['path'] + ' view')
		samtools_flagstat = Software('samtools flagstat', pipeline_config['samtools']['path'] + ' flagstat')
		samtools_index = Software('samtools index', pipeline_config['samtools']['path'] + ' index')
		samtools_sort = Software('samtools sort', pipeline_config['samtools']['path'] + ' sort')
		novosort = Software('novosort', pipeline_config['novosort']['path'])
		picard_mark_dup = Software('Picard MarkDuplicates', pipeline_config['picard']['path'] + ' MarkDuplicates')
		picard_insert_metrics = Software('Picard CollectInsertSizeMetrics', pipeline_config['picard']['path'] + ' CollectInsertSizeMetrics')
		bedtools_bamtobed = Software('bedtools bamtobed', pipeline_config['bedtools']['path'] + ' bamtobed')
		bedtools_sort = Software('bedtools sort', pipeline_config['bedtools']['path'] + 'sort')
		bedtools_merge = Software('bedtools merge', pipeline_config['bedtools']['path'] + ' merge')
		bedtools_intersect = Software('bedtools intersect', pipeline_config['bedtools']['path'] + ' intersect')
		macs2_callpeak = Software('macs2 callpeak', pipeline_config['macs2']['path'] + ' callpeak')

		if step <= 1:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')

				#QC: Get raw fastq read counts 
				qc_data['total_raw_reads_counts'].append([
					str(int(self.count_gzipped_lines(read1))/4),
					str(int(self.count_gzipped_lines(read2))/4)
				])

				trimmed_read1_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read1.trimmed.fastq.gz'.format(i))
				trimmed_read2_filename = os.path.join(output_dir,
														lib_prefix + '_{}_read2.trimmed.fastq.gz'.format(i))

				cutadapt.run(
					Parameter('--quality-base=33'),
					Parameter('--minimum-length=5'),
					Parameter('-q',  '30'), # Minimum quality score
					Parameter('--output={}'.format(trimmed_read1_filename)),
					Parameter('--paired-output={}'.format(trimmed_read2_filename)),
					Parameter('-a', forward_adapter if forward_adapter else 'ZZZ'),
					Parameter('-A', reverse_adapter if reverse_adapter else 'ZZZ'),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDOUT, dest=os.path.join(logs_dir, 'cutadapt.summary.log'))
				)

				# QC: Get trimmed fastq read counts
				qc_data['trimmed_reads_counts'].append([
					str(int(self.count_gzipped_lines(trimmed_read1_filename))/4),
					str(int(self.count_gzipped_lines(trimmed_read2_filename))/4)
					])

				staging_delete.extend([trimmed_read1_filename, trimmed_read2_filename])
				read_pairs[i] = ':'.join([trimmed_read1_filename, trimmed_read2_filename])

		if step <= 2:
			#Make FastQC Directory
			fastqc_output_dir = os.path.join(output_dir, 'fastqc')
			subprocess.call(['mkdir', '-p', fastqc_output_dir])
			for i, read_pair in enumerate(read_pairs):
				for read in read_pair.split(':'):
					fastqc.run(
						Parameter('--outdir={}'.format(fastqc_output_dir)),
						Parameter(read)
					)

					bwa_aln.run(
						Parameter('-t', pipeline_config['bwa']['threads']),
						Parameter(pipeline_config['bwa']['index-dir']),
						Parameter(read),
						Redirect(stream=Redirect.STDOUT, dest='{}.sai'.format(read))
					)

					staging_delete.append('{}.sai'.format(read))

		if step <= 3:
			for i, read_pair in enumerate(read_pairs):
				read1, read2 = read_pair.split(':')
				bwa_bam_output = os.path.join(output_dir, '{}.{}.bam'.format(lib_prefix, i))

				bwa_sampe.run(
					Parameter('-a', '2000'), # Maximum insert size
					Parameter('-n', '1'),
					Parameter(pipeline_config['bwa']['index-dir']),
					Parameter('{}.sai'.format(read1)),
					Parameter('{}.sai'.format(read2)),
					Parameter(read1),
					Parameter(read2),
					Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'bwa_sampe.log')),
					Pipe(
						samtools_view.pipe(
							Parameter('-hSb'),
							Parameter('-o', bwa_bam_output),
							Parameter('-') # Get input from stdin
						)
					)
				)

				bwa_bam_outs.append(bwa_bam_output)

		if step <= 4:
			for i, bwa_bam in enumerate(bwa_bam_outs):
				samtools_flagstat.run(
					Parameter(bwa_bam),
					Redirect(stream=Redirect.STDOUT, dest=bwa_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from this bam
				try:
					with open(bwa_bam + '.flagstat') as flagstats:
						flagstats_contents = flagstats.read()
						target_line = re.search(r'(\d+) \+ \d+ mapped', flagstats_contents)
						if target_line is not None:
							qc_data['num_reads_mapped'].append(str(int(target_line.group(1))/2))
						else:
							qc_data['num_reads_mapped'].append('0')
				except:
					qc_data['num_reads_mapped'].append('Could not open flagstats {}'.format(
						bwa_bam + '.flagstat'
					))

			sortmerged_bam = os.path.join(output_dir, '{}.sortmerged_bam'.format(lib_prefix))
			steric_filter_bam = os.path.join(output_dir, '{}.steric.bam'.format(lib_prefix))
			duprm_bam = os.path.join(output_dir, '{}.duprm.bam'.format(lib_prefix))
			unique_bam = os.path.join(output_dir, '{}.unique.bam'.format(lib_prefix))
			unmappedrm_bam = os.path.join(output_dir, '{}.unmappedrm.bam'.format(lib_prefix))
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))
			# binning read based off template size
			nucleosome_free_reads = os.path.join(output_dir, '{}.nucleosome_free.bam'.format(lib_prefix))
			mononucleosome_reads = os.path.join(output_dir, '{}.mononucleosome.bam'.format(lib_prefix))
			dinucleosome_reads = os.path.join(output_dir, '{}.dinucleosome.bam'.format(lib_prefix))
			trinucleosome_reads = os.path.join(output_dir, '{}.trinucleosome.bam'.format(lib_prefix))
			chrM_bam = os.path.join(output_dir, '{}.chrM.bam'.format(lib_prefix))
			sorted_for_PE_bam = os.path.join(output_dir, '{}.sorted_for_PE'.format(lib_prefix))

			novosort.run(
				Parameter('--threads', pipeline_config['novosort']['threads']),
				Parameter('--tmpcompression', '6'),
				Parameter('--tmpdir', tmp_dir),
				Parameter(*[bam for bam in bwa_bam_outs]),
				Redirect(stream=Redirect.STDOUT, dest=sortmerged_bam),
				Redirect(stream=Redirect.STDERR, dest=os.path.join(logs_dir, 'novosort.log'))
			)

			# This creates a dependency on pysam
			# Removes reads with template length < 38 due to steric hinderence
			samtools_index.run(Parameter(sortmerged_bam))
			sortmerged_bam_alignmentfile = pysam.AlignmentFile(sortmerged_bam, 'rb')
			steric_filter_bam_alignmentfile = pysam.AlignmentFile(steric_filter_bam, 'wb',
																	template=sortmerged_bam_alignmentfile)
			
			num_removed=0
			for read in sortmerged_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= STERIC_HINDRANCE_CUTOFF:
					steric_filter_bam_alignmentfile.write(read)
				else:
					num_removed += 1
			qc_data['num_read_removed_steric_hinderence']=str(num_removed)
			
			
			sortmerged_bam_alignmentfile.close()
			steric_filter_bam_alignmentfile.close()

			# Mark and remove MarkDuplicates
			markduplicates_metrics_filepath = os.path.join(logs_dir, 'mark_dup.metrics')
			picard_mark_dup.run(
				Parameter('INPUT={}'.format(steric_filter_bam)),
				Parameter('OUTPUT={}'.format(duprm_bam)),
				Parameter('TMP_DIR={}'.format(tmp_dir)),
				Parameter('METRICS_FILE={}'.format(markduplicates_metrics_filepath)),
				Parameter('REMOVE_DUPLICATES=true'),
				Parameter('VALIDATION_STRINGENCY=LENIENT'),
				Redirect(stream=Redirect.BOTH, dest=os.path.join(logs_dir, 'mark_dup.log'))
			)

			#QC: Get percent MarkDuplicates
			try:
				with open(markduplicates_metrics_filepath) as markdup_metrics:
					for line in markdup_metrics:
						if line[FIRST_CHAR] == '#':
							continue
						record = line.strip().split('\t')
						if len(record) == 9:
							if re.match(r'\d+', record[7]) is not None:
								qc_data['percent_duplicate_reads'] = record[7]
			except:
				qc_data['percent_duplicate_reads'] = 'Could not open MarkDuplicates metrics'

			# Filter down to uniquely mapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '256'),
				Parameter('-q', '10'),
				Parameter('-o', unique_bam),
				Parameter(duprm_bam)
			)

			# gets statistics on uniquely mapped reads
			for i, unique_map in enumerate(unique_bam):
				samtools_flagstat.run(
					Parameter(unique_bam),
					Redirect(stream=Redirect.STDOUT, dest=unique_bam + '.flagstat')
				)

				#QC: Get number of mapped reads from unique bams
			try:
				with open(unique_bam + '.flagstat') as flagstats:
					unique_flagstats_contents = flagstats.read()
					target_line = re.search(r'(\d+) \+ \d+ mapped', unique_flagstats_contents)
					if target_line is not None:
						qc_data['num_unique_reads_mapped'].append(str(int(target_line.group(1))/2))
					else:
						qc_data['num_unique_reads_mapped'].append('0')
			except:
				qc_data['num_unique_reads_mapped'] + '.flagstat'

			# make AlignmentFile object to extract binned reads and chrM reads from the unique bam
			samtools_index.run(Parameter(unique_bam))
			unique_bam_alignmentfile = pysam.AlignmentFile(unique_bam, 'rb')
			# Bins reads into 4 categories depending on template length read is derived from:
			# 50-115 (nucleosome-free), 180-247 (mononucleosome), 315-473 (dinucleosome), 558-615 (trinucleosome)
			nucleosome_free_reads_alignmentfile = pysam.AlignmentFile(nucleosome_free_reads, 'wb',
																	template=unique_bam_alignmentfile)
			mononucleosome_reads_alignmentfile = pysam.AlignmentFile(mononucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			dinucleosome_reads_alignmentfile = pysam.AlignmentFile(dinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			trinucleosome_reads_alignmentfile = pysam.AlignmentFile(trinucleosome_reads, 'wb',
																	template=unique_bam_alignmentfile)
			
			# Extract chrM into new BAM
			chrM_reads_alignmentfile = pysam.AlignmentFile(chrM_bam, 'wb',
														template=unique_bam_alignmentfile)

			# Binning of nucleosome reads
			for read in unique_bam_alignmentfile.fetch():
				if abs(int(read.template_length)) >= 50 and abs(int(read.template_length)) <= 115:
					nucleosome_free_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 180 and abs(int(read.template_length)) <= 247:
					mononucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 315 and abs(int(read.template_length)) <= 473:
					dinucleosome_reads_alignmentfile.write(read)
				elif abs(int(read.template_length)) >= 558 and abs(int(read.template_length)) <= 615:
					trinucleosome_reads_alignmentfile.write(read)
				else:
					continue;

			#stores chrM reads in separate file
			for read in unique_bam_alignmentfile.fetch():
				if read.reference_name == 'chrM':
					chrM_reads_alignmentfile.write(read)
	
			nucleosome_free_reads_alignmentfile.close()
			mononucleosome_reads_alignmentfile.close()
			dinucleosome_reads_alignmentfile.close()
			trinucleosome_reads_alignmentfile.close()
			chrM_reads_alignmentfile.close()
			
			# gets series of flagstats results for non-main files
			samtools_flagstat.run(
					Parameter(nucleosome_free_reads),
					Redirect(stream=Redirect.STDOUT, dest=nucleosome_free_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(mononucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=mononucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(dinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=dinucleosome_reads + '.flagstat'))

			samtools_flagstat.run(
					Parameter(trinucleosome_reads),
					Redirect(stream=Redirect.STDOUT, dest=trinucleosome_reads + '.flagstat'))

			
			# gets statistics on chrM mapped reads
			samtools_index.run(Parameter(chrM_bam))
			for i, chrM_map in enumerate(chrM_bam):
				samtools_flagstat.run(
					Parameter(chrM_bam),
					Redirect(stream=Redirect.STDOUT, dest=chrM_bam + '.flagstat')
				)
			try:
				with open(chrM_bam + '.flagstat') as flagstats:
					chrM_flagstats_contents = flagstats.read()
					target_line = re.search(r'(\d+) \+ \d+ mapped', chrM_flagstats_contents)
					if target_line is not None:
						qc_data['num_mtDNA_reads_mapped'].append(str(int(target_line.group(1))/2))
					else:
						qc_data['num_mtDNA_reads_mapped'].append('0')
			except:
				qc_data['num_mtDNA_reads_mapped'] + '.flagstat'



			# Remove unmapped reads
			samtools_view.run(
				Parameter('-b'),
				Parameter('-F', '12'),
				Parameter('-o', unmappedrm_bam),
				Parameter(unique_bam)
			)

			# Create BAM index, then remove chrM
			samtools_index.run(
				Parameter(unmappedrm_bam)
			)

			# Remove chrM
			all_chr = [Parameter('chr{}'.format(chromosome)) for chromosome in map(str, range(1, 23)) + ['X', 'Y']]
			samtools_view.run(
				Parameter('-b'),
				Parameter('-o', chrmrm_bam),
				Parameter(unmappedrm_bam),
				*all_chr
			)

			# Stage delete for temporary files
			staging_delete.extend([
				sortmerged_bam,
				sortmerged_bam + '.bai', # BAM index file
				steric_filter_bam,
				unique_bam,
				duprm_bam,
				unmappedrm_bam,
				unmappedrm_bam + '.bai', # BAM index file
				chrmrm_bam
			])

		if step <= 5:
			# Generate filename for final processed BAM and BED
			processed_bam = os.path.join(output_dir, '{}.processed.bam'.format(lib_prefix))
			unshifted_bed = os.path.join(output_dir, '{}.unshifted_bed'.format(lib_prefix))
			processed_bed = os.path.join(output_dir, '{}.processed.bed'.format(lib_prefix))
			unshifted_bedpe = os.path.join(output_dir, '{}.unshifted_bedpe'.format(lib_prefix))
			processed_bedpe_to_bed = os.path.join(output_dir,'{}.processed_bedpe_to_bed'.format(lib_prefix))
			# staging_delete.append(unshifted_bed)

			# Generate filename for chrM removed BAM
			chrmrm_bam = os.path.join(output_dir, '{}.chrmrm.bam'.format(lib_prefix))

			# Remove blacklisted genomic regions
			bedtools_intersect.run(
				Parameter('-v'),
				Parameter('-abam', chrmrm_bam),
				Parameter('-b', pipeline_config['bedtools']['blacklist-bed']),
				Parameter('-f', '0.5'),
				Redirect(stream=Redirect.STDOUT, dest=processed_bam)
			)

			# QC: Generate insert size metrics PDF
			picard_insert_metrics.run(
				Parameter('INPUT={}'.format(processed_bam)),
				Parameter('OUTPUT={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.metrics'))),
				Parameter('HISTOGRAM_FILE={}'.format(os.path.join(logs_dir, lib_prefix + '.insertsize.pdf')))
			)

			# Generate index for processed BAM
			samtools_index.run(
				Parameter(processed_bam)
			)

			# Convert BAM to BED
			bedtools_bamtobed.run(
				Parameter('-i', processed_bam),
				Redirect(stream=Redirect.STDOUT, dest=unshifted_bed)
			)

			# Convert BAM to BEDPE, with specific quality and only properly paired reads, sorted by name
			samtools_view.run(
				Parameter('-uf', '0x2'),
				Parameter('-F', '1548'),
				Parameter('-q', '30'),
				Parameter(processed_bam),
				Pipe(
					samtools_sort.pipe(
						Parameter('-n'),
						Parameter('-'),
						Parameter(sorted_for_PE_bam)
					)
				)
			)

			# convert bam to BEDPE
			bedtools_bamtobed.run(
				Parameter('-i', str(sorted_for_PE_bam)+'.bam'),
				Parameter('-bedpe'),
				Redirect(stream=Redirect.STDOUT, dest=unshifted_bedpe)
			)
			
			unshifted_bedpe_to_bed = open(output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix), 'w')
			
			with open(unshifted_bedpe) as convertToBed:
				for line in convertToBed:
					chrpos1, start1, end1, chrpos2, start2, end2, name, score, strand1, strand2=line.split('\t')
					bedformat=[chrpos1, start1, end2, name, score, strand1, strand2.rstrip('\n')]
					unshifted_bedpe_to_bed.write('\t'.join(bedformat)+'\n')

					
			staging_delete.append(unshifted_bed)
			staging_delete.append(output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix))

			# Shifting + strand by 4 and - strand by -5, according to the ATACseq paper

			# This ysed to be bedtools shift, but they are fired
			self.shift_reads(
				input_bed_filepath=unshifted_bed,
				output_bed_filepath=processed_bed,
				log_filepath=os.path.join(logs_dir, 'shift_reads.logs'),
				genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
				minus_strand_shift=MINUS_STRAND_SHIFT,
				plus_strand_shift=PLUS_STRAND_SHIFT
			)

			##TO DO, needs modification for bedpe format
			self.shift_reads_bedpe(
				input_bed_filepath=output_dir+'/'+'{}.unshifted_bedpe_to_bed'.format(lib_prefix),
				output_bed_filepath=processed_bedpe_to_bed,
				log_filepath=os.path.join(logs_dir, 'shift_reads_bedpe_to_bed.logs'),
				genome_sizes_filepath=pipeline_config['bedtools']['genome-sizes'],
				minus_strand_shift=MINUS_STRAND_SHIFT,
				plus_strand_shift=PLUS_STRAND_SHIFT
			)

		# Peak-calling; MACS2
		if step <= 6:
			# for regular peak calling, including narrow, default q-value=0.01
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_regular_peak_calls'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads
				Parameter('--call-summits'),
				Parameter('--keep-dup', 'all')
			)

			#for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks
			macs2_callpeak.run(
				Parameter('-t', processed_bed),
				Parameter('-f', 'BED'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bed) + '_broad_peak_calls'),
				Parameter('-q', '0.05'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('--broad'),
				Parameter('--keep-dup', 'all')
			)

			# for regular peak calling, including narrow, default q-value=0.01 for processed bedpe to bed file
			# NOTE: BEDPE for MACS2 is not the same format at BEDPE accepted by NGS/UCSC standards
			macs2_callpeak.run(
				Parameter('-t', processed_bedpe_to_bed),
				Parameter('-f', 'BEDPE'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bedpe_to_bed) + '_regular_peak_calls'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('-B', '--SPMR'), # Generates pileup tracks, bedgraph, fragment pileup per million reads
				Parameter('--call-summits'),
				Parameter('--keep-dup', 'all')
			)

			#for broad peak calling, q-value=0.05 per MACS2 suggestion on broad peaks for processed bedpe to bed file
			# NOTE: BEDPE for MACS2 is not the same format at BEDPE accepted by NGS/UCSC standards
			macs2_callpeak.run(
				Parameter('-t', processed_bedpe_to_bed),
				Parameter('-f', 'BEDPE'),
				Parameter('-g', 'hs'),
				Parameter('-n', str(processed_bedpe_to_bed) + '_broad_peak_calls'),
				Parameter('-q', '0.05'),
				Parameter('--nomodel'),
				Parameter('--extsize', '200'),
				Parameter('--shift', '-100'),
				Parameter('--broad'),
				Parameter('--keep-dup', 'all')
			)


		# QC: Output QC data to file
		with open(os.path.join(logs_dir, 'qc_metrics.txt'), 'w') as qc_data_file:
			qc_data_file.write(str(qc_data) + '\n')

		# Delete temporary files
		for delete_file in staging_delete:
			subprocess.call(['rm', '-rf', delete_file])