def align_pe(fastq, sai, reference, fastq_basename): '''Use BWA to align PE data.''' sam_filename = "%s.sam" % (fastq_basename) badcigar_filename = "%s.badReads" % (fastq_basename) bam_filename = '%s.srt.bam' % (fastq_basename) # Remove read pairs with bad CIGAR strings and sort by position steps = [ "bwa sampe -P %s %s %s %s %s" % (reference, sai[0], sai[1], fastq[0], fastq[1]), "tee %s" % (sam_filename), r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""", "sort", "uniq" ] out, err = utils.run_pipe(steps, badcigar_filename) if err: logger.error("sampe error: %s", err) steps = [ "cat %s" % (sam_filename), "grep -v -F -f %s" % (badcigar_filename), "samtools view -@%d -Su -" % (cpu_count()), "samtools sort -@%d -o %s" % (cpu_count(), bam_filename) ] out, err = utils.run_pipe(steps) if err: logger.error("samtools error: %s", err) return bam_filename
def convert_mapped(bam, tag_filename): '''Use bedtools to convert to tagAlign.''' out, err = utils.run_pipe([ "bamToBed -i %s" % (bam), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "gzip -nc" ], outfile=tag_filename)
def filter_mapped_pe(bam, bam_basename): '''Use samtools to filter unmapped reads for PE data.''' filt_bam_prefix = bam_basename + ".filt.srt" filt_bam_filename = filt_bam_prefix + ".bam" tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix) tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam" # Remove unmapped, mate unmapped # not primary alignment, reads failing platform # Remove low MAPQ reads # Only keep properly paired reads # Obtain name sorted BAM file out, err = utils.run_pipe([ # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire; # -q 30 exclude MAPQ < 30; -u uncompressed output # exclude FLAG 1804: unmapped, next segment unmapped, secondary # alignments, not passing platform q, PCR or optical duplicates # require FLAG 2: properly aligned "samtools view -F 1804 -f 2 -q 30 -u %s" % (bam), # sort: -n sort by name; - take input from stdin; # out to specified filename # Will produce name sorted BAM "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)]) if err: logger.error("samtools filter error: %s" % (err)) # Remove orphan reads (pair was removed) # and read pairs mapping to different chromosomes # Obtain position sorted BAM out, err = utils.run_pipe([ # fill in mate coordinates, ISIZE and mate-related flags # fixmate requires name-sorted alignment; -r removes secondary and # unmapped (redundant here because already done above?) # - send output to stdout "samtools fixmate -r %s -" % (tmp_filt_bam_filename), # repeat filtering after mate repair "samtools view -F 1804 -f 2 -u -", # produce the coordinate-sorted BAM "samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)]) os.remove(tmp_filt_bam_filename) return filt_bam_filename
def compute_complexity(bam, paired, bam_basename): '''Calculate library complexity .''' pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc" tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename) # Sort by name # convert to bedPE and obtain fragment coordinates # sort by position and strand # Obtain unique count statistics # PBC File output # TotalReadPairs [tab] # DistinctReadPairs [tab] # OneReadPair [tab] # TwoReadPairs [tab] # NRF=Distinct/Total [tab] # PBC1=OnePair/Distinct [tab] # PBC2=OnePair/TwoPair pbc_headers = [ 'TotalReadPairs', 'DistinctReadPairs', 'OneReadPair', 'TwoReadPairs', 'NRF', 'PBC1', 'PBC2'] if paired: steps = [ "samtools sort -@%d -n %s" % (cpu_count(), bam), "bamToBed -bedpe -i stdin", r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""] else: steps = [ "bamToBed -i %s" % (bam), r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""] steps.extend([ "grep -v 'chrM'", "sort", "uniq -c", r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'""" ]) out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename) if err: logger.error("PBC file error: %s", err) # Add headers pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None, names=pbc_headers) pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False) os.remove(bam) os.remove(bam + '.bai') os.remove(tmp_pbc_file_qc_filename)
def self_psuedoreplication(tag_file, prefix, paired): '''Make 2 self-psuedoreplicates.''' # Get total number of reads no_lines = utils.count_lines(tag_file) # Number of lines to split into lines_per_rep = (no_lines+1)/2 # Make an array of number of psuedoreplicatesfile names pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz' for r in [0, 1]} # Shuffle and split file into equal parts # by using the input to seed shuf we ensure multiple runs with the same # input will produce the same output # Produces two files named splits_prefix0n, n=1,2 splits_prefix = 'temp_split' out, err = utils.run_pipe([ 'gzip -dc %s' % (tag_file), 'shuf --random-source=%s' % (tag_file), 'split -d -l %d - %s' % (lines_per_rep, splits_prefix)]) # Convert read pairs to reads into standard tagAlign file for i, index in enumerate([0, 1]): string_index = '0' + str(index) steps = ['cat %s' % (splits_prefix + string_index)] if paired: steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""]) steps.extend(['gzip -cn']) out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i]) os.remove(splits_prefix + string_index) return pseudoreplicate_dict
def align_se(fastq, sai, reference, fastq_basename): '''Use BWA to align SE data.''' bam_filename = '%s.srt.bam' % (fastq_basename) steps = [ "bwa samse %s %s %s" % (reference, sai[0], fastq[0]), "samtools view -@%d -Su -" % (cpu_count()), "samtools sort -@%d -o %s" % (cpu_count(), bam_filename) ] out, err = utils.run_pipe(steps) if err: logger.error("samse/samtools error: %s", err) return bam_filename
def pool(tag_files, outfile, paired): '''Pool files together.''' if paired: file_extension = '.bedpe.gz' else: file_extension = '.bedse.gz' pooled_filename = outfile + file_extension # Merge files out, err = utils.run_pipe([ 'gzip -dc %s' % (' '.join(tag_files)), 'gzip -cn'], outfile=pooled_filename) return pooled_filename
def convert_mapped_pe(bam, bam_basename): '''Use bedtools to convert to tagAlign PE data.''' bedpe_filename = bam_basename + ".bedpe.gz" # Name sort bam to make BEDPE nmsrt_bam_filename = bam_basename + ".nmsrt.bam" samtools_sort_command = \ "samtools sort -n -@%d -o %s %s" \ % (cpu_count(), nmsrt_bam_filename, bam) logger.info(samtools_sort_command) subprocess.check_output(shlex.split(samtools_sort_command)) out, err = utils.run_pipe( ["bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), "gzip -nc"], outfile=bedpe_filename)
def overlap(experiment, design): '''Calculate the overlap of peaks''' logger.info("Determining consenus peaks for experiment %s.", experiment) # Output File names peak_type = 'narrowPeak' overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type) rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type) # Intermediate File names overlap_tr_fn = 'replicated_tr.%s' % (peak_type) overlap_pr_fn = 'replicated_pr.%s' % (peak_type) # Assign Pooled and Psuedoreplicate peaks pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0] pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0] pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0] # Remove non true replicate rows not_replicates = ['1_pr', '2_pr', 'pooled'] design_true_reps = design[~design['replicate'].isin(not_replicates)] true_rep_peaks = design_true_reps.peaks.unique() # Find overlaps awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'""" cut_command = 'cut -f 1-10' # Find pooled peaks that overlap Rep1 and Rep2 # where overlap is defined as the fractional overlap # with any one of the overlapping peak pairs >= 0.5 steps_true = [ 'intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]), awk_command, cut_command, 'sort -u' ] iter_true_peaks = iter(true_rep_peaks) next(iter_true_peaks) if len(true_rep_peaks) > 1: for true_peak in true_rep_peaks[1:]: steps_true.extend([ 'intersectBed -wo -a stdin -b %s' % (true_peak), awk_command, cut_command, 'sort -u' ]) out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn) print("%d peaks overlap with both true replicates" % (utils.count_lines(overlap_tr_fn))) # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 # where overlap is defined as the fractional overlap # with any one of the overlapping peak pairs >= 0.5 steps_pseudo = [ 'intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks), awk_command, cut_command, 'sort -u', 'intersectBed -wo -a stdin -b %s' % (pr2_peaks), awk_command, cut_command, 'sort -u' ] out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn) print("%d peaks overlap with both pooled pseudoreplicates" % (utils.count_lines(overlap_pr_fn))) # Make union of peak lists out, err = utils.run_pipe( ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'], overlapping_peaks_fn) print( "%d peaks overlap with true replicates or with pooled pseudorepliates" % (utils.count_lines(overlapping_peaks_fn))) # Make rejected peak list out, err = utils.run_pipe([ 'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn) ], rejected_peaks_fn) print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn))) # Remove temporary files os.remove(overlap_tr_fn) os.remove(overlap_pr_fn) return overlapping_peaks_fn
def call_peaks_macs(experiment, xcor, control, prefix, genome_size, chrom_sizes): # Extract the fragment length estimate from column 3 of the # cross-correlation scores file with open(xcor, 'r') as xcor_fh: firstline = xcor_fh.readline() frag_lengths = firstline.split()[2] # third column fragment_length = frag_lengths.split(',')[0] # grab first value logger.info("Fraglen %s" % (fragment_length)) # Generate narrow peaks and preliminary signal tracks command = 'macs2 callpeak ' + \ '-t %s -c %s ' % (experiment, control) + \ '-f BED -n %s ' % (prefix) + \ '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length) logger.info(command) returncode = utils.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # MACS2 sometimes calls features off the end of chromosomes. # Remove coordinates outside chromosome sizes narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix) clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn) steps = [ 'slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes), 'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn) ] out, err = utils.run_pipe(steps) # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format # (score must be <1000) rescaled_narrowpeak_fn = utils.rescale_scores(clipped_narrowpeak_fn, scores_col=5) # Sort by Col8 in descending order and replace long peak names in Column 4 # with Peak_<peakRank> steps = [ 'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn), r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""" ] out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn)) # For Fold enrichment signal tracks # This file is a tab delimited file with 2 columns Col1 (chromosome name), # Col2 (chromosome size in bp). command = 'macs2 bdgcmp ' + \ '-t %s_treat_pileup.bdg ' % (prefix) + \ '-c %s_control_lambda.bdg ' % (prefix) + \ '-o %s_FE.bdg ' % (prefix) + \ '-m FE' logger.info(command) returncode = utils.block_on(command) logger.info("MACS2 exited with returncode %d" % (returncode)) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (MACS2 bug) fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix) fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn) fc_signal_fn = "%s.fc_signal.bw" % (prefix) steps = [ 'slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes), 'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn) ] out, err = utils.run_pipe(steps) # Sort file out, err = utils.run_pipe( ['bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)]) # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s ' % (fc_bedgraph_sorted_fn) + \ '%s ' % (chrom_sizes) + \ '%s' % (fc_signal_fn) logger.info(command) returncode = utils.block_on(command) logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) assert returncode == 0, "bedGraphToBigWig non-zero return" # For -log10(p-value) signal tracks # Compute sval = # min(no. of reads in ChIP, no. of reads in control) / 1,000,000 out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l']) chip_reads = out.strip() out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l']) control_reads = out.strip() sval = str(min(float(chip_reads), float(control_reads)) / 1000000) logger.info("chip_reads = %s, control_reads = %s, sval = %s" % (chip_reads, control_reads, sval)) command = 'macs2 bdgcmp ' + \ '-t %s_treat_pileup.bdg ' % (prefix) + \ '-c %s_control_lambda.bdg ' % (prefix) + \ '-o %s_ppois.bdg ' % (prefix) + \ '-m ppois -S %s' % (sval) logger.info(command) returncode = utils.block_on(command) assert returncode == 0, "MACS2 non-zero return" # Remove coordinates outside chromosome sizes (MACS2 bug) pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix) pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn) pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix) steps = [ 'slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes), 'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn) ] out, err = utils.run_pipe(steps) # Sort file out, err = utils.run_pipe( ['bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)]) # Convert bedgraph to bigwig command = 'bedGraphToBigWig ' + \ '%s ' % (pvalue_bedgraph_sorted_fn) + \ '%s ' % (chrom_sizes) + \ '%s' % (pvalue_signal_fn) logger.info(command) returncode = utils.block_on(command) logger.info("bedGraphToBigWig exited with returncode %d" % (returncode)) assert returncode == 0, "bedGraphToBigWig non-zero return" # Remove temporary files os.remove(clipped_narrowpeak_fn) os.remove(rescaled_narrowpeak_fn)