Example #1
0
def align_pe(fastq, sai, reference, fastq_basename):
    '''Use BWA to align PE data.'''

    sam_filename = "%s.sam" % (fastq_basename)
    badcigar_filename = "%s.badReads" % (fastq_basename)
    bam_filename = '%s.srt.bam' % (fastq_basename)

    # Remove read pairs with bad CIGAR strings and sort by position
    steps = [
        "bwa sampe -P %s %s %s %s %s" %
        (reference, sai[0], sai[1], fastq[0], fastq[1]),
        "tee %s" % (sam_filename),
        r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
        "sort", "uniq"
    ]

    out, err = utils.run_pipe(steps, badcigar_filename)
    if err:
        logger.error("sampe error: %s", err)

    steps = [
        "cat %s" % (sam_filename),
        "grep -v -F -f %s" % (badcigar_filename),
        "samtools view -@%d -Su -" % (cpu_count()),
        "samtools sort -@%d -o %s" % (cpu_count(), bam_filename)
    ]

    out, err = utils.run_pipe(steps)
    if err:
        logger.error("samtools error: %s", err)

    return bam_filename
Example #2
0
def convert_mapped(bam, tag_filename):
    '''Use bedtools to convert to tagAlign.'''

    out, err = utils.run_pipe([
        "bamToBed -i %s" % (bam),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "gzip -nc"
    ],
                              outfile=tag_filename)
Example #3
0
def filter_mapped_pe(bam, bam_basename):
    '''Use samtools to filter unmapped reads for PE data.'''

    filt_bam_prefix = bam_basename + ".filt.srt"
    filt_bam_filename = filt_bam_prefix + ".bam"
    tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)
    tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"

    # Remove  unmapped, mate unmapped
    # not primary alignment, reads failing platform
    # Remove low MAPQ reads
    # Only keep properly paired reads
    # Obtain name sorted BAM file
    out, err = utils.run_pipe([
        # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
        # -q 30 exclude MAPQ < 30; -u uncompressed output
        # exclude FLAG 1804: unmapped, next segment unmapped, secondary
        # alignments, not passing platform q, PCR or optical duplicates
        # require FLAG 2: properly aligned
        "samtools view -F 1804 -f 2 -q 30 -u %s" % (bam),
        # sort:  -n sort by name; - take input from stdin;
        # out to specified filename
        # Will produce name sorted BAM
        "samtools sort -n -@ %d -o %s" % (cpu_count(), tmp_filt_bam_filename)])
    if err:
        logger.error("samtools filter error: %s" % (err))

    # Remove orphan reads (pair was removed)
    # and read pairs mapping to different chromosomes
    # Obtain position sorted BAM
    out, err = utils.run_pipe([
        # fill in mate coordinates, ISIZE and mate-related flags
        # fixmate requires name-sorted alignment; -r removes secondary and
        # unmapped (redundant here because already done above?)
        # - send output to stdout
        "samtools fixmate -r %s -" % (tmp_filt_bam_filename),
        # repeat filtering after mate repair
        "samtools view -F 1804 -f 2 -u -",
        # produce the coordinate-sorted BAM
        "samtools sort -@ %d -o %s" % (cpu_count(), filt_bam_filename)])

    os.remove(tmp_filt_bam_filename)
    return filt_bam_filename
Example #4
0
def compute_complexity(bam, paired, bam_basename):
    '''Calculate library complexity .'''

    pbc_file_qc_filename = bam_basename + ".filt.nodup.pbc.qc"
    tmp_pbc_file_qc_filename = "tmp.%s" % (pbc_file_qc_filename)

    # Sort by name
    # convert to bedPE and obtain fragment coordinates
    # sort by position and strand
    # Obtain unique count statistics

    # PBC File output
    # TotalReadPairs [tab]
    # DistinctReadPairs [tab]
    # OneReadPair [tab]
    # TwoReadPairs [tab]
    # NRF=Distinct/Total [tab]
    # PBC1=OnePair/Distinct [tab]
    # PBC2=OnePair/TwoPair
    pbc_headers = [
        'TotalReadPairs',
        'DistinctReadPairs',
        'OneReadPair',
        'TwoReadPairs',
        'NRF',
        'PBC1',
        'PBC2']

    if paired:
        steps = [
            "samtools sort -@%d -n %s" % (cpu_count(), bam),
            "bamToBed -bedpe -i stdin",
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
    else:
        steps = [
            "bamToBed -i %s" % (bam),
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
    steps.extend([
        "grep -v 'chrM'",
        "sort",
        "uniq -c",
        r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
        ])
    out, err = utils.run_pipe(steps, tmp_pbc_file_qc_filename)
    if err:
        logger.error("PBC file error: %s", err)

    # Add headers
    pbc_file = pd.read_csv(tmp_pbc_file_qc_filename, sep='\t', header=None,
                           names=pbc_headers)
    pbc_file.to_csv(pbc_file_qc_filename, header=True, sep='\t', index=False)
    os.remove(bam)
    os.remove(bam + '.bai')
    os.remove(tmp_pbc_file_qc_filename)
Example #5
0
def self_psuedoreplication(tag_file, prefix, paired):
    '''Make 2 self-psuedoreplicates.'''

    # Get total number of reads
    no_lines = utils.count_lines(tag_file)

    # Number of lines to split into
    lines_per_rep = (no_lines+1)/2

    # Make an array of number of psuedoreplicatesfile names
    pseudoreplicate_dict = {r: prefix + '.pr' + str(r) + '.bedse.tagAlign.gz'
                            for r in [0, 1]}

    # Shuffle and split file into equal parts
    # by using the input to seed shuf we ensure multiple runs with the same
    # input will produce the same output
    # Produces two files named splits_prefix0n, n=1,2

    splits_prefix = 'temp_split'

    out, err = utils.run_pipe([
        'gzip -dc %s' % (tag_file),
        'shuf --random-source=%s' % (tag_file),
        'split -d -l %d - %s' % (lines_per_rep, splits_prefix)])

    # Convert read pairs to reads into standard tagAlign file

    for i, index in enumerate([0, 1]):
        string_index = '0' + str(index)
        steps = ['cat %s' % (splits_prefix + string_index)]
        if paired:
            steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
        steps.extend(['gzip -cn'])
        out, err = utils.run_pipe(steps, outfile=pseudoreplicate_dict[i])
        os.remove(splits_prefix + string_index)

    return pseudoreplicate_dict
Example #6
0
def align_se(fastq, sai, reference, fastq_basename):
    '''Use BWA to align SE data.'''

    bam_filename = '%s.srt.bam' % (fastq_basename)

    steps = [
        "bwa samse %s %s %s" % (reference, sai[0], fastq[0]),
        "samtools view -@%d -Su -" % (cpu_count()),
        "samtools sort -@%d -o %s" % (cpu_count(), bam_filename)
    ]

    out, err = utils.run_pipe(steps)
    if err:
        logger.error("samse/samtools error: %s", err)

    return bam_filename
Example #7
0
def pool(tag_files, outfile, paired):
    '''Pool files together.'''

    if paired:
        file_extension = '.bedpe.gz'
    else:
        file_extension = '.bedse.gz'

    pooled_filename = outfile + file_extension

    # Merge files
    out, err = utils.run_pipe([
        'gzip -dc %s' % (' '.join(tag_files)),
        'gzip -cn'], outfile=pooled_filename)

    return pooled_filename
Example #8
0
def convert_mapped_pe(bam, bam_basename):
    '''Use bedtools to convert to tagAlign PE data.'''

    bedpe_filename = bam_basename + ".bedpe.gz"

    # Name sort bam to make BEDPE
    nmsrt_bam_filename = bam_basename + ".nmsrt.bam"
    samtools_sort_command = \
        "samtools sort -n -@%d -o %s %s" \
        % (cpu_count(), nmsrt_bam_filename, bam)

    logger.info(samtools_sort_command)
    subprocess.check_output(shlex.split(samtools_sort_command))

    out, err = utils.run_pipe(
        ["bamToBed -bedpe -mate1 -i %s" % (nmsrt_bam_filename), "gzip -nc"],
        outfile=bedpe_filename)
def overlap(experiment, design):
    '''Calculate the overlap of peaks'''

    logger.info("Determining consenus peaks for experiment %s.", experiment)

    # Output File names
    peak_type = 'narrowPeak'
    overlapping_peaks_fn = '%s.replicated.%s' % (experiment, peak_type)
    rejected_peaks_fn = '%s.rejected.%s' % (experiment, peak_type)

    # Intermediate File names
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Assign Pooled and Psuedoreplicate peaks
    pool_peaks = design.loc[design.replicate == 'pooled', 'peaks'].values[0]
    pr1_peaks = design.loc[design.replicate == '1_pr', 'peaks'].values[0]
    pr2_peaks = design.loc[design.replicate == '2_pr', 'peaks'].values[0]

    # Remove non true replicate rows
    not_replicates = ['1_pr', '2_pr', 'pooled']
    design_true_reps = design[~design['replicate'].isin(not_replicates)]
    true_rep_peaks = design_true_reps.peaks.unique()

    # Find overlaps
    awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
    cut_command = 'cut -f 1-10'

    # Find pooled peaks that overlap Rep1 and Rep2
    # where overlap is defined as the fractional overlap
    # with any one of the overlapping peak pairs  >= 0.5

    steps_true = [
        'intersectBed -wo -a %s -b %s' % (pool_peaks, true_rep_peaks[0]),
        awk_command, cut_command, 'sort -u'
    ]

    iter_true_peaks = iter(true_rep_peaks)
    next(iter_true_peaks)

    if len(true_rep_peaks) > 1:
        for true_peak in true_rep_peaks[1:]:
            steps_true.extend([
                'intersectBed -wo -a stdin -b %s' % (true_peak), awk_command,
                cut_command, 'sort -u'
            ])

    out, err = utils.run_pipe(steps_true, outfile=overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (utils.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2
    # where overlap is defined as the fractional overlap
    # with any one of the overlapping peak pairs  >= 0.5

    steps_pseudo = [
        'intersectBed -wo -a %s -b %s' % (pool_peaks, pr1_peaks), awk_command,
        cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' % (pr2_peaks), awk_command,
        cut_command, 'sort -u'
    ]

    out, err = utils.run_pipe(steps_pseudo, outfile=overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates" %
          (utils.count_lines(overlap_pr_fn)))

    # Make union of peak lists
    out, err = utils.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudorepliates"
        % (utils.count_lines(overlapping_peaks_fn)))

    # Make rejected peak list
    out, err = utils.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pool_peaks, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (utils.count_lines(rejected_peaks_fn)))

    # Remove temporary files
    os.remove(overlap_tr_fn)
    os.remove(overlap_pr_fn)

    return overlapping_peaks_fn
Example #10
0
def call_peaks_macs(experiment, xcor, control, prefix, genome_size,
                    chrom_sizes):

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file
    with open(xcor, 'r') as xcor_fh:
        firstline = xcor_fh.readline()
        frag_lengths = firstline.split()[2]  # third column
        fragment_length = frag_lengths.split(',')[0]  # grab first value
        logger.info("Fraglen %s" % (fragment_length))

    # Generate narrow peaks and preliminary signal tracks

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s ' % (prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genome_size, fragment_length)

    logger.info(command)
    returncode = utils.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.
    # Remove coordinates outside chromosome sizes

    narrowpeak_fn = '%s_peaks.narrowPeak' % (prefix)
    clipped_narrowpeak_fn = 'clipped-%s' % (narrowpeak_fn)

    steps = [
        'slopBed -i %s -g %s -b 0' % (narrowpeak_fn, chrom_sizes),
        'bedClip stdin %s %s' % (chrom_sizes, clipped_narrowpeak_fn)
    ]

    out, err = utils.run_pipe(steps)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_narrowpeak_fn = utils.rescale_scores(clipped_narrowpeak_fn,
                                                  scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4
    # with Peak_<peakRank>
    steps = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'"""
    ]

    out, err = utils.run_pipe(steps, '%s' % (narrowpeak_fn))

    # For Fold enrichment signal tracks

    # This file is a tab delimited file with 2 columns Col1 (chromosome name),
    # Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
          '-t %s_treat_pileup.bdg ' % (prefix) + \
          '-c %s_control_lambda.bdg ' % (prefix) + \
          '-o %s_FE.bdg ' % (prefix) + \
          '-m FE'

    logger.info(command)
    returncode = utils.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (MACS2 bug)
    fc_bedgraph_fn = '%s.fc.signal.bedgraph' % (prefix)
    fc_bedgraph_sorted_fn = 'sorted-%s' % (fc_bedgraph_fn)
    fc_signal_fn = "%s.fc_signal.bw" % (prefix)
    steps = [
        'slopBed -i %s_FE.bdg -g %s -b 0' % (prefix, chrom_sizes),
        'bedClip stdin %s %s' % (chrom_sizes, fc_bedgraph_fn)
    ]

    out, err = utils.run_pipe(steps)

    # Sort file
    out, err = utils.run_pipe(
        ['bedSort %s %s' % (fc_bedgraph_fn, fc_bedgraph_sorted_fn)])

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
          '%s ' % (fc_bedgraph_sorted_fn) + \
          '%s ' % (chrom_sizes) + \
          '%s' % (fc_signal_fn)

    logger.info(command)
    returncode = utils.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # For -log10(p-value) signal tracks

    # Compute sval =
    # min(no. of reads in ChIP, no. of reads in control) / 1,000,000
    out, err = utils.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
    chip_reads = out.strip()
    out, err = utils.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
    control_reads = out.strip()
    sval = str(min(float(chip_reads), float(control_reads)) / 1000000)

    logger.info("chip_reads = %s, control_reads = %s, sval = %s" %
                (chip_reads, control_reads, sval))

    command = 'macs2 bdgcmp ' + \
          '-t %s_treat_pileup.bdg ' % (prefix) + \
          '-c %s_control_lambda.bdg ' % (prefix) + \
          '-o %s_ppois.bdg ' % (prefix) + \
          '-m ppois -S %s' % (sval)

    logger.info(command)
    returncode = utils.block_on(command)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (MACS2 bug)
    pvalue_bedgraph_fn = '%s.pval.signal.bedgraph' % (prefix)
    pvalue_bedgraph_sorted_fn = 'sort-%s' % (pvalue_bedgraph_fn)
    pvalue_signal_fn = "%s.pvalue_signal.bw" % (prefix)
    steps = [
        'slopBed -i %s_ppois.bdg -g %s -b 0' % (prefix, chrom_sizes),
        'bedClip stdin %s %s' % (chrom_sizes, pvalue_bedgraph_fn)
    ]

    out, err = utils.run_pipe(steps)

    # Sort file
    out, err = utils.run_pipe(
        ['bedSort %s %s' % (fc_bedgraph_fn, pvalue_bedgraph_sorted_fn)])

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
          '%s ' % (pvalue_bedgraph_sorted_fn) + \
          '%s ' % (chrom_sizes) + \
          '%s' % (pvalue_signal_fn)

    logger.info(command)
    returncode = utils.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # Remove temporary files
    os.remove(clipped_narrowpeak_fn)
    os.remove(rescaled_narrowpeak_fn)