コード例 #1
0
ファイル: 02chip_filterQC.py プロジェクト: wkl1990/ChIP-seq
def scrub_fun(in_filepath, out_filepath):
    # Check the input.
    logger.debug("Input flagstat for %s" % (in_filepath))
    logger.debug(shell_command("samtools flagstat %s" % (in_filepath)))
    # Set up the paths to inputs and outputs.
    dirname = os.path.dirname(out_filepath)
    header_path = os.path.join(dirname, "header.txt")
    sam_path = os.path.join(dirname, "scrubbed.sam")
    # Cache the header.
    shell_command("samtools view -H %s -o %s" % (in_filepath, header_path))
    # Scrub the sequence information from these fields:
    # 6 = CIGAR, 10 = query sequence, 11 = PHRED, and suppress optional tags
    # For example, unscrubbed read might look like:
    # SPADE:8:33:220:1107#0 0 chr21 8994907 37 9M1I26M * 0 0 ATTGTTGACAAAAACTCGACAAACAATTGGAGAATC    bbbR]`T`^]TTSSS^_W`BBBBBBBBBBBBBBBBB    X0:i:1  X1:i:0  MD:Z:35 PG:Z:MarkDuplicates     XG:i:1  NM:i:1  XM:i:0  XO:i:1  XT:A:U
    # Scrubbed version would look like:
    # SPADE:8:33:220:1107#0 0 chr21 8994907 37 36M     * 0 0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN    *
    common.run_pipe([
        'samtools view %s' % (in_filepath),
        r"""awk '{OFS="\t"} {s=""; for(i=1;i<=length($10);i++) s=(s "N"); $6=(i-1 "M"); $10=s; $11="*"; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11}'"""
    ], sam_path)
    # Add back the header.
    common.run_pipe([
        'cat %s %s' % (header_path, sam_path),
        'samtools view -S -b - -o %s' % (out_filepath)
    ])
    # Check the output.
    logger.debug("Output flagstat for %s" % (out_filepath))
    logger.debug(shell_command("samtools flagstat %s" % (out_filepath)))
コード例 #2
0
ファイル: scrub.py プロジェクト: ENCODE-DCC/chip-seq-pipeline
def scrub(in_filepath, out_filepath):
    # Check the input.
    logger.debug("Input flagstat for %s" % (in_filepath))
    logger.debug(shell_command("samtools flagstat %s" % (in_filepath)))
    # Set up the paths to inputs and outputs.
    dirname = os.path.dirname(out_filepath)
    header_path = os.path.join(dirname, "header.txt")
    sam_path = os.path.join(dirname, "scrubbed.sam")
    # Cache the header.
    shell_command("samtools view -H %s -o %s" % (in_filepath, header_path))
    # Scrub the sequence information from these fields:
    # 6 = CIGAR, 10 = query sequence, 11 = PHRED, and suppress optional tags
    # For example, unscrubbed read might look like:
    # SPADE:8:33:220:1107#0 0 chr21 8994907 37 9M1I26M * 0 0 ATTGTTGACAAAAACTCGACAAACAATTGGAGAATC    bbbR]`T`^]TTSSS^_W`BBBBBBBBBBBBBBBBB    X0:i:1  X1:i:0  MD:Z:35 PG:Z:MarkDuplicates     XG:i:1  NM:i:1  XM:i:0  XO:i:1  XT:A:U
    # Scrubbed version would look like:
    # SPADE:8:33:220:1107#0 0 chr21 8994907 37 36M     * 0 0 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN    *
    common.run_pipe([
        'samtools view %s' % (in_filepath),
        r"""awk '{OFS="\t"} {s=""; for(i=1;i<=length($10);i++) s=(s "N"); $6=(i-1 "M"); $10=s; $11="*"; print $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11}'"""
        ], sam_path)
    # Add back the header.
    common.run_pipe([
        'cat %s %s' % (header_path, sam_path),
        'samtools view -S -b - -o %s' % (out_filepath)])
    # Check the output.
    logger.debug("Output flagstat for %s" % (out_filepath))
    logger.debug(shell_command("samtools flagstat %s" % (out_filepath)))
コード例 #3
0
def main(input_bam, paired_end):

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    subprocess.check_output('ls -l', shell=True)

    # ===================
    # Create tagAlign file
    # ===================
    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename),
        "gzip -cn"],
        outfile=final_TA_filename)

    subprocess.check_output('ls -l', shell=True)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        command = \
            "samtools sort -@ %d -n %s %s" \
            % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(command)
        subprocess.check_call(shlex.split(command))

        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename),
            "gzip -cn"],
            outfile=final_BEDPE_filename)

    subprocess.check_output('ls -l', shell=True)

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    output = {}
    output["tagAlign_file"] = dxpy.dxlink(tagAlign_file)
    if paired_end:
        output["BEDPE_file"] = dxpy.dxlink(BEDPE_file)

    return output
コード例 #4
0
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None):
    return_string = \
        "\t\ttrack %s%d\n" %(accession,n) + \
        "\t\tbigDataUrl %s\n" %(url) + \
        "\t\tshortLabel %s\n" %(name[:17]) + \
        "\t\tparent %sviewpeaks on\n" %(accession) + \
        "\t\ttype %s\n" %(tracktype) + \
        "\t\tvisibility dense\n" + \
        "\t\tview PK\n" + \
        "\t\tpriority %d\n\n" %(n)
    n_stanzas = 1
    if not lowpass:
        lowpass = []
    if isinstance(lowpass,int):
        lowpass = [lowpass]
    extra_stanza_count = 0
    for (i, cutoff) in enumerate(lowpass,start=1):
        fn = dx.get_id()
        if not os.path.isfile(fn):
            dxpy.download_dxfile(dx.get_id(),fn)
        cutoffstr = '-lt%d' %(cutoff)
        outfn = fn + cutoffstr
        print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0]
        bed_fn = fn + '.bed'
        common.block_on('bigBedToBed %s %s' %(fn, bed_fn))
        common.run_pipe([
            'cat %s' %(bed_fn),
            r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn)
        print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0]
        if tracktype =='bigBed 6 +':
            as_file = 'narrowPeak.as'
        elif tracktype == 'bigBed 12 +':
            as_file = 'gappedPeak.as'
        else:
            print "Cannot match tracktype %s to any .as file" %(tracktype)
        bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file)
        newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True)
        new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True)

        new_lines = [
            "\t\ttrack %s%d" %(accession,n+i),
            "\t\tbigDataUrl %s" %(new_url),
            "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr),
            "\t\tparent %sviewpeaks on" %(accession),
            "\t\ttype %s" %(tracktype),
            "\t\tvisibility dense",
            "\t\tview PK",
            "\t\tpriority %d\n\n" %(n+i)]
        new_stanza = '\n'.join(new_lines)
        return_string += new_stanza
        n_stanzas += 1
        os.remove(bed_fn)
        os.remove(bb_fn)
        os.remove(outfn)
        os.remove(fn)

    return(return_string, n_stanzas)
コード例 #5
0
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None):
	return_string = \
		"\t\ttrack %s%d\n" %(accession,n) + \
		"\t\tbigDataUrl %s\n" %(url) + \
		"\t\tshortLabel %s\n" %(name[:17]) + \
		"\t\tparent %sviewpeaks on\n" %(accession) + \
		"\t\ttype %s\n" %(tracktype) + \
		"\t\tvisibility dense\n" + \
		"\t\tview PK\n" + \
		"\t\tpriority %d\n\n" %(n)
	n_stanzas = 1
	if not lowpass:
		lowpass = []
	if isinstance(lowpass,int):
		lowpass = [lowpass]
	extra_stanza_count = 0
	for (i, cutoff) in enumerate(lowpass,start=1):
		fn = dx.get_id()
		if not os.path.isfile(fn):
			dxpy.download_dxfile(dx.get_id(),fn)
		cutoffstr = '-lt%d' %(cutoff)
		outfn = fn + cutoffstr
		print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0]
		bed_fn = fn + '.bed'
		common.block_on('bigBedToBed %s %s' %(fn, bed_fn))
		common.run_pipe([
			'cat %s' %(bed_fn),
			r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn)
		print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0]
		if tracktype =='bigBed 6 +':
			as_file = 'narrowPeak.as'
		elif tracktype == 'bigBed 12 +':
			as_file = 'gappedPeak.as'
		else:
			print "Cannot match tracktype %s to any .as file" %(tracktype)
		bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file)
		newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True)
		new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True)

		new_lines = [
			"\t\ttrack %s%dp%d" %(accession,n,i),
			"\t\tbigDataUrl %s" %(new_url),
			"\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr),
			"\t\tparent %sviewpeaks on" %(accession),
			"\t\ttype %s" %(tracktype),
			"\t\tvisibility dense",
			"\t\tview PK",
			"\t\tpriority %d.%d\n\n" %(n,i)]
		new_stanza = '\n'.join(new_lines)
		return_string += new_stanza
		n_stanzas += 1

	return(return_string, n_stanzas)
コード例 #6
0
def count_lines(filename):
    if filename.endswith(('.Z', '.gz', '.bz', '.bz2')):
        catcommand = 'gzip -dc'
    else:
        catcommand = 'cat'
    out, err = common.run_pipe(['%s %s' % (catcommand, filename), 'wc -l'])
    return int(out)
コード例 #7
0
def main(inputs, prefix=None):

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    # uses last extension - presumably they are all the same
    extension = splitext(splitext(input_filenames[-1])[0])[1]
    if prefix:
        pooled_filename = prefix + "_pooled%s.gz" % (extension)
    else:
        pooled_filename = \
            '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = common.run_pipe([
        'gzip -dc %s' % (' '.join(input_filenames)),
        'gzip -cn'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    output = {
        "pooled": dxpy.dxlink(pooled)
    }

    return output
コード例 #8
0
def count_lines(filename):
        if filename.endswith(('.Z','.gz','.bz','.bz2')):
                catcommand = 'gzip -dc'
        else:
                catcommand = 'cat'
        out,err = common.run_pipe([
                '%s %s' %(catcommand, filename),
                'wc -l'
        ])
        return int(out)
コード例 #9
0
ファイル: spp.py プロジェクト: weng-lab/chip-seq-pipeline
	def process(self, resource_dir):

		# Define output directory
		peaks_dirname = "peaks_spp"
		if not os.path.exists(peaks_dirname): os.makedirs(peaks_dirname)

		# Define output filenames
		prefix = self.experiment.name.rstrip('.gz').rstrip('.tagAlign')
		self.peaks_fn       = prefix + '.regionPeak'
		self.final_peaks_fn = self.peaks_fn + '.gz'
		self.xcor_plot_fn   = prefix + '.pdf'
		self.xcor_scores_fn = prefix + '.ccscores'
		self.fixed_peaks_fn = prefix + '.fixcoord.regionPeak'

		# fragment length is third column in cross-correlation input file
		fragment_length = int(open(self.xcor_scores_input.name, 'r').readline().split('\t')[2])
		print "Read fragment length: %d" % fragment_length

		# install SPP
		ca_tarball  = '%s/caTools/caTools_1.17.1.tar.gz' % resource_dir
		spp_tarball = '%s/phantompeakqualtools/spp_1.10.1.tar.gz' % resource_dir
		bitops_tarball = '%s/bitops/bitops_1.0-6.tar.gz' % resource_dir
		run_spp = '%s/phantompeakqualtools/run_spp_nodups.R' % resource_dir if self.nodups else '%s/phantompeakqualtools/run_spp.R' % resource_dir
		if not os.path.exists(os.path.expanduser("~/R-libs")): os.mkdir(os.path.expanduser("~/R-libs"))
		print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s' % (os.path.expanduser("~/R-libs"), bitops_tarball)), stderr=subprocess.STDOUT)
		print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s' % (os.path.expanduser("~/R-libs"), ca_tarball)), stderr=subprocess.STDOUT)
		print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s/snow/snow_0.4-1.tar.gz' % (os.path.expanduser("~/R-libs"), resource_dir)), stderr=subprocess.STDOUT)
		print subprocess.check_output(shlex.split('R CMD INSTALL -l %s %s' % (os.path.expanduser("~/R-libs"), spp_tarball)), stderr=subprocess.STDOUT)

		# run SPP
		spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (run_spp, self.cpu_count(), self.experiment.name, self.control.name, self.npeaks, fragment_length, self.peaks_fn, self.xcor_plot_fn, self.xcor_scores_fn)
		print spp_command
		process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
		for line in iter(process.stdout.readline, ''):
			sys.stdout.write(line)

		# various fixes to ensure that coordinates fall within chr boundaries and are in the correct format
		common.run_pipe([
		        "gzip -dc %s" % self.final_peaks_fn,
			"tee %s" % self.peaks_fn,
			r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
			'slopBed -i stdin -g %s -b 0' % self.chrom_sizes.name,
			'bedClip stdin %s %s' % (self.chrom_sizes.name, self.fixed_peaks_fn) ])
コード例 #10
0
def blacklist_filter(input_fname, output_fname, input_blacklist_fname):
	
	with open(input_fname, 'rb') as fh:
		gzipped = fh.read(2) == b'\x1f\x8b'
	if gzipped:
		peaks_fname = 'peaks.bed'
		out,err = common.run_pipe(['gzip -dc %s' %(input_fname)], peaks_fname)
	else:
		peaks_fname = input_fname

	with open(input_blacklist_fname, 'rb') as fh:
		gzipped = fh.read(2) == b'\x1f\x8b'
	if gzipped:
		blacklist_fname = 'blacklist.bed'
		out, err = common.run_pipe(['gzip -dc %s' %(input_blacklist_fname)], blacklist_fname)
	else:
		blacklist_fname = input_blacklist_fname

	out, err = common.run_pipe([
		'subtractBed -A -a %s -b %s' %(peaks_fname, blacklist_fname)
		], output_fname)
コード例 #11
0
def rescale_scores(fn, scores_col, new_min=10, new_max=1000):
	n_peaks = common.count_lines(fn)
	sorted_fn = 'sorted-%s' %(fn)
	rescaled_fn = 'rescaled-%s' %(fn)
	out,err = common.run_pipe([
		'sort -k %dgr,%dgr %s' %(scores_col, scores_col, fn),
		r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
		sorted_fn)
	out, err = common.run_pipe([
		'head -n 1 %s' %(sorted_fn),
		'cut -f %s' %(scores_col)])
	max_score = float(out.strip())
	out, err = common.run_pipe([
		'tail -n 1 %s' %(sorted_fn),
		'cut -f %s' %(scores_col)])
	min_score = float(out.strip())
	out,err = common.run_pipe([
		'cat %s' %(sorted_fn),
		r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \
		r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)],
		rescaled_fn)
	return rescaled_fn
コード例 #12
0
def rescale_scores(fn, scores_col, new_min=10, new_max=1000):
    n_peaks = common.count_lines(fn)
    sorted_fn = 'sorted-%s' % (fn)
    rescaled_fn = 'rescaled-%s' % (fn)
    out, err = common.run_pipe([
        'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn),
        r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""
    ], sorted_fn)
    out, err = common.run_pipe(
        ['head -n 1 %s' % (sorted_fn),
         'cut -f %s' % (scores_col)])
    max_score = float(out.strip())
    out, err = common.run_pipe(
        ['tail -n 1 %s' % (sorted_fn),
         'cut -f %s' % (scores_col)])
    min_score = float(out.strip())
    out,err = common.run_pipe([
     'cat %s' %(sorted_fn),
     r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \
     r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)],
     rescaled_fn)
    return rescaled_fn
コード例 #13
0
def blacklist_filter(input_fname, output_fname, input_blacklist_fname):

    with open(input_fname, 'rb') as fh:
        gzipped = fh.read(2) == b'\x1f\x8b'
    if gzipped:
        peaks_fname = 'peaks.bed'
        out, err = common.run_pipe(['gzip -dc %s' % (input_fname)],
                                   peaks_fname)
    else:
        peaks_fname = input_fname

    with open(input_blacklist_fname, 'rb') as fh:
        gzipped = fh.read(2) == b'\x1f\x8b'
    if gzipped:
        blacklist_fname = 'blacklist.bed'
        out, err = common.run_pipe(['gzip -dc %s' % (input_blacklist_fname)],
                                   blacklist_fname)
    else:
        blacklist_fname = input_blacklist_fname

    out, err = common.run_pipe(
        ['subtractBed -A -a %s -b %s' % (peaks_fname, blacklist_fname)],
        output_fname)
コード例 #14
0
def scrub(in_filepath, out_filepath):
    # Check the input.
    logger.debug("Input flagstat for %s" % (in_filepath))
    logger.debug(shell_command("samtools flagstat %s" % (in_filepath)))
    # Set up the paths to inputs and outputs.
    dirname = os.path.dirname(out_filepath)
    header_path = os.path.join(dirname, "header.txt")
    sam_path = os.path.join(dirname, "scrubbed.sam")
    # Cache the header.
    shell_command("samtools view -H %s -o %s" % (in_filepath, header_path))
    # Scrub the sequence from field 10 with awk.
    common.run_pipe([
        'samtools view %s' % (in_filepath),
        r"""awk '{OFS="\t"} {s=""; for(i=1;i<=length($10);i++) s=(s "N"); $10=s; $11="*"; print}'"""
    ], sam_path)
    # Add back the header.
    common.run_pipe([
        'cat %s %s' % (header_path, sam_path),
        'samtools view -S -b - -o %s' % (out_filepath)
    ])
    # Check the output.
    logger.debug("Output flagstat for %s" % (out_filepath))
    logger.debug(shell_command("samtools flagstat %s" % (out_filepath)))
コード例 #15
0
def pool(inputs, prefix=None):
    input_filenames = []
    for input_file in inputs:
        dxf = input_file
        input_filenames.append(dxf)
    # uses last extension - presumably they are all the same
    extension = splitext(splitext(input_filenames[-1])[0])[1]
    if prefix:
        pooled_filename = prefix + "_pooled%s.gz" % (extension)
    else:
        pooled_filename = \
            '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = common.run_pipe(
        ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -cn'],
        outfile=pooled_filename)
    pooled = pooled_filename
    output = {"pooled": pooled}
    return output
コード例 #16
0
def pool(inputs, prefix=None):

    input_filenames = inputs

    # uses last extension - presumably they are all the same
    extension = splitext(splitext(input_filenames[-1])[0])[1]
    if prefix:
        pooled_filename = prefix + "_pooled%s.gz" % (extension)
    else:
        pooled_filename = \
            '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    # outfile needs to be reduced to basename to direct cromwell
    # output to the correct place
    out, err = common.run_pipe([
        'gzip -dc %s' % (' '.join(input_filenames)),
        'gzip -cn'],
        outfile=os.path.basename(pooled_filename))

    output = {
        "pooled": pooled_filename
    }

    return output
コード例 #17
0
def main(input_bam, paired_end):

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    subprocess.check_output('ls -l', shell=True)

    samtools = SAMTOOLS_PATH["1.0"]

    # ===================
    # Create tagAlign file
    # ===================
    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename), "gzip -cn"
    ],
                               outfile=final_TA_filename)

    subprocess.check_output('ls -l', shell=True)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        command = \
            "%s sort -@ %d -n %s %s" \
            % (samtools, cpu_count(), input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(command)
        subprocess.check_call(shlex.split(command))

        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" %
            (final_nmsrt_bam_filename), "gzip -cn"
        ],
                                   outfile=final_BEDPE_filename)

    subprocess.check_output('ls -l', shell=True)

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    output = {}
    output["tagAlign_file"] = dxpy.dxlink(tagAlign_file)
    if paired_end:
        output["BEDPE_file"] = dxpy.dxlink(BEDPE_file)

    return output
コード例 #18
0
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                                     rep1_ta, rep1_xcor,
                                     paired_end, chrom_sizes, as_file,
                                     peak_type, prefix, fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn       = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe([
        'cat %s' % (overlap_tr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #19
0
ファイル: xcor.py プロジェクト: ENCODE-DCC/chip-seq-pipeline
def main(input_bam, paired_end, spp_version):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    # ===================
    # Create tagAlign file
    # ===================

    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename),
        "gzip -cn"],
        outfile=final_TA_filename)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        # need namesorted bam to make BEDPE
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        samtools_sort_command = \
            "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(samtools_sort_command)
        subprocess.check_output(shlex.split(samtools_sort_command))
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename),
            "gzip -cn"],
            outfile=final_BEDPE_filename)

    # =================================
    # Subsample tagAlign file
    # ================================
    logger.info(
        "Intermediate tA md5: %s" % (common.md5(intermediate_TA_filename)))
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_bam_basename + \
        ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (intermediate_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename)]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)
    logger.info(
        "Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename)))

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    # spp_tarball = SPP_VERSION_MAP.get(spp_version)
    # assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # # install spp
    # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    # run spp
    run_spp_command = '/phantompeakqualtools/run_spp.R'
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s"
        % (run_spp_command, subsampled_TA_filename, cpu_count(),
           CC_plot_filename, CC_scores_filename)])
    out, err = common.run_pipe([
        r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe([
        "mv temp %s" % (CC_scores_filename)])

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    CC_scores_file = dxpy.upload_local_file(CC_scores_filename)
    CC_plot_file = dxpy.upload_local_file(CC_plot_filename)
    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "tagAlign_file": dxpy.dxlink(tagAlign_file),
        "CC_scores_file": dxpy.dxlink(CC_scores_file),
        "CC_plot_file": dxpy.dxlink(CC_plot_file),
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": float(xcor_qc.get('estFragLen'))
    }
    if paired_end:
        output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)})
    return output
コード例 #20
0
def main(input_tags, prefix=None):

    input_tags_file = dxpy.DXFile(input_tags)

    input_tags_filename = input_tags_file.name
    dxpy.download_dxfile(input_tags_file.get_id(), input_tags_filename)

    # introspect the file to determine tagAlign (thus SE) or BEDPE (thus PE)
    # strip extension as appropriate

    subprocess.check_output('ls', shell=True)
    with gzip.open(input_tags_filename) as f:
        firstline = f.readline()
    logger.info('First line of input_tags:\n%s' % (firstline))

    se_cols = 6
    pe_cols = 10
    if re.match('^(\S+[\t\n]){%d}$' % (se_cols), firstline):
        paired_end = False
        input_tags_basename = prefix or input_tags_filename.rstrip('.tagAlign.gz')
        filename_infix = 'SE'
        logger.info("Detected single-end data")
    elif re.match('^(\S+[\t\n]){%d}$' % (pe_cols), firstline):
        paired_end = True
        input_tags_basename = prefix or input_tags_filename.rstrip('.bedpe.gz')
        filename_infix = 'PE2SE'
        logger.info("Detected paired-end data")
    else:
        raise IOError(
            "%s is neither a BEDPE or tagAlign file" % (input_tags_filename))

    pr_ta_filenames = \
        [input_tags_basename + ".%s.pr1.tagAlign.gz" % (filename_infix),
         input_tags_filename + ".%s.pr2.tagAlign.gz" % (filename_infix)]

    # count lines in the file
    out, err = common.run_pipe([
        'gzip -dc %s' % (input_tags_filename),
        'wc -l'])
    # number of lines in each split
    nlines = (int(out)+1)/2
    # Shuffle and split BEDPE file into 2 equal parts
    # by using the input to seed shuf we ensure multiple runs with the same
    # input will produce the same output
    # Produces two files named splits_prefix0n, n=1,2
    splits_prefix = 'temp_split'
    out, err = common.run_pipe([
        'gzip -dc %s' % (input_tags_filename),
        'shuf --random-source=%s' % (input_tags_filename),
        'split -a 2 -d -l %d - %s' % (nlines, splits_prefix)])
    # Convert read pairs to reads into standard tagAlign file
    for i, index in enumerate(['00', '01']):  # could be made multi-threaded
        steps = ['cat %s' % (splits_prefix+index)]
        if paired_end:
            steps.extend([r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""])
        steps.extend(['gzip -cn'])
        out, err = common.run_pipe(steps, outfile=pr_ta_filenames[i])

    pseudoreplicate1_file = dxpy.upload_local_file(pr_ta_filenames[0])
    pseudoreplicate2_file = dxpy.upload_local_file(pr_ta_filenames[1])

    output = {
        "pseudoreplicate1": dxpy.dxlink(pseudoreplicate1_file),
        "pseudoreplicate2": dxpy.dxlink(pseudoreplicate2_file)
    }

    return output
コード例 #21
0
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as,
         gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment = dxpy.DXFile(experiment)
    control = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    narrowPeak_as = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as = dxpy.DXFile(gappedpeak_as)
    broadPeak_as = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(), experiment.name)
    dxpy.download_dxfile(control.get_id(), control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    broadPeak_bb_fn = "%s.bb" % (broadPeak_fn)
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name, 'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2]  #third column
        print "Fraglen %s" % (fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' %
                                                  (peaks_dirname, prefix),
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
        '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads,
                                                            controlReads, sval)

    returncode = common.block_on(
     'macs2 bdgcmp ' + \
     '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
     '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
     '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
     '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes.name,
                                        narrowPeak_as.name,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes.name,
                                        gappedPeak_as.name,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes.name,
                                       broadPeak_as.name,
                                       bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit
    for fn in [
            narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn,
            gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn
    ]:
        common.block_on('touch %s' % (fn))

    # Upload the file outputs

    narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks": dxpy.dxlink(narrowPeak),
        "gappedpeaks": dxpy.dxlink(gappedPeak),
        "broadpeaks": dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb": dxpy.dxlink(broadPeak_bb),
        "fc_signal": dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output
コード例 #22
0
def main(input_bam, fastqs, debug):
    # create a file handler
    if len(fastqs) > 1:
        paired_end = True
    else:
        paired_end = False
    handler = logging.FileHandler('xcor.log')

    if debug:
        handler.setLevel(logging.DEBUG)
    else:
        handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    input_bam_filename = input_bam
    input_bam_basename = (input_bam.rstrip('.bam')).split('/')[-1]

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    # ===================
    # Create tagAlign file
    # ===================

    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename), "gzip -cn"
    ],
                               outfile=final_TA_filename)
    samtools = SAMTOOLS_PATH
    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        # need namesorted bam to make BEDPE
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        samtools_sort_command = \
            "%s sort -n -@%d -o %s %s" \
            % (samtools, cpu_count(), final_nmsrt_bam_filename, input_bam_filename)
        logger.info(samtools_sort_command)
        subprocess.check_output(shlex.split(samtools_sort_command))
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" %
            (final_nmsrt_bam_filename), "gzip -cn"
        ],
                                   outfile=final_BEDPE_filename)

    # =================================
    # Subsample tagAlign file
    # ================================
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_bam_basename + \
        ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (intermediate_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename)
    ]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    # run spp
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
        (SPP_TOOL_PATH, subsampled_TA_filename, cpu_count(), CC_plot_filename,
         CC_scores_filename)
    ])
    out, err = common.run_pipe(
        [r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)])

    tagAlign_file = final_TA_filename
    if paired_end:
        BEDPE_file = final_BEDPE_filename

    CC_scores_file = CC_scores_filename
    CC_plot_file = CC_plot_filename
    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "tagAlign_file": tagAlign_file,
        "CC_scores_file": CC_scores_file,
        "CC_plot_file": CC_plot_file,
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": int(xcor_qc.get('estFragLen'))
    }
    with open('xcor.json', 'w') as f:
        json.dump(output, f, sort_keys=True, indent=4, separators=(',', ': '))
    if paired_end:
        output.update({"BEDPE_file": BEDPE_file})

    return output
コード例 #23
0
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                       pooledpr1_peaks, pooledpr2_peaks,
                       rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                       paired_end, chrom_sizes, as_file, peak_type, prefix,
                       fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep2_ta_file         = dxpy.DXFile(rep2_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file       = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn         = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn       = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn       = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print(
        "%d peaks overlap with both pooled pseudoreplicates"
        % (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in        = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out       = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected  = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #24
0
def postprocess(crop_length, reference_tar, debug, reads_files):

    handler = logging.FileHandler('post_mapping.log')

    if debug:
        handler.setLevel(logging.DEBUG)
    else:
        handler.setLevel(logging.INFO)
    logger.addHandler(handler)

    samtools = SAMTOOLS_PATH
    bwa = BWA_PATH
    logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa))

    indexed_reads = []
    unmapped_reads = []
    '''for file_name in special_sort(reads_files):
        if file_name.endswith('.sai'):
            indexed_reads.append(file_name)
        else:
            unmapped_reads.append(file_name)
    '''
    figure_out_sort(reads_files, unmapped_reads, indexed_reads)
    indexed_reads_filenames = []
    unmapped_reads_filenames = []

    for i, reads in enumerate(indexed_reads):
        read_pair_number = i + 1
        logger.info("indexed_reads %d: %s" % (read_pair_number, reads))
        indexed_reads_filenames.append(reads)

        unmapped = unmapped_reads[i]
        logger.info("unmapped reads %d: %s" % (read_pair_number, unmapped))
        unmapped_reads_filenames.append(unmapped)

    reference_tar_filename = reference_tar
    logger.info("reference_tar: %s" % (reference_tar_filename))
    # extract the reference files from the tar
    reference_dirname = '.'

    reference_filename = \
        resolve_reference(reference_tar_filename, reference_dirname)

    logger.info("Using reference file: %s" % (reference_filename))

    paired_end = len(indexed_reads) == 2

    # fixing the directories
    if paired_end:
        r1_basename = (strip_extensions(unmapped_reads_filenames[0],
                                        STRIP_EXTENSIONS)).split('/')[-1]
        r2_basename = (strip_extensions(unmapped_reads_filenames[1],
                                        STRIP_EXTENSIONS)).split('/')[-1]
        reads_basename = r1_basename + r2_basename
    else:
        reads_basename = (strip_extensions(unmapped_reads_filenames[0],
                                           STRIP_EXTENSIONS)).split('/')[-1]

    raw_bam_filename = '%s.raw.srt.bam' % (reads_basename)
    raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename)

    if paired_end:
        reads1_filename = indexed_reads_filenames[0]
        reads2_filename = indexed_reads_filenames[1]
        unmapped_reads1_filename = unmapped_reads_filenames[0]
        unmapped_reads2_filename = unmapped_reads_filenames[1]
        raw_sam_filename = reads_basename + ".raw.sam"
        badcigar_filename = "badreads.tmp"
        steps = [
            "%s sampe -P %s %s %s %s %s" %
            (bwa, reference_filename, reads1_filename, reads2_filename,
             unmapped_reads1_filename, unmapped_reads2_filename),
            "tee %s" % (raw_sam_filename),
            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
            "sort", "uniq"
        ]
        out, err = common.run_pipe(steps, badcigar_filename)
        print(out)
        if err:
            logger.error("sampe error: %s" % (err))

        steps = [
            "cat %s" % (raw_sam_filename),
            "grep -v -F -f %s" % (badcigar_filename)
        ]
    else:  # single end
        reads_filename = indexed_reads_filenames[0]
        unmapped_reads_filename = unmapped_reads_filenames[0]
        steps = [
            "%s samse %s %s %s" %
            (bwa, reference_filename, reads_filename, unmapped_reads_filename)
        ]

    steps.extend([
        "%s view -@%d -Su -" % (samtools, cpu_count()),
        "%s sort -@%d -o %s" % (samtools, cpu_count(), raw_bam_filename)
    ])  # samtools adds .bam

    logger.info("Running pipe: %s" % (steps))
    out, err = common.run_pipe(steps)

    if out:
        print(out)
    if err:
        logger.error("samtools error: %s" % (err))

    with open(raw_bam_mapstats_filename, 'w') as fh:
        subprocess.check_call(shlex.split("%s flagstat %s" %
                                          (samtools, raw_bam_filename)),
                              stdout=fh)
    print(subprocess.check_output('ls -l', shell=True))

    mapped_reads = raw_bam_filename
    mapping_statistics = raw_bam_mapstats_filename
    flagstat_qc = flagstat_parse(raw_bam_mapstats_filename)

    output = {
        'mapped_reads': mapped_reads,
        'mapping_statistics': mapping_statistics,
        'n_mapped_reads': flagstat_qc.get('mapped')[0],  # 0 is hi-q reads
        "crop_length": crop_length,
        "paired_end": paired_end
    }
    with open('post_mapping.json', 'w') as f:
        json.dump(output, f, sort_keys=True, indent=4, separators=(',', ': '))
    logger.info("Returning from postprocess with output: %s" % (output))
    return output
コード例 #25
0
def main(experiment,
         control,
         xcor_scores_input,
         chrom_sizes,
         narrowpeak_as,
         gappedpeak_as,
         broadpeak_as,
         genomesize,
         prefix=None,
         fragment_length=None):

    narrowPeak_as = narrowpeak_as
    gappedPeak_as = gappedpeak_as
    broadPeak_as = broadpeak_as

    # Define the output filenames

    peaks_dirname = 'peaks_macs'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    if not prefix:
        prefix = experiment
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file
    # if the fragment_length argument is given, use that instead
    if fragment_length is not None:
        fraglen = str(fragment_length)
        logger.info("User given fragment length %s" % fraglen)
    else:
        with open(xcor_scores_input, 'r') as fh:
            firstline = fh.readline()
            fraglen = firstline.split()[2]  # third column
            logger.info("Fraglen %s" % (fraglen))

    # ===========================================
    # Generate narrow peaks and preliminary signal tracks
    # ============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genomesize, fraglen)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_narrowpeak_fn = common.slop_clip(
        '%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), chrom_sizes)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn,
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4
    # with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    # ===========================================
    # Generate Broad and Gapped Peaks
    # ============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' % (genomesize, fraglen)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_broadpeak_fn = common.slop_clip(
        '%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), chrom_sizes)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn,
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending
    # order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %
                                              (peaks_dirname, prefix),
                                              chrom_sizes,
                                              bed_type='gappedPeak')

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn,
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    # ===========================================
    # For Fold enrichment signal tracks
    # ============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name),
    # Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
              '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + \
              '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + \
              '--outdir %s -o %s_FE.bdg ' % (peaks_dirname, prefix) + \
              '-m FE'
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes, peaks_dirname, prefix)
    ]
    out, err = common.run_pipe(pipe)

    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.fc.signal.bedgraph ' % (peaks_dirname, prefix) + \
              '%s ' % (chrom_sizes) + \
              '%s' % (fc_signal_fn)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    # drm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    # ===========================================
    # For -log10(p-value) signal tracks
    # ============================================

    # Compute sval =
    # min(no. of reads in ChIP, no. of reads in control) / 1,000,000
    out, err = common.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    logger.info("chipReads = %s, controlReads = %s, sval = %s" %
                (chipReads, controlReads, sval))

    returncode = common.block_on('macs2 bdgcmp ' +
                                 '-t %s/%s_treat_pileup.bdg ' %
                                 (peaks_dirname, prefix) +
                                 '-c %s/%s_control_lambda.bdg ' %
                                 (peaks_dirname, prefix) +
                                 '--outdir %s -o %s_ppois.bdg ' %
                                 (peaks_dirname, prefix) + '-m ppois -S %s' %
                                 (sval))
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes, peaks_dirname, prefix)
    ]
    out, err = common.run_pipe(pipe)

    # rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.pval.signal.bedgraph ' % (peaks_dirname, prefix) + \
              '%s ' % (chrom_sizes) + \
              '%s' % (pvalue_signal_fn)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    # ===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    # ============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes,
                                        narrowPeak_as,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes,
                                        gappedPeak_as,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes,
                                       broadPeak_as,
                                       bed_type='bed6+3')

    # Temporary during development to create empty files just to get the applet
    # to exit
    # narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    # gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    # broadPeak_bb_fn  = "%s.bb" % (broadPeak_fn)

    output = {
        "narrowpeaks": narrowPeak_gz_fn,
        "gappedpeaks": gappedPeak_gz_fn,
        "broadpeaks": broadPeak_gz_fn,
        "narrowpeaks_bb": narrowPeak_bb_fname,
        "gappedpeaks_bb": gappedPeak_bb_fname,
        "broadpeaks_bb": broadPeak_bb_fname,
        "fc_signal": fc_signal_fn,
        "pvalue_signal": pvalue_signal_fn
    }

    return output
コード例 #26
0
def main(input_tagAlign, paired_end, spp_version):

    input_tagAlign_file = dxpy.DXFile(input_tagAlign)

    input_tagAlign_filename = input_tagAlign_file.name
    input_tagAlign_basename = input_tagAlign_file.name.rstrip('.gz')
    dxpy.download_dxfile(input_tagAlign_file.get_id(), input_tagAlign_filename)

    uncompressed_TA_filename = input_tagAlign_basename
    out, err = common.run_pipe(['gzip -d %s' % (input_tagAlign_filename)])

    # =================================
    # Subsample tagAlign file
    # ================================
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_tagAlign_basename + \
        ".sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (uncompressed_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, uncompressed_TA_filename)
    ]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    spp_tarball = SPP_VERSION_MAP.get(spp_version)
    assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # install spp
    subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    # run spp
    run_spp_command = '/phantompeakqualtools/run_spp_nodups.R'
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
        (run_spp_command, subsampled_TA_filename, cpu_count(),
         CC_plot_filename, CC_scores_filename)
    ])
    out, err = common.run_pipe(
        [r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)])

    CC_scores_file = dxpy.upload_local_file(CC_scores_filename)
    CC_plot_file = dxpy.upload_local_file(CC_plot_filename)
    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "CC_scores_file": dxpy.dxlink(CC_scores_file),
        "CC_plot_file": dxpy.dxlink(CC_plot_file),
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": float(xcor_qc.get('estFragLen'))
    }

    return output
コード例 #27
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

        # The following line(s) download your file inputs to the local file system
        # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    output_filename_prefix = experiment_filename.rstrip(".gz").rstrip(".tagAlign")
    peaks_filename = output_filename_prefix + ".regionPeak"
    final_peaks_filename = peaks_filename + ".gz"  # spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + ".pdf"
    xcor_scores_filename = output_filename_prefix + ".ccscores"

    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3  # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, "r") as f:
        line = f.readline()
        fragment_length = int(line.split("\t")[fraglen_column - 1])
        print "Read fragment length: %d" % (fragment_length)

        # run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = "/phantompeakqualtools/spp_1.10.1.tar.gz"
    if nodups:
        run_spp = "/phantompeakqualtools/run_spp_nodups.R"
    else:
        run_spp = "/phantompeakqualtools/run_spp.R"
        # install spp
    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)
    print subprocess.check_output(shlex.split("R CMD INSTALL %s" % (spp_tarball)), stderr=subprocess.STDOUT)
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (
        run_spp,
        cpu_count(),
        experiment_filename,
        control_filename,
        npeaks,
        fragment_length,
        peaks_filename,
        xcor_plot_filename,
        xcor_scores_filename,
    )
    print spp_command
    process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    for line in iter(process.stdout.readline, ""):
        sys.stdout.write(line)

        # when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
        # this changes any such coodinates to decimal notation
        # this assumes 10-column output and that the 2nd and 3rd columns are coordinates
        # slopBed adjusts feature end coordinates that go off the end of the chromosome
        # bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + ".fixcoord.regionPeak"

    out, err = common.run_pipe(
        [
            "gzip -dc %s" % (final_peaks_filename),
            "tee %s" % (peaks_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
            "slopBed -i stdin -g %s -b 0" % (chrom_sizes_filename),
            "bedClip stdin %s %s" % (chrom_sizes_filename, fix_coordinate_peaks_filename),
        ]
    )

    # These lines transfer the peaks files to the temporary workspace for debugging later
    # Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" % (n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" % (
        n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)
    )
    print "First 50 peaks"
    print subprocess.check_output("head -50 %s" % (fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split("gzip %s" % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + ".gz"

    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #28
0
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks,
         chrom_sizes, as_file, peak_type, prefix=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks      = dxpy.DXFile(rep1_peaks)
    rep2_peaks      = dxpy.DXFile(rep2_peaks)
    pooled_peaks    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes     = dxpy.DXFile(chrom_sizes)
    as_file         = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn       = 'rep1-%s' %(rep1_peaks.name)
    rep2_peaks_fn       = 'rep2-%s' %(rep2_peaks.name)
    pooled_peaks_fn     = 'pooled-%s' %(pooled_peaks.name)
    pooledpr1_peaks_fn  = 'pooledpr1-%s' %(pooledpr1_peaks.name)
    pooledpr2_peaks_fn  = 'pooledpr2-%s' %(pooledpr2_peaks.name)
    chrom_sizes_fn      = 'chrom.sizes'
    as_file_fn          = '%s.as' %(peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' %(basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' %(basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn   = 'replicated_tr.%s' %(peak_type)
    overlap_pr_fn   = 'replicated_pr.%s' %(peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)

    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' %(overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn))

    npeaks_in       = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out      = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn    = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        'npeaks_rejected'       : npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
コード例 #29
0
def postprocess(indexed_reads, unmapped_reads, reference_tar,
                bwa_version, samtools_version, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    samtools = SAMTOOLS_PATH.get(samtools_version)
    assert samtools, "samtools version %s is not supported" % (samtools_version)
    bwa = BWA_PATH.get(bwa_version)
    assert bwa, "BWA version %s is not supported" % (bwa_version)
    logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa))

    indexed_reads_filenames = []
    unmapped_reads_filenames = []
    for i, reads in enumerate(indexed_reads):
        read_pair_number = i+1

        fn = dxpy.describe(reads)['name']
        logger.info("indexed_reads %d: %s" % (read_pair_number, fn))
        indexed_reads_filenames.append(fn)
        dxpy.download_dxfile(reads, fn)

        unmapped = unmapped_reads[i]
        fn = dxpy.describe(unmapped)['name']
        logger.info("unmapped reads %d: %s" % (read_pair_number, fn))
        unmapped_reads_filenames.append(fn)
        dxpy.download_dxfile(unmapped, fn)

    reference_tar_filename = dxpy.describe(reference_tar)['name']
    logger.info("reference_tar: %s" % (reference_tar_filename))
    dxpy.download_dxfile(reference_tar, reference_tar_filename)
    # extract the reference files from the tar
    reference_dirname = 'reference_files'
    reference_filename = \
        resolve_reference(reference_tar_filename, reference_dirname)
    logger.info("Using reference file: %s" % (reference_filename))

    paired_end = len(indexed_reads) == 2

    if paired_end:
        r1_basename = strip_extensions(
            unmapped_reads_filenames[0], STRIP_EXTENSIONS)
        r2_basename = strip_extensions(
            unmapped_reads_filenames[1], STRIP_EXTENSIONS)
        reads_basename = r1_basename + r2_basename
    else:
        reads_basename = strip_extensions(
            unmapped_reads_filenames[0], STRIP_EXTENSIONS)
    raw_bam_filename = '%s.raw.srt.bam' % (reads_basename)
    raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename)

    if paired_end:
        reads1_filename = indexed_reads_filenames[0]
        reads2_filename = indexed_reads_filenames[1]
        unmapped_reads1_filename = unmapped_reads_filenames[0]
        unmapped_reads2_filename = unmapped_reads_filenames[1]
        raw_sam_filename = reads_basename + ".raw.sam"
        badcigar_filename = "badreads.tmp"
        steps = [
            "%s sampe -P %s %s %s %s %s"
            % (bwa, reference_filename, reads1_filename, reads2_filename,
               unmapped_reads1_filename, unmapped_reads2_filename),
            "tee %s" % (raw_sam_filename),
            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
            "sort",
            "uniq"]
        out, err = common.run_pipe(steps, badcigar_filename)
        print(out)
        if err:
            logger.error("sampe error: %s" % (err))

        steps = [
            "cat %s" % (raw_sam_filename),
            "grep -v -F -f %s" % (badcigar_filename)]
    else:  # single end
        reads_filename = indexed_reads_filenames[0]
        unmapped_reads_filename = unmapped_reads_filenames[0]
        steps = [
            "%s samse %s %s %s"
            % (bwa, reference_filename,
               reads_filename, unmapped_reads_filename)]

    if samtools_version == "0.1.9":
        steps.extend([
            "%s view -Su -" % (samtools),
            "%s sort - %s"
            % (samtools, raw_bam_filename.rstrip('.bam'))])  # samtools adds .bam
    else:
        steps.extend([
            "%s view -@%d -Su -" % (samtools, cpu_count()),
            "%s sort -@%d - %s"
            % (samtools, cpu_count(), raw_bam_filename.rstrip('.bam'))])  # samtools adds .bam

    logger.info("Running pipe: %s" % (steps))
    out, err = common.run_pipe(steps)

    if out:
        print(out)
    if err:
        logger.error("samtools error: %s" % (err))

    with open(raw_bam_mapstats_filename, 'w') as fh:
        subprocess.check_call(
            shlex.split("%s flagstat %s" % (samtools, raw_bam_filename)),
            stdout=fh)
    print(subprocess.check_output('ls -l', shell=True))

    mapped_reads = dxpy.upload_local_file(raw_bam_filename)
    mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename)
    flagstat_qc = flagstat_parse(raw_bam_mapstats_filename)

    output = {
        'mapped_reads': dxpy.dxlink(mapped_reads),
        'mapping_statistics': dxpy.dxlink(mapping_statistics),
        'n_mapped_reads': flagstat_qc.get('mapped')[0]  # 0 is hi-q reads
    }
    logger.info("Returning from postprocess with output: %s" % (output))
    return output
コード例 #30
0
ファイル: 02chip_filterQC.py プロジェクト: wkl1990/ChIP-seq
def main(input_bam, paired_end, samtools_version, samtools_params,
         picard_version, scrub, debug):
    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    samtools = SAMTOOLS_PATH.get(samtools_version)
    assert samtools, "samtools version %s is not supported" % (
        samtools_version)
    picard = PICARD_PATH.get(picard_version)
    assert picard, "picard version %s is not supported" % (picard_version)
    logger.info("In postprocess with samtools %s and picard %s" %
                (samtools, picard))

    raw_bam_file = input_bam
    raw_bam_filename = raw_bam_file
    raw_bam_basename = raw_bam_file.rstrip('.bam')
    raw_bam_file_mapstats_filename = raw_bam_basename + '.flagstat.qc'
    subprocess.check_output('set -x; ls -l', shell=True)

    # Generate initial mapping statistics
    with open(raw_bam_file_mapstats_filename, 'w') as fh:
        flagstat_command = "%s flagstat %s" % (samtools, raw_bam_filename)
        logger.info(flagstat_command)
        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)

    filt_bam_prefix = raw_bam_basename + ".filt.srt"
    filt_bam_filename = filt_bam_prefix + ".bam"
    if paired_end:
        # =============================
        # Remove  unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Only keep properly paired reads
        # Obtain name sorted BAM file
        # ==================
        tmp_filt_bam_prefix = "%s.tmp" % (filt_bam_prefix
                                          )  # was tmp.prefix.nmsrt
        tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
        out, err = common.run_pipe([
            # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
            # -q 30 exclude MAPQ < 30; -u uncompressed output
            # exclude FLAG 1804: unmapped, next segment unmapped, secondary
            # alignments, not passing platform q, PCR or optical duplicates
            # require FLAG 2: properly aligned
            "%s view -F 1804 -f 2 %s -u %s" %
            (samtools, samtools_params, raw_bam_filename),
            # sort:  -n sort by name; - take input from stdin;
            # out to specified filename
            # Will produce name sorted BAM
            "%s sort -n - %s" % (samtools, tmp_filt_bam_prefix)
        ])
        if err:
            logger.error("samtools error: %s" % (err))
        # Remove orphan reads (pair was removed)
        # and read pairs mapping to different chromosomes
        # Obtain position sorted BAM
        subprocess.check_output('set -x; ls -l', shell=True)
        out, err = common.run_pipe([
            # fill in mate coordinates, ISIZE and mate-related flags
            # fixmate requires name-sorted alignment; -r removes secondary and
            # unmapped (redundant here because already done above?)
            # - send output to stdout
            "%s fixmate -r %s -" % (samtools, tmp_filt_bam_filename),
            # repeat filtering after mate repair
            "%s view -F 1804 -f 2 -u -" % (samtools),
            # produce the coordinate-sorted BAM
            "%s sort - %s" % (samtools, filt_bam_prefix)
        ])
        subprocess.check_output('set -x; ls -l', shell=True)
    else:  # single-end data
        # =============================
        # Remove unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Obtain name sorted BAM file
        # ==================
        with open(filt_bam_filename, 'w') as fh:
            samtools_filter_command = (
                "%s view -F 1804 %s -b %s" %
                (samtools, samtools_params, raw_bam_filename))
            logger.info(samtools_filter_command)
            subprocess.check_call(shlex.split(samtools_filter_command),
                                  stdout=fh)

    # ========================
    # Mark duplicates
    # ======================
    tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam"
    dup_file_qc_filename = raw_bam_basename + ".dup.qc"
    picard_string = ' '.join([
        "java -Xmx4G -jar %s" % (picard),
        "INPUT=%s" % (filt_bam_filename),
        "OUTPUT=%s" % (tmp_filt_bam_filename),
        "METRICS_FILE=%s" % (dup_file_qc_filename),
        "VALIDATION_STRINGENCY=LENIENT", "ASSUME_SORTED=true",
        "REMOVE_DUPLICATES=false"
    ])
    logger.info(picard_string)
    subprocess.check_output(shlex.split(picard_string))
    os.rename(tmp_filt_bam_filename, filt_bam_filename)

    if paired_end:
        final_bam_prefix = raw_bam_basename + ".filt.srt.nodup"
    else:
        final_bam_prefix = raw_bam_basename + ".filt.nodup.srt"
    final_bam_filename = final_bam_prefix + ".bam"  # To be stored
    final_bam_index_filename = final_bam_filename + ".bai"  # To be stored
    # QC file
    final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"

    if paired_end:
        samtools_dedupe_command = \
            "%s view -F 1804 -f2 -b %s" % (samtools, filt_bam_filename)
    else:
        samtools_dedupe_command = \
            "%s view -F 1804 -b %s" % (samtools, filt_bam_filename)

    # ============================
    # Remove duplicates
    # Index final position sorted BAM
    # ============================
    with open(final_bam_filename, 'w') as fh:
        logger.info(samtools_dedupe_command)
        subprocess.check_call(shlex.split(samtools_dedupe_command), stdout=fh)
    # Index final bam file
    samtools2 = SAMTOOLS_PATH.get("1.3.1")
    samtools_index_command = \
        "%s index %s %s" % (samtools2, final_bam_filename, final_bam_index_filename)
    logger.info(samtools_index_command)
    subprocess.check_output(shlex.split(samtools_index_command))

    # Generate mapping statistics
    with open(final_bam_file_mapstats_filename, 'w') as fh:
        flagstat_command = "%s flagstat %s" % (samtools, final_bam_filename)
        logger.info(flagstat_command)
        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)

    # =============================
    # Compute library complexity
    # =============================
    # Sort by name
    # convert to bedPE and obtain fragment coordinates
    # sort by position and strand
    # Obtain unique count statistics
    pbc_file_qc_filename = final_bam_prefix + ".pbc.qc"
    # PBC File output
    # TotalReadPairs [tab]
    # DistinctReadPairs [tab]
    # OneReadPair [tab]
    # TwoReadPairs [tab]
    # NRF=Distinct/Total [tab]
    # PBC1=OnePair/Distinct [tab]
    # PBC2=OnePair/TwoPair
    if paired_end:
        steps = [
            "%s sort -no %s -" % (samtools, filt_bam_filename),
            "bamToBed -bedpe -i stdin",
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""
        ]
    else:
        steps = [
            "bamToBed -i %s" % (filt_bam_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""
        ]
    steps.extend([
        "grep -v 'chrM'", "sort", "uniq -c",
        r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{if(m2){printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}else{printf "%d\t%d\t%d\t%d\t%f\t%f\t%s\n",mt,m0,m1,m2,m0/mt,m1/m0,"Inf"}}'"""
    ])
    out, err = common.run_pipe(steps, pbc_file_qc_filename)
    if err:
        logger.error("PBC file error: %s" % (err))

    output = {}
    logger.info("Uploading results files to the project")
    filtered_bam = final_bam_filename
    filtered_bam_index = final_bam_index_filename
    output.update({
        "filtered_bam": filtered_bam,
        "filtered_bam_index": filtered_bam_index
    })

    # If the scrub parameter is true, pass the bams to the scrub applet.
    if scrub:
        scrub_subjob = scrub_main([input_bam, filtered_bam])
        scrubbed_unfiltered_bam = scrub_subjob.get("scrubbed_bams")[0]
        scrubbed_filtered_bam = scrub_subjob.get("scrubbed_bams")[1]
        # Add the optional scrubbed outputs.
        output.update({
            "scrubbed_unfiltered_bam": scrubbed_unfiltered_bam,
            "scrubbed_filtered_bam": scrubbed_filtered_bam
        })

    # Upload or calculate the remaining outputs.
    filtered_mapstats = final_bam_file_mapstats_filename
    dup_file = dup_file_qc_filename
    pbc_file = pbc_file_qc_filename

    logger.info("Calcualting QC metrics")
    dup_qc = dup_parse(dup_file_qc_filename)
    pbc_qc = pbc_parse(pbc_file_qc_filename)
    initial_mapstats_qc = flagstat_parse(raw_bam_file_mapstats_filename)
    final_mapstats_qc = flagstat_parse(final_bam_file_mapstats_filename)
    if paired_end:
        useable_fragments = final_mapstats_qc.get('in_total')[0] / 2
    else:
        useable_fragments = final_mapstats_qc.get('in_total')[0]
    logger.info("initial_mapstats_qc: %s" % (initial_mapstats_qc)),
    logger.info("final_mapstats_qc: %s" % (final_mapstats_qc)),
    logger.info("dup_qc: %s" % (dup_qc))
    logger.info("pbc_qc: %s" % (pbc_qc))

    # Return links to the output files and values.
    output.update({
        "filtered_mapstats":
        filtered_mapstats,
        "dup_file_qc":
        dup_file,
        "pbc_file_qc":
        pbc_file,
        "paired_end":
        paired_end,
        "n_reads_input":
        str(initial_mapstats_qc.get('in_total')[0]),
        "picard_read_pairs_examined":
        str(dup_qc.get('read_pairs_examined')),
        "picard_unpaired_reads_examined":
        str(dup_qc.get('unpaired_reads_examined')),
        "picard_read_pair_duplicates":
        str(dup_qc.get('read_pair_duplicates')),
        "picard_unpaired_read_duplicates":
        str(dup_qc.get('unpaired_read_duplicates')),
        "useable_fragments":
        str(useable_fragments),
        "NRF":
        str(pbc_qc.get('NRF')),
        "PBC1":
        str(pbc_qc.get('PBC1')),
        "PBC2":
        str(pbc_qc.get('PBC2')),
        "duplicate_fraction":
        str(dup_qc.get('percent_duplication'))
    })
    parse_file = final_bam_prefix + ".parse"
    with open(parse_file, "w") as fh:
        for key, val in output.items():
            if isinstance(val, list):
                fh.write(": ".join([key, ", ".join(val)]) + "\n")
            else:
                fh.write(": ".join([key, str(val)]) + "\n")
    logger.info("Exiting with output:\n%s" % (pformat(output)))
    return output
コード例 #31
0
def main(input_tagAlign, paired_end):

    input_tagAlign_filename = input_tagAlign
    input_tagAlign_basename = input_tagAlign_filename.rstrip('.gz')

    uncompressed_TA_filename = input_tagAlign_basename
    out, err = common.run_pipe(['gzip -d %s' % (input_tagAlign_filename)])

    # =================================
    # Subsample tagAlign file
    # ================================
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_tagAlign_basename + \
        ".sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (uncompressed_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, uncompressed_TA_filename)
    ]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    # spp will be installed in the docker container, so this is not needed
    # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    # run spp
    # refer cwd for testing
    # does this really have to be with _no_dups
    run_spp_command = SPP_TOOL_PATH
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
        (run_spp_command, subsampled_TA_filename, cpu_count(),
         CC_plot_filename, CC_scores_filename)
    ])
    out, err = common.run_pipe(
        [r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)])

    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "CC_scores_file": CC_scores_filename,
        "CC_plot_file": CC_plot_filename,
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": float(xcor_qc.get('estFragLen'))
    }

    return output
コード例 #32
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed,
         chrom_sizes, spp_version, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)

    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)

    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(
        xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = \
            experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    # spp adds .gz, so this is the file name that's actually created
    final_peaks_filename = peaks_filename + '.gz'
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    logger.info(subprocess.check_output(
        'ls -l', shell=True, stderr=subprocess.STDOUT))

    # third column in the cross-correlation scores input file
    fraglen_column = 3
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        logger.info("Read fragment length: %d" % (fragment_length))

    spp_tarball = SPP_VERSION_MAP.get(spp_version)
    assert spp_tarball, "spp version %s is not supported" % (spp_version)
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    # install spp
    subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    spp_command = (
        "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s"
        % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks,
           fragment_length, peaks_filename, xcor_plot_filename,
           xcor_scores_filename))
    logger.info(spp_command)
    subprocess.check_call(shlex.split(spp_command))

    # when one of the peak coordinates are an exact multiple of 10, spp (R)
    # outputs the coordinate in scientific notation
    # this changes any such coodinates to decimal notation
    # this assumes 10-column output and that the 2nd and 3rd columns are
    # coordinates
    # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a
    # negative start coordinate (particularly chrM) and will cause slopBed
    # to halt at that line, truncating the output of the pipe
    # slopBed adjusts feature end coordinates that go off the end of the
    # chromosome
    # bedClip removes any features that are still not within the boundaries of
    # the chromosome

    fix_coordinate_peaks_filename = \
        output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" % (final_peaks_filename),
        "tee %s" % (peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename),
        'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    # These lines transfer the peaks files to the temporary workspace for
    # debugging later
    # Only at the end are the final files uploaded that will be returned from
    # the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    logger.info("%s peaks called by spp" % (n_spp_peaks))
    logger.info(
        "%s of those peaks removed due to bad coordinates"
        % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)))
    print("First 50 peaks")
    subprocess.check_output(
        'head -50 %s' % (fix_coordinate_peaks_filename),
        shell=True)

    if bigbed:
        peaks_bb_filename = \
            common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        logger.info("Returning peaks with fixed coordinates")
        subprocess.check_call(shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    subprocess.check_call('ls -l', shell=True)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #33
0
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version,
                samtools_version, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    samtools = SAMTOOLS_PATH.get(samtools_version)
    assert samtools, "samtools version %s is not supported" % (
        samtools_version)
    bwa = BWA_PATH.get(bwa_version)
    assert bwa, "BWA version %s is not supported" % (bwa_version)
    logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa))

    indexed_reads_filenames = []
    unmapped_reads_filenames = []
    for i, reads in enumerate(indexed_reads):
        read_pair_number = i + 1

        fn = dxpy.describe(reads)['name']
        logger.info("indexed_reads %d: %s" % (read_pair_number, fn))
        indexed_reads_filenames.append(fn)
        dxpy.download_dxfile(reads, fn)

        unmapped = unmapped_reads[i]
        fn = dxpy.describe(unmapped)['name']
        logger.info("unmapped reads %d: %s" % (read_pair_number, fn))
        unmapped_reads_filenames.append(fn)
        dxpy.download_dxfile(unmapped, fn)

    reference_tar_filename = dxpy.describe(reference_tar)['name']
    logger.info("reference_tar: %s" % (reference_tar_filename))
    dxpy.download_dxfile(reference_tar, reference_tar_filename)
    # extract the reference files from the tar
    reference_dirname = 'reference_files'
    reference_filename = \
        resolve_reference(reference_tar_filename, reference_dirname)
    logger.info("Using reference file: %s" % (reference_filename))

    paired_end = len(indexed_reads) == 2

    if paired_end:
        r1_basename = strip_extensions(unmapped_reads_filenames[0],
                                       STRIP_EXTENSIONS)
        r2_basename = strip_extensions(unmapped_reads_filenames[1],
                                       STRIP_EXTENSIONS)
        reads_basename = r1_basename + r2_basename
    else:
        reads_basename = strip_extensions(unmapped_reads_filenames[0],
                                          STRIP_EXTENSIONS)
    raw_bam_filename = '%s.raw.srt.bam' % (reads_basename)
    raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename)

    if paired_end:
        reads1_filename = indexed_reads_filenames[0]
        reads2_filename = indexed_reads_filenames[1]
        unmapped_reads1_filename = unmapped_reads_filenames[0]
        unmapped_reads2_filename = unmapped_reads_filenames[1]
        raw_sam_filename = reads_basename + ".raw.sam"
        badcigar_filename = "badreads.tmp"
        steps = [
            "%s sampe -P %s %s %s %s %s" %
            (bwa, reference_filename, reads1_filename, reads2_filename,
             unmapped_reads1_filename, unmapped_reads2_filename),
            "tee %s" % (raw_sam_filename),
            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
            "sort", "uniq"
        ]
        out, err = common.run_pipe(steps, badcigar_filename)
        print(out)
        if err:
            logger.error("sampe error: %s" % (err))

        steps = [
            "cat %s" % (raw_sam_filename),
            "grep -v -F -f %s" % (badcigar_filename)
        ]
    else:  # single end
        reads_filename = indexed_reads_filenames[0]
        unmapped_reads_filename = unmapped_reads_filenames[0]
        steps = [
            "%s samse %s %s %s" %
            (bwa, reference_filename, reads_filename, unmapped_reads_filename)
        ]

    if samtools_version == "0.1.9":
        steps.extend([
            "%s view -Su -" % (samtools),
            "%s sort - %s" % (samtools, raw_bam_filename.rstrip('.bam'))
        ])  # samtools adds .bam
    else:
        steps.extend([
            "%s view -@%d -Su -" % (samtools, cpu_count()),
            "%s sort -@%d - %s" %
            (samtools, cpu_count(), raw_bam_filename.rstrip('.bam'))
        ])  # samtools adds .bam

    logger.info("Running pipe: %s" % (steps))
    out, err = common.run_pipe(steps)

    if out:
        print(out)
    if err:
        logger.error("samtools error: %s" % (err))

    with open(raw_bam_mapstats_filename, 'w') as fh:
        subprocess.check_call(shlex.split("%s flagstat %s" %
                                          (samtools, raw_bam_filename)),
                              stdout=fh)
    print(subprocess.check_output('ls -l', shell=True))

    mapped_reads = dxpy.upload_local_file(raw_bam_filename)
    mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename)
    flagstat_qc = flagstat_parse(raw_bam_mapstats_filename)

    output = {
        'mapped_reads': dxpy.dxlink(mapped_reads),
        'mapping_statistics': dxpy.dxlink(mapping_statistics),
        'n_mapped_reads': flagstat_qc.get('mapped')[0]  # 0 is hi-q reads
    }
    logger.info("Returning from postprocess with output: %s" % (output))
    return output
コード例 #34
0
ファイル: spp.py プロジェクト: zozo123/chip-seq-pipeline
def main(experiment,
         control,
         xcor_scores_input,
         npeaks,
         nodups,
         bigbed,
         chrom_sizes,
         spp_version,
         as_file=None,
         prefix=None,
         fragment_length=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)

    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)

    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(),
                         xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = \
            experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    # spp adds .gz, so this is the file name that's actually created
    final_peaks_filename = peaks_filename + '.gz'
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    logger.info(
        subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT))

    # third column in the cross-correlation scores input file
    # if fragment_length is provided, use that. Else read
    # fragment length from xcor file
    if fragment_length is not None:
        fraglen = str(fragment_length)
        logger.info("User given fragment length %s" % (fraglen))
    else:
        fraglen_column = 3
        with open(xcor_scores_input_filename, 'r') as f:
            line = f.readline()
            fraglen = line.split('\t')[fraglen_column - 1]
            logger.info("Read fragment length: %s" % (fraglen))

    # spp_tarball = SPP_VERSION_MAP.get(spp_version)
    # assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # install spp
    # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    run_spp = '/phantompeakqualtools/run_spp.R'
    spp_command = (
        "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%s -savr=%s -savp=%s -rf -out=%s"
        % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks,
           fraglen, peaks_filename, xcor_plot_filename, xcor_scores_filename))
    logger.info(spp_command)
    subprocess.check_call(shlex.split(spp_command))

    # when one of the peak coordinates are an exact multiple of 10, spp (R)
    # outputs the coordinate in scientific notation
    # this changes any such coodinates to decimal notation
    # this assumes 10-column output and that the 2nd and 3rd columns are
    # coordinates
    # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a
    # negative start coordinate (particularly chrM) and will cause slopBed
    # to halt at that line, truncating the output of the pipe
    # slopBed adjusts feature end coordinates that go off the end of the
    # chromosome
    # bedClip removes any features that are still not within the boundaries of
    # the chromosome

    fix_coordinate_peaks_filename = \
        output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" % (final_peaks_filename),
        "tee %s" % (peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename),
        'bedClip stdin %s %s' %
        (chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    # These lines transfer the peaks files to the temporary workspace for
    # debugging later
    # Only at the end are the final files uploaded that will be returned from
    # the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    logger.info("%s peaks called by spp" % (n_spp_peaks))
    logger.info(
        "%s of those peaks removed due to bad coordinates" %
        (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)))
    print("First 50 peaks")
    subprocess.check_output('head -50 %s' % (fix_coordinate_peaks_filename),
                            shell=True)

    if bigbed:
        peaks_bb_filename = \
            common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename):
        logger.info("Returning peaks with fixed coordinates")
        subprocess.check_call(
            shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    subprocess.check_call('ls -l', shell=True)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #35
0
def main(input_tagAlign, paired_end, spp_version):

    input_tagAlign_file = dxpy.DXFile(input_tagAlign)

    input_tagAlign_filename = input_tagAlign_file.name
    input_tagAlign_basename = input_tagAlign_file.name.rstrip('.gz')
    dxpy.download_dxfile(input_tagAlign_file.get_id(), input_tagAlign_filename)

    uncompressed_TA_filename = input_tagAlign_basename
    out, err = common.run_pipe(['gzip -d %s' % (input_tagAlign_filename)])

    # =================================
    # Subsample tagAlign file
    # ================================
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_tagAlign_basename + \
        ".sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (uncompressed_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, uncompressed_TA_filename)]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    # spp_tarball = SPP_VERSION_MAP.get(spp_version)
    # assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # # install spp
    # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    # run spp
    run_spp_command = '/phantompeakqualtools/run_spp.R'
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s"
        % (run_spp_command, subsampled_TA_filename, cpu_count(),
           CC_plot_filename, CC_scores_filename)])
    out, err = common.run_pipe([
        r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe([
        "mv temp %s" % (CC_scores_filename)])

    CC_scores_file = dxpy.upload_local_file(CC_scores_filename)
    CC_plot_file = dxpy.upload_local_file(CC_plot_filename)
    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "CC_scores_file": dxpy.dxlink(CC_scores_file),
        "CC_plot_file": dxpy.dxlink(CC_plot_file),
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": float(xcor_qc.get('estFragLen'))
    }

    return output
コード例 #36
0
def main(input_bam, paired_end, samtools_params, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    # input_json is no longer used
    # # if there is input_JSON, it over-rides any explicit parameters
    # if input_JSON:
    #     if 'input_bam' in input_JSON:
    #         input_bam = input_JSON['input_bam']
    #     if 'paired_end' in input_JSON:
    #         paired_end = input_JSON['paired_end']
    #     if 'samtools_params' in input_JSON:
    #         samtools_params = input_JSON['samtools_params']

    # this is now handled by the platform input validator
    # if not input_bam:
    #     logger.error('input_bam is required')
    #     raise Exception
    # assert paired_end is not None, 'paired_end is required, explicitly or in input_JSON'

    raw_bam_file = dxpy.DXFile(input_bam)
    raw_bam_filename = raw_bam_file.name
    raw_bam_basename = raw_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(raw_bam_file.get_id(), raw_bam_filename)

    subprocess.check_output('set -x; ls -l', shell=True)

    filt_bam_prefix = raw_bam_basename + ".filt.srt"
    filt_bam_filename = filt_bam_prefix + ".bam"
    if paired_end:
        # =============================
        # Remove  unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Only keep properly paired reads
        # Obtain name sorted BAM file
        # ==================
        tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix
                                          )  # was tmp.prefix.nmsrt
        tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
        out, err = common.run_pipe([
            # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
            # -q 30 exclude MAPQ < 30; -u uncompressed output
            # exclude FLAG 1804: unmapped, next segment unmapped, secondary
            # alignments, not passing platform q, PCR or optical duplicates
            # require FLAG 2: properly aligned
            "samtools view -F 1804 -f 2 %s -u %s" %
            (samtools_params, raw_bam_filename),
            # sort:  -n sort by name; - take input from stdin;
            # out to specified filename
            # Will produce name sorted BAM
            "samtools sort -n - %s" % (tmp_filt_bam_prefix)
        ])
        if err:
            logger.error("samtools error: %s" % (err))
        # Remove orphan reads (pair was removed)
        # and read pairs mapping to different chromosomes
        # Obtain position sorted BAM
        subprocess.check_output('set -x; ls -l', shell=True)
        out, err = common.run_pipe([
            # fill in mate coordinates, ISIZE and mate-related flags
            # fixmate requires name-sorted alignment; -r removes secondary and
            # unmapped (redundant here because already done above?)
            # - send output to stdout
            "samtools fixmate -r %s -" % (tmp_filt_bam_filename),
            # repeat filtering after mate repair
            "samtools view -F 1804 -f 2 -u -",
            # produce the coordinate-sorted BAM
            "samtools sort - %s" % (filt_bam_prefix)
        ])
        subprocess.check_output('set -x; ls -l', shell=True)
    else:  # single-end data
        # =============================
        # Remove unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Obtain name sorted BAM file
        # ==================
        with open(filt_bam_filename, 'w') as fh:
            samtools_filter_command = ("samtools view -F 1804 %s -b %s" %
                                       (samtools_params, raw_bam_filename))
            logger.info(samtools_filter_command)
            subprocess.check_call(shlex.split(samtools_filter_command),
                                  stdout=fh)

    # ========================
    # Mark duplicates
    # ======================
    tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam"
    dup_file_qc_filename = raw_bam_basename + ".dup.qc"
    picard_string = ' '.join([
        "java -Xmx4G -jar /picard/MarkDuplicates.jar",
        "INPUT=%s" % (filt_bam_filename),
        "OUTPUT=%s" % (tmp_filt_bam_filename),
        "METRICS_FILE=%s" % (dup_file_qc_filename),
        "VALIDATION_STRINGENCY=LENIENT", "ASSUME_SORTED=true",
        "REMOVE_DUPLICATES=false"
    ])
    logger.info(picard_string)
    subprocess.check_output(shlex.split(picard_string))
    os.rename(tmp_filt_bam_filename, filt_bam_filename)

    if paired_end:
        final_bam_prefix = raw_bam_basename + ".filt.srt.nodup"
    else:
        final_bam_prefix = raw_bam_basename + ".filt.nodup.srt"
    final_bam_filename = final_bam_prefix + ".bam"  # To be stored
    final_bam_index_filename = final_bam_filename + ".bai"  # To be stored
    # QC file
    final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"

    if paired_end:
        samtools_dedupe_command = \
            "samtools view -F 1804 -f2 -b %s" % (filt_bam_filename)
    else:
        samtools_dedupe_command = \
            "samtools view -F 1804 -b %s" % (filt_bam_filename)

    # ============================
    # Remove duplicates
    # Index final position sorted BAM
    # ============================
    with open(final_bam_filename, 'w') as fh:
        logger.info(samtools_dedupe_command)
        subprocess.check_call(shlex.split(samtools_dedupe_command), stdout=fh)
    # Index final bam file
    samtools_index_command = \
        "samtools index %s %s" % (final_bam_filename, final_bam_index_filename)
    logger.info(samtools_index_command)
    subprocess.check_output(shlex.split(samtools_index_command))

    # Generate mapping statistics
    with open(final_bam_file_mapstats_filename, 'w') as fh:
        flagstat_command = "samtools flagstat %s" % (final_bam_filename)
        logger.info(flagstat_command)
        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)

    # =============================
    # Compute library complexity
    # =============================
    # Sort by name
    # convert to bedPE and obtain fragment coordinates
    # sort by position and strand
    # Obtain unique count statistics
    pbc_file_qc_filename = final_bam_prefix + ".pbc.qc"
    # PBC File output
    # TotalReadPairs [tab]
    # DistinctReadPairs [tab]
    # OneReadPair [tab]
    # TwoReadPairs [tab]
    # NRF=Distinct/Total [tab]
    # PBC1=OnePair/Distinct [tab]
    # PBC2=OnePair/TwoPair
    if paired_end:
        steps = [
            "samtools sort -no %s -" % (filt_bam_filename),
            "bamToBed -bedpe -i stdin",
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""
        ]
    else:
        steps = [
            "bamToBed -i %s" % (filt_bam_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""
        ]
    steps.extend([
        # TODO this should be implemented as an explicit list of allowable
        # names, so that mapping can be done to a complete reference
        "grep -v 'chrM'",
        "sort",
        "uniq -c",
        r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
    ])
    out, err = common.run_pipe(steps, pbc_file_qc_filename)
    if err:
        logger.error("PBC file error: %s" % (err))

    logger.info("Uploading results files to the project")
    filtered_bam = dxpy.upload_local_file(final_bam_filename)
    filtered_bam_index = dxpy.upload_local_file(final_bam_index_filename)
    filtered_mapstats = \
        dxpy.upload_local_file(final_bam_file_mapstats_filename)
    dup_file = dxpy.upload_local_file(dup_file_qc_filename)
    pbc_file = dxpy.upload_local_file(pbc_file_qc_filename)
    dup_qc = dup_parse(dup_file_qc_filename)
    pbc_qc = pbc_parse(pbc_file_qc_filename)
    logger.info("dup_qc: %s" % (dup_qc))
    logger.info("pbc_qc: %s" % (pbc_qc))

    # Return links to the output files
    output = {
        "filtered_bam": dxpy.dxlink(filtered_bam),
        "filtered_bam_index": dxpy.dxlink(filtered_bam_index),
        "filtered_mapstats": dxpy.dxlink(filtered_mapstats),
        "dup_file_qc": dxpy.dxlink(dup_file),
        "pbc_file_qc": dxpy.dxlink(pbc_file),
        "paired_end": paired_end,
        "NRF": pbc_qc.get('NRF'),
        "PBC1": pbc_qc.get('PBC1'),
        "PBC2": pbc_qc.get('PBC2'),
        "duplicate_fraction": dup_qc.get('percent_duplication')
    }
    logger.info("Exiting with output:\n%s" % (pprint(output)))
    return output
コード例 #37
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3 # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        print "Read fragment length: %d" %(fragment_length)

    #run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz'
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    #install spp
    subprocess.check_call('ls -l', shell=True)
    subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball)))
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)
    print spp_command
    # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    # for line in iter(process.stdout.readline, ''):
    #     sys.stdout.write(line)
    subprocess.check_call(shlex.split(spp_command))

    #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
    #this changes any such coodinates to decimal notation
    #this assumes 10-column output and that the 2nd and 3rd columns are coordinates
    #slopBed adjusts feature end coordinates that go off the end of the chromosome
    #bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" %(final_peaks_filename),
        "tee %s" %(peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename),
        'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    #These lines transfer the peaks files to the temporary workspace for debugging later
    #Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" %(n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))
    print "First 50 peaks"
    print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
コード例 #38
0
def main(input_bam, paired_end, samtools_params, scrub, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    raw_bam_file = dxpy.DXFile(input_bam)
    raw_bam_filename = raw_bam_file.name
    raw_bam_basename = raw_bam_file.name.rstrip('.bam')
    raw_bam_file_mapstats_filename = raw_bam_basename + '.flagstat.qc'
    dxpy.download_dxfile(raw_bam_file.get_id(), raw_bam_filename)
    subprocess.check_output('set -x; ls -l', shell=True)

    # Generate initial mapping statistics
    with open(raw_bam_file_mapstats_filename, 'w') as fh:
        flagstat_command = "samtools flagstat %s" % (raw_bam_filename)
        logger.info(flagstat_command)
        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)

    filt_bam_prefix = raw_bam_basename + ".filt.srt"
    filt_bam_filename = filt_bam_prefix + ".bam"
    if paired_end:
        # =============================
        # Remove  unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Only keep properly paired reads
        # Obtain name sorted BAM file
        # ==================
        tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)  # was tmp.prefix.nmsrt
        tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
        out, err = common.run_pipe([
            # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
            # -q 30 exclude MAPQ < 30; -u uncompressed output
            # exclude FLAG 1804: unmapped, next segment unmapped, secondary
            # alignments, not passing platform q, PCR or optical duplicates
            # require FLAG 2: properly aligned
            "samtools view -F 1804 -f 2 %s -u %s" % (samtools_params, raw_bam_filename),
            # sort:  -n sort by name; - take input from stdin;
            # out to specified filename
            # Will produce name sorted BAM
            "samtools sort -n - %s" % (tmp_filt_bam_prefix)])
        if err:
            logger.error("samtools error: %s" % (err))
        # Remove orphan reads (pair was removed)
        # and read pairs mapping to different chromosomes
        # Obtain position sorted BAM
        subprocess.check_output('set -x; ls -l', shell=True)
        out, err = common.run_pipe([
            # fill in mate coordinates, ISIZE and mate-related flags
            # fixmate requires name-sorted alignment; -r removes secondary and
            # unmapped (redundant here because already done above?)
            # - send output to stdout
            "samtools fixmate -r %s -" % (tmp_filt_bam_filename),
            # repeat filtering after mate repair
            "samtools view -F 1804 -f 2 -u -",
            # produce the coordinate-sorted BAM
            "samtools sort - %s" % (filt_bam_prefix)])
        subprocess.check_output('set -x; ls -l', shell=True)
    else:  # single-end data
        # =============================
        # Remove unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Obtain name sorted BAM file
        # ==================
        with open(filt_bam_filename, 'w') as fh:
            samtools_filter_command = (
                "samtools view -F 1804 %s -b %s"
                % (samtools_params, raw_bam_filename)
                )
            logger.info(samtools_filter_command)
            subprocess.check_call(
                shlex.split(samtools_filter_command),
                stdout=fh)

    # ========================
    # Mark duplicates
    # ======================
    tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam"
    dup_file_qc_filename = raw_bam_basename + ".dup.qc"
    picard_string = ' '.join([
        "java -Xmx4G -jar /picard/MarkDuplicates.jar",
        "INPUT=%s" % (filt_bam_filename),
        "OUTPUT=%s" % (tmp_filt_bam_filename),
        "METRICS_FILE=%s" % (dup_file_qc_filename),
        "VALIDATION_STRINGENCY=LENIENT",
        "ASSUME_SORTED=true",
        "REMOVE_DUPLICATES=false"
        ])
    logger.info(picard_string)
    subprocess.check_output(shlex.split(picard_string))
    os.rename(tmp_filt_bam_filename, filt_bam_filename)

    if paired_end:
        final_bam_prefix = raw_bam_basename + ".filt.srt.nodup"
    else:
        final_bam_prefix = raw_bam_basename + ".filt.nodup.srt"
    final_bam_filename = final_bam_prefix + ".bam"  # To be stored
    final_bam_index_filename = final_bam_filename + ".bai"  # To be stored
    # QC file
    final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"

    if paired_end:
        samtools_dedupe_command = \
            "samtools view -F 1804 -f2 -b %s" % (filt_bam_filename)
    else:
        samtools_dedupe_command = \
            "samtools view -F 1804 -b %s" % (filt_bam_filename)

    # ============================
    # Remove duplicates
    # Index final position sorted BAM
    # ============================
    with open(final_bam_filename, 'w') as fh:
        logger.info(samtools_dedupe_command)
        subprocess.check_call(
            shlex.split(samtools_dedupe_command),
            stdout=fh)
    # Index final bam file
    samtools_index_command = \
        "samtools index %s %s" % (final_bam_filename, final_bam_index_filename)
    logger.info(samtools_index_command)
    subprocess.check_output(shlex.split(samtools_index_command))

    # Generate mapping statistics
    with open(final_bam_file_mapstats_filename, 'w') as fh:
        flagstat_command = "samtools flagstat %s" % (final_bam_filename)
        logger.info(flagstat_command)
        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)

    # =============================
    # Compute library complexity
    # =============================
    # Sort by name
    # convert to bedPE and obtain fragment coordinates
    # sort by position and strand
    # Obtain unique count statistics
    pbc_file_qc_filename = final_bam_prefix + ".pbc.qc"
    # PBC File output
    # TotalReadPairs [tab]
    # DistinctReadPairs [tab]
    # OneReadPair [tab]
    # TwoReadPairs [tab]
    # NRF=Distinct/Total [tab]
    # PBC1=OnePair/Distinct [tab]
    # PBC2=OnePair/TwoPair
    if paired_end:
        steps = [
            "samtools sort -no %s -" % (filt_bam_filename),
            "bamToBed -bedpe -i stdin",
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
    else:
        steps = [
            "bamToBed -i %s" % (filt_bam_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
    steps.extend([
        "grep -v 'chrM'",
        "sort",
        "uniq -c",
        r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
        ])
    out, err = common.run_pipe(steps, pbc_file_qc_filename)
    if err:
        logger.error("PBC file error: %s" % (err))

    output = {}
    logger.info("Uploading results files to the project")
    filtered_bam = dxpy.upload_local_file(final_bam_filename)
    filtered_bam_index = dxpy.upload_local_file(final_bam_index_filename)
    output.update({
        "filtered_bam": dxpy.dxlink(filtered_bam),
        "filtered_bam_index": dxpy.dxlink(filtered_bam_index)
    })

    # If the scrub parameter is true, pass the bams to the scrub applet.
    if scrub:
        scrub_applet = dxpy.find_one_data_object(
            classname='applet',
            name='scrub',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
        scrub_subjob = \
            scrub_applet.run(
                {"input_bams": [input_bam, dxpy.dxlink(filtered_bam)]},
                name='Scrub bams')
        scrubbed_unfiltered_bam = scrub_subjob.get_output_ref("scrubbed_bams", index=0)
        scrubbed_filtered_bam = scrub_subjob.get_output_ref("scrubbed_bams", index=1)
        # Add the optional scrubbed outputs.
        output.update({
            "scrubbed_unfiltered_bam": dxpy.dxlink(scrubbed_unfiltered_bam),
            "scrubbed_filtered_bam": dxpy.dxlink(scrubbed_filtered_bam)
        })

    # Upload or calculate the remaining outputs.
    filtered_mapstats = \
        dxpy.upload_local_file(final_bam_file_mapstats_filename)
    dup_file = dxpy.upload_local_file(dup_file_qc_filename)
    pbc_file = dxpy.upload_local_file(pbc_file_qc_filename)

    logger.info("Calcualting QC metrics")
    dup_qc = dup_parse(dup_file_qc_filename)
    pbc_qc = pbc_parse(pbc_file_qc_filename)
    initial_mapstats_qc = flagstat_parse(raw_bam_file_mapstats_filename)
    final_mapstats_qc = flagstat_parse(final_bam_file_mapstats_filename)
    if paired_end:
        useable_fragments = final_mapstats_qc.get('in_total')[0]/2
    else:
        useable_fragments = final_mapstats_qc.get('in_total')[0]
    logger.info("initial_mapstats_qc: %s" % (initial_mapstats_qc)),
    logger.info("final_mapstats_qc: %s" % (final_mapstats_qc)),
    logger.info("dup_qc: %s" % (dup_qc))
    logger.info("pbc_qc: %s" % (pbc_qc))

    # Return links to the output files and values.
    output.update({
        "filtered_mapstats": dxpy.dxlink(filtered_mapstats),
        "dup_file_qc": dxpy.dxlink(dup_file),
        "pbc_file_qc": dxpy.dxlink(pbc_file),
        "paired_end": paired_end,
        "n_reads_input": str(initial_mapstats_qc.get('in_total')[0]),
        "picard_read_pairs_examined": str(dup_qc.get('read_pairs_examined')),
        "picard_unpaired_reads_examined": str(dup_qc.get('unpaired_reads_examined')),
        "picard_read_pair_duplicates": str(dup_qc.get('read_pair_duplicates')),
        "picard_unpaired_read_duplicates": str(dup_qc.get('unpaired_read_duplicates')),
        "useable_fragments": str(useable_fragments),
        "NRF": str(pbc_qc.get('NRF')),
        "PBC1": str(pbc_qc.get('PBC1')),
        "PBC2": str(pbc_qc.get('PBC2')),
        "duplicate_fraction": str(dup_qc.get('percent_duplication'))
    })
    logger.info("Exiting with output:\n%s" % (pformat(output)))
    return output
コード例 #39
0
def main(input_tags, prefix=None):
    input_tags_file = input_tags
    input_tags_filename = input_tags_file
    # introspect the file to determine tagAlign (thus SE) or BEDPE (thus PE)
    # strip extension as appropriate
    subprocess.check_output('ls', shell=True)
    with gzip.open(input_tags_filename) as f:
        firstline = f.readline()
    logger.info('First line of input_tags:\n%s' % (firstline))

    se_cols = 6
    pe_cols = 10
    firstline = firstline.decode("utf-8")
    if re.match('^(\S+[\t\n]){%d}$' % (se_cols), firstline):
        paired_end = False
        input_tags_basename = prefix or input_tags_filename.rstrip(
            '.tagAlign.gz')
        filename_infix = 'SE'
        logger.info("Detected single-end data")
    elif re.match('^(\S+[\t\n]){%d}$' % (pe_cols), firstline):
        paired_end = True
        input_tags_basename = prefix or input_tags_filename.rstrip('.bedpe.gz')
        filename_infix = 'PE2SE'
        logger.info("Detected paired-end data")
    else:
        raise IOError("%s is neither a BEDPE or tagAlign file" %
                      (input_tags_filename))

    pr_ta_filenames = \
        [input_tags_basename + ".%s.pr1.tagAlign.gz" % (filename_infix),
         input_tags_basename + ".%s.pr2.tagAlign.gz" % (filename_infix)]

    # count lines in the file
    out, err = common.run_pipe(
        ['gzip -dc %s' % (input_tags_filename), 'wc -l'])
    # number of lines in each split
    nlines = (int(out) + 1) / 2
    # Shuffle and split BEDPE file into 2 equal parts
    # by using the input to seed shuf we ensure multiple runs with the same
    # input will produce the same output
    # Produces two files named splits_prefix0n, n=1,2
    splits_prefix = 'temp_split'
    out, err = common.run_pipe([
        'gzip -dc %s' % (input_tags_filename),
        'shuf --random-source=%s' % (input_tags_filename),
        'split -a 2 -d -l %d - %s' % (nlines, splits_prefix)
    ])
    # Convert read pairs to reads into standard tagAlign file
    for i, index in enumerate(['00', '01']):  # could be made multi-threaded
        steps = ['cat %s' % (splits_prefix + index)]
        if paired_end:
            steps.extend([
                r"""awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\tN\t1000\t%s\n%s\t%s\t%s\tN\t1000\t%s\n",$1,$2,$3,$9,$4,$5,$6,$10}'"""
            ])
        steps.extend(['gzip -cn'])
        out, err = common.run_pipe(steps, outfile=pr_ta_filenames[i])

    pseudoreplicate1_file = pr_ta_filenames[0]
    pseudoreplicate2_file = pr_ta_filenames[1]

    output = {
        "pseudoreplicate1": pseudoreplicate1_file,
        "pseudoreplicate2": pseudoreplicate2_file
    }

    return output
コード例 #40
0
def main(rep1_peaks,
         rep2_peaks,
         pooled_peaks,
         pooledpr1_peaks,
         pooledpr2_peaks,
         chrom_sizes,
         as_file,
         peak_type,
         prefix=None,
         rep1_signal=None,
         rep2_signal=None,
         pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks = dxpy.DXFile(rep1_peaks)
    rep2_peaks = dxpy.DXFile(rep2_peaks)
    pooled_peaks = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    as_file = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)  #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)
    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" % (
        common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" % (
        common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" % (
        common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))

    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        'npeaks_rejected': npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
コード例 #41
0
def main(input_bam, paired_end, spp_version):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    # ===================
    # Create tagAlign file
    # ===================

    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename), "gzip -cn"
    ],
                               outfile=final_TA_filename)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        # need namesorted bam to make BEDPE
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        samtools_sort_command = \
            "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(samtools_sort_command)
        subprocess.check_output(shlex.split(samtools_sort_command))
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" %
            (final_nmsrt_bam_filename), "gzip -cn"
        ],
                                   outfile=final_BEDPE_filename)

    # =================================
    # Subsample tagAlign file
    # ================================
    logger.info("Intermediate tA md5: %s" %
                (common.md5(intermediate_TA_filename)))
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_bam_basename + \
        ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (intermediate_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename)
    ]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)
    logger.info("Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename)))

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    spp_tarball = SPP_VERSION_MAP.get(spp_version)
    assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # install spp
    subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    # run spp
    run_spp_command = '/phantompeakqualtools/run_spp_nodups.R'
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
        (run_spp_command, subsampled_TA_filename, cpu_count(),
         CC_plot_filename, CC_scores_filename)
    ])
    out, err = common.run_pipe(
        [r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)])

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    CC_scores_file = dxpy.upload_local_file(CC_scores_filename)
    CC_plot_file = dxpy.upload_local_file(CC_plot_filename)
    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "tagAlign_file": dxpy.dxlink(tagAlign_file),
        "CC_scores_file": dxpy.dxlink(CC_scores_file),
        "CC_plot_file": dxpy.dxlink(CC_plot_file),
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": float(xcor_qc.get('estFragLen'))
    }
    if paired_end:
        output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)})
    return output
コード例 #42
0
def replicated_overlap(rep1_peaks,
                       rep2_peaks,
                       pooled_peaks,
                       pooledpr1_peaks,
                       pooledpr2_peaks,
                       rep1_ta,
                       rep1_xcor,
                       rep2_ta,
                       rep2_xcor,
                       paired_end,
                       chrom_sizes,
                       as_file,
                       peak_type,
                       prefix,
                       fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates" %
          (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename,
        pool_xcor_filename,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #43
0
        def process(self):
                '''
                #find pooled peaks that are in (rep1 AND rep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
                        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
                        ], overlap_tr_fn)
                print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

                #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
                        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
                        ], overlap_pr_fn)
                print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

                #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
                        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
                        ], overlapping_peaks_fn)
                print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
                '''
                #the only difference between the peak_types is how the extra columns are handled
                if self.peak_type == "narrowPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-10'
                        bed_type = 'bed6+4'
                elif self.peak_type == "gappedPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-15'
                        bed_type = 'bed12+3'
                elif self.peak_type == "broadPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-9'
                        bed_type = 'bed6+3'
                else:
                        print "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak."
                        sys.exit()

                # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined 1bp
                out, err = common.run_pipe([
                        'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.rep1_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u',
                        'intersectBed -wo -a stdin -b %s' %(self.rep2_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u'
                        ], self.overlap_tr_fn)
                print "%d peaks overlap with both true replicates" %(common.count_lines(self.overlap_tr_fn))

                # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as 1bp
                out, err = common.run_pipe([
                        'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.pooledpr1_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u',
                        'intersectBed -wo -a stdin -b %s' %(self.pooledpr2_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u'
                        ], self.overlap_pr_fn)
                print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(self.overlap_pr_fn))

                # Combine peak lists
                out, err = common.run_pipe([
                        'cat %s %s' %(self.overlap_tr_fn, self.overlap_pr_fn),
                        'sort -u'
                        ], self.overlapping_peaks_fn)
                print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(self.overlapping_peaks_fn))

                #rejected peaks
                out, err = common.run_pipe([
                        'intersectBed -wa -v -a %s -b %s' %(self.pooled_peaks_fn, self.overlapping_peaks_fn)
                        ], self.rejected_peaks_fn)
                print "%d peaks were rejected" %(common.count_lines(self.rejected_peaks_fn))

                self.npeaks_in 		= common.count_lines(common.uncompress(self.pooled_peaks_fn))
                self.npeaks_out 		= common.count_lines(self.overlapping_peaks_fn)
                self.npeaks_rejected = common.count_lines(self.rejected_peaks_fn)

                #make bigBed files for visualization
                self.overlapping_peaks_bb_fn = common.bed2bb(self.overlapping_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
                self.rejected_peaks_bb_fn 	= common.bed2bb(self.rejected_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
コード例 #44
0
def internal_pseudoreplicate_overlap(rep1_peaks,
                                     rep2_peaks,
                                     pooled_peaks,
                                     rep1_ta,
                                     rep1_xcor,
                                     paired_end,
                                     chrom_sizes,
                                     as_file,
                                     peak_type,
                                     prefix,
                                     fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'],
                               overlapping_peaks_fn)
    print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn,
        rep1_xcor_fn,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
コード例 #45
0
ファイル: macs2.py プロジェクト: jmrinaldi/chip-seq-pipeline
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment        = dxpy.DXFile(experiment)
    control           = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes       = dxpy.DXFile(chrom_sizes)
    narrowPeak_as     = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as     = dxpy.DXFile(gappedpeak_as)
    broadPeak_as      = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(),        experiment.name)
    dxpy.download_dxfile(control.get_id(),           control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(),       chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(),     narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(),     gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(),      broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks_macs'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn    = "%s/%s.narrowPeak" %(peaks_dirname, prefix)
    gappedPeak_fn    = "%s/%s.gappedPeak" %(peaks_dirname, prefix)
    broadPeak_fn     = "%s/%s.broadPeak"  %(peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn  = broadPeak_fn  + ".gz"
    narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn)
    broadPeak_bb_fn  = "%s.bb" %(broadPeak_fn)
    fc_signal_fn     = "%s/%s.fc_signal.bw"     %(peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" %(peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name,'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2] #third column
        print "Fraglen %s" %(fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' %(experiment.name, control.name) + \
              '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_narrowpeak_fn = common.slop_clip('%s/%s_peaks.narrowPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = ['sort -k 8gr,8gr %s' %(rescaled_narrowpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(narrowPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' %(experiment.name, control.name) + \
              '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_broadpeak_fn = common.slop_clip('%s/%s_peaks.broadPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = ['sort -k 8gr,8gr %s' %(rescaled_broadpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(broadPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(broadPeak_gz_fn))

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5)

    pipe = ['sort -k 14gr,14gr %s' %(rescaled_gappedpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(gappedPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
              '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
              '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
              '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
              '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"
    
    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = ['slopBed -i %s/%s_FE.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name),
            'bedClip stdin %s %s/%s.fc.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
              '%s ' %(chrom_sizes.name) + \
              '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" %(returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph
    
    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe([
        'gzip -dc %s' %(experiment.name),
        'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe([
        'gzip -dc %s' %(control.name),
        'wc -l'])
    controlReads = out.strip()
    sval=str(min(float(chipReads), float(controlReads))/1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval)

    returncode = common.block_on(
        'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
        '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = ['slopBed -i %s/%s_ppois.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name),
            'bedClip stdin %s %s/%s.pval.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
              '%s ' %(chrom_sizes.name) + \
              '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" %(returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' %(narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' %(gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3')
    broadPeak_bb_fname =  common.bed2bb('%s' %(broadPeak_fn),  chrom_sizes.name, broadPeak_as.name,  bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit 
    # for fn in [narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn]:
    #     common.block_on('touch %s' %(fn))

    # Upload the file outputs

    narrowPeak    = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak    = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak     = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb  = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal     = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks":    dxpy.dxlink(narrowPeak),
        "gappedpeaks":    dxpy.dxlink(gappedPeak),
        "broadpeaks":     dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb":  dxpy.dxlink(broadPeak_bb),
        "fc_signal":     dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output
コード例 #46
0
def main(input_bam, paired_end, samtools_params, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    # input_json is no longer used
    # # if there is input_JSON, it over-rides any explicit parameters
    # if input_JSON:
    #     if 'input_bam' in input_JSON:
    #         input_bam = input_JSON['input_bam']
    #     if 'paired_end' in input_JSON:
    #         paired_end = input_JSON['paired_end']
    #     if 'samtools_params' in input_JSON:
    #         samtools_params = input_JSON['samtools_params']

    # this is now handled by the platform input validator
    # if not input_bam:
    #     logger.error('input_bam is required')
    #     raise Exception
    # assert paired_end is not None, 'paired_end is required, explicitly or in input_JSON'

    raw_bam_file = dxpy.DXFile(input_bam)
    raw_bam_filename = raw_bam_file.name
    raw_bam_basename = raw_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(raw_bam_file.get_id(), raw_bam_filename)

    subprocess.check_output('set -x; ls -l', shell=True)

    filt_bam_prefix = raw_bam_basename + ".filt.srt"
    filt_bam_filename = filt_bam_prefix + ".bam"
    if paired_end:
        # =============================
        # Remove  unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Only keep properly paired reads
        # Obtain name sorted BAM file
        # ==================
        tmp_filt_bam_prefix = "tmp.%s" % (filt_bam_prefix)  # was tmp.prefix.nmsrt
        tmp_filt_bam_filename = tmp_filt_bam_prefix + ".bam"
        out, err = common.run_pipe([
            # filter: -F 1804 FlAG bits to exclude; -f 2 FLAG bits to reqire;
            # -q 30 exclude MAPQ < 30; -u uncompressed output
            # exclude FLAG 1804: unmapped, next segment unmapped, secondary
            # alignments, not passing platform q, PCR or optical duplicates
            # require FLAG 2: properly aligned
            "samtools view -F 1804 -f 2 %s -u %s" % (samtools_params, raw_bam_filename),
            # sort:  -n sort by name; - take input from stdin;
            # out to specified filename
            # Will produce name sorted BAM
            "samtools sort -n - %s" % (tmp_filt_bam_prefix)])
        if err:
            logger.error("samtools error: %s" % (err))
        # Remove orphan reads (pair was removed)
        # and read pairs mapping to different chromosomes
        # Obtain position sorted BAM
        subprocess.check_output('set -x; ls -l', shell=True)
        out, err = common.run_pipe([
            # fill in mate coordinates, ISIZE and mate-related flags
            # fixmate requires name-sorted alignment; -r removes secondary and
            # unmapped (redundant here because already done above?)
            # - send output to stdout
            "samtools fixmate -r %s -" % (tmp_filt_bam_filename),
            # repeat filtering after mate repair
            "samtools view -F 1804 -f 2 -u -",
            # produce the coordinate-sorted BAM
            "samtools sort - %s" % (filt_bam_prefix)])
        subprocess.check_output('set -x; ls -l', shell=True)
    else:  # single-end data
        # =============================
        # Remove unmapped, mate unmapped
        # not primary alignment, reads failing platform
        # Remove low MAPQ reads
        # Obtain name sorted BAM file
        # ==================
        with open(filt_bam_filename, 'w') as fh:
            samtools_filter_command = (
                "samtools view -F 1804 %s -b %s"
                % (samtools_params, raw_bam_filename)
                )
            logger.info(samtools_filter_command)
            subprocess.check_call(
                shlex.split(samtools_filter_command),
                stdout=fh)

    # ========================
    # Mark duplicates
    # ======================
    tmp_filt_bam_filename = raw_bam_basename + ".dupmark.bam"
    dup_file_qc_filename = raw_bam_basename + ".dup.qc"
    picard_string = ' '.join([
        "java -Xmx4G -jar /picard/MarkDuplicates.jar",
        "INPUT=%s" % (filt_bam_filename),
        "OUTPUT=%s" % (tmp_filt_bam_filename),
        "METRICS_FILE=%s" % (dup_file_qc_filename),
        "VALIDATION_STRINGENCY=LENIENT",
        "ASSUME_SORTED=true",
        "REMOVE_DUPLICATES=false"
        ])
    logger.info(picard_string)
    subprocess.check_output(shlex.split(picard_string))
    os.rename(tmp_filt_bam_filename, filt_bam_filename)

    if paired_end:
        final_bam_prefix = raw_bam_basename + ".filt.srt.nodup"
    else:
        final_bam_prefix = raw_bam_basename + ".filt.nodup.srt"
    final_bam_filename = final_bam_prefix + ".bam"  # To be stored
    final_bam_index_filename = final_bam_filename + ".bai"  # To be stored
    # QC file
    final_bam_file_mapstats_filename = final_bam_prefix + ".flagstat.qc"

    if paired_end:
        samtools_dedupe_command = \
            "samtools view -F 1804 -f2 -b %s" % (filt_bam_filename)
    else:
        samtools_dedupe_command = \
            "samtools view -F 1804 -b %s" % (filt_bam_filename)

    # ============================
    # Remove duplicates
    # Index final position sorted BAM
    # ============================
    with open(final_bam_filename, 'w') as fh:
        logger.info(samtools_dedupe_command)
        subprocess.check_call(
            shlex.split(samtools_dedupe_command),
            stdout=fh)
    # Index final bam file
    samtools_index_command = \
        "samtools index %s %s" % (final_bam_filename, final_bam_index_filename)
    logger.info(samtools_index_command)
    subprocess.check_output(shlex.split(samtools_index_command))

    # Generate mapping statistics
    with open(final_bam_file_mapstats_filename, 'w') as fh:
        flagstat_command = "samtools flagstat %s" % (final_bam_filename)
        logger.info(flagstat_command)
        subprocess.check_call(shlex.split(flagstat_command), stdout=fh)

    # =============================
    # Compute library complexity
    # =============================
    # Sort by name
    # convert to bedPE and obtain fragment coordinates
    # sort by position and strand
    # Obtain unique count statistics
    pbc_file_qc_filename = final_bam_prefix + ".pbc.qc"
    # PBC File output
    # TotalReadPairs [tab]
    # DistinctReadPairs [tab]
    # OneReadPair [tab]
    # TwoReadPairs [tab]
    # NRF=Distinct/Total [tab]
    # PBC1=OnePair/Distinct [tab]
    # PBC2=OnePair/TwoPair
    if paired_end:
        steps = [
            "samtools sort -no %s -" % (filt_bam_filename),
            "bamToBed -bedpe -i stdin",
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$4,$6,$9,$10}'"""]
    else:
        steps = [
            "bamToBed -i %s" % (filt_bam_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$6}'"""]
    steps.extend([
        # TODO this should be implemented as an explicit list of allowable
        # names, so that mapping can be done to a complete reference
        "grep -v 'chrM'",
        "sort",
        "uniq -c",
        r"""awk 'BEGIN{mt=0;m0=0;m1=0;m2=0} ($1==1){m1=m1+1} ($1==2){m2=m2+1} {m0=m0+1} {mt=mt+$1} END{printf "%d\t%d\t%d\t%d\t%f\t%f\t%f\n",mt,m0,m1,m2,m0/mt,m1/m0,m1/m2}'"""
        ])
    out, err = common.run_pipe(steps, pbc_file_qc_filename)
    if err:
        logger.error("PBC file error: %s" % (err))

    logger.info("Uploading results files to the project")
    filtered_bam = dxpy.upload_local_file(final_bam_filename)
    filtered_bam_index = dxpy.upload_local_file(final_bam_index_filename)
    filtered_mapstats = \
        dxpy.upload_local_file(final_bam_file_mapstats_filename)
    dup_file = dxpy.upload_local_file(dup_file_qc_filename)
    pbc_file = dxpy.upload_local_file(pbc_file_qc_filename)
    dup_qc = dup_parse(dup_file_qc_filename)
    pbc_qc = pbc_parse(pbc_file_qc_filename)
    logger.info("dup_qc: %s" % (dup_qc))
    logger.info("pbc_qc: %s" % (pbc_qc))

    # Return links to the output files
    output = {
        "filtered_bam": dxpy.dxlink(filtered_bam),
        "filtered_bam_index": dxpy.dxlink(filtered_bam_index),
        "filtered_mapstats": dxpy.dxlink(filtered_mapstats),
        "dup_file_qc": dxpy.dxlink(dup_file),
        "pbc_file_qc": dxpy.dxlink(pbc_file),
        "paired_end": paired_end,
        "NRF": pbc_qc.get('NRF'),
        "PBC1": pbc_qc.get('PBC1'),
        "PBC2": pbc_qc.get('PBC2'),
        "duplicate_fraction": dup_qc.get('percent_duplication')
    }
    logger.info("Exiting with output:\n%s" % (pprint(output)))
    return output