Ejemplo n.º 1
0
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None):
    return_string = \
        "\t\ttrack %s%d\n" %(accession,n) + \
        "\t\tbigDataUrl %s\n" %(url) + \
        "\t\tshortLabel %s\n" %(name[:17]) + \
        "\t\tparent %sviewpeaks on\n" %(accession) + \
        "\t\ttype %s\n" %(tracktype) + \
        "\t\tvisibility dense\n" + \
        "\t\tview PK\n" + \
        "\t\tpriority %d\n\n" %(n)
    n_stanzas = 1
    if not lowpass:
        lowpass = []
    if isinstance(lowpass,int):
        lowpass = [lowpass]
    extra_stanza_count = 0
    for (i, cutoff) in enumerate(lowpass,start=1):
        fn = dx.get_id()
        if not os.path.isfile(fn):
            dxpy.download_dxfile(dx.get_id(),fn)
        cutoffstr = '-lt%d' %(cutoff)
        outfn = fn + cutoffstr
        print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0]
        bed_fn = fn + '.bed'
        common.block_on('bigBedToBed %s %s' %(fn, bed_fn))
        common.run_pipe([
            'cat %s' %(bed_fn),
            r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn)
        print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0]
        if tracktype =='bigBed 6 +':
            as_file = 'narrowPeak.as'
        elif tracktype == 'bigBed 12 +':
            as_file = 'gappedPeak.as'
        else:
            print "Cannot match tracktype %s to any .as file" %(tracktype)
        bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file)
        newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True)
        new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True)

        new_lines = [
            "\t\ttrack %s%d" %(accession,n+i),
            "\t\tbigDataUrl %s" %(new_url),
            "\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr),
            "\t\tparent %sviewpeaks on" %(accession),
            "\t\ttype %s" %(tracktype),
            "\t\tvisibility dense",
            "\t\tview PK",
            "\t\tpriority %d\n\n" %(n+i)]
        new_stanza = '\n'.join(new_lines)
        return_string += new_stanza
        n_stanzas += 1
        os.remove(bed_fn)
        os.remove(bb_fn)
        os.remove(outfn)
        os.remove(fn)

    return(return_string, n_stanzas)
Ejemplo n.º 2
0
def peaks_stanza(accession, url, name, n, tracktype='bigBed 6 +', lowpass=[], dx=None):
	return_string = \
		"\t\ttrack %s%d\n" %(accession,n) + \
		"\t\tbigDataUrl %s\n" %(url) + \
		"\t\tshortLabel %s\n" %(name[:17]) + \
		"\t\tparent %sviewpeaks on\n" %(accession) + \
		"\t\ttype %s\n" %(tracktype) + \
		"\t\tvisibility dense\n" + \
		"\t\tview PK\n" + \
		"\t\tpriority %d\n\n" %(n)
	n_stanzas = 1
	if not lowpass:
		lowpass = []
	if isinstance(lowpass,int):
		lowpass = [lowpass]
	extra_stanza_count = 0
	for (i, cutoff) in enumerate(lowpass,start=1):
		fn = dx.get_id()
		if not os.path.isfile(fn):
			dxpy.download_dxfile(dx.get_id(),fn)
		cutoffstr = '-lt%d' %(cutoff)
		outfn = fn + cutoffstr
		print fn, os.path.getsize(fn), subprocess.check_output('wc -l %s' %(fn), shell=True).split()[0]
		bed_fn = fn + '.bed'
		common.block_on('bigBedToBed %s %s' %(fn, bed_fn))
		common.run_pipe([
			'cat %s' %(bed_fn),
			r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'""" %(cutoff)], outfn)
		print outfn, os.path.getsize(outfn), subprocess.check_output('wc -l %s' %(outfn), shell=True).split()[0]
		if tracktype =='bigBed 6 +':
			as_file = 'narrowPeak.as'
		elif tracktype == 'bigBed 12 +':
			as_file = 'gappedPeak.as'
		else:
			print "Cannot match tracktype %s to any .as file" %(tracktype)
		bb_fn = common.bed2bb(outfn,'mm10.chrom.sizes',as_file)
		newdx = dxpy.upload_local_file(filename=bb_fn, folder="/tracks", wait_on_close=True)
		new_url, headers = newdx.get_download_url(duration=sys.maxint, preauthenticated=True)

		new_lines = [
			"\t\ttrack %s%dp%d" %(accession,n,i),
			"\t\tbigDataUrl %s" %(new_url),
			"\t\tshortLabel %s" %(name[:17-len(cutoffstr)] + cutoffstr),
			"\t\tparent %sviewpeaks on" %(accession),
			"\t\ttype %s" %(tracktype),
			"\t\tvisibility dense",
			"\t\tview PK",
			"\t\tpriority %d.%d\n\n" %(n,i)]
		new_stanza = '\n'.join(new_lines)
		return_string += new_stanza
		n_stanzas += 1

	return(return_string, n_stanzas)
Ejemplo n.º 3
0
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment        = dxpy.DXFile(experiment)
    control           = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes       = dxpy.DXFile(chrom_sizes)
    narrowPeak_as     = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as     = dxpy.DXFile(gappedpeak_as)
    broadPeak_as      = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(),        experiment.name)
    dxpy.download_dxfile(control.get_id(),           control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(),       chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(),     narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(),     gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(),      broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks_macs'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn    = "%s/%s.narrowPeak" %(peaks_dirname, prefix)
    gappedPeak_fn    = "%s/%s.gappedPeak" %(peaks_dirname, prefix)
    broadPeak_fn     = "%s/%s.broadPeak"  %(peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn  = broadPeak_fn  + ".gz"
    narrowPeak_bb_fn = "%s.bb" %(narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" %(gappedPeak_fn)
    broadPeak_bb_fn  = "%s.bb" %(broadPeak_fn)
    fc_signal_fn     = "%s/%s.fc_signal.bw"     %(peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" %(peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name,'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2] #third column
        print "Fraglen %s" %(fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' %(experiment.name, control.name) + \
              '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_narrowpeak_fn = common.slop_clip('%s/%s_peaks.narrowPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn, scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = ['sort -k 8gr,8gr %s' %(rescaled_narrowpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(narrowPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' %(experiment.name, control.name) + \
              '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_broadpeak_fn = common.slop_clip('%s/%s_peaks.broadPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn, scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = ['sort -k 8gr,8gr %s' %(rescaled_broadpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(broadPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(broadPeak_gz_fn))

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %(peaks_dirname, prefix), chrom_sizes.name)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn, scores_col=5)

    pipe = ['sort -k 14gr,14gr %s' %(rescaled_gappedpeak_fn),
            r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
            'tee %s' %(gappedPeak_fn),
            'gzip -c']
    print pipe
    out,err = common.run_pipe(pipe,'%s' %(gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
              '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
              '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
              '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
              '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"
    
    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = ['slopBed -i %s/%s_FE.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name),
            'bedClip stdin %s %s/%s.fc.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
              '%s ' %(chrom_sizes.name) + \
              '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" %(returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph
    
    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe([
        'gzip -dc %s' %(experiment.name),
        'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe([
        'gzip -dc %s' %(control.name),
        'wc -l'])
    controlReads = out.strip()
    sval=str(min(float(chipReads), float(controlReads))/1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" %(chipReads, controlReads, sval)

    returncode = common.block_on(
        'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
        '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" %(returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = ['slopBed -i %s/%s_ppois.bdg -g %s -b 0' %(peaks_dirname, prefix, chrom_sizes.name),
            'bedClip stdin %s %s/%s.pval.signal.bedgraph' %(chrom_sizes.name, peaks_dirname, prefix)]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
              '%s ' %(chrom_sizes.name) + \
              '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" %(returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' %(narrowPeak_fn), chrom_sizes.name, narrowPeak_as.name, bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' %(gappedPeak_fn), chrom_sizes.name, gappedPeak_as.name, bed_type='bed12+3')
    broadPeak_bb_fname =  common.bed2bb('%s' %(broadPeak_fn),  chrom_sizes.name, broadPeak_as.name,  bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit 
    # for fn in [narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn, gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn]:
    #     common.block_on('touch %s' %(fn))

    # Upload the file outputs

    narrowPeak    = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak    = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak     = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb  = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal     = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks":    dxpy.dxlink(narrowPeak),
        "gappedpeaks":    dxpy.dxlink(gappedPeak),
        "broadpeaks":     dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb":  dxpy.dxlink(broadPeak_bb),
        "fc_signal":     dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output
Ejemplo n.º 4
0
def main(experiment,
         control,
         xcor_scores_input,
         chrom_sizes,
         narrowpeak_as,
         gappedpeak_as,
         broadpeak_as,
         genomesize,
         prefix=None,
         fragment_length=None):

    narrowPeak_as = narrowpeak_as
    gappedPeak_as = gappedpeak_as
    broadPeak_as = broadpeak_as

    # Define the output filenames

    peaks_dirname = 'peaks_macs'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    if not prefix:
        prefix = experiment
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file
    # if the fragment_length argument is given, use that instead
    if fragment_length is not None:
        fraglen = str(fragment_length)
        logger.info("User given fragment length %s" % fraglen)
    else:
        with open(xcor_scores_input, 'r') as fh:
            firstline = fh.readline()
            fraglen = firstline.split()[2]  # third column
            logger.info("Fraglen %s" % (fraglen))

    # ===========================================
    # Generate narrow peaks and preliminary signal tracks
    # ============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' % (genomesize, fraglen)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_narrowpeak_fn = common.slop_clip(
        '%s/%s_peaks.narrowPeak' % (peaks_dirname, prefix), chrom_sizes)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores(clipped_narrowpeak_fn,
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4
    # with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    # ===========================================
    # Generate Broad and Gapped Peaks
    # ============================================

    command = 'macs2 callpeak ' + \
              '-t %s -c %s ' % (experiment, control) + \
              '-f BED -n %s/%s ' % (peaks_dirname, prefix) + \
              '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' % (genomesize, fraglen)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_broadpeak_fn = common.slop_clip(
        '%s/%s_peaks.broadPeak' % (peaks_dirname, prefix), chrom_sizes)

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores(clipped_broadpeak_fn,
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak) in descending
    # order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # MACS2 sometimes calls features off the end of chromosomes.  Fix that.
    clipped_gappedpeaks_fn = common.slop_clip('%s/%s_peaks.gappedPeak' %
                                              (peaks_dirname, prefix),
                                              chrom_sizes,
                                              bed_type='gappedPeak')

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format
    # (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores(clipped_gappedpeaks_fn,
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -cn'
    ]
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    # ===========================================
    # For Fold enrichment signal tracks
    # ============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name),
    # Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
              '-t %s/%s_treat_pileup.bdg ' % (peaks_dirname, prefix) + \
              '-c %s/%s_control_lambda.bdg ' % (peaks_dirname, prefix) + \
              '--outdir %s -o %s_FE.bdg ' % (peaks_dirname, prefix) + \
              '-m FE'
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes, peaks_dirname, prefix)
    ]
    out, err = common.run_pipe(pipe)

    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.fc.signal.bedgraph ' % (peaks_dirname, prefix) + \
              '%s ' % (chrom_sizes) + \
              '%s' % (fc_signal_fn)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    # drm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    # ===========================================
    # For -log10(p-value) signal tracks
    # ============================================

    # Compute sval =
    # min(no. of reads in ChIP, no. of reads in control) / 1,000,000
    out, err = common.run_pipe(['gzip -dc %s' % (experiment), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    logger.info("chipReads = %s, controlReads = %s, sval = %s" %
                (chipReads, controlReads, sval))

    returncode = common.block_on('macs2 bdgcmp ' +
                                 '-t %s/%s_treat_pileup.bdg ' %
                                 (peaks_dirname, prefix) +
                                 '-c %s/%s_control_lambda.bdg ' %
                                 (peaks_dirname, prefix) +
                                 '--outdir %s -o %s_ppois.bdg ' %
                                 (peaks_dirname, prefix) + '-m ppois -S %s' %
                                 (sval))
    logger.info("MACS2 exited with returncode %d" % (returncode))
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes, peaks_dirname, prefix)
    ]
    out, err = common.run_pipe(pipe)

    # rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
              '%s/%s.pval.signal.bedgraph ' % (peaks_dirname, prefix) + \
              '%s ' % (chrom_sizes) + \
              '%s' % (pvalue_signal_fn)
    logger.info(command)
    returncode = common.block_on(command)
    logger.info("bedGraphToBigWig exited with returncode %d" % (returncode))
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    # rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    # ===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    # ============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes,
                                        narrowPeak_as,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes,
                                        gappedPeak_as,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes,
                                       broadPeak_as,
                                       bed_type='bed6+3')

    # Temporary during development to create empty files just to get the applet
    # to exit
    # narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    # gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    # broadPeak_bb_fn  = "%s.bb" % (broadPeak_fn)

    output = {
        "narrowpeaks": narrowPeak_gz_fn,
        "gappedpeaks": gappedPeak_gz_fn,
        "broadpeaks": broadPeak_gz_fn,
        "narrowpeaks_bb": narrowPeak_bb_fname,
        "gappedpeaks_bb": gappedPeak_bb_fname,
        "broadpeaks_bb": broadPeak_bb_fname,
        "fc_signal": fc_signal_fn,
        "pvalue_signal": pvalue_signal_fn
    }

    return output
Ejemplo n.º 5
0
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as,
         gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment = dxpy.DXFile(experiment)
    control = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    narrowPeak_as = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as = dxpy.DXFile(gappedpeak_as)
    broadPeak_as = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(), experiment.name)
    dxpy.download_dxfile(control.get_id(), control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    broadPeak_bb_fn = "%s.bb" % (broadPeak_fn)
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name, 'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2]  #third column
        print "Fraglen %s" % (fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' %
                                                  (peaks_dirname, prefix),
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
        '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads,
                                                            controlReads, sval)

    returncode = common.block_on(
     'macs2 bdgcmp ' + \
     '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
     '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
     '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
     '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes.name,
                                        narrowPeak_as.name,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes.name,
                                        gappedPeak_as.name,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes.name,
                                       broadPeak_as.name,
                                       bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit
    for fn in [
            narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn,
            gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn
    ]:
        common.block_on('touch %s' % (fn))

    # Upload the file outputs

    narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks": dxpy.dxlink(narrowPeak),
        "gappedpeaks": dxpy.dxlink(gappedPeak),
        "broadpeaks": dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb": dxpy.dxlink(broadPeak_bb),
        "fc_signal": dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output