def __init__(self, ref_prefix, max_n):
        self.max_n = max_n  # use up to max_n gram
        # reference files
        self.files = self.get_ref_files(ref_prefix)
        # number of references per file
        self.nref = count_lines(self.files[0])
        for filename in self.files:
            n = count_lines(filename)
            assert self.nref == n, '%s has %s lines' % (filename, n)
        # counters for ngrams
        self.counters = [RefCounter(max_n) for i in range(self.nref)]

        self.load()
Exemple #2
0
    def __init__(self,
                 ffilename,
                 efilename,
                 afilename,
                 outputdir,
                 alpha,
                 threshold,
                 length_factor=False,
                 lexical_weighter=None,
                 maximize_derivation=False):
        self.ffilename = ffilename
        self.efilename = efilename
        self.afilename = afilename
        self.outputdir = outputdir
        self.alpha = alpha
        self.threshold = threshold
        self.length_factor = length_factor
        self.lexical_weighter = lexical_weighter
        self.maximize_derivation = maximize_derivation

        self.counter = RuleCounter()
        self.corpus_size = count_lines(ffilename)

        system('rm -rf %s' % outputdir)
        system('mkdir %s' % outputdir)
Exemple #3
0
def read_table(json_table_path):
    with open(json_table_path) as table_file:
        tables = {}
        for line in tqdm(table_file, total=count_lines(json_table_path)):
            d = json.loads(line)
            tables[d['id']] = d

    return tables
Exemple #4
0
 def __init__(self, total=-1, input='', file=stdout):
     assert total == -1 or input == '', \
             "user should specify either 'total' or 'input'"
     if total != -1:
         self.total = total
     elif input:
         self.total = count_lines(input)
     else:
         assert False, "please specify either 'total' or 'input'"
     self.percent = 0
     self.file = file
Exemple #5
0
	def upload(self, uploader):

		# Information about called peaks
		n_spp_peaks = common.count_lines(self.peaks_fn)
		print "%s peaks called by spp" % n_spp_peaks
		print "%s of those peaks removed due to bad coordinates" % (n_spp_peaks - common.count_lines(self.fixed_peaks_fn))
		print "First 50 peaks"
		print subprocess.check_output('head -50 %s' % self.fixed_peaks_fn, shell=True, stderr=subprocess.STDOUT)

		# Upload bigBed if applicable
		if self.bigbed:
			self.peaks_bb_fn = common.bed2bb(self.fixed_peaks_fn, self.chrom_sizes.name, self.as_file.name)
			if self.peaks_bb_fn:
				self.peaks_bb = uploader.upload(self.peaks_bb_fn)

		if not filecmp.cmp(self.peaks_fn, self.fixed_peaks_fn): print "Returning peaks with fixed coordinates"

		# Upload peaks
		print subprocess.check_output(shlex.split("gzip %s" % self.fixed_peaks_fn))
		self.peaks = uploader.upload(self.fixed_peaks_fn + ".gz")

		# Upload cross-correlations
                self.xcor_plot   = uploader.upload(self.xcor_plot)
	        self.xcor_scores = uploader.upload(self.xcor_scores)
def rescale_scores(fn, scores_col, new_min=10, new_max=1000):
	n_peaks = common.count_lines(fn)
	sorted_fn = 'sorted-%s' %(fn)
	rescaled_fn = 'rescaled-%s' %(fn)
	out,err = common.run_pipe([
		'sort -k %dgr,%dgr %s' %(scores_col, scores_col, fn),
		r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""],
		sorted_fn)
	out, err = common.run_pipe([
		'head -n 1 %s' %(sorted_fn),
		'cut -f %s' %(scores_col)])
	max_score = float(out.strip())
	out, err = common.run_pipe([
		'tail -n 1 %s' %(sorted_fn),
		'cut -f %s' %(scores_col)])
	min_score = float(out.strip())
	out,err = common.run_pipe([
		'cat %s' %(sorted_fn),
		r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \
		r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)],
		rescaled_fn)
	return rescaled_fn
Exemple #7
0
def rescale_scores(fn, scores_col, new_min=10, new_max=1000):
    n_peaks = common.count_lines(fn)
    sorted_fn = 'sorted-%s' % (fn)
    rescaled_fn = 'rescaled-%s' % (fn)
    out, err = common.run_pipe([
        'sort -k %dgr,%dgr %s' % (scores_col, scores_col, fn),
        r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (NF != 0) print $0}'"""
    ], sorted_fn)
    out, err = common.run_pipe(
        ['head -n 1 %s' % (sorted_fn),
         'cut -f %s' % (scores_col)])
    max_score = float(out.strip())
    out, err = common.run_pipe(
        ['tail -n 1 %s' % (sorted_fn),
         'cut -f %s' % (scores_col)])
    min_score = float(out.strip())
    out,err = common.run_pipe([
     'cat %s' %(sorted_fn),
     r"""awk 'BEGIN{OFS="\t"}{n=$%d;a=%d;b=%d;x=%d;y=%d}""" %(scores_col, min_score, max_score, new_min, new_max) + \
     r"""{$%d=int(((n-a)*(y-x)/(b-a))+x) ; print $0}'""" %(scores_col)],
     rescaled_fn)
    return rescaled_fn
def internal_pseudoreplicate_overlap(rep1_peaks,
                                     rep2_peaks,
                                     pooled_peaks,
                                     rep1_ta,
                                     rep1_xcor,
                                     paired_end,
                                     chrom_sizes,
                                     as_file,
                                     peak_type,
                                     prefix,
                                     fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe(['cat %s' % (overlap_tr_fn), 'sort -u'],
                               overlapping_peaks_fn)
    print("%d peaks overlap" % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn,
        rep1_xcor_fn,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
def replicated_overlap(rep1_peaks,
                       rep2_peaks,
                       pooled_peaks,
                       pooledpr1_peaks,
                       pooledpr2_peaks,
                       rep1_ta,
                       rep1_xcor,
                       rep2_ta,
                       rep2_xcor,
                       paired_end,
                       chrom_sizes,
                       as_file,
                       peak_type,
                       prefix,
                       fragment_length=None):

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
                     pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print("%d peaks overlap with both true replicates" %
          (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print("%d peaks overlap with both pooled pseudoreplicates" %
          (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename,
        pool_xcor_filename,
        overlapping_peaks_fn,
        chrom_sizes_fn,
        fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        "npeaks_rejected": npeaks_rejected,
        "frip_nreads": n_reads,
        "frip_nreads_in_peaks": n_reads_in_peaks,
        "frip_score": frip_score,
        "fragment_length_used": fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
Exemple #10
0
def main(experiment,
         control,
         xcor_scores_input,
         npeaks,
         nodups,
         bigbed,
         chrom_sizes,
         spp_version,
         as_file=None,
         prefix=None,
         fragment_length=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)

    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)

    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(),
                         xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = \
            experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    # spp adds .gz, so this is the file name that's actually created
    final_peaks_filename = peaks_filename + '.gz'
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    logger.info(
        subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT))

    # third column in the cross-correlation scores input file
    # if fragment_length is provided, use that. Else read
    # fragment length from xcor file
    if fragment_length is not None:
        fraglen = str(fragment_length)
        logger.info("User given fragment length %s" % (fraglen))
    else:
        fraglen_column = 3
        with open(xcor_scores_input_filename, 'r') as f:
            line = f.readline()
            fraglen = line.split('\t')[fraglen_column - 1]
            logger.info("Read fragment length: %s" % (fraglen))

    # spp_tarball = SPP_VERSION_MAP.get(spp_version)
    # assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # install spp
    # subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    run_spp = '/phantompeakqualtools/run_spp.R'
    spp_command = (
        "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%s -savr=%s -savp=%s -rf -out=%s"
        % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks,
           fraglen, peaks_filename, xcor_plot_filename, xcor_scores_filename))
    logger.info(spp_command)
    subprocess.check_call(shlex.split(spp_command))

    # when one of the peak coordinates are an exact multiple of 10, spp (R)
    # outputs the coordinate in scientific notation
    # this changes any such coodinates to decimal notation
    # this assumes 10-column output and that the 2nd and 3rd columns are
    # coordinates
    # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a
    # negative start coordinate (particularly chrM) and will cause slopBed
    # to halt at that line, truncating the output of the pipe
    # slopBed adjusts feature end coordinates that go off the end of the
    # chromosome
    # bedClip removes any features that are still not within the boundaries of
    # the chromosome

    fix_coordinate_peaks_filename = \
        output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" % (final_peaks_filename),
        "tee %s" % (peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename),
        'bedClip stdin %s %s' %
        (chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    # These lines transfer the peaks files to the temporary workspace for
    # debugging later
    # Only at the end are the final files uploaded that will be returned from
    # the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    logger.info("%s peaks called by spp" % (n_spp_peaks))
    logger.info(
        "%s of those peaks removed due to bad coordinates" %
        (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)))
    print("First 50 peaks")
    subprocess.check_output('head -50 %s' % (fix_coordinate_peaks_filename),
                            shell=True)

    if bigbed:
        peaks_bb_filename = \
            common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename):
        logger.info("Returning peaks with fixed coordinates")
        subprocess.check_call(
            shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    subprocess.check_call('ls -l', shell=True)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
Exemple #11
0
def read_and_write_query(query_question_path,
                         tables,
                         question_output_path,
                         sql_output_path,
                         table_path,
                         valid_file_counter,
                         debug=False,
                         do_append=False):
    sql_parser = SQLParser()
    if do_append:
        sql_writer = open(sql_output_path, 'a')
        question_writer = open(question_output_path, 'a')
        table_writer = open(table_path, 'a')
    else:
        sql_writer = open(sql_output_path, 'w')
        question_writer = open(question_output_path, 'w')
        table_writer = open(table_path, 'w')
    num_of_unicode_error = 0
    num_of_non_parsable_error = 0
    with open(query_question_path) as qq_file:
        queries = []
        questions = []
        counter = 0
        for line in tqdm(qq_file, total=count_lines(query_question_path)):
            data = json.loads(line)
            question = data['question']
            table_id = data['table_id']
            table = tables[table_id]
            column_names = table["header"]
            #print(column_names)
            sql = data['sql']
            select_col = table["header"][int(sql["sel"])]
            agg = agg_ops[int(sql["agg"])]
            conditions = sql["conds"]
            use_column_name = True
            query = Query(int(sql["sel"]), int(sql["agg"]), column_names,
                          use_column_name, conditions)
            #print("select col: " + select_col)
            #print("agg: " + agg)
            #print(question)
            #print(sql)

            #print(col_names)
            hasError = False
            try:
                sql_query = query.__repr__()
                col_names = " COL_END COL_START ".join(
                    str(x) for x in column_names)
            except:
                if debug:
                    print("ERROR in line unicode" + str(counter))
                hasError = True
                num_of_unicode_error += 1
            if not hasError:
                try:
                    #new_query, orig_table_name = fix_table_name(query)
                    parse_tree, rule_list = sql_parser.parse(sql_query,
                                                             get_rules=True)
                    sql_writer.write(sql_query + "\n")
                    question_writer.write(question + " COL_START " +
                                          col_names + " COL_END\n")
                    valid_file_counter += 1
                except:
                    if debug:
                        print("ERROR in line " + str(counter) + " :" +
                              str(sql_query))
                    num_of_non_parsable_error += 1
                counter += 1
            #if counter == 10:
            #	break
        print("Unicode error: " + str(num_of_unicode_error))
        print("Nonparsable error: " + str(num_of_non_parsable_error))
        return valid_file_counter
def main(rep1_ta,
         ctl1_ta,
         rep1_xcor,
         rep1_paired_end,
         npeaks,
         nodups,
         chrom_sizes,
         spp_version,
         rep2_ta=None,
         ctl2_ta=None,
         rep2_xcor=None,
         rep2_paired_end=None,
         as_file=None,
         idr_peaks=False,
         fragment_length=None,
         spp_instance=None):

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    rep1_ta_filename = rep1_ta_file.name
    ntags_rep1 = common.count_lines(rep1_ta_filename)

    simplicate_experiment = rep1_ta and not rep2_ta
    if simplicate_experiment:
        logger.info(
            "No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info(
            "Rep1 and rep2 tags specified so processing as a replicated experiment."
        )

    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_file = dxpy.DXFile(rep2_ta)
        dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
        rep2_ta_filename = rep2_ta_file.name
        ntags_rep2 = common.count_lines(rep2_ta_filename)
    paired_end = rep1_paired_end

    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    ctl1_ta_filename = ctl1_ta_file.name

    if not unary_control:
        ctl2_ta_file = dxpy.DXFile(ctl2_ta)
        dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
        ctl2_ta_filename = ctl2_ta_file.name
    else:
        ctl2_ta_file = ctl1_ta_file
        ctl2_ta_filename = ctl1_ta_file.name

    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"

    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename),
                     (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    if not simplicate_experiment:
        pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
        pool_replicates_subjob = \
            pool_applet.run(
                {"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'},
                name='Pool replicates')
        pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                spp_version,
                name='Pool cross-correlation')

    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info(
                "Using one control for both replicate 1 and 2 and for the pool."
            )
        rep2_control = rep1_control
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        pool_controls_subjob = pool_applet.run(
            {
                "inputs": [ctl1_ta, ctl2_ta],
                "prefix": "PL_ctls"
            },
            name='Pool controls')
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"

        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            logger.info(
                "Number of reads in controls differ by > factor of %f. Using pooled controls."
                % (ratio_cutoff))
            rep1_control = pooled_controls
            rep2_control = pooled_controls
        else:
            if ntags_ctl1 < ntags_rep1:
                logger.info(
                    "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                )
                rep1_control = pooled_controls
                rep1_ctl_msg = "pooled controls"
            elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                logger.info(
                    "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                )
                rep2_control = pooled_controls
                rep2_ctl_msg = "pooled controls"
            else:
                logger.info("Using distinct controls for replicate 1 and 2.")
                rep1_control = ctl1_ta  # default.  May be changed later.
                rep2_control = ctl2_ta  # default.  May be changed later.
                rep1_ctl_msg = "control rep1"
                rep2_ctl_msg = "control rep2"

    common_args = {
        'chrom_sizes': chrom_sizes,
        'spp_version': spp_version,
        'as_file': as_file,
        'spp_instance': spp_instance
    }
    if fragment_length is not None:
        common_args.update({'fragment_length': fragment_length})
    rep1_peaks_subjob = spp(rep1_ta,
                            rep1_control,
                            rep1_xcor,
                            bigbed=True,
                            name='Rep1 peaks vs %s' % (rep1_ctl_msg),
                            prefix='R1',
                            **common_args)

    if not simplicate_experiment:
        rep2_peaks_subjob = spp(rep2_ta,
                                rep2_control,
                                rep2_xcor,
                                bigbed=True,
                                name='Rep2 peaks vs %s' % (rep2_ctl_msg),
                                prefix='R2',
                                **common_args)

        pooled_peaks_subjob = spp(
            pooled_replicates,
            control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            bigbed=True,
            name='Pooled peaks vs %s' % (pool_ctl_msg),
            prefix='PL',
            **common_args)

    output = {
        'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"),
        'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"),
        'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"),
        'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores")
    }

    if not simplicate_experiment:
        output.update({
            'rep2_peaks':
            rep2_peaks_subjob.get_output_ref("peaks"),
            'rep2_peaks_bb':
            rep2_peaks_subjob.get_output_ref("peaks_bb"),
            'rep2_xcor_plot':
            rep2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2_xcor_scores':
            rep2_peaks_subjob.get_output_ref("xcor_scores"),
            'pooled_peaks':
            pooled_peaks_subjob.get_output_ref("peaks"),
            'pooled_peaks_bb':
            pooled_peaks_subjob.get_output_ref("peaks_bb"),
            'pooled_xcor_plot':
            pooled_peaks_subjob.get_output_ref("xcor_plot"),
            'pooled_xcor_scores':
            pooled_peaks_subjob.get_output_ref("xcor_scores")
        })

    if idr_peaks:  # also call peaks on pseudoreplicates for IDR
        pseudoreplicator_applet = \
            dxpy.find_one_data_object(
               classname='applet',
               name='pseudoreplicator',
               project=dxpy.PROJECT_CONTEXT_ID,
               zero_ok=False,
               more_ok=False,
               return_handler=True)

        rep1_pr_subjob = \
            pseudoreplicator_applet.run(
                {"input_tags": rep1_ta,
                 "prefix": 'R1PR'},
                name='Pseudoreplicate rep1 -> R1PR1,2')

        rep1pr1_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep1_control,
            rep1_xcor,
            bigbed=False,
            name='R1PR1 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1PR1',
            **common_args)

        rep1pr2_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep1_control,
            rep1_xcor,
            bigbed=False,
            name='R1PR2 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1PR2',
            **common_args)

        output.update({
            'rep1pr1_peaks':
            rep1pr1_peaks_subjob.get_output_ref("peaks"),
            'rep1pr2_peaks':
            rep1pr2_peaks_subjob.get_output_ref("peaks")
        })

        if not simplicate_experiment:
            rep2_pr_subjob = \
                pseudoreplicator_applet.run(
                    {"input_tags": rep2_ta,
                     "prefix": 'R2PR'},
                    name='Pseudoreplicate rep2 -> R2PR1,2')

            pool_pr1_subjob = pool_applet.run(
                {
                    "inputs": [
                        rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                        rep2_pr_subjob.get_output_ref("pseudoreplicate1")
                    ],
                    "prefix":
                    'PPR1'
                },
                name='Pool R1PR1+R2PR1 -> PPR1')

            pool_pr2_subjob = pool_applet.run(
                {
                    "inputs": [
                        rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                        rep2_pr_subjob.get_output_ref("pseudoreplicate2")
                    ],
                    "prefix":
                    'PPR2'
                },
                name='Pool R1PR2+R2PR2 -> PPR2')

            rep2pr1_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep2_control,
                rep2_xcor,
                bigbed=False,
                name='R2PR1 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR1',
                **common_args)

            rep2pr2_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep2_control,
                rep2_xcor,
                bigbed=False,
                name='R2PR2 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR2',
                **common_args)

            pooledpr1_peaks_subjob = spp(
                pool_pr1_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                bigbed=False,
                name='PPR1 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR1',
                **common_args)

            pooledpr2_peaks_subjob = spp(
                pool_pr2_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                bigbed=False,
                name='PPR2 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR2',
                **common_args)

            output.update({
                'rep2pr1_peaks':
                rep2pr1_peaks_subjob.get_output_ref("peaks"),
                'rep2pr2_peaks':
                rep2pr2_peaks_subjob.get_output_ref("peaks"),
                'pooledpr1_peaks':
                pooledpr1_peaks_subjob.get_output_ref("peaks"),
                'pooledpr2_peaks':
                pooledpr2_peaks_subjob.get_output_ref("peaks"),
            })

    return output
Exemple #13
0
def main(rep1_ta,
         ctl1_ta,
         rep1_xcor,
         rep1_paired_end,
         chrom_sizes,
         genomesize,
         narrowpeak_as,
         gappedpeak_as,
         broadpeak_as,
         rep2_ta=None,
         ctl2_ta=None,
         rep2_xcor=None,
         rep2_paired_end=None,
         fragment_length=None):
    rep1_ta_file = rep1_ta
    rep1_ta_filename = rep1_ta_file
    ntags_rep1 = common.count_lines(rep1_ta_filename)
    #
    simplicate_experiment = rep1_ta and not rep2_ta
    if simplicate_experiment:
        logger.info(
            "No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info(
            "Rep1 and rep2 tags specified so processing as a replicated experiment."
        )
    #
    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_file = rep2_ta
        rep2_ta_filename = rep2_ta_file
        ntags_rep2 = common.count_lines(rep2_ta_filename)
    paired_end = rep1_paired_end
    #
    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_file = ctl1_ta
    ctl1_ta_filename = ctl1_ta_file
    #
    if not unary_control:
        ctl2_ta_file = ctl2_ta
        ctl2_ta_filename = ctl2_ta_file
    else:
        ctl2_ta_file = ctl1_ta_file
        ctl2_ta_filename = ctl1_ta_file
    #
    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"
    #
    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename),
                     (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))
    #
    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)
    #
    if not simplicate_experiment:
        #Pool replicates
        pool_replicates_subjob = pool(inputs=[rep1_ta, rep2_ta],
                                      prefix='pooled_reps')
        pooled_replicates = pool_replicates_subjob.get("pooled")
        #Pool cross-correlation
        pooled_replicates_xcor_subjob = xcor_only(pooled_replicates,
                                                  paired_end)
    #
    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info(
                "Using one control for both replicate 1 and 2 and for the pool."
            )
        rep2_control = rep1_control
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        #Pool controls
        pool_controls_subjob = pool(inputs=[ctl1_ta, ctl2_ta],
                                    prefix="PL_ctls")
        pooled_controls = pool_controls_subjob.get("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"
        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            logger.info(
                "Number of reads in controls differ by > factor of %f. Using pooled controls."
                % (ratio_cutoff))
            rep1_control = pooled_controls
            rep2_control = pooled_controls
        else:
            if ntags_ctl1 < ntags_rep1:
                logger.info(
                    "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                )
                rep1_control = pooled_controls
                rep1_ctl_msg = "pooled controls"
            elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                logger.info(
                    "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                )
                rep2_control = pooled_controls
                rep2_ctl_msg = "pooled controls"
            else:
                logger.info("Using distinct controls for replicate 1 and 2.")
                rep1_control = ctl1_ta  # default.  May be changed later.
                rep2_control = ctl2_ta  # default.  May be changed later.
                rep1_ctl_msg = "control rep1"
                rep2_ctl_msg = "control rep2"
    #
    rep1_pr_subjob = pseudoreplicator(input_tags=rep1_ta)
    if not simplicate_experiment:
        rep2_pr_subjob = pseudoreplicator(input_tags=rep2_ta)
        #
        pool_pr1_subjob = pool(inputs=[
            rep1_pr_subjob.get("pseudoreplicate1"),
            rep2_pr_subjob.get("pseudoreplicate1")
        ],
                               prefix='PPR1')
        pool_pr2_subjob = pool(inputs=[
            rep1_pr_subjob.get("pseudoreplicate2"),
            rep2_pr_subjob.get("pseudoreplicate2")
        ],
                               prefix='PPR2')
    #
    common_args = {
        'chrom_sizes': chrom_sizes,
        'genomesize': genomesize,
        'narrowpeak_as': narrowpeak_as,
        'gappedpeak_as': gappedpeak_as,
        'broadpeak_as': broadpeak_as
    }
    # if the fragment_length argument is given, update macs2 input
    if fragment_length is not None:
        common_args.update({'fragment_length': fragment_length})
    #macs2(experiment, control, xcor_scores_input, chrom_sizes,narrowpeak_as, gappedpeak_as, broadpeak_as, genomesize, prefix=None,fragment_length=None)
    common_args.update({'prefix': 'r1'})
    rep1_peaks_subjob = macs2(rep1_ta, rep1_control, rep1_xcor, **common_args)
    #
    common_args.update({'prefix': 'r1pr1'})
    rep1pr1_peaks_subjob = macs2(rep1_pr_subjob.get("pseudoreplicate1"),
                                 rep1_control, rep1_xcor, **common_args)
    #
    common_args.update({'prefix': 'r1pr2'})
    rep1pr2_peaks_subjob = macs2(rep1_pr_subjob.get("pseudoreplicate2"),
                                 rep1_control, rep1_xcor, **common_args)
    #
    if not simplicate_experiment:
        common_args.update({'prefix': 'r2'})
        rep2_peaks_subjob = macs2(rep2_ta, rep2_control, rep2_xcor,
                                  **common_args)
        #
        common_args.update({'prefix': 'r2pr1'})
        rep2pr1_peaks_subjob = macs2(rep2_pr_subjob.get("pseudoreplicate1"),
                                     rep2_control, rep2_xcor, **common_args)
        #
        common_args.update({'prefix': 'r2pr2'})
        rep2pr2_peaks_subjob = macs2(rep2_pr_subjob.get("pseudoreplicate2"),
                                     rep2_control, rep2_xcor, **common_args)
        #
        common_args.update({'prefix': 'pool'})
        pooled_peaks_subjob = macs2(
            pooled_replicates, control_for_pool,
            pooled_replicates_xcor_subjob.get("CC_scores_file"), **common_args)
        #
        common_args.update({'prefix': 'ppr1'})
        pooledpr1_peaks_subjob = macs2(
            pool_pr1_subjob.get("pooled"), control_for_pool,
            pooled_replicates_xcor_subjob.get("CC_scores_file"), **common_args)
        #
        common_args.update({'prefix': 'ppr2'})
        pooledpr2_peaks_subjob = macs2(
            pool_pr2_subjob.get("pooled"), control_for_pool,
            pooled_replicates_xcor_subjob.get("CC_scores_file"), **common_args)
    #
    output = {
        'rep1_narrowpeaks': rep1_peaks_subjob.get("narrowpeaks"),
        'rep1_gappedpeaks': rep1_peaks_subjob.get("gappedpeaks"),
        'rep1_broadpeaks': rep1_peaks_subjob.get("broadpeaks"),
        'rep1_narrowpeaks_bb': rep1_peaks_subjob.get("narrowpeaks_bb"),
        'rep1_gappedpeaks_bb': rep1_peaks_subjob.get("gappedpeaks_bb"),
        'rep1_broadpeaks_bb': rep1_peaks_subjob.get("broadpeaks_bb"),
        'rep1_fc_signal': rep1_peaks_subjob.get("fc_signal"),
        'rep1_pvalue_signal': rep1_peaks_subjob.get("pvalue_signal"),
        #
        'rep1pr1_narrowpeaks': rep1pr1_peaks_subjob.get("narrowpeaks"),
        'rep1pr1_gappedpeaks': rep1pr1_peaks_subjob.get("gappedpeaks"),
        'rep1pr1_broadpeaks': rep1pr1_peaks_subjob.get("broadpeaks"),
        'rep1pr1_fc_signal': rep1pr1_peaks_subjob.get("fc_signal"),
        'rep1pr1_pvalue_signal': rep1pr1_peaks_subjob.get("pvalue_signal"),
        #
        'rep1pr2_narrowpeaks': rep1pr2_peaks_subjob.get("narrowpeaks"),
        'rep1pr2_gappedpeaks': rep1pr2_peaks_subjob.get("gappedpeaks"),
        'rep1pr2_broadpeaks': rep1pr2_peaks_subjob.get("broadpeaks"),
        'rep1pr2_fc_signal': rep1pr2_peaks_subjob.get("fc_signal"),
        'rep1pr2_pvalue_signal': rep1pr2_peaks_subjob.get("pvalue_signal")
    }
    #
    if not simplicate_experiment:
        output.update({
            'rep2_narrowpeaks':
            rep2_peaks_subjob.get("narrowpeaks"),
            'rep2_gappedpeaks':
            rep2_peaks_subjob.get("gappedpeaks"),
            'rep2_broadpeaks':
            rep2_peaks_subjob.get("broadpeaks"),
            'rep2_narrowpeaks_bb':
            rep2_peaks_subjob.get("narrowpeaks_bb"),
            'rep2_gappedpeaks_bb':
            rep2_peaks_subjob.get("gappedpeaks_bb"),
            'rep2_broadpeaks_bb':
            rep2_peaks_subjob.get("broadpeaks_bb"),
            'rep2_fc_signal':
            rep2_peaks_subjob.get("fc_signal"),
            'rep2_pvalue_signal':
            rep2_peaks_subjob.get("pvalue_signal"),
            #
            'rep2pr1_narrowpeaks':
            rep2pr1_peaks_subjob.get("narrowpeaks"),
            'rep2pr1_gappedpeaks':
            rep2pr1_peaks_subjob.get("gappedpeaks"),
            'rep2pr1_broadpeaks':
            rep2pr1_peaks_subjob.get("broadpeaks"),
            'rep2pr1_fc_signal':
            rep2pr1_peaks_subjob.get("fc_signal"),
            'rep2pr1_pvalue_signal':
            rep2pr1_peaks_subjob.get("pvalue_signal"),
            #
            'rep2pr2_narrowpeaks':
            rep2pr2_peaks_subjob.get("narrowpeaks"),
            'rep2pr2_gappedpeaks':
            rep2pr2_peaks_subjob.get("gappedpeaks"),
            'rep2pr2_broadpeaks':
            rep2pr2_peaks_subjob.get("broadpeaks"),
            'rep2pr2_fc_signal':
            rep2pr2_peaks_subjob.get("fc_signal"),
            'rep2pr2_pvalue_signal':
            rep2pr2_peaks_subjob.get("pvalue_signal"),
            #
            'pooled_narrowpeaks':
            pooled_peaks_subjob.get("narrowpeaks"),
            'pooled_gappedpeaks':
            pooled_peaks_subjob.get("gappedpeaks"),
            'pooled_broadpeaks':
            pooled_peaks_subjob.get("broadpeaks"),
            'pooled_narrowpeaks_bb':
            pooled_peaks_subjob.get("narrowpeaks_bb"),
            'pooled_gappedpeaks_bb':
            pooled_peaks_subjob.get("gappedpeaks_bb"),
            'pooled_broadpeaks_bb':
            pooled_peaks_subjob.get("broadpeaks_bb"),
            'pooled_fc_signal':
            pooled_peaks_subjob.get("fc_signal"),
            'pooled_pvalue_signal':
            pooled_peaks_subjob.get("pvalue_signal"),
            #
            'pooledpr1_narrowpeaks':
            pooledpr1_peaks_subjob.get("narrowpeaks"),
            'pooledpr1_gappedpeaks':
            pooledpr1_peaks_subjob.get("gappedpeaks"),
            'pooledpr1_broadpeaks':
            pooledpr1_peaks_subjob.get("broadpeaks"),
            'pooledpr1_fc_signal':
            pooledpr1_peaks_subjob.get("fc_signal"),
            'pooledpr1_pvalue_signal':
            pooledpr1_peaks_subjob.get("pvalue_signal"),
            #
            'pooledpr2_narrowpeaks':
            pooledpr2_peaks_subjob.get("narrowpeaks"),
            'pooledpr2_gappedpeaks':
            pooledpr2_peaks_subjob.get("gappedpeaks"),
            'pooledpr2_broadpeaks':
            pooledpr2_peaks_subjob.get("broadpeaks"),
            'pooledpr2_fc_signal':
            pooledpr2_peaks_subjob.get("fc_signal"),
            'pooledpr2_pvalue_signal':
            pooledpr2_peaks_subjob.get("pvalue_signal")
        })
    peaks_dirname = '%s_%s_peaks_macs' % (rep1_ta_filename.split(
        "/")[-1].split(".")[0], ctl1_ta_filename.split("/")[-1].split(".")[0])
    prefix = rep1_ta_filename.split("/")[-1]
    peak_file = "%s/%s.peaksfile" % (peaks_dirname, prefix)
    with open(peak_file, "w") as fh:
        for key, val in output.items():
            if isinstance(val, list):
                fh.write(": ".join([key, ", ".join(val)]) + "\n")
            else:
                fh.write(": ".join([key, str(val)]) + "\n")
    return output
def internal_pseudoreplicate_IDR(experiment, r1pr_peaks, rep1_ta, rep1_xcor,
                                 paired_end, chrom_sizes, as_file, blacklist,
                                 rep1_signal, fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info(
            "%d peaks blacklisted from the stable set" % (N1-Nsb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    r1pr_peaks_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update(
            {"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1": N1,
        "stable_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns": Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

        # The following line(s) download your file inputs to the local file system
        # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    output_filename_prefix = experiment_filename.rstrip(".gz").rstrip(".tagAlign")
    peaks_filename = output_filename_prefix + ".regionPeak"
    final_peaks_filename = peaks_filename + ".gz"  # spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + ".pdf"
    xcor_scores_filename = output_filename_prefix + ".ccscores"

    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3  # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, "r") as f:
        line = f.readline()
        fragment_length = int(line.split("\t")[fraglen_column - 1])
        print "Read fragment length: %d" % (fragment_length)

        # run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = "/phantompeakqualtools/spp_1.10.1.tar.gz"
    if nodups:
        run_spp = "/phantompeakqualtools/run_spp_nodups.R"
    else:
        run_spp = "/phantompeakqualtools/run_spp.R"
        # install spp
    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)
    print subprocess.check_output(shlex.split("R CMD INSTALL %s" % (spp_tarball)), stderr=subprocess.STDOUT)
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" % (
        run_spp,
        cpu_count(),
        experiment_filename,
        control_filename,
        npeaks,
        fragment_length,
        peaks_filename,
        xcor_plot_filename,
        xcor_scores_filename,
    )
    print spp_command
    process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    for line in iter(process.stdout.readline, ""):
        sys.stdout.write(line)

        # when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
        # this changes any such coodinates to decimal notation
        # this assumes 10-column output and that the 2nd and 3rd columns are coordinates
        # slopBed adjusts feature end coordinates that go off the end of the chromosome
        # bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + ".fixcoord.regionPeak"

    out, err = common.run_pipe(
        [
            "gzip -dc %s" % (final_peaks_filename),
            "tee %s" % (peaks_filename),
            r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
            "slopBed -i stdin -g %s -b 0" % (chrom_sizes_filename),
            "bedClip stdin %s %s" % (chrom_sizes_filename, fix_coordinate_peaks_filename),
        ]
    )

    # These lines transfer the peaks files to the temporary workspace for debugging later
    # Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" % (n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" % (
        n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)
    )
    print "First 50 peaks"
    print subprocess.check_output("head -50 %s" % (fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename, fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split("gzip %s" % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + ".gz"

    print subprocess.check_output("ls -l", shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
def main(rep1_peaks, rep2_peaks, pooled_peaks, pooledpr1_peaks, pooledpr2_peaks,
         chrom_sizes, as_file, peak_type, prefix=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks      = dxpy.DXFile(rep1_peaks)
    rep2_peaks      = dxpy.DXFile(rep2_peaks)
    pooled_peaks    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes     = dxpy.DXFile(chrom_sizes)
    as_file         = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn       = 'rep1-%s' %(rep1_peaks.name)
    rep2_peaks_fn       = 'rep2-%s' %(rep2_peaks.name)
    pooled_peaks_fn     = 'pooled-%s' %(pooled_peaks.name)
    pooledpr1_peaks_fn  = 'pooledpr1-%s' %(pooledpr1_peaks.name)
    pooledpr2_peaks_fn  = 'pooledpr2-%s' %(pooledpr2_peaks.name)
    chrom_sizes_fn      = 'chrom.sizes'
    as_file_fn          = '%s.as' %(peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match('(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' %(peak_type), pooled_peaks.name) #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' %(basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' %(basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn   = 'replicated_tr.%s' %(peak_type)
    overlap_pr_fn   = 'replicated_pr.%s' %(peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)

    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' %(pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' %(overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %(pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print "%d peaks were rejected" %(common.count_lines(rejected_peaks_fn))

    npeaks_in       = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out      = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn    = common.bed2bb(rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        'npeaks_rejected'       : npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemple #17
0
def internal_pseudoreplicate_IDR(experiment,
                                 r1pr_peaks,
                                 rep1_ta,
                                 rep1_xcor,
                                 paired_end,
                                 chrom_sizes,
                                 as_file,
                                 blacklist,
                                 rep1_signal,
                                 fragment_length=None):

    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    rep1_ta = dxpy.DXFile(rep1_ta)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(rep1_ta.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)
    # If fragment_length is given, override appropriate values.
    # Calculate, or set the actually used fragment length value.
    # Set the fragment_length_given_by_user flag appropriately.
    if fragment_length is not None:
        rep1_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_given_by_user = True
    else:
        rep1_xcor = dxpy.DXFile(rep1_xcor)
        rep1_xcor_filename = 'r1xc_%s' % (rep1_xcor.name)
        dxpy.download_dxfile(rep1_xcor.get_id(), rep1_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_given_by_user = False

    subprocess.check_output('set -x; ls -l', shell=True)

    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    stable_set_filename = "%s_stable.narrowPeak" % (experiment)
    if blacklist is not None:
        blacklist_filter(r1pr_peaks_filename, stable_set_filename,
                         blacklist_filename)
        Nsb = common.count_lines(stable_set_filename)
        logger.info("%d peaks blacklisted from the stable set" % (N1 - Nsb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (r1pr_peaks_filename, stable_set_filename)))
        Nsb = N1
        logger.info("No blacklist filter applied to the stable set")

    # calculate FRiP

    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, stable_set_filename,
        chrom_sizes_filename, fragment_length_used_rep1)

    output = {
        "rep1_frip_nreads": n_reads,
        "rep1_frip_nreads_in_peaks": n_reads_in_peaks,
        "F1": frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_stable_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(r1pr_peaks_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    stable_set_bb_filename = \
        common.bed2bb(stable_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if stable_set_bb_filename:
        stable_set_bb_output = \
            dxpy.upload_local_file(stable_set_bb_filename)
        output.update({"stable_set_bb": dxpy.dxlink(stable_set_bb_output)})

    output.update({
        "N1":
        N1,
        "stable_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(stable_set_filename))),
        "Ns":
        Nsb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})

    return output
Exemple #18
0
def replicated_IDR(experiment,
                   reps_peaks,
                   r1pr_peaks,
                   r2pr_peaks,
                   pooledpr_peaks,
                   rep1_ta,
                   rep1_xcor,
                   rep2_ta,
                   rep2_xcor,
                   paired_end,
                   chrom_sizes,
                   as_file,
                   blacklist,
                   rep1_signal,
                   rep2_signal,
                   pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe(
        )['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info("%d peaks blacklisted from the conservative set" %
                    (Nt - Ncb))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No - Nob))
    else:
        subprocess.check_output(
            shlex.split('cp %s %s' %
                        (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads": rep1_n_reads,
        "rep1_frip_nreads_in_peaks": rep1_n_reads_in_peaks,
        "F1": rep1_frip_score,
        "rep2_frip_nreads": rep2_n_reads,
        "rep2_frip_nreads_in_peaks": rep2_n_reads_in_peaks,
        "F2": rep2_frip_score,
        "true_frip_nreads": true_n_reads,
        "true_frip_nreads_in_peaks": true_n_reads_in_peaks,
        "Ft": true_frip_score,
        "pr_frip_nreads": pr_n_reads,
        "pr_frip_nreads_in_peaks": pr_n_reads_in_peaks,
        "Fp": pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
            dxpy.dxlink(
                dxpy.upload_local_file(common.compress(reps_peaks_filename))),
            "pre_bl_optimal_set":
            dxpy.dxlink(
                dxpy.upload_local_file(
                    common.compress(peaks_to_filter_filename)))
        })

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility,
        "No":
        Nob,
        "Nc":
        Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemple #19
0
def main(rep1_ta, ctl1_ta, rep1_paired_end,
         rep2_ta=None, ctl2_ta=None, rep2_paired_end=None):
    rep1_ta_filename = rep1_ta
    ntags_rep1 = common.count_lines(rep1_ta_filename)
    output = {'rep1_ta': rep1_ta_filename}

    simplicate_experiment = rep1_ta and not rep2_ta
    output.update({'simplicate_experiment': simplicate_experiment})
    if simplicate_experiment:
        logger.info("No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info("Rep1 and rep2 tags specified so processing as a replicated experiment.")
        output.update({'rep2_ta': rep2_ta})

    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_filename = rep2_ta
        ntags_rep2 = common.count_lines(rep2_ta_filename)
        output.update({'rep2_ta': rep2_ta_filename})
    paired_end = rep1_paired_end
    output.update({'paired_end': paired_end})
    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_filename = ctl1_ta

    if not unary_control:
        ctl2_ta_filename = ctl2_ta
    else:
        ctl2_ta_filename = ctl1_ta

    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep2_control = ctl2_ta  # default.  May be changed later.
    output.update({'rep1_control': rep1_control,
                   'rep2_control': rep2_control})

    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend(
        [(ntags_ctl1, 'control 1', ctl1_ta_filename),
         (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    if not simplicate_experiment:
        pool_replicates_subjob = \
            pool(**{"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'})
        pooled_replicates = pool_replicates_subjob.get("pooled")
        output.update({'pooled_replicates': pooled_replicates})
        # this needs to go to the other image
        '''
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                name='Pool cross-correlation')
        '''

    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info("Using one control for both replicate 1 and 2 and for the pool.")
        rep2_control = rep1_control
        control_for_pool = rep1_control
        output.update({'rep2_control': rep2_control,
                       'control_for_pool': rep1_control})
    else:
        pool_controls_subjob = pool(
            **{"inputs": [ctl1_ta, ctl2_ta],
               "prefix": "PL_ctls"})
        pooled_controls = pool_controls_subjob.get("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        output.update({'control_for_pool': control_for_pool})
        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2)
        if ratio_ctl_reads < 1:
                ratio_ctl_reads = 1/ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
                logger.info(
                    "Number of reads in controls differ by > factor of %f. Using pooled controls."
                    % (ratio_cutoff))
                rep1_control = pooled_controls
                rep2_control = pooled_controls
                output.update({'rep1_control': pooled_controls,
                               'rep2_control': pooled_controls})
        else:
                if ntags_ctl1 < ntags_rep1:
                        logger.info("Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1.")
                        rep1_control = pooled_controls
                        output.update({'rep1_control': pooled_controls})
                elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                        logger.info("Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2.")
                        rep2_control = pooled_controls
                        output.update({'rep2_control': pooled_controls})
                else:
                    logger.info(
                        "Using distinct controls for replicate 1 and 2.")
                    rep1_control = ctl1_ta
                    rep2_control = ctl2_ta
                    output.update({'rep1_control': ctl1_ta,
                                   'rep2_control': ctl2_ta})

    rep1_pr_subjob = pseudoreplicator(**{"input_tags": rep1_ta})
    r1pr1 = rep1_pr_subjob.get('pseudoreplicate1')
    r1pr2 = rep1_pr_subjob.get('pseudoreplicate2')
    output.update({'r1pr1': r1pr1,
                   'r1pr2': r1pr2})

    if not simplicate_experiment:
        rep2_pr_subjob = pseudoreplicator(**{"input_tags": rep2_ta})
        r2pr1 = rep2_pr_subjob.get('pseudoreplicate1')
        r2pr2 = rep2_pr_subjob.get('pseudoreplicate2')
        output.update({'r2pr1': r2pr1,
                       'r2pr2': r2pr2})
        pool_pr1_subjob = pool(
            **{"inputs": [rep1_pr_subjob.get("pseudoreplicate1"),
                          rep2_pr_subjob.get("pseudoreplicate1")],
               "prefix": 'PPR1'})
        pool_pr2_subjob = pool(
            **{"inputs": [rep1_pr_subjob.get("pseudoreplicate2"),
                          rep2_pr_subjob.get("pseudoreplicate2")],
               "prefix": 'PPR2'})
        ppr1 = pool_pr1_subjob.get('pooled')
        ppr2 = pool_pr2_subjob.get('pooled')
        output.update({'ppr1': ppr1,
                       'ppr2': ppr2})
    # should there be an indication of the simplicateness of the
    # experiment in the output json? this could be a good way to
    # direct the next step without putting too much logic into the
    # workflow. ADDED.
    # Turns out Cromwell does not support reading .json. Instead
    # they have read_map function that accepts 2 column TSVs.
    with open('pool_and_pseudoreplicate_outfiles.mapping', 'w') as f:
        for key in output:
            f.write('%s\t%s\n' % (key, output[key]))

    return output
def main(rep1_peaks,
         rep2_peaks,
         pooled_peaks,
         pooledpr1_peaks,
         pooledpr2_peaks,
         chrom_sizes,
         as_file,
         peak_type,
         prefix=None,
         rep1_signal=None,
         rep2_signal=None,
         pooled_signal=None):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances

    rep1_peaks = dxpy.DXFile(rep1_peaks)
    rep2_peaks = dxpy.DXFile(rep2_peaks)
    pooled_peaks = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks = dxpy.DXFile(pooledpr2_peaks)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    as_file = dxpy.DXFile(as_file)

    #Input filenames - necessary to define each explicitly because input files could have the same name, in which case subsequent
    #file would overwrite previous file
    rep1_peaks_fn = 'rep1-%s' % (rep1_peaks.name)
    rep2_peaks_fn = 'rep2-%s' % (rep2_peaks.name)
    pooled_peaks_fn = 'pooled-%s' % (pooled_peaks.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks.name)
    chrom_sizes_fn = 'chrom.sizes'
    as_file_fn = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)  #strip off the peak and compression extensions
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file.get_id(), as_file_fn)
    '''
    #find pooled peaks that are in (rep1 AND rep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
        ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

    #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
        ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

    #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
    out, err = common.run_pipe([
        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
        ], overlapping_peaks_fn)
    print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
    '''
    #the only difference between the peak_types is how the extra columns are handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in [
            'narrowPeak', 'gappedPeak', 'broadPeak'
        ], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (
            peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' %
        (pooled_peaks_fn, rep1_peaks_fn), awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (rep2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_tr_fn)
    print "%d peaks overlap with both true replicates" % (
        common.count_lines(overlap_tr_fn))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command, cut_command, 'sort -u',
        'intersectBed -wo -a stdin -b %s' %
        (pooledpr2_peaks_fn), awk_command, cut_command, 'sort -u'
    ], overlap_pr_fn)
    print "%d peaks overlap with both pooled pseudoreplicates" % (
        common.count_lines(overlap_pr_fn))

    # Combine peak lists
    out, err = common.run_pipe(
        ['cat %s %s' % (overlap_tr_fn, overlap_pr_fn), 'sort -u'],
        overlapping_peaks_fn)
    print "%d peaks overlap with true replicates or with pooled pseudorepliates" % (
        common.count_lines(overlapping_peaks_fn))

    #rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' %
        (pooled_peaks_fn, overlapping_peaks_fn)
    ], rejected_peaks_fn)
    print "%d peaks were rejected" % (common.count_lines(rejected_peaks_fn))

    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    #make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(overlapping_peaks_fn,
                                            chrom_sizes_fn,
                                            as_file_fn,
                                            bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(rejected_peaks_fn,
                                         chrom_sizes_fn,
                                         as_file_fn,
                                         bed_type=bed_type)
    # overlapping_peaks_bb_fn = common.bed2bb(common.slop_clip(overlapping_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    # rejected_peaks_bb_fn    = common.bed2bb(common.slop_clip(rejected_peaks_fn, chrom_sizes_fn, "gappedPeak"), chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(
        common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {
        "overlapping_peaks": dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb": dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks": dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb": dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in": npeaks_in,
        "npeaks_out": npeaks_out,
        'npeaks_rejected': npeaks_rejected
    }

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemple #21
0
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
         chrom_sizes, as_file, blacklist=None,
         rep1_signal=None, rep2_signal=None, pooled_signal=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    subprocess.check_output('set -x; ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    logging.info("Exiting with output: %s", output)
    return output
def replicated_IDR(experiment,
                   reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks,
                   rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                   paired_end, chrom_sizes, as_file, blacklist,
                   rep1_signal, rep2_signal, pooled_signal,
                   fragment_length=None):

    # TODO for now just taking the peak files.  This applet should actually
    # call IDR instead of putting that in the workflow populator script

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    rep1_ta_filename = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_filename = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_filename = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_filename = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_filename)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_filename)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_filename)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # next call could be on 267 and save time?
    pool_replicates_subjob.wait_on_done()
    # If fragment_length is not given, calculate the fragment_length
    # using crosscorrelation. Else use the overridevalue. Set the
    # pool_xcor_filename to None to accommodate common.frip calls.
    # Calculate, or set, actually used fragment lengths for different
    # cases. Set the flag indicating whether the fragment length
    # was given by the user.
    if fragment_length is not None:
        pool_xcor_filename = None
        fragment_length_used_rep1 = fragment_length
        fragment_length_used_rep2 = fragment_length
        fragment_length_used_pool = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fragment_length_used_rep1 = common.xcor_fraglen(rep1_xcor_filename)
        fragment_length_used_rep2 = common.xcor_fraglen(rep2_xcor_filename)
        fragment_length_used_pool = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    Nt = common.count_lines(reps_peaks_filename)
    logger.info("%d peaks from true replicates (Nt)" % (Nt))
    N1 = common.count_lines(r1pr_peaks_filename)
    logger.info("%d peaks from rep1 self-pseudoreplicates (N1)" % (N1))
    N2 = common.count_lines(r2pr_peaks_filename)
    logger.info("%d peaks from rep2 self-pseudoreplicates (N2)" % (N2))
    Np = common.count_lines(pooledpr_peaks_filename)
    logger.info("%d peaks from pooled pseudoreplicates (Np)" % (Np))

    # generate the conservative set, which is always based on the IDR peaks
    # from true replicates
    conservative_set_filename = \
        '%s_final_conservative.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
        Ncb = common.count_lines(conservative_set_filename)
        logger.info(
            "%d peaks blacklisted from the conservative set" % (Nt-Ncb))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (reps_peaks_filename, conservative_set_filename)))
        Ncb = Nt
        logger.info("No blacklist filter applied to the conservative set")

    # generate the optimal set, which is based on the longest of IDR peaks
    # list from true reps or the IDR peaks from the pseudoreplicates of the
    # pool
    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np
    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
        Nob = common.count_lines(optimal_set_filename)
        logger.info("%d peaks blacklisted from the optimal set" % (No-Nob))
    else:
        subprocess.check_output(shlex.split(
            'cp %s %s' % (peaks_to_filter_filename, optimal_set_filename)))
        Nob = No
        logger.info("No blacklist filter applied to the optimal set")

    rescue_ratio            = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio  = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    # FRiP (fraction reads in peaks)
    # rep1 stable peaks comparing internal pseudoreplicates
    rep1_n_reads, rep1_n_reads_in_peaks, rep1_frip_score = common.frip(
        rep1_ta_filename, rep1_xcor_filename, r1pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # rep2 stable peaks comparing internal pseudoreplicates
    rep2_n_reads, rep2_n_reads_in_peaks, rep2_frip_score = common.frip(
        rep2_ta_filename, rep2_xcor_filename, r2pr_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing true reps
    true_n_reads, true_n_reads_in_peaks, true_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, reps_peaks_filename,
        chrom_sizes_filename, fragment_length)
    # comparing pooled pseudoreplicates
    pr_n_reads, pr_n_reads_in_peaks, pr_frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, pooledpr_peaks_filename,
        chrom_sizes_filename, fragment_length)

    output = {
        "rep1_frip_nreads"           : rep1_n_reads,
        "rep1_frip_nreads_in_peaks"  : rep1_n_reads_in_peaks,
        "F1"            : rep1_frip_score,
        "rep2_frip_nreads"           : rep2_n_reads,
        "rep2_frip_nreads_in_peaks"  : rep2_n_reads_in_peaks,
        "F2"            : rep2_frip_score,
        "true_frip_nreads"           : true_n_reads,
        "true_frip_nreads_in_peaks"  : true_n_reads_in_peaks,
        "Ft"            : true_frip_score,
        "pr_frip_nreads"             : pr_n_reads,
        "pr_frip_nreads_in_peaks"    : pr_n_reads_in_peaks,
        "Fp"              : pr_frip_score,
        "fragment_length_used_rep1": fragment_length_used_rep1,
        "fragment_length_used_rep2": fragment_length_used_rep2,
        "fragment_length_used_pool": fragment_length_used_pool,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    # These are optional outputs to see what's being removed by the blacklist
    if blacklist:
        output.update({
            "pre_bl_conservative_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    reps_peaks_filename))),
            "pre_bl_optimal_set":
                dxpy.dxlink(dxpy.upload_local_file(common.compress(
                    peaks_to_filter_filename)))}
        )

    # bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = \
        common.bed2bb(conservative_set_filename, chrom_sizes_filename,
                      as_file_filename)
    optimal_set_bb_filename = \
        common.bed2bb(optimal_set_filename, chrom_sizes_filename,
                      as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = \
            dxpy.upload_local_file(conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update(
            {"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt": Nt,
        "N1": N1,
        "N2": N2,
        "Np": Np,
        "conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
        "optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio": rescue_ratio,
        "self_consistency_ratio": self_consistency_ratio,
        "reproducibility_test": reproducibility,
        "No": Nob,
        "Nc": Ncb
    })

    # These are just passed through for convenience so that signals and tracks
    # are available in one place.  Both input and output are optional.
    if rep1_signal:
        output.update({"rep1_signal": rep1_signal})
    if rep2_signal:
        output.update({"rep2_signal": rep2_signal})
    if pooled_signal:
        output.update({"pooled_signal": pooled_signal})

    return output
Exemple #23
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3 # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        print "Read fragment length: %d" %(fragment_length)

    #run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz'
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    #install spp
    subprocess.check_call('ls -l', shell=True)
    subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball)))
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)
    print spp_command
    # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    # for line in iter(process.stdout.readline, ''):
    #     sys.stdout.write(line)
    subprocess.check_call(shlex.split(spp_command))

    #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
    #this changes any such coodinates to decimal notation
    #this assumes 10-column output and that the 2nd and 3rd columns are coordinates
    #slopBed adjusts feature end coordinates that go off the end of the chromosome
    #bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" %(final_peaks_filename),
        "tee %s" %(peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename),
        'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    #These lines transfer the peaks files to the temporary workspace for debugging later
    #Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" %(n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))
    print "First 50 peaks"
    print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end, chrom_sizes, genomesize,
         narrowpeak_as, gappedpeak_as, broadpeak_as,
         rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None,
         fragment_length=None):
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    rep1_ta_filename = rep1_ta_file.name
    ntags_rep1 = common.count_lines(rep1_ta_filename)

    simplicate_experiment = rep1_ta and not rep2_ta
    if simplicate_experiment:
        logger.info("No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info("Rep1 and rep2 tags specified so processing as a replicated experiment.")

    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_file = dxpy.DXFile(rep2_ta)
        dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
        rep2_ta_filename = rep2_ta_file.name
        ntags_rep2 = common.count_lines(rep2_ta_filename)
    paired_end = rep1_paired_end

    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    ctl1_ta_filename = ctl1_ta_file.name

    if not unary_control:
        ctl2_ta_file = dxpy.DXFile(ctl2_ta)
        dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
        ctl2_ta_filename = ctl2_ta_file.name
    else:
        ctl2_ta_file = ctl1_ta_file
        ctl2_ta_filename = ctl1_ta_file.name

    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"

    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend(
        [(ntags_ctl1, 'control 1', ctl1_ta_filename),
         (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    if not simplicate_experiment:
        pool_applet = dxpy.find_one_data_object(
                classname='applet',
                name='pool',
                project=dxpy.PROJECT_CONTEXT_ID,
                zero_ok=False,
                more_ok=False,
                return_handler=True)
        pool_replicates_subjob = \
            pool_applet.run(
                {"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'},
                name='Pool replicates')
        pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                name='Pool cross-correlation')

    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info("Using one control for both replicate 1 and 2 and for the pool.")
        rep2_control = rep1_control
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        pool_controls_subjob = pool_applet.run(
            {"inputs": [ctl1_ta, ctl2_ta],
             "prefix": "PL_ctls"},
            name='Pool controls')
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"

        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2)
        if ratio_ctl_reads < 1:
                ratio_ctl_reads = 1/ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
                logger.info(
                    "Number of reads in controls differ by > factor of %f. Using pooled controls."
                    % (ratio_cutoff))
                rep1_control = pooled_controls
                rep2_control = pooled_controls
        else:
                if ntags_ctl1 < ntags_rep1:
                        logger.info("Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1.")
                        rep1_control = pooled_controls
                        rep1_ctl_msg = "pooled controls"
                elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                        logger.info("Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2.")
                        rep2_control = pooled_controls
                        rep2_ctl_msg = "pooled controls"
                else:
                    logger.info(
                        "Using distinct controls for replicate 1 and 2.")
                    rep1_control = ctl1_ta  # default.  May be changed later.
                    rep2_control = ctl2_ta  # default.  May be changed later.
                    rep1_ctl_msg = "control rep1"
                    rep2_ctl_msg = "control rep2"

    pseudoreplicator_applet = dxpy.find_one_data_object(
        classname='applet',
        name='pseudoreplicator',
        zero_ok=False,
        more_ok=False,
        return_handler=True)
    rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta})
    if not simplicate_experiment:
        rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta})

        pool_pr1_subjob = pool_applet.run(
            {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                        rep2_pr_subjob.get_output_ref("pseudoreplicate1")],
             "prefix": 'PPR1'})
        pool_pr2_subjob = pool_applet.run(
            {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                        rep2_pr_subjob.get_output_ref("pseudoreplicate2")],
             "prefix": 'PPR2'})

    common_args = {
        'chrom_sizes':      chrom_sizes,
        'genomesize':       genomesize,
        'narrowpeak_as':    narrowpeak_as,
        'gappedpeak_as':    gappedpeak_as,
        'broadpeak_as':     broadpeak_as
        }
    # if the fragment_length argument is given, update macs2 input
    if fragment_length is not None:
        common_args.update({'fragment_length' : fragment_length})

    common_args.update({'prefix': 'r1'})
    rep1_peaks_subjob      = macs2( rep1_ta,
                                    rep1_control,
                                    rep1_xcor, **common_args)

    common_args.update({'prefix': 'r1pr1'})
    rep1pr1_peaks_subjob   = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                                    rep1_control,
                                    rep1_xcor, **common_args)

    common_args.update({'prefix': 'r1pr2'})
    rep1pr2_peaks_subjob   = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                                    rep1_control,
                                    rep1_xcor, **common_args)

    if not simplicate_experiment:
        common_args.update({'prefix': 'r2'})
        rep2_peaks_subjob      = macs2( rep2_ta,
                                        rep2_control,
                                        rep2_xcor, **common_args)

        common_args.update({'prefix': 'r2pr1'})
        rep2pr1_peaks_subjob   = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
                                        rep2_control,
                                        rep2_xcor, **common_args)

        common_args.update({'prefix': 'r2pr2'})
        rep2pr2_peaks_subjob   = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
                                        rep2_control,
                                        rep2_xcor, **common_args)

        common_args.update({'prefix': 'pool'})
        pooled_peaks_subjob    = macs2( pooled_replicates,
                                        control_for_pool,   
                                        pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

        common_args.update({'prefix': 'ppr1'})
        pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get_output_ref("pooled"),
                                        control_for_pool,
                                        pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

        common_args.update({'prefix': 'ppr2'})
        pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get_output_ref("pooled"),
                                        control_for_pool,
                                        pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    output = {
        'rep1_narrowpeaks':         rep1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1_gappedpeaks':         rep1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1_broadpeaks':          rep1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1_narrowpeaks_bb':      rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'rep1_gappedpeaks_bb':      rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'rep1_broadpeaks_bb':       rep1_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'rep1_fc_signal':           rep1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1_pvalue_signal':       rep1_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep1pr1_narrowpeaks':      rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr1_gappedpeaks':      rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr1_broadpeaks':       rep1pr1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr1_fc_signal':        rep1pr1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr1_pvalue_signal':    rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep1pr2_narrowpeaks':      rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr2_gappedpeaks':      rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr2_broadpeaks':       rep1pr2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr2_fc_signal':        rep1pr2_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr2_pvalue_signal':    rep1pr2_peaks_subjob.get_output_ref("pvalue_signal")
    }

    if not simplicate_experiment:
        output.update({
            'rep2_narrowpeaks':         rep2_peaks_subjob.get_output_ref("narrowpeaks"),
            'rep2_gappedpeaks':         rep2_peaks_subjob.get_output_ref("gappedpeaks"),
            'rep2_broadpeaks':          rep2_peaks_subjob.get_output_ref("broadpeaks"),
            'rep2_narrowpeaks_bb':      rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"),
            'rep2_gappedpeaks_bb':      rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"),
            'rep2_broadpeaks_bb':       rep2_peaks_subjob.get_output_ref("broadpeaks_bb"),
            'rep2_fc_signal':           rep2_peaks_subjob.get_output_ref("fc_signal"),
            'rep2_pvalue_signal':       rep2_peaks_subjob.get_output_ref("pvalue_signal"),

            'rep2pr1_narrowpeaks':      rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"),
            'rep2pr1_gappedpeaks':      rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"),
            'rep2pr1_broadpeaks':       rep2pr1_peaks_subjob.get_output_ref("broadpeaks"),
            'rep2pr1_fc_signal':        rep2pr1_peaks_subjob.get_output_ref("fc_signal"),
            'rep2pr1_pvalue_signal':    rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"),

            'rep2pr2_narrowpeaks':      rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"),
            'rep2pr2_gappedpeaks':      rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"),
            'rep2pr2_broadpeaks':       rep2pr2_peaks_subjob.get_output_ref("broadpeaks"),
            'rep2pr2_fc_signal':        rep2pr2_peaks_subjob.get_output_ref("fc_signal"),
            'rep2pr2_pvalue_signal':    rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"),

            'pooled_narrowpeaks':       pooled_peaks_subjob.get_output_ref("narrowpeaks"),
            'pooled_gappedpeaks':       pooled_peaks_subjob.get_output_ref("gappedpeaks"),
            'pooled_broadpeaks':        pooled_peaks_subjob.get_output_ref("broadpeaks"),
            'pooled_narrowpeaks_bb':    pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"),
            'pooled_gappedpeaks_bb':    pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"),
            'pooled_broadpeaks_bb':     pooled_peaks_subjob.get_output_ref("broadpeaks_bb"),
            'pooled_fc_signal':         pooled_peaks_subjob.get_output_ref("fc_signal"),
            'pooled_pvalue_signal':     pooled_peaks_subjob.get_output_ref("pvalue_signal"),

            'pooledpr1_narrowpeaks':    pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"),
            'pooledpr1_gappedpeaks':    pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"),
            'pooledpr1_broadpeaks':     pooledpr1_peaks_subjob.get_output_ref("broadpeaks"),
            'pooledpr1_fc_signal':      pooledpr1_peaks_subjob.get_output_ref("fc_signal"),
            'pooledpr1_pvalue_signal':  pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"),

            'pooledpr2_narrowpeaks':    pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"),
            'pooledpr2_gappedpeaks':    pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"),
            'pooledpr2_broadpeaks':     pooledpr2_peaks_subjob.get_output_ref("broadpeaks"),
            'pooledpr2_fc_signal':      pooledpr2_peaks_subjob.get_output_ref("fc_signal"),
            'pooledpr2_pvalue_signal':  pooledpr2_peaks_subjob.get_output_ref("pvalue_signal")
        })

    return output
Exemple #25
0
def main(rep1_ta,
         rep2_ta,
         ctl1_ta,
         ctl2_ta,
         rep1_xcor,
         rep2_xcor,
         npeaks,
         nodups,
         rep1_paired_end,
         rep2_paired_end,
         chrom_sizes,
         as_file=None,
         idr_peaks=False):

    if not rep1_paired_end == rep2_paired_end:
        raise ValueError('Mixed PE/SE not supported (yet)')
    paired_end = rep1_paired_end

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    unary_control = ctl1_ta == ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    ctl2_ta_file = dxpy.DXFile(ctl2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)

    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name)

    rep1_ta_filename = rep1_ta_file.name
    rep2_ta_filename = rep2_ta_file.name
    ctl1_ta_filename = ctl1_ta_file.name
    ctl2_ta_filename = ctl2_ta_file.name

    ntags_rep1 = common.count_lines(rep1_ta_filename)
    ntags_rep2 = common.count_lines(rep2_ta_filename)
    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)

    for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename),
                              (ntags_rep2, 'replicate 2', rep2_ta_filename),
                              (ntags_ctl1, 'control 1', ctl1_ta_filename),
                              (ntags_ctl2, 'control 2', ctl2_ta_filename)]:
        print("Found %d tags in %s file %s" % (n, name, filename))

    print(
        subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT))

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta]},
            name='Pool replicates')
    pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
    pooled_replicates_xcor_subjob = \
        xcor_only(
            pooled_replicates,
            paired_end,
            name='Pool cross-correlation')

    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"

    if unary_control:
        print(
            "Only one control supplied.  Using it for both replicate 1 and 2 and for the pool."
        )
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        pool_controls_subjob = \
            pool_applet.run(
                {"inputs": [ctl1_ta, ctl2_ta]},
                name='Pool controls')
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"

        # use the pooled controls for the reps depending on the ratio of
        # rep to control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            print(
                "Number of reads in controls differ by > factor of %f. Using pooled controls."
                % (ratio_cutoff))
            rep1_control = pooled_controls
            rep1_ctl_msg = "pooled controls"
            rep2_control = pooled_controls
            rep2_ctl_msg = "pooled controls"
        else:
            if ntags_ctl1 < ntags_rep1:
                print(
                    "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                )
                rep1_control = pooled_controls
                rep1_ctl_msg = "pooled controls"
            elif ntags_ctl2 < ntags_rep2:
                print(
                    "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                )
                rep2_control = pooled_controls
                rep2_ctl_msg = "pooled controls"
            else:
                print("Using distinct controls for replicate 1 and 2.")
                rep1_ctl_msg = "control rep1"
                rep2_ctl_msg = "control rep2"

    rep1_peaks_subjob = spp(rep1_ta,
                            rep1_control,
                            rep1_xcor,
                            chrom_sizes=chrom_sizes,
                            bigbed=True,
                            as_file=as_file,
                            name='Rep1 peaks vs %s' % (rep1_ctl_msg))

    rep2_peaks_subjob = spp(rep2_ta,
                            rep2_control,
                            rep2_xcor,
                            chrom_sizes=chrom_sizes,
                            bigbed=True,
                            as_file=as_file,
                            name='Rep2 peaks vs %s' % (rep2_ctl_msg))

    pooled_peaks_subjob = spp(
        pooled_replicates,
        control_for_pool,
        pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
        chrom_sizes=chrom_sizes,
        bigbed=True,
        as_file=as_file,
        name='Pooled peaks vs %s' % (pool_ctl_msg))

    output = {
        'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"),
        'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"),
        'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"),
        'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores"),
        'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"),
        'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"),
        'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"),
        'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"),
        'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"),
        'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"),
        'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"),
        'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores")
    }

    if idr_peaks:  # also call peaks on pseudoreplicates for IDR
        pseudoreplicator_applet = \
            dxpy.find_one_data_object(
               classname='applet',
               name='pseudoreplicator',
               project=dxpy.PROJECT_CONTEXT_ID,
               zero_ok=False,
               more_ok=False,
               return_handler=True)

        rep1_pr_subjob = \
            pseudoreplicator_applet.run(
                {"input_tags": rep1_ta},
                name='Pseudoreplicate rep1 -> R1PR1,2')
        rep2_pr_subjob = \
            pseudoreplicator_applet.run(
                {"input_tags": rep2_ta},
                name='Pseudoreplicate rep2 -> R2PR1,2')

        pool_pr1_subjob = pool_applet.run(
            {
                "inputs": [
                    rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate1")
                ]
            },
            name='Pool R1PR1+R2PR1 -> PPR1')

        pool_pr2_subjob = pool_applet.run(
            {
                "inputs": [
                    rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate2")
                ]
            },
            name='Pool R1PR2+R2PR2 -> PPR2')

        rep1_pr1_xcor_subjob = xcor_only(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            paired_end,
            name='R1PR1 cross-correlation')
        rep1_pr2_xcor_subjob = xcor_only(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            paired_end,
            name='R1PR2 cross-correlation')
        rep2_pr1_xcor_subjob = xcor_only(
            rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
            paired_end,
            name='R2PR1 cross-correlation')
        rep2_pr2_xcor_subjob = xcor_only(
            rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
            paired_end,
            name='R2PR2 cross-correlation')
        pool_pr1_xcor_subjob = xcor_only(
            pool_pr1_subjob.get_output_ref("pooled"),
            paired_end,
            name='PPR1 cross-correlation')
        pool_pr2_xcor_subjob = xcor_only(
            pool_pr2_subjob.get_output_ref("pooled"),
            paired_end,
            name='PPR2 cross-correlation')

        rep1pr1_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep1_control,
            rep1_pr1_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False,
            name='R1PR1 peaks vs %s' % (rep1_ctl_msg))

        rep1pr2_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep1_control,
            rep1_pr2_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False,
            name='R1PR2 peaks vs %s' % (rep1_ctl_msg))

        rep2pr1_peaks_subjob = spp(
            rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep2_control,
            rep2_pr1_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False,
            name='R2PR1 peaks vs %s' % (rep2_ctl_msg))

        rep2pr2_peaks_subjob = spp(
            rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep2_control,
            rep2_pr2_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False,
            name='R2PR2 peaks vs %s' % (rep2_ctl_msg))

        pooledpr1_peaks_subjob = spp(
            pool_pr1_subjob.get_output_ref("pooled"),
            control_for_pool,
            pool_pr1_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False,
            name='PPR1 peaks vs %s' % (pool_ctl_msg))

        pooledpr2_peaks_subjob = spp(
            pool_pr2_subjob.get_output_ref("pooled"),
            control_for_pool,
            pool_pr2_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False,
            name='PPR2 peaks vs %s' % (pool_ctl_msg))

        output.update({
            'rep1pr1_peaks':
            rep1pr1_peaks_subjob.get_output_ref("peaks"),
            'rep1pr1_xcor_plot':
            rep1pr1_peaks_subjob.get_output_ref("xcor_plot"),
            'rep1pr1_xcor_scores':
            rep1pr1_peaks_subjob.get_output_ref("xcor_scores"),
            'rep1pr2_peaks':
            rep1pr2_peaks_subjob.get_output_ref("peaks"),
            'rep1pr2_xcor_plot':
            rep1pr2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep1pr2_xcor_scores':
            rep1pr2_peaks_subjob.get_output_ref("xcor_scores"),
            'rep2pr1_peaks':
            rep2pr1_peaks_subjob.get_output_ref("peaks"),
            'rep2pr1_xcor_plot':
            rep2pr1_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2pr1_xcor_scores':
            rep2pr1_peaks_subjob.get_output_ref("xcor_scores"),
            'rep2pr2_peaks':
            rep2pr2_peaks_subjob.get_output_ref("peaks"),
            'rep2pr2_xcor_plot':
            rep2pr2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2pr2_xcor_scores':
            rep2pr2_peaks_subjob.get_output_ref("xcor_scores"),
            'pooledpr1_peaks':
            pooledpr1_peaks_subjob.get_output_ref("peaks"),
            'pooledpr1_xcor_plot':
            pooledpr1_peaks_subjob.get_output_ref("xcor_plot"),
            'pooledpr1_xcor_scores':
            pooledpr1_peaks_subjob.get_output_ref("xcor_scores"),
            'pooledpr2_peaks':
            pooledpr2_peaks_subjob.get_output_ref("peaks"),
            'pooledpr2_xcor_plot':
            pooledpr2_peaks_subjob.get_output_ref("xcor_plot"),
            'pooledpr2_xcor_scores':
            pooledpr2_peaks_subjob.get_output_ref("xcor_scores"),
        })

    return output
def main(experiment,
         reps_peaks,
         r1pr_peaks,
         r2pr_peaks,
         pooledpr_peaks,
         chrom_sizes,
         as_file,
         blacklist=None):

    #TODO for now just taking the peak files.  This applet should actually call IDR instead of
    #putting that in the workflow populator script

    # Initialize the data object inputs on the platform into
    # dxpy.DXDataObject instances.

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Download the file inputs to the local file system.

    #Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    print subprocess.check_output('ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    print "%d peaks from true replicates" % (Nt)
    N1 = common.count_lines(r1pr_peaks_filename)
    print "%d peaks from rep1 self-pseudoreplicates" % (N1)
    N2 = common.count_lines(r2pr_peaks_filename)
    print "%d peaks from rep2 self-pseudoreplicates" % (N2)
    Np = common.count_lines(pooledpr_peaks_filename)
    print "%d peaks from pooled pseudoreplicates" % (Np)

    conservative_set_filename = '%s_final_conservative.narrowPeak' % (
        experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
    else:
        conservative_set_filename = reps_peaks_filename
    Ncb = common.count_lines(conservative_set_filename)
    print "%d peaks blacklisted from the conservative set" % (Nt - Ncb)

    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np

    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
    else:
        optimal_set_filename = peaks_to_filter_filename
    Nob = common.count_lines(optimal_set_filename)
    print "%d peaks blacklisted from the optimal set" % (No - Nob)

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    #bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = common.bed2bb(conservative_set_filename,
                                                 chrom_sizes_filename,
                                                 as_file_filename)
    optimal_set_bb_filename = common.bed2bb(optimal_set_filename,
                                            chrom_sizes_filename,
                                            as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = dxpy.upload_local_file(
            conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility
    })

    logging.info("Exiting with output: %s", output)
    return output
def replicated_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                       pooledpr1_peaks, pooledpr2_peaks,
                       rep1_ta, rep1_xcor, rep2_ta, rep2_xcor,
                       paired_end, chrom_sizes, as_file, peak_type, prefix,
                       fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    pooledpr1_peaks_file = dxpy.DXFile(pooledpr1_peaks)
    pooledpr2_peaks_file = dxpy.DXFile(pooledpr2_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep2_ta_file         = dxpy.DXFile(rep2_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file       = dxpy.DXFile(rep2_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    pooledpr1_peaks_fn = 'pooledpr1-%s' % (pooledpr1_peaks_file.name)
    pooledpr2_peaks_fn = 'pooledpr2-%s' % (pooledpr2_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep2_ta_fn         = 'r2ta_%s' % (rep2_ta_file.name)
    rep1_xcor_fn       = 'r1cc_%s' % (rep1_xcor_file.name)
    rep2_xcor_fn       = 'r2cc_%s' % (rep2_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(pooledpr1_peaks_file.get_id(), pooledpr1_peaks_fn)
    dxpy.download_dxfile(pooledpr2_peaks_file.get_id(), pooledpr2_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
    pool_replicates_subjob = \
        pool_applet.run(
            {"inputs": [rep1_ta, rep2_ta],
             "prefix": 'pooled_reps'},
            name='Pool replicates')
    # If fragment length was given by user we skip pooled_replicates
    # _xcor_subjob, set the pool_xcor_filename to None, and update
    # the flag fragment_length_given_by_user. Otherwise, run the subjob
    # to be able to extract the fragment length fron cross-correlations.
    if fragment_length is not None:
        pool_xcor_filename = None
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pool_replicates_subjob.get_output_ref("pooled"),
                paired_end,
                spp_version=None,
                name='Pool cross-correlation')
        pooled_replicates_xcor_subjob.wait_on_done()
        pool_xcor_link = pooled_replicates_xcor_subjob.describe()['output'].get("CC_scores_file")
        pool_xcor_file = dxpy.get_handler(pool_xcor_link)
        pool_xcor_filename = 'poolcc_%s' % (pool_xcor_file.name)
        dxpy.download_dxfile(pool_xcor_file.get_id(), pool_xcor_filename)
        fraglen = common.xcor_fraglen(pool_xcor_filename)
        fragment_length_given_by_user = False

    pool_replicates_subjob.wait_on_done()
    pool_ta_link = pool_replicates_subjob.describe()['output'].get("pooled")
    pool_ta_file = dxpy.get_handler(pool_ta_link)
    pool_ta_filename = 'poolta_%s' % (pool_ta_file.name)
    dxpy.download_dxfile(pool_ta_file.get_id(), pool_ta_filename)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where
    # overlap is defined as the fractional overlap wrt any one of the
    # overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, pooledpr1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (pooledpr2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_pr_fn)
    print(
        "%d peaks overlap with both pooled pseudoreplicates"
        % (common.count_lines(overlap_pr_fn)))

    # Combine peak lists
    out, err = common.run_pipe([
        'cat %s %s' % (overlap_tr_fn, overlap_pr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap with true replicates or with pooled pseudoreplicates"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        pool_ta_filename, pool_xcor_filename, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen, reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in        = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out       = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected  = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks       = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb    = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks          = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb       = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
def internal_pseudoreplicate_overlap(rep1_peaks, rep2_peaks, pooled_peaks,
                                     rep1_ta, rep1_xcor,
                                     paired_end, chrom_sizes, as_file,
                                     peak_type, prefix, fragment_length=None):

    rep1_peaks_file      = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file      = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file    = dxpy.DXFile(pooled_peaks)
    rep1_ta_file         = dxpy.DXFile(rep1_ta)
    rep1_xcor_file       = dxpy.DXFile(rep1_xcor)
    chrom_sizes_file     = dxpy.DXFile(chrom_sizes)
    as_file_file         = dxpy.DXFile(as_file)

    # Input filenames - necessary to define each explicitly because input files
    # could have the same name, in which case subsequent
    # file would overwrite previous file
    rep1_peaks_fn      = 'rep1-%s' % (rep1_peaks_file.name)
    rep2_peaks_fn      = 'rep2-%s' % (rep2_peaks_file.name)
    pooled_peaks_fn    = 'pooled-%s' % (pooled_peaks_file.name)
    rep1_ta_fn         = 'r1ta_%s' % (rep1_ta_file.name)
    rep1_xcor_fn       = 'r1xc_%s' % (rep1_xcor_file.name)
    chrom_sizes_fn     = 'chrom.sizes'
    as_file_fn         = '%s.as' % (peak_type)

    # Output filenames
    if prefix:
        basename = prefix
    else:
        # strip off the peak and compression extensions
        m = re.match(
            '(.*)(\.%s)+(\.((gz)|(Z)|(bz)|(bz2)))' % (peak_type),
            pooled_peaks.name)
        if m:
            basename = m.group(1)
        else:
            basename = pooled_peaks.name

    overlapping_peaks_fn    = '%s.replicated.%s' % (basename, peak_type)
    overlapping_peaks_bb_fn = overlapping_peaks_fn + '.bb'
    rejected_peaks_fn       = '%s.rejected.%s' % (basename, peak_type)
    rejected_peaks_bb_fn    = rejected_peaks_fn + '.bb'

    # Intermediate filenames
    overlap_tr_fn = 'replicated_tr.%s' % (peak_type)
    overlap_pr_fn = 'replicated_pr.%s' % (peak_type)

    # Download file inputs to the local file system with local filenames

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_fn)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_fn)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_fn)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_fn)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_fn)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_fn)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_fn)

    logger.info(subprocess.check_output('set -x; ls -l', shell=True))

    # the only difference between the peak_types is how the extra columns are
    # handled
    if peak_type == "narrowPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-10'
        bed_type = 'bed6+4'
    elif peak_type == "gappedPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-15'
        bed_type = 'bed12+3'
    elif peak_type == "broadPeak":
        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
        cut_command = 'cut -f 1-9'
        bed_type = 'bed6+3'
    else:
        assert peak_type in ['narrowPeak', 'gappedPeak', 'broadPeak'], "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak." % (peak_type)

    # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined as
    # the fractional overlap wrt any one of the overlapping peak pairs  > 0.5
    out, err = common.run_pipe([
        'intersectBed -wo -a %s -b %s' % (pooled_peaks_fn, rep1_peaks_fn),
        awk_command,
        cut_command,
        'sort -u',
        'intersectBed -wo -a stdin -b %s' % (rep2_peaks_fn),
        awk_command,
        cut_command,
        'sort -u'
        ], overlap_tr_fn)
    print(
        "%d peaks overlap with both true replicates"
        % (common.count_lines(overlap_tr_fn)))

    # this is a simplicate analysis
    # overlapping peaks are just based on pseudoreps of the one pool
    out, err = common.run_pipe([
        'cat %s' % (overlap_tr_fn),
        'sort -u'
        ], overlapping_peaks_fn)
    print(
        "%d peaks overlap"
        % (common.count_lines(overlapping_peaks_fn)))

    # rejected peaks
    out, err = common.run_pipe([
        'intersectBed -wa -v -a %s -b %s' % (pooled_peaks_fn, overlapping_peaks_fn)
        ], rejected_peaks_fn)
    print("%d peaks were rejected" % (common.count_lines(rejected_peaks_fn)))

    # calculate FRiP (Fraction of Reads in Peaks)

    # Extract the fragment length estimate from column 3 of the
    # cross-correlation scores file or use the user-defined
    # fragment_length if given.
    if fragment_length is not None:
        fraglen = fragment_length
        fragment_length_given_by_user = True
    else:
        fraglen = common.xcor_fraglen(rep1_xcor_fn)
        fragment_length_given_by_user = False

    # FRiP
    reads_in_peaks_fn = 'reads_in_%s.ta' % (peak_type)
    n_reads, n_reads_in_peaks, frip_score = common.frip(
        rep1_ta_fn, rep1_xcor_fn, overlapping_peaks_fn,
        chrom_sizes_fn, fraglen,
        reads_in_peaks_fn=reads_in_peaks_fn)

    # count peaks
    npeaks_in = common.count_lines(common.uncompress(pooled_peaks_fn))
    npeaks_out = common.count_lines(overlapping_peaks_fn)
    npeaks_rejected = common.count_lines(rejected_peaks_fn)

    # make bigBed files for visualization
    overlapping_peaks_bb_fn = common.bed2bb(
        overlapping_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)
    rejected_peaks_bb_fn = common.bed2bb(
        rejected_peaks_fn, chrom_sizes_fn, as_file_fn, bed_type=bed_type)

    # Upload file outputs from the local file system.

    overlapping_peaks = dxpy.upload_local_file(common.compress(overlapping_peaks_fn))
    overlapping_peaks_bb = dxpy.upload_local_file(overlapping_peaks_bb_fn)
    rejected_peaks = dxpy.upload_local_file(common.compress(rejected_peaks_fn))
    rejected_peaks_bb = dxpy.upload_local_file(rejected_peaks_bb_fn)

    output = {
        "overlapping_peaks"     : dxpy.dxlink(overlapping_peaks),
        "overlapping_peaks_bb"  : dxpy.dxlink(overlapping_peaks_bb),
        "rejected_peaks"        : dxpy.dxlink(rejected_peaks),
        "rejected_peaks_bb"     : dxpy.dxlink(rejected_peaks_bb),
        "npeaks_in"             : npeaks_in,
        "npeaks_out"            : npeaks_out,
        "npeaks_rejected"       : npeaks_rejected,
        "frip_nreads"           : n_reads,
        "frip_nreads_in_peaks"  : n_reads_in_peaks,
        "frip_score"            : frip_score,
        "fragment_length_used"  : fraglen,
        "fragment_length_given_by_user": fragment_length_given_by_user
    }

    return output
Exemple #29
0
def main(rep1_ta,
         ctl1_ta,
         rep1_xcor,
         rep1_paired_end,
         chrom_sizes,
         genomesize,
         narrowpeak_as,
         gappedpeak_as,
         broadpeak_as,
         rep2_ta=None,
         ctl2_ta=None,
         rep2_xcor=None,
         rep2_paired_end=None,
         fragment_length=None):
    rep1_ta_file = dxpy.DXFile(rep1_ta)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    rep1_ta_filename = rep1_ta_file.name
    ntags_rep1 = common.count_lines(rep1_ta_filename)

    simplicate_experiment = rep1_ta and not rep2_ta
    if simplicate_experiment:
        logger.info(
            "No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info(
            "Rep1 and rep2 tags specified so processing as a replicated experiment."
        )

    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_file = dxpy.DXFile(rep2_ta)
        dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
        rep2_ta_filename = rep2_ta_file.name
        ntags_rep2 = common.count_lines(rep2_ta_filename)
    paired_end = rep1_paired_end

    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    ctl1_ta_filename = ctl1_ta_file.name

    if not unary_control:
        ctl2_ta_file = dxpy.DXFile(ctl2_ta)
        dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
        ctl2_ta_filename = ctl2_ta_file.name
    else:
        ctl2_ta_file = ctl1_ta_file
        ctl2_ta_filename = ctl1_ta_file.name

    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"

    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename),
                     (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    if not simplicate_experiment:
        pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
        pool_replicates_subjob = \
            pool_applet.run(
                {"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'},
                name='Pool replicates')
        pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                name='Pool cross-correlation')

    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info(
                "Using one control for both replicate 1 and 2 and for the pool."
            )
        rep2_control = rep1_control
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        pool_controls_subjob = pool_applet.run(
            {
                "inputs": [ctl1_ta, ctl2_ta],
                "prefix": "PL_ctls"
            },
            name='Pool controls')
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"

        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            logger.info(
                "Number of reads in controls differ by > factor of %f. Using pooled controls."
                % (ratio_cutoff))
            rep1_control = pooled_controls
            rep2_control = pooled_controls
        else:
            if ntags_ctl1 < ntags_rep1:
                logger.info(
                    "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                )
                rep1_control = pooled_controls
                rep1_ctl_msg = "pooled controls"
            elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                logger.info(
                    "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                )
                rep2_control = pooled_controls
                rep2_ctl_msg = "pooled controls"
            else:
                logger.info("Using distinct controls for replicate 1 and 2.")
                rep1_control = ctl1_ta  # default.  May be changed later.
                rep2_control = ctl2_ta  # default.  May be changed later.
                rep1_ctl_msg = "control rep1"
                rep2_ctl_msg = "control rep2"

    pseudoreplicator_applet = dxpy.find_one_data_object(
        classname='applet',
        name='pseudoreplicator',
        zero_ok=False,
        more_ok=False,
        return_handler=True)
    rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta})
    if not simplicate_experiment:
        rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta})

        pool_pr1_subjob = pool_applet.run({
            "inputs": [
                rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep2_pr_subjob.get_output_ref("pseudoreplicate1")
            ],
            "prefix":
            'PPR1'
        })
        pool_pr2_subjob = pool_applet.run({
            "inputs": [
                rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep2_pr_subjob.get_output_ref("pseudoreplicate2")
            ],
            "prefix":
            'PPR2'
        })

    common_args = {
        'chrom_sizes': chrom_sizes,
        'genomesize': genomesize,
        'narrowpeak_as': narrowpeak_as,
        'gappedpeak_as': gappedpeak_as,
        'broadpeak_as': broadpeak_as
    }
    # if the fragment_length argument is given, update macs2 input
    if fragment_length is not None:
        common_args.update({'fragment_length': fragment_length})

    common_args.update({'prefix': 'r1'})
    rep1_peaks_subjob = macs2(rep1_ta, rep1_control, rep1_xcor, **common_args)

    common_args.update({'prefix': 'r1pr1'})
    rep1pr1_peaks_subjob = macs2(
        rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control,
        rep1_xcor, **common_args)

    common_args.update({'prefix': 'r1pr2'})
    rep1pr2_peaks_subjob = macs2(
        rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control,
        rep1_xcor, **common_args)

    if not simplicate_experiment:
        common_args.update({'prefix': 'r2'})
        rep2_peaks_subjob = macs2(rep2_ta, rep2_control, rep2_xcor,
                                  **common_args)

        common_args.update({'prefix': 'r2pr1'})
        rep2pr1_peaks_subjob = macs2(
            rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control,
            rep2_xcor, **common_args)

        common_args.update({'prefix': 'r2pr2'})
        rep2pr2_peaks_subjob = macs2(
            rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control,
            rep2_xcor, **common_args)

        common_args.update({'prefix': 'pool'})
        pooled_peaks_subjob = macs2(
            pooled_replicates, control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            **common_args)

        common_args.update({'prefix': 'ppr1'})
        pooledpr1_peaks_subjob = macs2(
            pool_pr1_subjob.get_output_ref("pooled"), control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            **common_args)

        common_args.update({'prefix': 'ppr2'})
        pooledpr2_peaks_subjob = macs2(
            pool_pr2_subjob.get_output_ref("pooled"), control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            **common_args)

    output = {
        'rep1_narrowpeaks':
        rep1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1_gappedpeaks':
        rep1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1_broadpeaks':
        rep1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1_narrowpeaks_bb':
        rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'rep1_gappedpeaks_bb':
        rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'rep1_broadpeaks_bb':
        rep1_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'rep1_fc_signal':
        rep1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1_pvalue_signal':
        rep1_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep1pr1_narrowpeaks':
        rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr1_gappedpeaks':
        rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr1_broadpeaks':
        rep1pr1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr1_fc_signal':
        rep1pr1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr1_pvalue_signal':
        rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep1pr2_narrowpeaks':
        rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr2_gappedpeaks':
        rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr2_broadpeaks':
        rep1pr2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr2_fc_signal':
        rep1pr2_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr2_pvalue_signal':
        rep1pr2_peaks_subjob.get_output_ref("pvalue_signal")
    }

    if not simplicate_experiment:
        output.update({
            'rep2_narrowpeaks':
            rep2_peaks_subjob.get_output_ref("narrowpeaks"),
            'rep2_gappedpeaks':
            rep2_peaks_subjob.get_output_ref("gappedpeaks"),
            'rep2_broadpeaks':
            rep2_peaks_subjob.get_output_ref("broadpeaks"),
            'rep2_narrowpeaks_bb':
            rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"),
            'rep2_gappedpeaks_bb':
            rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"),
            'rep2_broadpeaks_bb':
            rep2_peaks_subjob.get_output_ref("broadpeaks_bb"),
            'rep2_fc_signal':
            rep2_peaks_subjob.get_output_ref("fc_signal"),
            'rep2_pvalue_signal':
            rep2_peaks_subjob.get_output_ref("pvalue_signal"),
            'rep2pr1_narrowpeaks':
            rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"),
            'rep2pr1_gappedpeaks':
            rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"),
            'rep2pr1_broadpeaks':
            rep2pr1_peaks_subjob.get_output_ref("broadpeaks"),
            'rep2pr1_fc_signal':
            rep2pr1_peaks_subjob.get_output_ref("fc_signal"),
            'rep2pr1_pvalue_signal':
            rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"),
            'rep2pr2_narrowpeaks':
            rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"),
            'rep2pr2_gappedpeaks':
            rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"),
            'rep2pr2_broadpeaks':
            rep2pr2_peaks_subjob.get_output_ref("broadpeaks"),
            'rep2pr2_fc_signal':
            rep2pr2_peaks_subjob.get_output_ref("fc_signal"),
            'rep2pr2_pvalue_signal':
            rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"),
            'pooled_narrowpeaks':
            pooled_peaks_subjob.get_output_ref("narrowpeaks"),
            'pooled_gappedpeaks':
            pooled_peaks_subjob.get_output_ref("gappedpeaks"),
            'pooled_broadpeaks':
            pooled_peaks_subjob.get_output_ref("broadpeaks"),
            'pooled_narrowpeaks_bb':
            pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"),
            'pooled_gappedpeaks_bb':
            pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"),
            'pooled_broadpeaks_bb':
            pooled_peaks_subjob.get_output_ref("broadpeaks_bb"),
            'pooled_fc_signal':
            pooled_peaks_subjob.get_output_ref("fc_signal"),
            'pooled_pvalue_signal':
            pooled_peaks_subjob.get_output_ref("pvalue_signal"),
            'pooledpr1_narrowpeaks':
            pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"),
            'pooledpr1_gappedpeaks':
            pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"),
            'pooledpr1_broadpeaks':
            pooledpr1_peaks_subjob.get_output_ref("broadpeaks"),
            'pooledpr1_fc_signal':
            pooledpr1_peaks_subjob.get_output_ref("fc_signal"),
            'pooledpr1_pvalue_signal':
            pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"),
            'pooledpr2_narrowpeaks':
            pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"),
            'pooledpr2_gappedpeaks':
            pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"),
            'pooledpr2_broadpeaks':
            pooledpr2_peaks_subjob.get_output_ref("broadpeaks"),
            'pooledpr2_fc_signal':
            pooledpr2_peaks_subjob.get_output_ref("fc_signal"),
            'pooledpr2_pvalue_signal':
            pooledpr2_peaks_subjob.get_output_ref("pvalue_signal")
        })

    return output
def main(rep1_ta, ctl1_ta, rep1_xcor, rep1_paired_end,
         npeaks, nodups,  chrom_sizes, spp_version,
         rep2_ta=None, ctl2_ta=None, rep2_xcor=None, rep2_paired_end=None,
         as_file=None, idr_peaks=False, fragment_length=None, spp_instance=None):

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    rep1_ta_filename = rep1_ta_file.name
    ntags_rep1 = common.count_lines(rep1_ta_filename)

    simplicate_experiment = rep1_ta and not rep2_ta
    if simplicate_experiment:
        logger.info("No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info("Rep1 and rep2 tags specified so processing as a replicated experiment.")

    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_file = dxpy.DXFile(rep2_ta)
        dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
        rep2_ta_filename = rep2_ta_file.name
        ntags_rep2 = common.count_lines(rep2_ta_filename)
    paired_end = rep1_paired_end

    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    ctl1_ta_filename = ctl1_ta_file.name

    if not unary_control:
        ctl2_ta_file = dxpy.DXFile(ctl2_ta)
        dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
        ctl2_ta_filename = ctl2_ta_file.name
    else:
        ctl2_ta_file = ctl1_ta_file
        ctl2_ta_filename = ctl1_ta_file.name

    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"

    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend(
        [(ntags_ctl1, 'control 1', ctl1_ta_filename),
         (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    if not simplicate_experiment:
        pool_applet = dxpy.find_one_data_object(
                classname='applet',
                name='pool',
                project=dxpy.PROJECT_CONTEXT_ID,
                zero_ok=False,
                more_ok=False,
                return_handler=True)
        pool_replicates_subjob = \
            pool_applet.run(
                {"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'},
                name='Pool replicates')
        pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                spp_version,
                name='Pool cross-correlation')

    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info("Using one control for both replicate 1 and 2 and for the pool.")
        rep2_control = rep1_control
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        pool_controls_subjob = pool_applet.run(
            {"inputs": [ctl1_ta, ctl2_ta],
             "prefix": "PL_ctls"},
            name='Pool controls')
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"

        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2)
        if ratio_ctl_reads < 1:
                ratio_ctl_reads = 1/ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
                logger.info(
                    "Number of reads in controls differ by > factor of %f. Using pooled controls."
                    % (ratio_cutoff))
                rep1_control = pooled_controls
                rep2_control = pooled_controls
        else:
                if ntags_ctl1 < ntags_rep1:
                        logger.info("Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1.")
                        rep1_control = pooled_controls
                        rep1_ctl_msg = "pooled controls"
                elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                        logger.info("Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2.")
                        rep2_control = pooled_controls
                        rep2_ctl_msg = "pooled controls"
                else:
                    logger.info(
                        "Using distinct controls for replicate 1 and 2.")
                    rep1_control = ctl1_ta  # default.  May be changed later.
                    rep2_control = ctl2_ta  # default.  May be changed later.
                    rep1_ctl_msg = "control rep1"
                    rep2_ctl_msg = "control rep2"

    common_args = {
        'chrom_sizes': chrom_sizes,
        'spp_version': spp_version,
        'as_file':     as_file,
        'spp_instance': spp_instance
        }
    if fragment_length is not None:
        common_args.update({'fragment_length': fragment_length})
    rep1_peaks_subjob = spp(
        rep1_ta,
        rep1_control,
        rep1_xcor,
        bigbed=True,
        name='Rep1 peaks vs %s' % (rep1_ctl_msg),
        prefix='R1', **common_args)

    if not simplicate_experiment:
        rep2_peaks_subjob = spp(
            rep2_ta,
            rep2_control,
            rep2_xcor,
            bigbed=True,
            name='Rep2 peaks vs %s' % (rep2_ctl_msg),
            prefix='R2', **common_args)

        pooled_peaks_subjob = spp(
            pooled_replicates,
            control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            bigbed=True,
            name='Pooled peaks vs %s' % (pool_ctl_msg),
            prefix='PL', **common_args)

    output = {
        'rep1_peaks':       rep1_peaks_subjob.get_output_ref("peaks"),
        'rep1_peaks_bb':    rep1_peaks_subjob.get_output_ref("peaks_bb"),
        'rep1_xcor_plot':   rep1_peaks_subjob.get_output_ref("xcor_plot"),
        'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores")
    }

    if not simplicate_experiment:
        output.update({
            'rep2_peaks':       rep2_peaks_subjob.get_output_ref("peaks"),
            'rep2_peaks_bb':    rep2_peaks_subjob.get_output_ref("peaks_bb"),
            'rep2_xcor_plot':   rep2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"),

            'pooled_peaks':       pooled_peaks_subjob.get_output_ref("peaks"),
            'pooled_peaks_bb':    pooled_peaks_subjob.get_output_ref("peaks_bb"),
            'pooled_xcor_plot':   pooled_peaks_subjob.get_output_ref("xcor_plot"),
            'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores")
        })

    if idr_peaks:  # also call peaks on pseudoreplicates for IDR
        pseudoreplicator_applet = \
            dxpy.find_one_data_object(
               classname='applet',
               name='pseudoreplicator',
               project=dxpy.PROJECT_CONTEXT_ID,
               zero_ok=False,
               more_ok=False,
               return_handler=True)

        rep1_pr_subjob = \
            pseudoreplicator_applet.run(
                {"input_tags": rep1_ta,
                 "prefix": 'R1PR'},
                name='Pseudoreplicate rep1 -> R1PR1,2')

        rep1pr1_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep1_control,
            rep1_xcor,
            bigbed=False,
            name='R1PR1 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1PR1', **common_args)

        rep1pr2_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep1_control,
            rep1_xcor,
            bigbed=False,
            name='R1PR2 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1PR2', **common_args)

        output.update({
            'rep1pr1_peaks':         rep1pr1_peaks_subjob.get_output_ref("peaks"),
            'rep1pr2_peaks':         rep1pr2_peaks_subjob.get_output_ref("peaks")
            })

        if not simplicate_experiment:
            rep2_pr_subjob = \
                pseudoreplicator_applet.run(
                    {"input_tags": rep2_ta,
                     "prefix": 'R2PR'},
                    name='Pseudoreplicate rep2 -> R2PR1,2')

            pool_pr1_subjob = pool_applet.run({
                "inputs": [
                    rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate1")],
                "prefix": 'PPR1'},
                name='Pool R1PR1+R2PR1 -> PPR1')

            pool_pr2_subjob = pool_applet.run({
                "inputs": [
                    rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate2")],
                "prefix": 'PPR2'},
                name='Pool R1PR2+R2PR2 -> PPR2')

            rep2pr1_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep2_control,
                rep2_xcor,
                bigbed=False,
                name='R2PR1 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR1', **common_args)

            rep2pr2_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep2_control,
                rep2_xcor,
                bigbed=False,
                name='R2PR2 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR2', **common_args)

            pooledpr1_peaks_subjob = spp(
                pool_pr1_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                bigbed=False,
                name='PPR1 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR1', **common_args)

            pooledpr2_peaks_subjob = spp(
                pool_pr2_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                bigbed=False,
                name='PPR2 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR2', **common_args)

            output.update({
                'rep2pr1_peaks':         rep2pr1_peaks_subjob.get_output_ref("peaks"),
                'rep2pr2_peaks':         rep2pr2_peaks_subjob.get_output_ref("peaks"),
                'pooledpr1_peaks':       pooledpr1_peaks_subjob.get_output_ref("peaks"),
                'pooledpr2_peaks':       pooledpr2_peaks_subjob.get_output_ref("peaks"),
            })

    return output
Exemple #31
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed,
         chrom_sizes, spp_version, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)

    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)

    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(
        xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = \
            experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    # spp adds .gz, so this is the file name that's actually created
    final_peaks_filename = peaks_filename + '.gz'
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    logger.info(subprocess.check_output(
        'ls -l', shell=True, stderr=subprocess.STDOUT))

    # third column in the cross-correlation scores input file
    fraglen_column = 3
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        logger.info("Read fragment length: %d" % (fragment_length))

    spp_tarball = SPP_VERSION_MAP.get(spp_version)
    assert spp_tarball, "spp version %s is not supported" % (spp_version)
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    # install spp
    subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    spp_command = (
        "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s"
        % (run_spp, cpu_count(), experiment_filename, control_filename, npeaks,
           fragment_length, peaks_filename, xcor_plot_filename,
           xcor_scores_filename))
    logger.info(spp_command)
    subprocess.check_call(shlex.split(spp_command))

    # when one of the peak coordinates are an exact multiple of 10, spp (R)
    # outputs the coordinate in scientific notation
    # this changes any such coodinates to decimal notation
    # this assumes 10-column output and that the 2nd and 3rd columns are
    # coordinates
    # the ($2>0)?$2:0) is needed because spp sometimes calls peaks with a
    # negative start coordinate (particularly chrM) and will cause slopBed
    # to halt at that line, truncating the output of the pipe
    # slopBed adjusts feature end coordinates that go off the end of the
    # chromosome
    # bedClip removes any features that are still not within the boundaries of
    # the chromosome

    fix_coordinate_peaks_filename = \
        output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" % (final_peaks_filename),
        "tee %s" % (peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",($2>0)?$2:0),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' % (chrom_sizes_filename),
        'bedClip stdin %s %s' % (chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    # These lines transfer the peaks files to the temporary workspace for
    # debugging later
    # Only at the end are the final files uploaded that will be returned from
    # the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    logger.info("%s peaks called by spp" % (n_spp_peaks))
    logger.info(
        "%s of those peaks removed due to bad coordinates"
        % (n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename)))
    print("First 50 peaks")
    subprocess.check_output(
        'head -50 %s' % (fix_coordinate_peaks_filename),
        shell=True)

    if bigbed:
        peaks_bb_filename = \
            common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        logger.info("Returning peaks with fixed coordinates")
        subprocess.check_call(shlex.split('gzip -n %s' % (fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    subprocess.check_call('ls -l', shell=True)
    # print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    # print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
Exemple #32
0
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor,
         rep1_paired_end, rep2_paired_end, chrom_sizes, genomesize,
         narrowpeak_as, gappedpeak_as, broadpeak_as):

    assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
    paired_end = rep1_paired_end

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    unary_control = ctl1_ta == ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    ctl2_ta_file = dxpy.DXFile(ctl2_ta)
    # not necessary to actually download these - just pass through
    # rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    # rep2_xcor_file = dxpy.DXFile(rep2_xcor)

    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
    # not necessary to actually download these - just pass through
    # dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name)
    # dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name)

    rep1_ta_filename = rep1_ta_file.name
    rep2_ta_filename = rep2_ta_file.name
    ctl1_ta_filename = ctl1_ta_file.name
    ctl2_ta_filename = ctl2_ta_file.name
    # not necessary to actually download these - just pass through
    # rep1_xcor_filename = rep1_xcor_file.name
    # rep2_xcor_filename = rep2_xcor_file.name

    ntags_rep1 = common.count_lines(rep1_ta_filename)
    ntags_rep2 = common.count_lines(rep2_ta_filename)
    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)

    for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename),
                              (ntags_rep2, 'replicate 2', rep2_ta_filename),
                              (ntags_ctl1, 'control 1', ctl1_ta_filename),
                              (ntags_ctl2, 'control 2', ctl2_ta_filename)]:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    pool_applet = dxpy.find_one_data_object(
        classname='applet',
        name='pool',
        zero_ok=False,
        more_ok=False,
        return_handler=True)
    pool_replicates_subjob = pool_applet.run({
        "inputs": [rep1_ta, rep2_ta],
        "prefix": "PL_reps"})
    pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")

    rep1_control = ctl1_ta  # default.  May be changed later.
    rep2_control = ctl2_ta  # default.  May be changed later.

    if unary_control:
        logger.info("Only one control supplied.  Using it for both replicate 1 and 2 and for the pool.")
        control_for_pool = rep1_control
    else:
        pool_controls_subjob = pool_applet.run({
            "inputs": [ctl1_ta, ctl2_ta],
            "prefix": "PL_ctls"})
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls

        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2)
        if ratio_ctl_reads < 1:
                ratio_ctl_reads = 1/ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
                logger.info(
                    "Number of reads in controls differ by > factor of %f. Using pooled controls."
                    % (ratio_cutoff))
                rep1_control = pooled_controls
                rep2_control = pooled_controls
        else:
                if ntags_ctl1 < ntags_rep1:
                        logger.info("Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1.")
                        rep1_control = pooled_controls
                elif ntags_ctl2 < ntags_rep2:
                        logger.info("Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2.")
                        rep2_control = pooled_controls
                else:
                    logger.info(
                        "Using distinct controls for replicate 1 and 2.")

    pseudoreplicator_applet = dxpy.find_one_data_object(
        classname='applet',
        name='pseudoreplicator',
        zero_ok=False,
        more_ok=False,
        return_handler=True)
    rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta})
    rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta})

    pool_pr1_subjob = pool_applet.run(
        {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate1")],
         "prefix": 'PPR1'})
    pool_pr2_subjob = pool_applet.run(
        {"inputs": [rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate2")],
         "prefix": 'PPR2'})
    pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end)

    # no longer calculated - now we take the cross-correlation metrics for the
    # pseudoreplicates as those from the true reps
    # rep1_pr1_xcor_subjob = \
    #     xcor_only(rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end)
    # rep1_pr2_xcor_subjob = \
    #     xcor_only(rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end)
    # rep2_pr1_xcor_subjob = \
    #     xcor_only(rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end)
    # rep2_pr2_xcor_subjob = \
    #     xcor_only(rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end)
    # pool_pr1_xcor_subjob = \
    #     xcor_only(pool_pr1_subjob.get_output_ref("pooled"), paired_end)
    # pool_pr2_xcor_subjob = \
    #     xcor_only(pool_pr2_subjob.get_output_ref("pooled"), paired_end)

    common_args = {
        'chrom_sizes':      chrom_sizes,
        'genomesize':       genomesize,
        'narrowpeak_as':    narrowpeak_as,
        'gappedpeak_as':    gappedpeak_as,
        'broadpeak_as':     broadpeak_as
        }

    common_args.update({'prefix': 'r1'})
    rep1_peaks_subjob      = macs2( rep1_ta,
                                    rep1_control,
                                    rep1_xcor, **common_args)

    common_args.update({'prefix': 'r2'})
    rep2_peaks_subjob      = macs2( rep2_ta,
                                    rep2_control,
                                    rep2_xcor, **common_args)

    common_args.update({'prefix': 'pool'})
    pooled_peaks_subjob    = macs2( pooled_replicates,
                                    control_for_pool,   
                                    pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'r1pr1'})
    rep1pr1_peaks_subjob   = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                                    rep1_control,
                                    rep1_xcor, **common_args)

    common_args.update({'prefix': 'r1pr2'})
    rep1pr2_peaks_subjob   = macs2( rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                                    rep1_control,
                                    rep1_xcor, **common_args)

    common_args.update({'prefix': 'r2pr1'})
    rep2pr1_peaks_subjob   = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
                                    rep2_control,
                                    rep2_xcor, **common_args)

    common_args.update({'prefix': 'r2pr2'})
    rep2pr2_peaks_subjob   = macs2( rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
                                    rep2_control,
                                    rep2_xcor, **common_args)

    common_args.update({'prefix': 'ppr1'})
    pooledpr1_peaks_subjob = macs2( pool_pr1_subjob.get_output_ref("pooled"),
                                    control_for_pool,
                                    pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'ppr2'})
    pooledpr2_peaks_subjob = macs2( pool_pr2_subjob.get_output_ref("pooled"),
                                    control_for_pool,
                                    pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    output = {
        'rep1_narrowpeaks':         rep1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1_gappedpeaks':         rep1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1_broadpeaks':          rep1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1_narrowpeaks_bb':      rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'rep1_gappedpeaks_bb':      rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'rep1_broadpeaks_bb':       rep1_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'rep1_fc_signal':           rep1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1_pvalue_signal':       rep1_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep2_narrowpeaks':         rep2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep2_gappedpeaks':         rep2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep2_broadpeaks':          rep2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep2_narrowpeaks_bb':      rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'rep2_gappedpeaks_bb':      rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'rep2_broadpeaks_bb':       rep2_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'rep2_fc_signal':           rep2_peaks_subjob.get_output_ref("fc_signal"),
        'rep2_pvalue_signal':       rep2_peaks_subjob.get_output_ref("pvalue_signal"),

        'pooled_narrowpeaks':       pooled_peaks_subjob.get_output_ref("narrowpeaks"),
        'pooled_gappedpeaks':       pooled_peaks_subjob.get_output_ref("gappedpeaks"),
        'pooled_broadpeaks':        pooled_peaks_subjob.get_output_ref("broadpeaks"),
        'pooled_narrowpeaks_bb':    pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'pooled_gappedpeaks_bb':    pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'pooled_broadpeaks_bb':     pooled_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'pooled_fc_signal':         pooled_peaks_subjob.get_output_ref("fc_signal"),
        'pooled_pvalue_signal':     pooled_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep1pr1_narrowpeaks':      rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr1_gappedpeaks':      rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr1_broadpeaks':       rep1pr1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr1_fc_signal':        rep1pr1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr1_pvalue_signal':    rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep1pr2_narrowpeaks':      rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr2_gappedpeaks':      rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr2_broadpeaks':       rep1pr2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr2_fc_signal':        rep1pr2_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr2_pvalue_signal':    rep1pr2_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep2pr1_narrowpeaks':      rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep2pr1_gappedpeaks':      rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep2pr1_broadpeaks':       rep2pr1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep2pr1_fc_signal':        rep2pr1_peaks_subjob.get_output_ref("fc_signal"),
        'rep2pr1_pvalue_signal':    rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"),

        'rep2pr2_narrowpeaks':      rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep2pr2_gappedpeaks':      rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep2pr2_broadpeaks':       rep2pr2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep2pr2_fc_signal':        rep2pr2_peaks_subjob.get_output_ref("fc_signal"),
        'rep2pr2_pvalue_signal':    rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"),

        'pooledpr1_narrowpeaks':    pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'pooledpr1_gappedpeaks':    pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'pooledpr1_broadpeaks':     pooledpr1_peaks_subjob.get_output_ref("broadpeaks"),
        'pooledpr1_fc_signal':      pooledpr1_peaks_subjob.get_output_ref("fc_signal"),
        'pooledpr1_pvalue_signal':  pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"),

        'pooledpr2_narrowpeaks':    pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'pooledpr2_gappedpeaks':    pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'pooledpr2_broadpeaks':     pooledpr2_peaks_subjob.get_output_ref("broadpeaks"),
        'pooledpr2_fc_signal':      pooledpr2_peaks_subjob.get_output_ref("fc_signal"),
        'pooledpr2_pvalue_signal':  pooledpr2_peaks_subjob.get_output_ref("pvalue_signal")
    }

    return output
def main(experiment, reps_peaks, r1pr_peaks, r2pr_peaks, pooledpr_peaks, chrom_sizes, as_file, blacklist=None):

	#TODO for now just taking the peak files.  This applet should actually call IDR instead of 
	#putting that in the workflow populator script

	# Initialize the data object inputs on the platform into
	# dxpy.DXDataObject instances.

	reps_peaks_file = dxpy.DXFile(reps_peaks)
	r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
	r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
	pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
	chrom_sizes_file = dxpy.DXFile(chrom_sizes)
	as_file_file = dxpy.DXFile(as_file)
	if blacklist is not None:
		blacklist_file = dxpy.DXFile(blacklist)
		blacklist_filename = 'blacklist_%s' %(blacklist_file.name)
		dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
		blacklist_filename = common.uncompress(blacklist_filename)

	# Download the file inputs to the local file system.

	#Need to prepend something to ensure the local filenames will be unique
	reps_peaks_filename = 'true_%s' %(reps_peaks_file.name)
	r1pr_peaks_filename = 'r1pr_%s' %(r1pr_peaks_file.name)
	r2pr_peaks_filename = 'r2pr_%s' %(r2pr_peaks_file.name)
	pooledpr_peaks_filename = 'pooledpr_%s' %(pooledpr_peaks_file.name)
	chrom_sizes_filename = chrom_sizes_file.name
	as_file_filename = as_file_file.name

	dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
	dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
	dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
	dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
	dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
	dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

	print subprocess.check_output('ls -l', shell=True)

	reps_peaks_filename = common.uncompress(reps_peaks_filename)
	r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
	r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
	pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

	Nt = common.count_lines(reps_peaks_filename)
	print "%d peaks from true replicates" %(Nt)
	N1 = common.count_lines(r1pr_peaks_filename)
	print "%d peaks from rep1 self-pseudoreplicates" %(N1)
	N2 = common.count_lines(r2pr_peaks_filename)
	print "%d peaks from rep2 self-pseudoreplicates" %(N2)
	Np = common.count_lines(pooledpr_peaks_filename)
	print "%d peaks from pooled pseudoreplicates" %(Np)

	conservative_set_filename = '%s_final_conservative.narrowPeak' %(experiment)
	if blacklist is not None:
		blacklist_filter(reps_peaks_filename, conservative_set_filename, blacklist_filename)
	else:
		conservative_set_filename = reps_peaks_filename
	Ncb = common.count_lines(conservative_set_filename)
	print "%d peaks blacklisted from the conservative set" %(Nt-Ncb)

	if Nt >= Np:
		peaks_to_filter_filename = reps_peaks_filename
		No = Nt
	else:
		peaks_to_filter_filename = pooledpr_peaks_filename
		No = Np

	optimal_set_filename = '%s_final_optimal.narrowPeak' %(experiment)
	if blacklist is not None:
		blacklist_filter(peaks_to_filter_filename, optimal_set_filename, blacklist_filename)
	else:
		optimal_set_filename = peaks_to_filter_filename
	Nob = common.count_lines(optimal_set_filename)
	print "%d peaks blacklisted from the optimal set" %(No-Nob)

	rescue_ratio            = float(max(Np,Nt)) / float(min(Np,Nt))
	self_consistency_ratio  = float(max(N1,N2)) / float(min(N1,N2))

	if rescue_ratio > 2 and self_consistency_ratio > 2:
		reproducibility = 'fail'
	elif rescue_ratio > 2 or self_consistency_ratio > 2:
		reproducibility = 'borderline'
	else:
		reproducibility = 'pass'

	output = {}

	#bedtobigbed often fails, so skip creating the bb if it does
	conservative_set_bb_filename = common.bed2bb(conservative_set_filename, chrom_sizes_filename, as_file_filename)
	optimal_set_bb_filename = common.bed2bb(optimal_set_filename, chrom_sizes_filename, as_file_filename)
	if conservative_set_bb_filename:
		conservative_set_bb_output = dxpy.upload_local_file(conservative_set_bb_filename)
		output.update({"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
	if optimal_set_bb_filename:
		optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
		output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

	output.update({
		"Nt": Nt,
		"N1": N1,
		"N2": N2,
		"Np": Np,
		"conservative_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(conservative_set_filename))),
		"optimal_set": dxpy.dxlink(dxpy.upload_local_file(common.compress(optimal_set_filename))),
		"rescue_ratio": rescue_ratio,
		"self_consistency_ratio": self_consistency_ratio,
		"reproducibility_test": reproducibility,
		"No": Nob,
		"Nc": Ncb
	})

	logging.info("Exiting with output: %s", output)
	return output
        def process(self):
                '''
                #find pooled peaks that are in (rep1 AND rep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, rep1_peaks_fn),
                        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(rep2_peaks_fn)
                        ], overlap_tr_fn)
                print "%d peaks overlap with both true replicates" %(common.count_lines(overlap_tr_fn))

                #pooled peaks that are in (pooledpseudorep1 AND pooledpseudorep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -f 0.50 -r -a %s -b %s' %(pooled_peaks_fn, pooledpr1_peaks_fn),
                        'intersectBed -wa -f 0.50 -r -a stdin -b %s' %(pooledpr2_peaks_fn)
                        ], overlap_pr_fn)
                print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(overlap_pr_fn))

                #combined pooled peaks in (rep1 AND rep2) OR (pooledpseudorep1 AND pooledpseudorep2)
                out, err = common.run_pipe([
                        'intersectBed -wa -a %s -b %s %s' %(pooled_peaks_fn, overlap_tr_fn, overlap_pr_fn),
                        'intersectBed -wa -u -a %s -b stdin' %(pooled_peaks_fn)
                        ], overlapping_peaks_fn)
                print "%d peaks overall with true replicates or with pooled pseudorepliates" %(common.count_lines(overlapping_peaks_fn))
                '''
                #the only difference between the peak_types is how the extra columns are handled
                if self.peak_type == "narrowPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$13-$12; if (($21/s1 >= 0.5) || ($21/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-10'
                        bed_type = 'bed6+4'
                elif self.peak_type == "gappedPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$18-$17; if (($31/s1 >= 0.5) || ($31/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-15'
                        bed_type = 'bed12+3'
                elif self.peak_type == "broadPeak":
                        awk_command = r"""awk 'BEGIN{FS="\t";OFS="\t"}{s1=$3-$2; s2=$12-$11; if (($19/s1 >= 0.5) || ($19/s2 >= 0.5)) {print $0}}'"""
                        cut_command = 'cut -f 1-9'
                        bed_type = 'bed6+3'
                else:
                        print "%s is unrecognized.  peak_type should be narrowPeak, gappedPeak or broadPeak."
                        sys.exit()

                # Find pooled peaks that overlap Rep1 and Rep2 where overlap is defined 1bp
                out, err = common.run_pipe([
                        'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.rep1_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u',
                        'intersectBed -wo -a stdin -b %s' %(self.rep2_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u'
                        ], self.overlap_tr_fn)
                print "%d peaks overlap with both true replicates" %(common.count_lines(self.overlap_tr_fn))

                # Find pooled peaks that overlap PseudoRep1 and PseudoRep2 where overlap is defined as 1bp
                out, err = common.run_pipe([
                        'intersectBed -wo -a %s -b %s' %(self.pooled_peaks_fn, self.pooledpr1_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u',
                        'intersectBed -wo -a stdin -b %s' %(self.pooledpr2_peaks_fn),
                        awk_command,
                        cut_command,
                        'sort -u'
                        ], self.overlap_pr_fn)
                print "%d peaks overlap with both pooled pseudoreplicates" %(common.count_lines(self.overlap_pr_fn))

                # Combine peak lists
                out, err = common.run_pipe([
                        'cat %s %s' %(self.overlap_tr_fn, self.overlap_pr_fn),
                        'sort -u'
                        ], self.overlapping_peaks_fn)
                print "%d peaks overlap with true replicates or with pooled pseudorepliates" %(common.count_lines(self.overlapping_peaks_fn))

                #rejected peaks
                out, err = common.run_pipe([
                        'intersectBed -wa -v -a %s -b %s' %(self.pooled_peaks_fn, self.overlapping_peaks_fn)
                        ], self.rejected_peaks_fn)
                print "%d peaks were rejected" %(common.count_lines(self.rejected_peaks_fn))

                self.npeaks_in 		= common.count_lines(common.uncompress(self.pooled_peaks_fn))
                self.npeaks_out 		= common.count_lines(self.overlapping_peaks_fn)
                self.npeaks_rejected = common.count_lines(self.rejected_peaks_fn)

                #make bigBed files for visualization
                self.overlapping_peaks_bb_fn = common.bed2bb(self.overlapping_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
                self.rejected_peaks_bb_fn 	= common.bed2bb(self.rejected_peaks_fn, self.chrom_sizes_fn, self.as_file_fn, bed_type=bed_type)
Exemple #35
0
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor,
         npeaks, nodups, rep1_paired_end, rep2_paired_end, chrom_sizes,
         spp_version, as_file=None, idr_peaks=False):

        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported (yet)'
        paired_end = rep1_paired_end

        rep1_ta_file = dxpy.DXFile(rep1_ta)
        rep2_ta_file = dxpy.DXFile(rep2_ta)
        unary_control = ctl1_ta == ctl2_ta
        ctl1_ta_file = dxpy.DXFile(ctl1_ta)
        ctl2_ta_file = dxpy.DXFile(ctl2_ta)
        rep1_xcor_file = dxpy.DXFile(rep1_xcor)
        rep2_xcor_file = dxpy.DXFile(rep2_xcor)

        dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
        dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
        dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
        dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
        dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name)
        dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name)

        rep1_ta_filename = rep1_ta_file.name
        rep2_ta_filename = rep2_ta_file.name
        ctl1_ta_filename = ctl1_ta_file.name
        ctl2_ta_filename = ctl2_ta_file.name

        ntags_rep1 = common.count_lines(rep1_ta_filename)
        ntags_rep2 = common.count_lines(rep2_ta_filename)
        ntags_ctl1 = common.count_lines(ctl1_ta_filename)
        ntags_ctl2 = common.count_lines(ctl2_ta_filename)

        for n, name, filename in [
                    (ntags_rep1, 'replicate 1', rep1_ta_filename),
                    (ntags_rep2, 'replicate 2', rep2_ta_filename),
                    (ntags_ctl1, 'control 1', ctl1_ta_filename),
                    (ntags_ctl2, 'control 2', ctl2_ta_filename)]:
            logger.info("Found %d tags in %s file %s" % (n, name, filename))

        subprocess.check_call('ls -l', shell=True)

        pool_applet = dxpy.find_one_data_object(
                classname='applet',
                name='pool',
                project=dxpy.PROJECT_CONTEXT_ID,
                zero_ok=False,
                more_ok=False,
                return_handler=True)
        pool_replicates_subjob = \
            pool_applet.run(
                {"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'},
                name='Pool replicates')
        pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                spp_version,
                name='Pool cross-correlation')

        rep1_control = ctl1_ta  # default.  May be changed later.
        rep1_ctl_msg = "control rep1"
        rep2_control = ctl2_ta  # default.  May be changed later.
        rep2_ctl_msg = "control rep2"

        if unary_control:
            logger.info("Only one control supplied.  Using it for both replicate 1 and 2 and for the pool.")
            control_for_pool = rep1_control
            pool_ctl_msg = "one control"
        else:
            pool_controls_subjob = \
                pool_applet.run(
                    {"inputs": [ctl1_ta, ctl2_ta],
                     "prefix": 'pooled_ctls'},
                    name='Pool controls')
            pooled_controls = pool_controls_subjob.get_output_ref("pooled")
            # always use the pooled controls for the pool
            control_for_pool = pooled_controls
            pool_ctl_msg = "pooled controls"

            # use the pooled controls for the reps depending on the ratio of
            # rep to control reads
            ratio_ctl_reads = float(ntags_ctl1)/float(ntags_ctl2)
            if ratio_ctl_reads < 1:
                    ratio_ctl_reads = 1/ratio_ctl_reads
            ratio_cutoff = 1.2
            if ratio_ctl_reads > ratio_cutoff:
                    logger.info(
                        "Number of reads in controls differ by > factor of %f. Using pooled controls."
                        % (ratio_cutoff))
                    rep1_control = pooled_controls
                    rep1_ctl_msg = "pooled controls"
                    rep2_control = pooled_controls
                    rep2_ctl_msg = "pooled controls"
            else:
                    if ntags_ctl1 < ntags_rep1:
                            logger.info("Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1.")
                            rep1_control = pooled_controls
                            rep1_ctl_msg = "pooled controls"
                    elif ntags_ctl2 < ntags_rep2:
                            logger.info("Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2.")
                            rep2_control = pooled_controls
                            rep2_ctl_msg = "pooled controls"
                    else:
                        logger.info("Using distinct controls for replicate 1 and 2.")
                        rep1_ctl_msg = "control rep1"
                        rep2_ctl_msg = "control rep2"

        rep1_peaks_subjob = spp(
            rep1_ta,
            rep1_control,
            rep1_xcor,
            chrom_sizes,
            spp_version,
            bigbed=True,
            as_file=as_file,
            name='Rep1 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1')

        rep2_peaks_subjob = spp(
            rep2_ta,
            rep2_control,
            rep2_xcor,
            chrom_sizes,
            spp_version,
            bigbed=True,
            as_file=as_file,
            name='Rep2 peaks vs %s' % (rep2_ctl_msg),
            prefix='R2')

        pooled_peaks_subjob = spp(
            pooled_replicates,
            control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes,
            spp_version,
            bigbed=True,
            as_file=as_file,
            name='Pooled peaks vs %s' % (pool_ctl_msg),
            prefix='PL')

        output = {
            'rep1_peaks':       rep1_peaks_subjob.get_output_ref("peaks"),
            'rep1_peaks_bb':    rep1_peaks_subjob.get_output_ref("peaks_bb"),
            'rep1_xcor_plot':   rep1_peaks_subjob.get_output_ref("xcor_plot"),
            'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores"),

            'rep2_peaks':       rep2_peaks_subjob.get_output_ref("peaks"),
            'rep2_peaks_bb':    rep2_peaks_subjob.get_output_ref("peaks_bb"),
            'rep2_xcor_plot':   rep2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"),

            'pooled_peaks':       pooled_peaks_subjob.get_output_ref("peaks"),
            'pooled_peaks_bb':    pooled_peaks_subjob.get_output_ref("peaks_bb"),
            'pooled_xcor_plot':   pooled_peaks_subjob.get_output_ref("xcor_plot"),
            'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores")
        }

        if idr_peaks:  # also call peaks on pseudoreplicates for IDR
            pseudoreplicator_applet = \
                dxpy.find_one_data_object(
                   classname='applet',
                   name='pseudoreplicator',
                   project=dxpy.PROJECT_CONTEXT_ID,
                   zero_ok=False,
                   more_ok=False,
                   return_handler=True)

            rep1_pr_subjob = \
                pseudoreplicator_applet.run(
                    {"input_tags": rep1_ta,
                     "prefix": 'R1PR'},
                    name='Pseudoreplicate rep1 -> R1PR1,2')
            rep2_pr_subjob = \
                pseudoreplicator_applet.run(
                    {"input_tags": rep2_ta,
                     "prefix": 'R2PR'},
                    name='Pseudoreplicate rep2 -> R2PR1,2')

            pool_pr1_subjob = pool_applet.run({
                "inputs": [
                    rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate1")],
                "prefix": 'PPR1'},
                name='Pool R1PR1+R2PR1 -> PPR1')

            pool_pr2_subjob = pool_applet.run({
                "inputs": [
                    rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                    rep2_pr_subjob.get_output_ref("pseudoreplicate2")],
                "prefix": 'PPR2'},
                name='Pool R1PR2+R2PR2 -> PPR2')

            # rep1_pr1_xcor_subjob = \
            #     xcor_only(
            #         rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            #         paired_end,
            #         spp_version,
            #         name='R1PR1 cross-correlation')
            # rep1_pr2_xcor_subjob = \
            #     xcor_only(
            #         rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            #         paired_end,
            #         spp_version,
            #         name='R1PR2 cross-correlation')
            # rep2_pr1_xcor_subjob = \
            #     xcor_only(
            #         rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
            #         paired_end,
            #         spp_version,
            #         name='R2PR1 cross-correlation')
            # rep2_pr2_xcor_subjob = \
            #     xcor_only(
            #         rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
            #         paired_end,
            #         spp_version,
            #         name='R2PR2 cross-correlation')
            # pool_pr1_xcor_subjob = \
            #     xcor_only(
            #         pool_pr1_subjob.get_output_ref("pooled"),
            #         paired_end,
            #         spp_version,
            #         name='PPR1 cross-correlation')
            # pool_pr2_xcor_subjob = \
            #     xcor_only(
            #         pool_pr2_subjob.get_output_ref("pooled"),
            #         paired_end,
            #         spp_version,
            #         name='PPR2 cross-correlation')

            rep1pr1_peaks_subjob = spp(
                rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep1_control,
                rep1_xcor,
                chrom_sizes,
                spp_version,
                bigbed=False,
                name='R1PR1 peaks vs %s' % (rep1_ctl_msg),
                prefix='R1PR1')

            rep1pr2_peaks_subjob = spp(
                rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep1_control,
                rep1_xcor,
                chrom_sizes,
                spp_version,
                bigbed=False,
                name='R1PR2 peaks vs %s' % (rep1_ctl_msg),
                prefix='R1PR2')

            rep2pr1_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep2_control,
                rep2_xcor,
                chrom_sizes,
                spp_version,
                bigbed=False,
                name='R2PR1 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR1')

            rep2pr2_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep2_control,
                rep2_xcor,
                chrom_sizes,
                spp_version,
                bigbed=False,
                name='R2PR2 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR2')

            pooledpr1_peaks_subjob = spp(
                pool_pr1_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                chrom_sizes,
                spp_version,
                bigbed=False,
                name='PPR1 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR1')

            pooledpr2_peaks_subjob = spp(
                pool_pr2_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                chrom_sizes,
                spp_version,
                bigbed=False,
                name='PPR2 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR2')

            output.update({
                'rep1pr1_peaks':         rep1pr1_peaks_subjob.get_output_ref("peaks"),
                # 'rep1pr1_xcor_plot':     rep1pr1_peaks_subjob.get_output_ref("xcor_plot"),
                # 'rep1pr1_xcor_scores':   rep1pr1_peaks_subjob.get_output_ref("xcor_scores"),
                'rep1pr2_peaks':         rep1pr2_peaks_subjob.get_output_ref("peaks"),
                # 'rep1pr2_xcor_plot':     rep1pr2_peaks_subjob.get_output_ref("xcor_plot"),
                # 'rep1pr2_xcor_scores':   rep1pr2_peaks_subjob.get_output_ref("xcor_scores"),
                'rep2pr1_peaks':         rep2pr1_peaks_subjob.get_output_ref("peaks"),
                # 'rep2pr1_xcor_plot':     rep2pr1_peaks_subjob.get_output_ref("xcor_plot"),
                # 'rep2pr1_xcor_scores':   rep2pr1_peaks_subjob.get_output_ref("xcor_scores"),
                'rep2pr2_peaks':         rep2pr2_peaks_subjob.get_output_ref("peaks"),
                # 'rep2pr2_xcor_plot':     rep2pr2_peaks_subjob.get_output_ref("xcor_plot"),
                # 'rep2pr2_xcor_scores':   rep2pr2_peaks_subjob.get_output_ref("xcor_scores"),
                'pooledpr1_peaks':       pooledpr1_peaks_subjob.get_output_ref("peaks"),
                # 'pooledpr1_xcor_plot':   pooledpr1_peaks_subjob.get_output_ref("xcor_plot"),
                # 'pooledpr1_xcor_scores': pooledpr1_peaks_subjob.get_output_ref("xcor_scores"),
                'pooledpr2_peaks':       pooledpr2_peaks_subjob.get_output_ref("peaks"),
                # 'pooledpr2_xcor_plot':   pooledpr2_peaks_subjob.get_output_ref("xcor_plot"),
                # 'pooledpr2_xcor_scores': pooledpr2_peaks_subjob.get_output_ref("xcor_scores"),
            })

        return output