Exemple #1
0
def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid =  project.get_id()

    counts = {}
    n = 0
    summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False)
    while summaries:
        try:
            flink = dxpy.dxlink(summaries.next())
            n = n+1
        except StopIteration:
            break
        fd = dxpy.describe(flink)
        fn = "fastqc/%s" % fd['name']
        if not os.path.isfile(fn):
            print 'Downloading: %s from %s' % (fn, fd['folder'])
            try:
                dxpy.download_dxfile(flink, fn)
            except Exception, e:
                print "Error %s" % e

        parse_summary(fn, counts)
def main(outfn, assembly, debug, key, keyfile, dryrun, force, analysis_ids=None, infile=None, project=None):

	if debug:
		logger.setLevel(logging.DEBUG)
	else:
		logger.setLevel(logging.INFO)

	if infile is not None:
		infile = dxpy.DXFile(infile)
		dxpy.download_dxfile(infile.get_id(), "infile")
		ids = open("infile",'r')
	elif analysis_ids is not None:
		ids = analysis_ids
	else:
		logger.error("Must supply one of --infile or a list of one or more analysis-ids")
		return

	authid, authpw, server = common.processkey(key, keyfile)
	keypair = (authid,authpw)

	for (i, analysis_id) in enumerate(ids):
		logger.info('%s' %(analysis_id))
		accessioned_files = accession_analysis(analysis_id, keypair, server, assembly, dryrun, force)

	print accessioned_files
	common.touch(outfn)
	outfile = dxpy.upload_local_file(outfn)

	output = {}
	output["outfile"] = dxpy.dxlink(outfile)

	return output
def main(quants_a, quants_b):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(['tool_versions.py', '-a', APP_SCRIPT, '-av', APP_VER])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b")

    print "* Runnning MAD.R..."
    mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b'])
    quants_a_name = dxfile_a.name.split('.')
    quants_b_name = dxfile_b.name.split('.')
    filename = quants_a_name[0] + '_' + quants_b_name[0] + '_' + quants_a_name[1] + '_mad_plot.png'
    subprocess.check_call(['mv', "MAplot.png", filename])
    
    print "* package properties..."
    qc_metrics = {}
    qc_metrics["MAD.R"] = json.loads(mad_output)
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics,indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(filename,properties=props,details=qc_metrics)
    
    return { "metadata": meta_string, "mad_plot": plot_dxfile }
def main(BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'):

    inputFiles = []
    for i in range(len(BAMs)):
        fh = dxpy.DXFile(BAMs[i])
        dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i))

    name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam")

    # Fill in your application code here.

    command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (name, params)
    for i in range(len(BAMs)):
        command += " INPUT=input%d.bam" % (i)
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    BAM = dxpy.upload_local_file("%s.bam" % name);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["BAM"] = dxpy.dxlink(BAM)

    return output
Exemple #5
0
def main(inputs, prefix=None):

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    # uses last extension - presumably they are all the same
    extension = splitext(splitext(input_filenames[-1])[0])[1]
    if prefix:
        pooled_filename = prefix + "_pooled%s.gz" % (extension)
    else:
        pooled_filename = \
            '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = common.run_pipe([
        'gzip -dc %s' % (' '.join(input_filenames)),
        'gzip -cn'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    output = {
        "pooled": dxpy.dxlink(pooled)
    }

    return output
def main(input_file):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_file = dxpy.DXFile(input_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_file.get_id(), "input_file")

    # Fill in your application code here.

    subprocess.check_call("fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output_file = dxpy.upload_local_file("output_file");

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["output_file"] = dxpy.dxlink(output_file)

    return output
Exemple #7
0
def unpack_tarball(input_tarball):
    """
    Unpacks the tarball with the specified file ID and returns a string
    containing the directory where it was unpacked.
    """

    tempdir = tempfile.mkdtemp()
    print "Working in " + tempdir

    tarball_filename = os.path.join(tempdir, "input.tar.gz")

    dxpy.download_dxfile(input_tarball, tarball_filename)

    checkout_dir = os.path.join(tempdir, "unpackdest")
    os.mkdir(checkout_dir)

    subprocess.check_call(['tar', '-xzf', tarball_filename, '-C', checkout_dir, '--warning=no-timestamp'])

    # TODO: instead of guessing the directory name to be a name that
    # generates no warnings, have the client send the directory name
    # that was used on its end.
    try:
        appname = json.load(open(os.path.join(tempdir, "unpackdest", "dxapp.json"))).get("name", "unpackdest")
        if appname != "unpackdest":
            os.rename(os.path.join(tempdir, "unpackdest"), os.path.join(tempdir, appname))
            checkout_dir = os.path.join(tempdir, appname)
    except:
        pass

    return checkout_dir
Exemple #8
0
def main(psmcfa, psmc, outname, xchr, timemax, window):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    psmcfa = dxpy.DXFile(psmcfa)
    psmc = dxpy.DXFile(psmc)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(psmcfa.get_id(), "psmcfa")
    dxpy.download_dxfile(psmc.get_id(), "psmc")

    # Fill in your application code here.
    (tmaxNew, parfile) = writeRecalFile('psmc', timemax, window, xchr)
    subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname, 'psmcfa'])
    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    outfile = dxpy.upload_local_file(outname);

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["outfile"] = dxpy.dxlink(outfile)

    return output
Exemple #9
0
def postprocess(**inputs):
    kwargs = inputs["kwargs"]
    subjob_outputs = inputs["subjob_outputs"] 
    print "\nMerging outputs from {n} subjobs".format(n=len(subjob_outputs))

    output_prefix = kwargs["output_prefix"]
    variant_suffixes = kwargs["variant_suffixes"]
    
    app_output_fn = {}
    for subjob_output in subjob_outputs:
        for type, id in subjob_output.iteritems():
            file_id = id["$dnanexus_link"]
            filename = output_prefix + "_" + variant_suffixes[type]
            
            print "Downloading " + str(file_id) + " into " + filename
            dxpy.download_dxfile(dxid=file_id, filename=filename, append=True)
            app_output_fn[type] = filename

    postprocess_outputs = {}
    need_to_renumber = ["deletions", "short_inserts", "tandem_duplications", "inversions", "large_inserts"]
    for type, fn in app_output_fn.iteritems():
        out_fn = fn
        if type in need_to_renumber:
            out_fn = RenumberMergedOutput(fn, fn+"_renumbered") 
        print "\nUploading {file} as {fn}".format(file=out_fn, fn=fn)
        postprocess_outputs[type] = dxpy.dxlink(dxpy.upload_local_file(out_fn, name=fn))

    if kwargs["export_vcf"]:
        DownloadRefFasta(kwargs["reference_fasta"])
        postprocess_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_prefix, ref_fn="reference_fasta") 
        
    return postprocess_outputs
Exemple #10
0
def main(input_bams):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    input_bams = [dxpy.DXFile(item) for item in input_bams]

    # Download each file input to a new directory in the the local file system
    # using variable names for the filenames.
    # Construct output filenames.
    # Dispatch jobs to a pool of workers.

    out_paths = []
    pool = Pool()  # default is pool of cpu_count() workers

    for i, bam in enumerate(input_bams):
        dirname = str(i)
        filename = bam.name
        os.mkdir(dirname)
        in_path = os.path.join(dirname, filename)
        dxpy.download_dxfile(bam.get_id(), in_path)
        out_path = os.path.join(dirname, "scrub-" + filename)
        out_paths.append(out_path)
        pool.apply_async(scrub, (in_path, out_path))

    # Close the worker pool and block until all jobs are complete.
    pool.close()
    pool.join()

    # Populate output fields and return.
    scrubbed_bams = [dxpy.upload_local_file(path) for path in out_paths]
    output = {
        "scrubbed_bams": [dxpy.dxlink(output_bam) for output_bam in scrubbed_bams]
    }
    return output
 def _install_dep_bundle(self, bundle):
     if bundle["id"].get("$dnanexus_link", "").startswith("file-"):
         self.log("Downloading bundled file {name}".format(**bundle))
         dxpy.download_dxfile(bundle["id"], bundle["name"])
         self.run("dx-unpack '{}'".format(bundle["name"]))
     else:
         self.log('Skipping bundled dependency "{name}" because it does not refer to a file'.format(**bundle))
def scatter(orig_reads, split_size):
    # Fill in code here to do whatever is necessary to scatter the
    # input.
    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    splitsize = split_size * 1000000 * 4
    # each FQ read is 4 lines
    os.mkdir('splits')

    for f in orig_reads:
        reads_filename = dxpy.describe(f)['name']
        reads_basename = strip_extensions(reads_filename, STRIP_EXTENSIONS)
        dxpy.download_dxfile(dxpy.DXFile(f).get_id(), reads_filename)

        reads_root_name = simplify_name() or reads_basename

        logger.info('* RUNNING /bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name))
        split_out = subprocess.check_output('/bin/zcat %s | /usr/bin/split -l %d -d - %s ' % (reads_filename, splitsize, 'splits/' + reads_root_name), shell=True)
        # can't shlex because of |

    logger.info(split_out)
    splits = os.listdir('splits')
    logger.info("* Return from scatter: %s *" % splits)

    # SHould we gzip here?
    return {
        "array_of_scattered_input": [ 
            dxpy.dxlink(dxpy.upload_local_file('splits/' + split_file)) for split_file in splits]
        }
def merge_map_reports(map_report_set, target_root):
    '''Merges techrep map_reports.'''

    # Working on map_reports now
    all_reports=""
    biorep_map_report = target_root + '_map_report.txt'
    append_line("### Combined Bismark map report for several technical replicates ###\n",biorep_map_report)
    for techrep_map_report_dlink in map_report_set:
        file_desc = dxpy.describe(techrep_map_report_dlink)
        file_root = file_desc['name']
        file_root = file_root.replace('_techrep_bismark_map_report.txt','') 
        file_root = file_root.replace('_bismark_map_report.txt','') 
        file_root = file_root.replace('_map_report.txt','')
        techrep_map_report = file_root + '_techrep_map_report.txt' 
        append_line("###################################",biorep_map_report)
        append_line("### Map report for ${file_root} ###",biorep_map_report)
        print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root
        dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report)
        run_cmd('cat ' + techrep_map_report, out=biorep_map_report,append=True)
        if len(all_reports) == 0:
            all_reports = techrep_map_report
        else:
            all_reports += ',' + techrep_map_report
        
    if all_reports == techrep_map_report: # only one
        run_cmd('mv %s %s' % (techrep_map_report,biorep_map_report) )
        all_reports = biorep_map_report
        
    return (biorep_map_report,all_reports)
def main(sam_file, probability):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    sam_file = dxpy.DXFile(sam_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(sam_file.get_id(), "sam_file")

    if probability < 0 or probability > 1:
        raise dxpy.AppError("Probability parameter determines % of mappings included in output. Must be between 0 an 1.")

    subprocess.check_call(" ".join(["java", "-Xmx2g", "-jar", "/usr/local/bin/DownsampleSam.jar", "INPUT=sam_file", "OUTPUT=downsampled_sam", "PROBABILITY="+str(probability)]), shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    downsampled_sam = dxpy.upload_local_file("downsampled_sam")
    downsampled_sam.rename(sam_file.describe()['name']+"_downsampled")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["downsampled_sam"] = dxpy.dxlink(downsampled_sam)

    return output
def _download_one_file(file_rec, idir):
    src_file = file_rec['src_file_id']
    trg_file = os.path.join(idir, file_rec['trg_fname'])
    print("downloading file: " + src_file + " to filesystem: " + trg_file)
    sys.stdout.flush()
    dxpy.download_dxfile(src_file, trg_file)
    return file_rec
def merge_map_reports(map_report_set, target_root):
    """Merges techrep map_reports."""

    # Working on map_reports now
    all_reports = ""
    biorep_map_report = target_root + "_map_report.txt"
    append_line("### Combined Bismark map report for several technical replicates ###\n", biorep_map_report)
    for techrep_map_report_dlink in map_report_set:
        file_desc = dxpy.describe(techrep_map_report_dlink)
        file_root = file_desc["name"]
        file_root = file_root.replace("_techrep_bismark_map_report.txt", "")
        file_root = file_root.replace("_bismark_map_report.txt", "")
        file_root = file_root.replace("_map_report.txt", "")
        techrep_map_report = file_root + "_techrep_map_report.txt"
        append_line("###################################", biorep_map_report)
        append_line("### Map report for ${file_root} ###", biorep_map_report)
        print "* Downloading %s_techrep_bismark_map_report.txt file..." % file_root
        dxpy.download_dxfile(techrep_map_report_dlink, techrep_map_report)
        run_cmd("cat " + techrep_map_report, out=biorep_map_report, append=True)
        if len(all_reports) == 0:
            all_reports = techrep_map_report
        else:
            all_reports += "," + techrep_map_report

    if all_reports == techrep_map_report:  # only one
        run_cmd("mv %s %s" % (techrep_map_report, biorep_map_report))
        all_reports = biorep_map_report

    return (biorep_map_report, all_reports)
def main(input_bam, paired=True, params=''):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam = dxpy.DXFile(input_bam)
    base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"])


    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_bam.get_id(), "input.bam")

    # Fill in your application code here.

    command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name
    if paired:
        command += " F2=%s_2.fastq" % base_name

    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}
    fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name);
    output["fastq_file"] = dxpy.dxlink(fastq_file)
    if paired:
        paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name);
        output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file)

    return output
def combine_files(countDXlinks, resultfn):
    """The 'gather' subjob of the applet.

    Arguments:
        countDXlinks (list[dict]): list of DXlinks to process job output files.
        resultfn (str): Filename to use for job output file.

    Returns:
        DXLink for the main function to return as the job output.

    Note: Only the DXLinks are passed as parameters.
    Subjobs work on a fresh instance so files must be downloaded to the machine
    """
    if resultfn.endswith(".bam"):
        resultfn = resultfn[:-4] + '.txt'

    sum_reads = 0
    with open(resultfn, 'w') as f:
        for i, dxlink in enumerate(countDXlinks):
            dxfile = dxpy.DXFile(dxlink)
            filename = "countfile{0}".format(i)
            dxpy.download_dxfile(dxfile, filename)
            with open(filename, 'r') as fsub:
                for line in fsub:
                    sum_reads += parse_line_for_readcount(line)
                    f.write(line)
        f.write('Total Reads: {0}'.format(sum_reads))

    countDXFile = dxpy.upload_local_file(resultfn)
    countDXlink = dxpy.dxlink(countDXFile.get_id())

    return {"countDXLink": countDXlink}
def main(quants_a, quants_b):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b")

    # Create and appropriate name for output files
    out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0])
    mad_plot_file = out_root + '_mad_plot.png'
        
    # DX/ENCODE independent script is found in resources/usr/bin
    print "* Runnning MAD.R..."
    mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a', 'quants_b'])
    subprocess.check_call(['mv', "MAplot.png", mad_plot_file])
    
    print "* package properties..."
    qc_metrics = {}
    qc_metrics["MAD.R"] = json.loads(mad_output)
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics,indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics)
    
    return { "metadata": meta_string, "mad_plot": plot_dxfile }
Exemple #20
0
def geneBody_coverage(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    # split mappings into chunks that can be done on a single worker
    # all mappings are loaded into RAM so can only do 5 million at a time
    run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"]))
    run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"]))
    files = os.listdir(".")
    jobs = []
    for f in files:
        if f.startswith("split_map"):
            # add header 
            run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"]))
            # convert to BAM
            run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"]))
            # upload file
            split_bam = dxpy.upload_local_file("temp.bam")
            # run analysis
            jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc"))
            
    run_shell( "ls -l" )

    gbc_agg_input = {"sub_reports":[]}
    for j in jobs:
        gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"})

    agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id()
    
    return {"results":{"job":agg_job, "field":"cover"}}
def bedmethyl_io(cx_report_dxlink, chrom_sizes_dxlink, target_root, qc_metrics, props):
    '''subjob runs cxrepo-bed.py, bedToBigBed on mem3_hdd2_x8'''

    print "* bedmethyl_io(): Retrieve CX report and chrom.sizes..."
    run_cmd('mkdir -p output/')
    cx_report = target_root + ".CX_report.txt"
    chrom_sizes = "chrom.sizes"
    dxpy.download_dxfile(chrom_sizes_dxlink, chrom_sizes)
    dxpy.download_dxfile(cx_report_dxlink, 'output/' + cx_report)

    (CpG_bed,CHG_bed,CHH_bed,CpG_bb,CHG_bb,CHH_bb) = bedmethyl(target_root, cx_report, chrom_sizes)
    
    print "* bedmethyl_io(): Storing bedmethyl results..."
    CpG_bed_dxfile = dxpy.upload_local_file(CpG_bed,properties=props,details=qc_metrics)
    CHG_bed_dxfile = dxpy.upload_local_file(CHG_bed,properties=props,details=qc_metrics)
    CHH_bed_dxfile = dxpy.upload_local_file(CHH_bed,properties=props,details=qc_metrics)

    CpG_bb_dxfile = dxpy.upload_local_file(CpG_bb,properties=props,details=qc_metrics)
    CHG_bb_dxfile = dxpy.upload_local_file(CHG_bb,properties=props,details=qc_metrics)
    CHH_bb_dxfile = dxpy.upload_local_file(CHH_bb,properties=props,details=qc_metrics)

    print "* bedmethyl_io(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        "CpG_bed_dxlink": dxpy.dxlink(CpG_bed_dxfile),
        "CHG_bed_dxlink": dxpy.dxlink(CHG_bed_dxfile),
        "CHH_bed_dxlink": dxpy.dxlink(CHH_bed_dxfile),
        "CpG_bb_dxlink":  dxpy.dxlink(CpG_bb_dxfile),
        "CHG_bb_dxlink":  dxpy.dxlink(CHG_bb_dxfile),
        "CHH_bb_dxlink":  dxpy.dxlink(CHH_bb_dxfile)
    }
def test_alignment_count(applet_id, project_id, folder, tmpdir):
    """Run BWA on a FASTQ file and verify that the number of
    alignments produced is correct.
    """

    # Recall that applet_id is set in the associated conftest.py, which either
    # gets it from the command line or builds the applet and retrieves its id.

    # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath.
    # It's strpath property just returns a string.

    applet = dxpy.DXApplet(applet_id)
    input_dict = {"fastq": dxpy.dxlink(SAMPLE_FASTQ),
                  "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX)}

    job = applet.run(input_dict, instance_type="mem1_ssd1_x16",
                         folder=folder, project=project_id)

    job.wait_on_done()
    
    output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"])
    local_filename = os.path.join(tmpdir.strpath, "test.bam")
    dxpy.download_dxfile(output_bam_dxfile.get_id(),
                         local_filename)
    count_alignments_cmd = "samtools view {bam} | wc -l".format(
        bam=local_filename)
    num_alignments = int(subprocess.check_output(count_alignments_cmd,
                                                 shell=True))
    assert num_alignments == 1951476
def main(inputs):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    extension = splitext(splitext(input_filenames[-1])[0])[1]  # uses last extension - presumably they are all the same
    pooled_filename = "-".join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = run_pipe(["gzip -dc %s" % (" ".join(input_filenames)), "gzip -c"], outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["pooled"] = dxpy.dxlink(pooled)

    return output
Exemple #24
0
def main(fastq, genomeindex_targz):

    print "something else"
    fastq_dxfile = dxpy.DXFile(fastq)
    dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq")

    genome_dxfile = dxpy.DXFile(genomeindex_targz)
    dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz")
    os.makedirs("genome")
    tar_cmd = "tar xzvf genome.tar.gz -C genome"
    subprocess.check_call(tar_cmd, shell=True)
    genome_file = glob.glob("genome/*.bwt")[0]
    genome_file = re.sub("\.bwt$", "", genome_file)

    bwa_cmd = (
        "bwa mem -t {nproc} {genome} {fastq} | "
        "samtools view -u -S - | "
        "samtools sort -m 256M -@ {nproc} - output".format(
            nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq"
        )
    )
    subprocess.check_call(bwa_cmd, shell=True)

    bam = dxpy.upload_local_file("output.bam")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["bam"] = dxpy.dxlink(bam)

    return output
Exemple #25
0
def read_duplication(BAM_file):
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    run_shell( " ".join(["read_duplication.py", "-i mappings.bam", "-o read_dup"]))
    run_shell( " ".join(["cat", "read_dup.pos.DupRate.xls", "read_dup.seq.DupRate.xls", ">", "read_dup.txt"]))
    results_id = dxpy.upload_local_file("read_dup.txt", wait_on_close=True).get_id()
    return {"results":results_id}
def merge_extract(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props):
    '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32'''

    (target_root,biorep_bam) = merge_bams(bam_set, 32)
    (biorep_map,all_reports) = merge_map_reports(map_report_set, target_root)
    (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports)
    
    print "* merge_extract(): Retrieve and uncompress index..."
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)
    run_cmd('tar -zxf ' + dme_ix)

    # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage
    (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root)

    bismark_simple_extract(target_root, alignments, ncores)
    qc_metrics = bismark_qc_metrics(target_root, qc_metrics)

    print "* Retrieve split report..."
    append_line("\n===== bismark_methylation_extractor: splitting_report =====",biorep_bam_qc)
    run_cmd('cat %s_splitting_report.txt' % target_root,out=biorep_bam_qc,append=True,silent=True)

    # TODO: Is this even needed?  Currently we do to get the size!
    #if len(bam_set) > 1:  # Wouldn't need to do this unless there is a merge
    #    print "* merge_extract(): Storing biorep bam..."
    #    props_ex = props.copy()
    #    props_ex.update({ 'reads': str(reads) })
    #    biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True))
    #else:
    #    biorep_bam_dxlink = bam_set[0]

    print "* merge_extract(): Storing extraction results..."
    biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,properties=props,details=qc_metrics)
    biorep_map_dxfile    = dxpy.upload_local_file(biorep_map,   properties=props,details=qc_metrics)
    split_report_dxfile  = dxpy.upload_local_file(target_root+'_splitting_report.txt')
    split_report_dxfile  = dxpy.upload_local_file(target_root+'_splitting_report.txt')
    chrom_sizes_dxfile   = dxpy.upload_local_file('input/chrom.sizes')
    mbias_report_dxfile  = dxpy.upload_local_file(target_root+'_mbias_report.txt',properties=props,details=qc_metrics)
    CpG_context_dxfile   = dxpy.upload_local_file('output/CpG_context_%s.txt' % (target_root))
    CHG_context_dxfile   = dxpy.upload_local_file('output/CHG_context_%s.txt' % (target_root))
    CHH_context_dxfile   = dxpy.upload_local_file('output/CHH_context_%s.txt' % (target_root))

    print "* merge_extract(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        #"biorep_bam_dxlink":    biorep_bam_dxfile,
        "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile),
        "biorep_map_dxlink":    dxpy.dxlink(biorep_map_dxfile),
        "CpG_context_dxlink":   dxpy.dxlink(CpG_context_dxfile),
        "CHG_context_dxlink":   dxpy.dxlink(CHG_context_dxfile),
        "CHH_context_dxlink":   dxpy.dxlink(CHH_context_dxfile),
        "split_report_dxlink":  dxpy.dxlink(split_report_dxfile),
        "chrom_sizes_dxlink":   dxpy.dxlink(chrom_sizes_dxfile),
        "mbias_report_dxlink":  dxpy.dxlink(mbias_report_dxfile),
        "target_root":          target_root,
        "qc_metrics":           qc_metrics
    }
Exemple #27
0
def main(input_bam, paired_end):

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    subprocess.check_output('ls -l', shell=True)

    # ===================
    # Create tagAlign file
    # ===================
    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename),
        "gzip -cn"],
        outfile=final_TA_filename)

    subprocess.check_output('ls -l', shell=True)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        command = \
            "samtools sort -@ %d -n %s %s" \
            % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(command)
        subprocess.check_call(shlex.split(command))

        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename),
            "gzip -cn"],
            outfile=final_BEDPE_filename)

    subprocess.check_output('ls -l', shell=True)

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    output = {}
    output["tagAlign_file"] = dxpy.dxlink(tagAlign_file)
    if paired_end:
        output["BEDPE_file"] = dxpy.dxlink(BEDPE_file)

    return output
Exemple #28
0
def read_distribution(BAM_file, BED_file):
    dxpy.download_dxfile(BAM_file, "mappings.bam")
    dxpy.download_dxfile(BED_file, "genes.bed")

    run_shell(" ".join(["read_distribution.py", "-i mappings.bam", "-r genes.bed", ">", "read_dist.txt"]))

    results_id = dxpy.upload_local_file("read_dist.txt", wait_on_close=True).get_id()
    return {"results":results_id}
Exemple #29
0
def run_gbc(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    run_shell( " ".join(["geneBody_coverage.py", "-i mappings.bam", "-r genes.bed", "-o geneBody"]))

    results_id = dxpy.upload_local_file("geneBody.geneBodyCoverage.txt", wait_on_close=True).get_id()
    return {"file":results_id}
Exemple #30
0
def inner_distance(BAM_file, BED_file):
    dxpy.download_dxfile(BED_file, "genes.bed")
    dxpy.download_dxfile(BAM_file, "mappings.bam")

    run_shell( " ".join(["inner_distance.py", "-i mappings.bam", "-r genes.bed", "-o inner", "-l -303", "-u 5002"]))
    
    results_id = dxpy.upload_local_file("inner.inner_distance_freq.txt", wait_on_close=True).get_id()
    return {"results":results_id}
Exemple #31
0
def signal_io(bedgraph_gz_dxlink, chrom_sizes_dxlink, target_root, qc_metrics,
              props):
    '''subjob runs bedGraphToBigWig on mem3_hdd2_x8'''

    print "* signal_io(): Retrieve bedgraph and chrom.sizes..."
    bedGraph = target_root + ".bedGraph"
    bedGraph_gz = bedGraph + ".gz"
    chrom_sizes = "chrom.sizes"
    dxpy.download_dxfile(bedgraph_gz_dxlink, bedGraph_gz)
    dxpy.download_dxfile(chrom_sizes_dxlink, chrom_sizes)

    bigWig = signal(target_root, bedGraph_gz, chrom_sizes)

    print "* signal_io(): Storing signal results..."
    bigWig_dxfile = dxpy.upload_local_file(bigWig,
                                           properties=props,
                                           details=qc_metrics,
                                           cleanup=True)

    print "* signal_io(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {"bigWig_dxlink": dxpy.dxlink(bigWig_dxfile)}
Exemple #32
0
def main(input_bams):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    input_bams = [dxpy.DXFile(item) for item in input_bams]

    # Download each file input to a new directory in the the local file system
    # using variable names for the filenames.
    # Construct output filenames.
    # Dispatch jobs to a pool of workers.

    out_paths = []
    pool = Pool()  # default is pool of cpu_count() workers

    for i, bam in enumerate(input_bams):
        dirname = str(i)
        filename = bam.name
        os.mkdir(dirname)
        in_path = os.path.join(dirname, filename)
        dxpy.download_dxfile(bam.get_id(), in_path)
        out_path = os.path.join(dirname, "scrub-" + filename)
        out_paths.append(out_path)
        pool.apply_async(scrub, (in_path, out_path))

    # Close the worker pool and block until all jobs are complete.
    pool.close()
    pool.join()

    # Populate output fields and return.
    scrubbed_bams = [dxpy.upload_local_file(path) for path in out_paths]
    output = {
        "scrubbed_bams":
        [dxpy.dxlink(output_bam) for output_bam in scrubbed_bams]
    }
    return output
def main(tumor_bam, normal_bam, reference, params='-F vcf'):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    tumor_bam = dxpy.DXFile(tumor_bam)
    normal_bam = dxpy.DXFile(normal_bam)
    reference = dxpy.DXFile(reference)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(tumor_bam.get_id(), "tumor.bam")
    dxpy.download_dxfile(normal_bam.get_id(), "normal.bam")
    dxpy.download_dxfile(reference.get_id(), "ref.fa.gz")

    # The following line extracts the name from the file object so that
    # outputs can be named intelligently. It is not automatically generated by
    # the app wizard.

    name = tumor_bam.describe()['name'] + "_vs_" + normal_bam.describe(
    )['name']

    # Append file extension based on whether the output will be VCF or now
    if "-F vcf" in params:
        name += ".vcf"
    else:
        name += ".snp"

    # Fill in your application code here.

    subprocess.check_call("gzip -d ref.fa.gz", shell=True)
    subprocess.check_call(
        "bam-somaticsniper -f ref.fa %s tumor.bam normal.bam %s" %
        (params, name),
        shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    snps = dxpy.upload_local_file(name)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["snps"] = dxpy.dxlink(snps)

    return output
Exemple #34
0
def process(file_obj, file_meta):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    print file_obj
    print file_meta
    filename = dxpy.describe(file_obj)['name']
    basename = filename.rstrip('.gz')
    dx_file = dxpy.download_dxfile(file_obj, filename)

    print "Run Validate Files"
    validate_args = validate_map.get(file_meta['file_format'])
    assembly = file_meta.get('assembly')
    if assembly:
        chromInfo = ['-chromInfo=%s/%s/chrom.sizes' % (encValData, assembly)]
    else:
        chromInfo = ['-chromInfo=%s/hg19/chrom.sizes' % encValData]

    print subprocess.check_output(['ls','-l'])
    valid = "Not validated yet"
    if validate_args is not None:
        print("Validating file.")
        validation_command = ['validateFiles'] + ['-verbose=2'] + validate_args + chromInfo + ['-doReport'] + [filename]
        try:
            print " ".join(validation_command)
            valid = subprocess.check_output(validation_command)
        except subprocess.CalledProcessError as e:
            pass
            #valid = "Process Error"
            print(e.output)
            #raise

    print valid
    print subprocess.check_output(['ls','-l'])
    print "Upload result"
    report_dxfile = dxpy.upload_local_file("%s.report" % filename)
    print report_dxfile
    ## is_valid == 'Error count 0'
    return {
        "report": report_dxfile,
        "validation": valid
    }
def process(fastq):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    print fastq
    reads_filename = dxpy.describe(fastq)['name']
    reads_basename = reads_filename.rstrip('.gz').rstrip('.fq').rstrip(
        '.fastq')
    reads_file = dxpy.download_dxfile(fastq, "fastq.gz")

    subprocess.check_call(['mkdir', 'output'])
    print "Run QC"
    fqc_command = "/usr/bin/FastQC/fastqc fastq.gz -o output"
    print fqc_command
    stdio = subprocess.check_output(shlex.split(fqc_command))
    print stdio
    print subprocess.check_output(['ls', '-l', 'output'])
    subprocess.check_call(['unzip', 'output/fastq_fastqc.zip'])
    print "Upload results"
    subprocess.check_call(
        ['mv', 'fastq_fastqc/fastqc_data.txt',
         "%s_data.txt" % reads_basename])
    subprocess.check_call(
        ['mv', 'fastq_fastqc/summary.txt',
         "%s_summary.txt" % reads_basename])
    subprocess.check_call(
        ['mv', 'output/fastq_fastqc.zip',
         "%s_fastqc.zip" % reads_basename])
    report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename)
    summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename)
    zip_dxfile = dxpy.upload_local_file("%s_fastqc.zip" % reads_basename)
    print report_dxfile
    return {
        "report": report_dxfile,
        "summary": summary_dxfile,
        "zip": zip_dxfile
    }
Exemple #36
0
def main(bam_file,
         ref_vcf_file,
         eval_vcf_file,
         qual_cutoff,
         depth_cutoff,
         bed_file=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    bam_file = dxpy.DXFile(bam_file)
    if bed_file is not None:
        bed_file = dxpy.DXFile(bed_file)
    ref_vcf_file = dxpy.DXFile(vcf_file)
    eval_vcf_file = dxpy.DXFile(eval_vcf_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(bam_file.get_id(), "bam_file")

    dxpy.download_dxfile(vcf_file.get_id(), "vcf_file")
    if bed_file is not None:
        dxpy.download_dxfile(bed_file.get_id(), "bed_file")

    # Fill in your application code here.

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    sites_for_manual_review = dxpy.upload_local_file("sites_for_manual_review")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["sites_for_manual_review"] = dxpy.dxlink(sites_for_manual_review)
    output["number_of_missed_sites"] = number_of_missed_sites
    output["found_sites"] = found_sites
    output["Sensitivity"] = Sensitivity
    output["specificity"] = specificity

    return output
Exemple #37
0
def main(quants_a, quants_b, annotations):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)
    dxfile_anno = dxpy.DXFile(annotations)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a.tsv")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b.tsv")
    dxpy.download_dxfile(dxfile_anno.get_id(), "annotations.gtf.gz")
    
    # Create and appropriate name for output files
    out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0])
    print "* Expecting output: '"+out_root+"_srna_mad_plot.png'"
    
    # Must move sub-scripts into current dir so they will be found by srna-mad-qc.sh
    subprocess.check_call(['mv', "/usr/bin/extract_gene_ids.awk", '.'])
    subprocess.check_call(['mv', "/usr/bin/sum_srna_expression.awk", '.'])
    subprocess.check_call(['mv', "/usr/bin/MAD.R", '.'])
    
    # DX/ENCODE independent script is found in resources/usr/bin
    print "* ===== Calling DNAnexus and ENCODE independent script... ====="
    subprocess.check_call(['srna_mad_qc.sh','annotations.gtf.gz','quants_a.tsv','quants_b.tsv',out_root])
    print "* ===== Returned from dnanexus and encodeD independent script ====="
    mad_plot_file = out_root + '_mad_plot.png'
    mad_qc_file = out_root + '_mad_qc.txt'

    print "* package properties..."
    qc_metrics = {}
    f_qc = open(mad_qc_file, 'r')
    mad_output = f_qc.read()
    f_qc.close()
    mad_output = mad_output.replace("NA","-1")
    qc_metrics["MAD.R"] = json.loads(mad_output)
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics,indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics)
    
    return { "metadata": meta_string, "mad_plot": plot_dxfile }
Exemple #38
0
def run_bwa_backtrack_paired(fastq_file, fastq_file2, genome_fasta_file,
                             genome_index_file, mark_duplicates, logger):
    """Runs BWA-backtrack on a pair of FASTQ files."""

    fastq_file = dxpy.DXFile(fastq_file)
    fastq_file2 = dxpy.DXFile(fastq_file2)
    genome_fasta_file = dxpy.DXFile(genome_fasta_file)
    genome_index_file = dxpy.DXFile(genome_index_file)

    dxpy.download_dxfile(fastq_file.get_id(), "sample.fastq.gz")
    dxpy.download_dxfile(fastq_file2.get_id(), "sample_2.fastq.gz")
    dxpy.download_dxfile(genome_fasta_file.get_id(), "genome.fa.gz")
    dxpy.download_dxfile(genome_index_file.get_id(), "genome.tar.gz")

    subprocess.check_call("tar xzvf genome.tar.gz", shell=True)
    num_cores = str(cpu_count())

    run_cmd(
        "bwa-0.6.2 aln -t " + num_cores +
        " genome.fa.gz sample.fastq.gz > sample.sai", logger)
    run_cmd(
        "bwa-0.6.2 aln -t " + num_cores +
        " genome.fa.gz sample_2.fastq.gz > sample_2.sai", logger)
    run_cmd(
        "bwa-0.6.2 sampe -P genome.fa.gz sample.sai sample_2.sai sample.fastq.gz sample_2.fastq.gz"
        + " > sample0.sam", logger)
    run_cmd("java -jar /CleanSam.jar INPUT=sample0.sam OUTPUT=sample1.bam",
            logger)
    run_cmd("samtools sort -@ " + num_cores + " sample1.bam sample", logger)

    if mark_duplicates:
        run_cmd(
            "java -jar /MarkDuplicates.jar " +
            "INPUT=sample.bam OUTPUT=sample_deduped.bam METRICS_FILE=/dev/null",
            logger)
        subprocess.check_call("mv sample_deduped.bam sample.bam", shell=True)
def coverage(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink,
             dme_ix_dxlink, target_root):
    '''subjob runs bismark2bedGraph and coverage2cytosine on mem3_hdd2_x8'''

    print "* coverage(): Retrieve context files and index..."
    CpG_context = 'CpG_context_%s.txt' % target_root
    CHG_context = 'CHG_context_%s.txt' % target_root
    CHH_context = 'CHH_context_%s.txt' % target_root
    run_cmd('mkdir -p output/')
    dxpy.download_dxfile(CpG_context_dxlink, 'output/%s.gz' % CpG_context)
    dxpy.download_dxfile(CHG_context_dxlink, 'output/%s.gz' % CHG_context)
    dxpy.download_dxfile(CHH_context_dxlink, 'output/%s.gz' % CHH_context)
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)

    print "* coverage(): Uncompress..."
    run_cmd('tar -zxf ' + dme_ix)
    run_cmd('gunzip output/%s.gz' % CpG_context)
    run_cmd('gunzip output/%s.gz' % CHG_context)
    run_cmd('gunzip output/%s.gz' % CHH_context)

    (bedGraph_gz, cx_report) = bismark_coverage(target_root, CpG_context,
                                                CHG_context, CHH_context)

    print "* coverage(): Storing coverage results..."
    cx_report_dxfile = dxpy.upload_local_file(cx_report)
    bedgraph_gz_dxfile = dxpy.upload_local_file(bedGraph_gz)

    print "* coverage(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        "cx_report_dxlink": dxpy.dxlink(cx_report_dxfile),
        "bedgraph_gz_dxlink": dxpy.dxlink(bedgraph_gz_dxfile)
    }
Exemple #40
0
def main(bam1, bam2, RE_site_bed):

    dxpy.download_dxfile(bam1, "input1.bam")
    dxpy.download_dxfile(bam2, "input2.bam")
    dxpy.download_dxfile(RE_site_bed, "RE.bed")

    command = "cp -r /miniconda ~; cp -r /.conda ~; bash -c 'PATH=/home/dnanexus/miniconda/miniconda2/bin:$PATH; source activate gitar; which python; python /usr/bin/HiCtool_hifive.arg.py input1.bam input2.bam RE.bed .'"
    print(command)
    subprocess.call(command, shell=True)

    fend_object_hdf5_filename = "./fend_object.hdf5"
    HiC_data_object_hdf5_filename = "./HiC_data_object.hdf5"
    HiC_distance_function_hdf5_filename = "./HiC_distance_function.hdf5"
    HiC_norm_binning_hdf5_filename = "./HiC_norm_binning.hdf5"
    HiC_project_object_hdf5_filename = "./HiC_project_object.hdf5"

    #fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename, folder=outdir)
    #HiC_data_object_hdf5_file = dxpy.upload_local_file(HiC_data_object_hdf5_filename, folder=outdir)
    #HiC_distance_function_hdf5_file= dxpy.upload_local_file(HiC_distance_function_hdf5_filename, folder=outdir)
    #HiC_norm_binning_hdf5_file= dxpy.upload_local_file(HiC_norm_binning_hdf5_filename, folder=outdir)
    #HiC_project_object_hdf5_file= dxpy.upload_local_file(HiC_project_object_hdf5_filename, folder=outdir)

    fend_object_hdf5_file = dxpy.upload_local_file(fend_object_hdf5_filename)
    HiC_data_object_hdf5_file = dxpy.upload_local_file(
        HiC_data_object_hdf5_filename)
    HiC_distance_function_hdf5_file = dxpy.upload_local_file(
        HiC_distance_function_hdf5_filename)
    HiC_norm_binning_hdf5_file = dxpy.upload_local_file(
        HiC_norm_binning_hdf5_filename)
    HiC_project_object_hdf5_file = dxpy.upload_local_file(
        HiC_project_object_hdf5_filename)

    return {
        "fend_object_hdf5": fend_object_hdf5_file,
        "HiC_data_object_hdf5": HiC_data_object_hdf5_file,
        "HiC_distance_function_hdf5": HiC_distance_function_hdf5_file,
        "HiC_norm_binning_hdf5": HiC_norm_binning_hdf5_file,
        "HiC_project_object_hdf5": HiC_project_object_hdf5_file
    }
def main(rep1_peaks, rep2_peaks, pooled_peaks, idr_threshold, rank, interactive):

    # Initialize the data object inputs on the platform into
    # dxpy.DXDataObject instances.

    idr_version = 1

    rep1_peaks_file = dxpy.DXFile(rep1_peaks)
    rep2_peaks_file = dxpy.DXFile(rep2_peaks)
    pooled_peaks_file = dxpy.DXFile(pooled_peaks)

    rep1_peaks_filename = rep1_peaks_file.name
    rep2_peaks_filename = rep2_peaks_file.name
    pooled_peaks_filename = pooled_peaks_file.name

    # Download the file inputs to the local file system.

    dxpy.download_dxfile(rep1_peaks_file.get_id(), rep1_peaks_filename)
    dxpy.download_dxfile(rep2_peaks_file.get_id(), rep2_peaks_filename)
    dxpy.download_dxfile(pooled_peaks_file.get_id(), pooled_peaks_filename)    


    rep1_peaks_filename = uncompress(rep1_peaks_filename)
    rep2_peaks_filename = uncompress(rep2_peaks_filename)
    pooled_peaks_filename = uncompress(pooled_peaks_filename)

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    #rep1_vs_rep2_prefix = '%s_vs_%s.IDRv%d' %(os.path.basename(rep1_peaks_filename), os.path.basename(rep2_peaks_filename), idr_version)
    rep1_vs_rep2_prefix = '%sv%s.IDRv%d' %(os.path.basename(rep1_peaks_filename)[0:11], os.path.basename(rep2_peaks_filename)[0:11], idr_version)

    pooled_common_peaks_IDR_filename, IDR_overlap_narrowpeak_filename = run_idr(
        rep1_peaks_filename,
        rep2_peaks_filename,
        pooled_peaks_filename,
        rep1_vs_rep2_prefix,
        rank=rank,
        idr_version=idr_version,
        interactive=interactive)

    # =============================
    # Get peaks passing the IDR threshold
    # =============================
    if idr_version == 1:
        awk_string = r"""awk 'BEGIN{OFS="\t"} $14<=%2.2f {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}'""" %(idr_threshold)
    elif idr_version ==2:
        awk_string = r"""awk 'BEGIN{OFS="\t"} $12>=%2.2f {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10}'""" %(-math.log10(idr_threshold))
    final_IDR_thresholded_filename = rep1_vs_rep2_prefix + '.IDR%2.2f.narrowPeak' %(idr_threshold)
    run_pipe([
        'cat %s' %(pooled_common_peaks_IDR_filename),
        awk_string,
        'sort -k7n,7n'
        #'gzip -c'
    ], final_IDR_thresholded_filename)

    npeaks_pass_filename = rep1_vs_rep2_prefix + '-npeaks-aboveIDR.txt'
    wc_output = subprocess.check_output(shlex.split('wc -l %s' %(final_IDR_thresholded_filename)))
    with open(npeaks_pass_filename, 'w') as fh:
        fh.write(wc_output)
    line_count = wc_output.split()[0]
    n_peaks = int(line_count)

    #TODO batch consistency plot

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}

    if idr_version == 1:
        IDR_overlap_narrowpeak_filename = compress(IDR_overlap_narrowpeak_filename)
        overlapped_peaks = dxpy.upload_local_file(IDR_overlap_narrowpeak_filename)
        EM_fit_output = dxpy.upload_local_file(rep1_vs_rep2_prefix + '-em.sav')
        empirical_curves_output = dxpy.upload_local_file(rep1_vs_rep2_prefix + '-uri.sav')
        EM_parameters_log = dxpy.upload_local_file(rep1_vs_rep2_prefix + '-Rout.txt')
        output.update({
            "EM_fit_output": dxpy.dxlink(EM_fit_output),
            "empirical_curves_output": dxpy.dxlink(empirical_curves_output),
            "overlapped_peaks": dxpy.dxlink(overlapped_peaks)
        })
    elif idr_version == 2:
        EM_fit_output = None
        empirical_curves_output = None
        overlapped_peaks = None
        EM_parameters_log = dxpy.upload_local_file(rep1_vs_rep2_prefix + '.log.txt')
        IDR2_plot = dxpy.upload_local_file(pooled_common_peaks_IDR_filename + '.png')
        output.update({
            "IDR2_plot": dxpy.dxlink(IDR2_plot)
            })

    npeaks_pass = dxpy.upload_local_file(npeaks_pass_filename)
    IDR_output = dxpy.upload_local_file(compress(pooled_common_peaks_IDR_filename))
    IDR_peaks = dxpy.upload_local_file(compress(final_IDR_thresholded_filename))

    #
    # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...}
    #

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    output.update({
        "EM_parameters_log": dxpy.dxlink(EM_parameters_log),
        "npeaks_pass": dxpy.dxlink(npeaks_pass),
        "IDR_output": dxpy.dxlink(IDR_output),
        "IDR_peaks": dxpy.dxlink(IDR_peaks),
        "N": n_peaks
    })

    logging.info("Exiting with output: %s", output)
    return output
Exemple #42
0
def main(experiment, control, xcor_scores_input, chrom_sizes, narrowpeak_as,
         gappedpeak_as, broadpeak_as, genomesize):

    # Initialize data object inputs on the platform
    # into dxpy.DXDataObject instances.

    experiment = dxpy.DXFile(experiment)
    control = dxpy.DXFile(control)
    xcor_scores_input = dxpy.DXFile(xcor_scores_input)
    chrom_sizes = dxpy.DXFile(chrom_sizes)
    narrowPeak_as = dxpy.DXFile(narrowpeak_as)
    gappedPeak_as = dxpy.DXFile(gappedpeak_as)
    broadPeak_as = dxpy.DXFile(broadpeak_as)

    # Download the file inputs to the local file system
    # and use their own filenames.

    dxpy.download_dxfile(experiment.get_id(), experiment.name)
    dxpy.download_dxfile(control.get_id(), control.name)
    dxpy.download_dxfile(xcor_scores_input.get_id(), xcor_scores_input.name)
    dxpy.download_dxfile(chrom_sizes.get_id(), chrom_sizes.name)
    dxpy.download_dxfile(narrowPeak_as.get_id(), narrowPeak_as.name)
    dxpy.download_dxfile(gappedPeak_as.get_id(), gappedPeak_as.name)
    dxpy.download_dxfile(broadPeak_as.get_id(), broadPeak_as.name)

    #Define the output filenames

    peaks_dirname = 'peaks'
    if not os.path.exists(peaks_dirname):
        os.makedirs(peaks_dirname)
    prefix = experiment.name
    if prefix.endswith('.gz'):
        prefix = prefix[:-3]

    narrowPeak_fn = "%s/%s.narrowPeak" % (peaks_dirname, prefix)
    gappedPeak_fn = "%s/%s.gappedPeak" % (peaks_dirname, prefix)
    broadPeak_fn = "%s/%s.broadPeak" % (peaks_dirname, prefix)
    narrowPeak_gz_fn = narrowPeak_fn + ".gz"
    gappedPeak_gz_fn = gappedPeak_fn + ".gz"
    broadPeak_gz_fn = broadPeak_fn + ".gz"
    narrowPeak_bb_fn = "%s.bb" % (narrowPeak_fn)
    gappedPeak_bb_fn = "%s.bb" % (gappedPeak_fn)
    broadPeak_bb_fn = "%s.bb" % (broadPeak_fn)
    fc_signal_fn = "%s/%s.fc_signal.bw" % (peaks_dirname, prefix)
    pvalue_signal_fn = "%s/%s.pvalue_signal.bw" % (peaks_dirname, prefix)

    #Extract the fragment length estimate from column 3 of the cross-correlation scores file
    with open(xcor_scores_input.name, 'r') as fh:
        firstline = fh.readline()
        fraglen = firstline.split()[2]  #third column
        print "Fraglen %s" % (fraglen)

    #===========================================
    # Generate narrow peaks and preliminary signal tracks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --nomodel --shift 0 --extsize %s --keep-dup all -B --SPMR' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_narrowpeak_fn = common.rescale_scores('%s/%s_peaks.narrowPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    # Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_narrowpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (narrowPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (narrowPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # Generate Broad and Gapped Peaks
    #============================================

    command = 'macs2 callpeak ' + \
        '-t %s -c %s ' %(experiment.name, control.name) + \
        '-f BED -n %s/%s ' %(peaks_dirname, prefix) + \
        '-g %s -p 1e-2 --broad --nomodel --shift 0 --extsize %s --keep-dup all' %(genomesize, fraglen)
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_broadpeak_fn = common.rescale_scores('%s/%s_peaks.broadPeak' %
                                                  (peaks_dirname, prefix),
                                                  scores_col=5)

    # Sort by Col8 (for broadPeak) or Col 14(for gappedPeak)  in descending order and replace long peak names in Column 4 with Peak_<peakRank>
    pipe = [
        'sort -k 8gr,8gr %s' % (rescaled_broadpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (broadPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (broadPeak_gz_fn))

    # Rescale Col5 scores to range 10-1000 to conform to narrowPeak.as format (score must be <1000)
    rescaled_gappedpeak_fn = common.rescale_scores('%s/%s_peaks.gappedPeak' %
                                                   (peaks_dirname, prefix),
                                                   scores_col=5)

    pipe = [
        'sort -k 14gr,14gr %s' % (rescaled_gappedpeak_fn),
        r"""awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}'""",
        'tee %s' % (gappedPeak_fn), 'gzip -c'
    ]
    print pipe
    out, err = common.run_pipe(pipe, '%s' % (gappedPeak_gz_fn))

    # remove additional files
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.xls ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_peaks.bed ${peakFile}_summits.bed

    #===========================================
    # For Fold enrichment signal tracks
    #============================================

    # This file is a tab delimited file with 2 columns Col1 (chromosome name), Col2 (chromosome size in bp).

    command = 'macs2 bdgcmp ' + \
        '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
        '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
        '--outdir %s -o %s_FE.bdg ' %(peaks_dirname, prefix) + \
        '-m FE'
    print command
    returncode = common.block_on(command)
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_FE.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.fc.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_FE.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.fc.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(fc_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.fc.signal.bedgraph

    #===========================================
    # For -log10(p-value) signal tracks
    #============================================

    # Compute sval = min(no. of reads in ChIP, no. of reads in control) / 1,000,000

    out, err = common.run_pipe(['gzip -dc %s' % (experiment.name), 'wc -l'])
    chipReads = out.strip()
    out, err = common.run_pipe(['gzip -dc %s' % (control.name), 'wc -l'])
    controlReads = out.strip()
    sval = str(min(float(chipReads), float(controlReads)) / 1000000)

    print "chipReads = %s, controlReads = %s, sval = %s" % (chipReads,
                                                            controlReads, sval)

    returncode = common.block_on(
     'macs2 bdgcmp ' + \
     '-t %s/%s_treat_pileup.bdg ' %(peaks_dirname, prefix) + \
     '-c %s/%s_control_lambda.bdg ' %(peaks_dirname, prefix) + \
     '--outdir %s -o %s_ppois.bdg ' %(peaks_dirname, prefix) + \
     '-m ppois -S %s' %(sval))
    print "MACS2 exited with returncode %d" % (returncode)
    assert returncode == 0, "MACS2 non-zero return"

    # Remove coordinates outside chromosome sizes (stupid MACS2 bug)
    pipe = [
        'slopBed -i %s/%s_ppois.bdg -g %s -b 0' %
        (peaks_dirname, prefix, chrom_sizes.name),
        'bedClip stdin %s %s/%s.pval.signal.bedgraph' %
        (chrom_sizes.name, peaks_dirname, prefix)
    ]
    print pipe
    out, err = common.run_pipe(pipe)

    #rm -rf ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_ppois.bdg

    # Convert bedgraph to bigwig
    command = 'bedGraphToBigWig ' + \
        '%s/%s.pval.signal.bedgraph ' %(peaks_dirname, prefix) + \
        '%s ' %(chrom_sizes.name) + \
        '%s' %(pvalue_signal_fn)
    print command
    returncode = common.block_on(command)
    print "bedGraphToBigWig exited with returncode %d" % (returncode)
    assert returncode == 0, "bedGraphToBigWig non-zero return"

    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}.pval.signal.bedgraph
    #rm -f ${PEAK_OUTPUT_DIR}/${CHIP_TA_PREFIX}_treat_pileup.bdg ${peakFile}_control_lambda.bdg

    #===========================================
    # Generate bigWigs from beds to support trackhub visualization of peak files
    #============================================

    narrowPeak_bb_fname = common.bed2bb('%s' % (narrowPeak_fn),
                                        chrom_sizes.name,
                                        narrowPeak_as.name,
                                        bed_type='bed6+4')
    gappedPeak_bb_fname = common.bed2bb('%s' % (gappedPeak_fn),
                                        chrom_sizes.name,
                                        gappedPeak_as.name,
                                        bed_type='bed12+3')
    broadPeak_bb_fname = common.bed2bb('%s' % (broadPeak_fn),
                                       chrom_sizes.name,
                                       broadPeak_as.name,
                                       bed_type='bed6+3')

    #Temporary during development to create empty files just to get the applet to exit
    for fn in [
            narrowPeak_fn, gappedPeak_fn, broadPeak_fn, narrowPeak_bb_fn,
            gappedPeak_bb_fn, broadPeak_bb_fn, fc_signal_fn, pvalue_signal_fn
    ]:
        common.block_on('touch %s' % (fn))

    # Upload the file outputs

    narrowPeak = dxpy.upload_local_file(narrowPeak_gz_fn)
    gappedPeak = dxpy.upload_local_file(gappedPeak_gz_fn)
    broadPeak = dxpy.upload_local_file(broadPeak_gz_fn)
    narrowPeak_bb = dxpy.upload_local_file(narrowPeak_bb_fn)
    gappedPeak_bb = dxpy.upload_local_file(gappedPeak_bb_fn)
    broadPeak_bb = dxpy.upload_local_file(broadPeak_bb_fn)
    fc_signal = dxpy.upload_local_file(fc_signal_fn)
    pvalue_signal = dxpy.upload_local_file(pvalue_signal_fn)

    # Build the output structure.

    output = {
        "narrowpeaks": dxpy.dxlink(narrowPeak),
        "gappedpeaks": dxpy.dxlink(gappedPeak),
        "broadpeaks": dxpy.dxlink(broadPeak),
        "narrowpeaks_bb": dxpy.dxlink(narrowPeak_bb),
        "gappedpeaks_bb": dxpy.dxlink(gappedPeak_bb),
        "broadpeaks_bb": dxpy.dxlink(broadPeak_bb),
        "fc_signal": dxpy.dxlink(fc_signal),
        "pvalue_signal": dxpy.dxlink(pvalue_signal)
    }

    return output
def main(rep1_ta,
         ctl1_ta,
         rep1_xcor,
         rep1_paired_end,
         npeaks,
         nodups,
         chrom_sizes,
         spp_version,
         rep2_ta=None,
         ctl2_ta=None,
         rep2_xcor=None,
         rep2_paired_end=None,
         as_file=None,
         idr_peaks=False,
         fragment_length=None,
         spp_instance=None):

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    rep1_ta_filename = rep1_ta_file.name
    ntags_rep1 = common.count_lines(rep1_ta_filename)

    simplicate_experiment = rep1_ta and not rep2_ta
    if simplicate_experiment:
        logger.info(
            "No rep2 tags specified so processing as a simplicate experiment.")
    else:
        logger.info(
            "Rep1 and rep2 tags specified so processing as a replicated experiment."
        )

    if not simplicate_experiment:
        assert rep1_paired_end == rep2_paired_end, 'Mixed PE/SE not supported'
        rep2_ta_file = dxpy.DXFile(rep2_ta)
        dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
        rep2_ta_filename = rep2_ta_file.name
        ntags_rep2 = common.count_lines(rep2_ta_filename)
    paired_end = rep1_paired_end

    unary_control = (ctl1_ta == ctl2_ta) or not ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    ctl1_ta_filename = ctl1_ta_file.name

    if not unary_control:
        ctl2_ta_file = dxpy.DXFile(ctl2_ta)
        dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
        ctl2_ta_filename = ctl2_ta_file.name
    else:
        ctl2_ta_file = ctl1_ta_file
        ctl2_ta_filename = ctl1_ta_file.name

    ntags_ctl1 = common.count_lines(ctl1_ta_filename)
    ntags_ctl2 = common.count_lines(ctl2_ta_filename)
    rep1_control = ctl1_ta  # default.  May be changed later.
    rep1_ctl_msg = "control rep1"
    rep2_control = ctl2_ta  # default.  May be changed later.
    rep2_ctl_msg = "control rep2"

    rep_info = [(ntags_rep1, 'replicate 1', rep1_ta_filename)]
    if not simplicate_experiment:
        rep_info.append((ntags_rep2, 'replicate 2', rep2_ta_filename))
    rep_info.extend([(ntags_ctl1, 'control 1', ctl1_ta_filename),
                     (ntags_ctl2, 'control 2', ctl2_ta_filename)])
    for n, name, filename in rep_info:
        logger.info("Found %d tags in %s file %s" % (n, name, filename))

    subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    if not simplicate_experiment:
        pool_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pool',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)
        pool_replicates_subjob = \
            pool_applet.run(
                {"inputs": [rep1_ta, rep2_ta],
                 "prefix": 'pooled_reps'},
                name='Pool replicates')
        pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
        pooled_replicates_xcor_subjob = \
            xcor_only(
                pooled_replicates,
                paired_end,
                spp_version,
                name='Pool cross-correlation')

    if unary_control:
        logger.info("Only one control supplied.")
        if not simplicate_experiment:
            logger.info(
                "Using one control for both replicate 1 and 2 and for the pool."
            )
        rep2_control = rep1_control
        control_for_pool = rep1_control
        pool_ctl_msg = "one control"
    else:
        pool_controls_subjob = pool_applet.run(
            {
                "inputs": [ctl1_ta, ctl2_ta],
                "prefix": "PL_ctls"
            },
            name='Pool controls')
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        # always use the pooled controls for the pool
        control_for_pool = pooled_controls
        pool_ctl_msg = "pooled controls"

        # use the pooled controls for the reps depending on the ratio of rep to
        # control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            logger.info(
                "Number of reads in controls differ by > factor of %f. Using pooled controls."
                % (ratio_cutoff))
            rep1_control = pooled_controls
            rep2_control = pooled_controls
        else:
            if ntags_ctl1 < ntags_rep1:
                logger.info(
                    "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                )
                rep1_control = pooled_controls
                rep1_ctl_msg = "pooled controls"
            elif not simplicate_experiment and ntags_ctl2 < ntags_rep2:
                logger.info(
                    "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                )
                rep2_control = pooled_controls
                rep2_ctl_msg = "pooled controls"
            else:
                logger.info("Using distinct controls for replicate 1 and 2.")
                rep1_control = ctl1_ta  # default.  May be changed later.
                rep2_control = ctl2_ta  # default.  May be changed later.
                rep1_ctl_msg = "control rep1"
                rep2_ctl_msg = "control rep2"

    common_args = {
        'chrom_sizes': chrom_sizes,
        'spp_version': spp_version,
        'as_file': as_file,
        'spp_instance': spp_instance
    }
    if fragment_length is not None:
        common_args.update({'fragment_length': fragment_length})
    rep1_peaks_subjob = spp(rep1_ta,
                            rep1_control,
                            rep1_xcor,
                            bigbed=True,
                            name='Rep1 peaks vs %s' % (rep1_ctl_msg),
                            prefix='R1',
                            **common_args)

    if not simplicate_experiment:
        rep2_peaks_subjob = spp(rep2_ta,
                                rep2_control,
                                rep2_xcor,
                                bigbed=True,
                                name='Rep2 peaks vs %s' % (rep2_ctl_msg),
                                prefix='R2',
                                **common_args)

        pooled_peaks_subjob = spp(
            pooled_replicates,
            control_for_pool,
            pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
            bigbed=True,
            name='Pooled peaks vs %s' % (pool_ctl_msg),
            prefix='PL',
            **common_args)

    output = {
        'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"),
        'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"),
        'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"),
        'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores")
    }

    if not simplicate_experiment:
        output.update({
            'rep2_peaks':
            rep2_peaks_subjob.get_output_ref("peaks"),
            'rep2_peaks_bb':
            rep2_peaks_subjob.get_output_ref("peaks_bb"),
            'rep2_xcor_plot':
            rep2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2_xcor_scores':
            rep2_peaks_subjob.get_output_ref("xcor_scores"),
            'pooled_peaks':
            pooled_peaks_subjob.get_output_ref("peaks"),
            'pooled_peaks_bb':
            pooled_peaks_subjob.get_output_ref("peaks_bb"),
            'pooled_xcor_plot':
            pooled_peaks_subjob.get_output_ref("xcor_plot"),
            'pooled_xcor_scores':
            pooled_peaks_subjob.get_output_ref("xcor_scores")
        })

    if idr_peaks:  # also call peaks on pseudoreplicates for IDR
        pseudoreplicator_applet = \
            dxpy.find_one_data_object(
               classname='applet',
               name='pseudoreplicator',
               project=dxpy.PROJECT_CONTEXT_ID,
               zero_ok=False,
               more_ok=False,
               return_handler=True)

        rep1_pr_subjob = \
            pseudoreplicator_applet.run(
                {"input_tags": rep1_ta,
                 "prefix": 'R1PR'},
                name='Pseudoreplicate rep1 -> R1PR1,2')

        rep1pr1_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep1_control,
            rep1_xcor,
            bigbed=False,
            name='R1PR1 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1PR1',
            **common_args)

        rep1pr2_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep1_control,
            rep1_xcor,
            bigbed=False,
            name='R1PR2 peaks vs %s' % (rep1_ctl_msg),
            prefix='R1PR2',
            **common_args)

        output.update({
            'rep1pr1_peaks':
            rep1pr1_peaks_subjob.get_output_ref("peaks"),
            'rep1pr2_peaks':
            rep1pr2_peaks_subjob.get_output_ref("peaks")
        })

        if not simplicate_experiment:
            rep2_pr_subjob = \
                pseudoreplicator_applet.run(
                    {"input_tags": rep2_ta,
                     "prefix": 'R2PR'},
                    name='Pseudoreplicate rep2 -> R2PR1,2')

            pool_pr1_subjob = pool_applet.run(
                {
                    "inputs": [
                        rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                        rep2_pr_subjob.get_output_ref("pseudoreplicate1")
                    ],
                    "prefix":
                    'PPR1'
                },
                name='Pool R1PR1+R2PR1 -> PPR1')

            pool_pr2_subjob = pool_applet.run(
                {
                    "inputs": [
                        rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                        rep2_pr_subjob.get_output_ref("pseudoreplicate2")
                    ],
                    "prefix":
                    'PPR2'
                },
                name='Pool R1PR2+R2PR2 -> PPR2')

            rep2pr1_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep2_control,
                rep2_xcor,
                bigbed=False,
                name='R2PR1 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR1',
                **common_args)

            rep2pr2_peaks_subjob = spp(
                rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep2_control,
                rep2_xcor,
                bigbed=False,
                name='R2PR2 peaks vs %s' % (rep2_ctl_msg),
                prefix='R2PR2',
                **common_args)

            pooledpr1_peaks_subjob = spp(
                pool_pr1_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                bigbed=False,
                name='PPR1 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR1',
                **common_args)

            pooledpr2_peaks_subjob = spp(
                pool_pr2_subjob.get_output_ref("pooled"),
                control_for_pool,
                pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
                bigbed=False,
                name='PPR2 peaks vs %s' % (pool_ctl_msg),
                prefix='PPR2',
                **common_args)

            output.update({
                'rep2pr1_peaks':
                rep2pr1_peaks_subjob.get_output_ref("peaks"),
                'rep2pr2_peaks':
                rep2pr2_peaks_subjob.get_output_ref("peaks"),
                'pooledpr1_peaks':
                pooledpr1_peaks_subjob.get_output_ref("peaks"),
                'pooledpr2_peaks':
                pooledpr2_peaks_subjob.get_output_ref("peaks"),
            })

    return output
Exemple #44
0
def main(experiment, control, xcor_scores_input, npeaks, nodups, bigbed, chrom_sizes, as_file=None, prefix=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    experiment_file = dxpy.DXFile(experiment)
    control_file = dxpy.DXFile(control)
    xcor_scores_input_file = dxpy.DXFile(xcor_scores_input)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    chrom_sizes_filename = chrom_sizes_file.name
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    if bigbed:
        as_file_file = dxpy.DXFile(as_file)
        as_file_filename = as_file_file.name
        dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    experiment_filename = experiment_file.name
    dxpy.download_dxfile(experiment_file.get_id(), experiment_filename)
    control_filename = control_file.name
    dxpy.download_dxfile(control_file.get_id(), control_filename)
    xcor_scores_input_filename = xcor_scores_input_file.name
    dxpy.download_dxfile(xcor_scores_input_file.get_id(), xcor_scores_input_filename)

    if not prefix:
        output_filename_prefix = experiment_filename.rstrip('.gz').rstrip('.tagAlign')
    else:
        output_filename_prefix = prefix
    peaks_filename = output_filename_prefix + '.regionPeak'
    final_peaks_filename = peaks_filename + '.gz' #spp adds .gz, so this is the file name that's actually created
    xcor_plot_filename = output_filename_prefix + '.pdf'
    xcor_scores_filename = output_filename_prefix + '.ccscores'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)

    fraglen_column = 3 # third column in the cross-correlation scores input file
    with open(xcor_scores_input_filename, 'r') as f:
        line = f.readline()
        fragment_length = int(line.split('\t')[fraglen_column-1])
        print "Read fragment length: %d" %(fragment_length)

    #run_spp_command = subprocess.check_output('which run_spp.R', shell=True)
    spp_tarball = '/phantompeakqualtools/spp_1.10.1.tar.gz'
    if nodups:
        run_spp = '/phantompeakqualtools/run_spp_nodups.R'
    else:
        run_spp = '/phantompeakqualtools/run_spp.R'
    #install spp
    subprocess.check_call('ls -l', shell=True)
    subprocess.check_call(shlex.split('R CMD INSTALL %s' %(spp_tarball)))
    spp_command = "Rscript %s -p=%d -c=%s -i=%s -npeak=%d -speak=%d -savr=%s -savp=%s -rf -out=%s" %(run_spp, cpu_count(), experiment_filename, control_filename, npeaks, fragment_length, peaks_filename, xcor_plot_filename, xcor_scores_filename)
    print spp_command
    # process = subprocess.Popen(shlex.split(spp_command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
    # for line in iter(process.stdout.readline, ''):
    #     sys.stdout.write(line)
    subprocess.check_call(shlex.split(spp_command))

    #when one of the peak coordinates are an exact multiple of 10, spp (R) outputs the coordinate in scientific notation
    #this changes any such coodinates to decimal notation
    #this assumes 10-column output and that the 2nd and 3rd columns are coordinates
    #slopBed adjusts feature end coordinates that go off the end of the chromosome
    #bedClip removes any features that are still not within the boundaries of the chromosome

    fix_coordinate_peaks_filename = output_filename_prefix + '.fixcoord.regionPeak'

    out, err = common.run_pipe([
        "gzip -dc %s" %(final_peaks_filename),
        "tee %s" %(peaks_filename),
        r"""awk 'BEGIN{OFS="\t"}{print $1,sprintf("%i",$2),sprintf("%i",$3),$4,$5,$6,$7,$8,$9,$10}'""",
        'slopBed -i stdin -g %s -b 0' %(chrom_sizes_filename),
        'bedClip stdin %s %s' %(chrom_sizes_filename, fix_coordinate_peaks_filename)
    ])

    #These lines transfer the peaks files to the temporary workspace for debugging later
    #Only at the end are the final files uploaded that will be returned from the applet
    dxpy.upload_local_file(peaks_filename)
    dxpy.upload_local_file(fix_coordinate_peaks_filename)

    n_spp_peaks = common.count_lines(peaks_filename)
    print "%s peaks called by spp" %(n_spp_peaks)
    print "%s of those peaks removed due to bad coordinates" %(n_spp_peaks - common.count_lines(fix_coordinate_peaks_filename))
    print "First 50 peaks"
    print subprocess.check_output('head -50 %s' %(fix_coordinate_peaks_filename), shell=True, stderr=subprocess.STDOUT)

    if bigbed:
        peaks_bb_filename = common.bed2bb(fix_coordinate_peaks_filename, chrom_sizes_filename, as_file_filename)
        if peaks_bb_filename:
            peaks_bb = dxpy.upload_local_file(peaks_bb_filename)

    if not filecmp.cmp(peaks_filename,fix_coordinate_peaks_filename):
        print "Returning peaks with fixed coordinates"
        print subprocess.check_output(shlex.split('gzip %s' %(fix_coordinate_peaks_filename)))
        final_peaks_filename = fix_coordinate_peaks_filename + '.gz'

    print subprocess.check_output('ls -l', shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(final_peaks_filename), shell=True, stderr=subprocess.STDOUT)
    #print subprocess.check_output('head %s' %(xcor_scores_filename), shell=True, stderr=subprocess.STDOUT)

    peaks = dxpy.upload_local_file(final_peaks_filename)
    xcor_plot = dxpy.upload_local_file(xcor_plot_filename)
    xcor_scores = dxpy.upload_local_file(xcor_scores_filename)

    output = {}
    output["peaks"] = dxpy.dxlink(peaks)
    output["xcor_plot"] = dxpy.dxlink(xcor_plot)
    output["xcor_scores"] = dxpy.dxlink(xcor_scores)
    if bigbed and peaks_bb_filename:
        output["peaks_bb"] = dxpy.dxlink(peaks_bb)

    return output
Exemple #45
0
def post_extraction(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink,
                    dme_ix_dxlink, target_root, qc_metrics, props):
    '''runs everything after bismark simple extraction in the main instance'''

    print "* post_extraction(): Retrieve context files and index..."
    CpG_context = 'CpG_context_%s.txt' % target_root
    CHG_context = 'CHG_context_%s.txt' % target_root
    CHH_context = 'CHH_context_%s.txt' % target_root
    run_cmd('mkdir -p output/')
    dxpy.download_dxfile(CpG_context_dxlink, 'output/%s.gz' % CpG_context)
    dxpy.download_dxfile(CHG_context_dxlink, 'output/%s.gz' % CHG_context)
    dxpy.download_dxfile(CHH_context_dxlink, 'output/%s.gz' % CHH_context)
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)

    print "* post_extraction(): Uncompress..."
    run_cmd('tar -zxf ' + dme_ix)
    run_cmd('mv input/chrom.sizes .')
    chrom_sizes = "chrom.sizes"
    run_cmd('gunzip output/%s.gz' % CpG_context)
    run_cmd('gunzip output/%s.gz' % CHG_context)
    run_cmd('gunzip output/%s.gz' % CHH_context)

    # First coverage:
    (bedGraph, cx_report) = bismark_coverage(target_root,
                                             CpG_context,
                                             CHG_context,
                                             CHH_context,
                                             gzip=False,
                                             cleanup=True)

    # Next beds
    (CpG_bed, CHG_bed, CHH_bed, CpG_bb, CHG_bb,
     CHH_bb) = bedmethyl(target_root, cx_report, chrom_sizes, cleanup=True)

    # Finally signal
    bigWig = signal(target_root, bedGraph, chrom_sizes, cleanup=True)

    print "* post_extraction(): Storing results..."
    CpG_bed_dxfile = dxpy.upload_local_file(CpG_bed,
                                            properties=props,
                                            details=qc_metrics)
    CHG_bed_dxfile = dxpy.upload_local_file(CHG_bed,
                                            properties=props,
                                            details=qc_metrics)
    CHH_bed_dxfile = dxpy.upload_local_file(CHH_bed,
                                            properties=props,
                                            details=qc_metrics)

    CpG_bb_dxfile = dxpy.upload_local_file(CpG_bb,
                                           properties=props,
                                           details=qc_metrics)
    CHG_bb_dxfile = dxpy.upload_local_file(CHG_bb,
                                           properties=props,
                                           details=qc_metrics)
    CHH_bb_dxfile = dxpy.upload_local_file(CHH_bb,
                                           properties=props,
                                           details=qc_metrics)

    bigWig_dxfile = dxpy.upload_local_file(bigWig,
                                           properties=props,
                                           details=qc_metrics)

    print "* post_extraction(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        "CpG_bed_dxlink": dxpy.dxlink(CpG_bed_dxfile),
        "CHG_bed_dxlink": dxpy.dxlink(CHG_bed_dxfile),
        "CHH_bed_dxlink": dxpy.dxlink(CHH_bed_dxfile),
        "CpG_bb_dxlink": dxpy.dxlink(CpG_bb_dxfile),
        "CHG_bb_dxlink": dxpy.dxlink(CHG_bb_dxfile),
        "CHH_bb_dxlink": dxpy.dxlink(CHH_bb_dxfile),
        "bigWig_dxlink": dxpy.dxlink(bigWig_dxfile)
    }
Exemple #46
0
def crop(reads1_file, reads2_file, crop_length, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logger.setLevel(logging.INFO)
    if crop_length == 'native':
        output = dict(
            zip(["cropped_reads1", "cropped_reads2"],
                [reads1_file, reads2_file]))
    else:
        reads1_filename = dxpy.describe(reads1_file)['name']
        reads1_basename = strip_extensions(reads1_filename, STRIP_EXTENSIONS)
        dxpy.download_dxfile(reads1_file, reads1_filename)
        if reads2_file:
            end_string = "PE"
            reads2_filename = dxpy.describe(reads2_file)['name']
            reads2_basename = \
                strip_extensions(reads2_filename, STRIP_EXTENSIONS)
            dxpy.download_dxfile(reads2_file, reads2_filename)
            output_fwd_paired_filename = reads1_basename + '-crop-paired.fq.gz'
            output_fwd_unpaired_filename = \
                reads1_basename + '-crop-unpaired.fq.gz'
            output_rev_paired_filename = reads2_basename + '-crop-paired.fq.gz'
            output_rev_unpaired_filename = \
                reads2_basename + '-crop-unpaired.fq.gz'
            SE_output_filename = None
        else:
            end_string = "SE"
            reads2_filename = None
            reads2_basename = None
            output_fwd_paired_filename = None
            output_fwd_unpaired_filename = None
            output_rev_paired_filename = None
            output_rev_unpaired_filename = None
            SE_output_filename = reads1_basename + "-crop.fq.gz"

        crop_command = ' '.join([
            s for s in [
                'java -jar', TRIMMOMATIC_PATH, end_string,
                '-threads %d' % (cpu_count()), reads1_filename,
                reads2_filename, SE_output_filename,
                output_fwd_paired_filename, output_fwd_unpaired_filename,
                output_rev_paired_filename, output_rev_unpaired_filename,
                'MINLEN:%s' % (crop_length),
                'CROP:%s' % (crop_length)
            ] if s
        ])

        logger.info("Cropping with: %s" % (crop_command))
        print(subprocess.check_output(shlex.split(crop_command)))
        print(subprocess.check_output(shlex.split('ls -l')))

        if SE_output_filename:
            SE_output = dxpy.upload_local_file(SE_output_filename)
            cropped_reads = [dxpy.dxlink(SE_output), None]
        else:
            output_fwd_paired = \
                dxpy.upload_local_file(output_fwd_paired_filename)
            output_rev_paired = \
                dxpy.upload_local_file(output_rev_paired_filename)
            cropped_reads = [
                dxpy.dxlink(output_fwd_paired),
                dxpy.dxlink(output_rev_paired)
            ]

        output = dict(zip(["cropped_reads1", "cropped_reads2"], cropped_reads))

    logger.info("returning from crop with output %s" % (output))
    return output
Exemple #47
0
def merge_extract_full(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam,
                       props):
    '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32'''

    (target_root, biorep_bam) = merge_bams(bam_set, 32)
    (biorep_map, all_reports) = merge_map_reports(map_report_set, target_root)
    (qc_metrics, reads,
     biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports)

    print "* merge_extract_full(): Retrieve and uncompress index..."
    dme_ix = "dme_index.tar.gz"
    dxpy.download_dxfile(dme_ix_dxlink, dme_ix)
    run_cmd('tar -zxf ' + dme_ix)

    # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage
    (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root)

    bismark_full_extract(target_root, alignments, ncores)
    qc_metrics = bismark_qc_metrics(target_root, qc_metrics)

    print "* merge_extract_full(): Retrieve split report..."
    append_line(
        "\n===== bismark_methylation_extractor: splitting_report =====",
        biorep_bam_qc)
    run_cmd('cat %s_splitting_report.txt' % target_root,
            out=biorep_bam_qc,
            append=True,
            silent=True)

    # TODO: Is this even needed?  Currently we do to get the size!
    #if len(bam_set) > 1:  # Wouldn't need to do this unless there is a merge
    #    print "* merge_extract(): Storing biorep bam..."
    #    props_ex = props.copy()
    #    props_ex.update({ 'reads': str(reads) })
    #    biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True))
    #else:
    #    biorep_bam_dxlink = bam_set[0]

    print "* merge_extract_full(): Storing extraction results..."
    biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,
                                                  properties=props,
                                                  details=qc_metrics)
    biorep_map_dxfile = dxpy.upload_local_file(biorep_map,
                                               properties=props,
                                               details=qc_metrics)
    run_cmd('pigz output/%s.CX_report.txt' % target_root)
    cx_report_dxfile = dxpy.upload_local_file('output/%s.CX_report.txt.gz' %
                                              target_root)
    bedgraph_gz_dxfile = dxpy.upload_local_file('output/%s.bedGraph.gz' %
                                                target_root)
    chrom_sizes_dxfile = dxpy.upload_local_file('input/chrom.sizes')
    split_report_dxfile = dxpy.upload_local_file(target_root +
                                                 '_splitting_report.txt')
    mbias_report_dxfile = dxpy.upload_local_file(target_root +
                                                 '_mbias_report.txt',
                                                 properties=props,
                                                 details=qc_metrics)

    print "* merge_extract_full(): Check storage..."
    run_cmd('ls -l')
    run_cmd('df -k .')

    return {
        #"biorep_bam_dxlink":    biorep_bam_dxfile,
        "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile),
        "biorep_map_dxlink": dxpy.dxlink(biorep_map_dxfile),
        "split_report_dxlink": dxpy.dxlink(split_report_dxfile),
        "cx_report_dxlink": dxpy.dxlink(cx_report_dxfile),
        "bedgraph_gz_dxlink": dxpy.dxlink(bedgraph_gz_dxfile),
        "chrom_sizes_dxlink": dxpy.dxlink(chrom_sizes_dxfile),
        "mbias_report_dxlink": dxpy.dxlink(mbias_report_dxfile),
        "target_root": target_root,
        "qc_metrics": qc_metrics
    }
Exemple #48
0
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version,
                samtools_version, debug):

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    samtools = SAMTOOLS_PATH.get(samtools_version)
    assert samtools, "samtools version %s is not supported" % (
        samtools_version)
    bwa = BWA_PATH.get(bwa_version)
    assert bwa, "BWA version %s is not supported" % (bwa_version)
    logger.info("In postprocess with samtools %s and bwa %s" % (samtools, bwa))

    indexed_reads_filenames = []
    unmapped_reads_filenames = []
    for i, reads in enumerate(indexed_reads):
        read_pair_number = i + 1

        fn = dxpy.describe(reads)['name']
        logger.info("indexed_reads %d: %s" % (read_pair_number, fn))
        indexed_reads_filenames.append(fn)
        dxpy.download_dxfile(reads, fn)

        unmapped = unmapped_reads[i]
        fn = dxpy.describe(unmapped)['name']
        logger.info("unmapped reads %d: %s" % (read_pair_number, fn))
        unmapped_reads_filenames.append(fn)
        dxpy.download_dxfile(unmapped, fn)

    reference_tar_filename = dxpy.describe(reference_tar)['name']
    logger.info("reference_tar: %s" % (reference_tar_filename))
    dxpy.download_dxfile(reference_tar, reference_tar_filename)
    # extract the reference files from the tar
    reference_dirname = 'reference_files'
    reference_filename = \
        resolve_reference(reference_tar_filename, reference_dirname)
    logger.info("Using reference file: %s" % (reference_filename))

    paired_end = len(indexed_reads) == 2

    if paired_end:
        r1_basename = strip_extensions(unmapped_reads_filenames[0],
                                       STRIP_EXTENSIONS)
        r2_basename = strip_extensions(unmapped_reads_filenames[1],
                                       STRIP_EXTENSIONS)
        reads_basename = r1_basename + r2_basename
    else:
        reads_basename = strip_extensions(unmapped_reads_filenames[0],
                                          STRIP_EXTENSIONS)
    raw_bam_filename = '%s.raw.srt.bam' % (reads_basename)
    raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' % (reads_basename)

    if paired_end:
        reads1_filename = indexed_reads_filenames[0]
        reads2_filename = indexed_reads_filenames[1]
        unmapped_reads1_filename = unmapped_reads_filenames[0]
        unmapped_reads2_filename = unmapped_reads_filenames[1]
        raw_sam_filename = reads_basename + ".raw.sam"
        badcigar_filename = "badreads.tmp"
        steps = [
            "%s sampe -P %s %s %s %s %s" %
            (bwa, reference_filename, reads1_filename, reads2_filename,
             unmapped_reads1_filename, unmapped_reads2_filename),
            "tee %s" % (raw_sam_filename),
            r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
            "sort", "uniq"
        ]
        out, err = common.run_pipe(steps, badcigar_filename)
        print(out)
        if err:
            logger.error("sampe error: %s" % (err))

        steps = [
            "cat %s" % (raw_sam_filename),
            "grep -v -F -f %s" % (badcigar_filename)
        ]
    else:  # single end
        reads_filename = indexed_reads_filenames[0]
        unmapped_reads_filename = unmapped_reads_filenames[0]
        steps = [
            "%s samse %s %s %s" %
            (bwa, reference_filename, reads_filename, unmapped_reads_filename)
        ]

    if samtools_version == "0.1.9":
        steps.extend([
            "%s view -Su -" % (samtools),
            "%s sort - %s" % (samtools, raw_bam_filename.rstrip('.bam'))
        ])  # samtools adds .bam
    else:
        steps.extend([
            "%s view -@%d -Su -" % (samtools, cpu_count()),
            "%s sort -@%d - %s" %
            (samtools, cpu_count(), raw_bam_filename.rstrip('.bam'))
        ])  # samtools adds .bam

    logger.info("Running pipe: %s" % (steps))
    out, err = common.run_pipe(steps)

    if out:
        print(out)
    if err:
        logger.error("samtools error: %s" % (err))

    with open(raw_bam_mapstats_filename, 'w') as fh:
        subprocess.check_call(shlex.split("%s flagstat %s" %
                                          (samtools, raw_bam_filename)),
                              stdout=fh)
    print(subprocess.check_output('ls -l', shell=True))

    mapped_reads = dxpy.upload_local_file(raw_bam_filename)
    mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename)
    flagstat_qc = flagstat_parse(raw_bam_mapstats_filename)

    output = {
        'mapped_reads': dxpy.dxlink(mapped_reads),
        'mapping_statistics': dxpy.dxlink(mapping_statistics),
        'n_mapped_reads': flagstat_qc.get('mapped')[0]  # 0 is hi-q reads
    }
    logger.info("Returning from postprocess with output: %s" % (output))
    return output
Exemple #49
0
def merge_bams(bam_set, ncores):
    '''Merges techrep bams into biorep bam.'''
    # NOTE: dme-align produces *_techrep_bismark.bam and dme-extract merges 1+ techrep bams into a *_bismark_biorep.bam.
    #       The reason for the name 'word' order is so thal older *_bismark.bam alignments are recognizable as techrep bams

    target_root = ""
    merged = ""
    tech_reps = ""
    exp_id = ""
    rep_tech = ""

    for techrep_bam_dlink in reversed(bam_set):
        file_desc = dxpy.describe(techrep_bam_dlink)
        file_root = file_desc['name']
        print "* Working on '" + str(techrep_bam_dlink) + "' " + file_root
        file_root = file_root.replace('_techrep_bismark.bam', '')
        file_root = file_root.replace('_bismark.bam', '')
        if len(target_root) == 0:
            target_root = file_root
        else:
            target_root = file_root + '_' + target_root
            if len(merged) == 0:
                target_root += '_bismark_biorep'
                merged = 's merged as'

        # Try to simplify the names
        if os.path.isfile('/usr/bin/parse_property.py'):
            if len(exp_id) == 0:
                file_path = file_desc['folder'] + '/' + file_desc['name']
                exp_id = subprocess.check_output(shlex.split('parse_property.py -f %s --project %s --exp_id -q' \
                                                                        % (file_desc['id'], file_desc['project']) ))
                exp_id = ''.join(exp_id.split())  # Remove \n, etc.
                if len(exp_id) > 0:
                    print "* Discovered exp_id: '%s'" % exp_id
            if len(exp_id) > 0:
                rep_tech = subprocess.check_output(shlex.split('parse_property.py -f %s --project %s --rep_tech -q' \
                                                                        % (file_desc['id'], file_desc['project']) ))
                rep_tech = ''.join(rep_tech.split())  # Remove \n, etc.
        if len(rep_tech) > 0:
            print "* Discovered rep_tech: '%s'" % rep_tech
            if len(tech_reps) > 0:
                tech_reps = tech_reps + '_' + rep_tech
            else:
                tech_reps = rep_tech

        print "* Downloading %s_techrep_bismark.bam file..." % file_root
        dxpy.download_dxfile(techrep_bam_dlink,
                             file_root + '_techrep_bismark.bam')

        if not os.path.isfile("sofar.bam"):
            run_cmd('mv %s_techrep_bismark.bam sofar.bam' % file_root)
        else:
            print "* Merging in %s_techrep_bismark.bam..." % file_root
            # NOTE: keeps the first header
            run_cmd('samtools cat sofar.bam %s_techrep_bismark.bam' %
                    file_root,
                    out='merging.bam')
            run_cmd('mv merging.bam sofar.bam')
            run_cmd('rm %s_techrep_bismark.bam' %
                    file_root)  # STORAGE IS LIMITED

    if len(exp_id) > 0 and len(tech_reps) > 0:
        target_root = '%s_%s_bismark_biorep' % (exp_id, tech_reps)
        print "* Name biorep bam as: %s.bam" % target_root
    else:
        print "* Long biorep bam to be named: %s.bam" % target_root

    # At this point there is a 'sofar.bam' with one or more input bams
    if len(merged) == 0:
        target_root = file_root + "_bismark_biorep"
        run_cmd('mv sofar.bam %s.bam' % target_root)
        print "* Only one input file '%s.bam', no merging required." % target_root
    else:
        # sorting needed due to samtools cat
        print "* Sorting merged bam..."
        run_cmd('samtools sort -@ %d -m 1600M -f sofar.bam %s.bam' %
                (ncores, target_root))
        run_cmd('rm sofar.bam')  # STORAGE IS LIMITED
        print "* Files merged into '%s.bam'" % target_root

    return (target_root, target_root + '.bam')
def peaks_stanza(accession,
                 url,
                 name,
                 n,
                 tracktype='bigBed 6 +',
                 lowpass=[],
                 dx=None):
    return_string = \
        "\t\ttrack %s%d\n" %(accession,n) + \
        "\t\tbigDataUrl %s\n" %(url) + \
        "\t\tshortLabel %s\n" %(name[:17]) + \
        "\t\tparent %sviewpeaks on\n" %(accession) + \
        "\t\ttype %s\n" %(tracktype) + \
        "\t\tvisibility dense\n" + \
        "\t\tview PK\n" + \
        "\t\tpriority %d\n\n" %(n)
    n_stanzas = 1
    if not lowpass:
        lowpass = []
    if isinstance(lowpass, int):
        lowpass = [lowpass]
    extra_stanza_count = 0
    for (i, cutoff) in enumerate(lowpass, start=1):
        fn = dx.get_id()
        if not os.path.isfile(fn):
            dxpy.download_dxfile(dx.get_id(), fn)
        cutoffstr = '-lt%d' % (cutoff)
        outfn = fn + cutoffstr
        print fn, os.path.getsize(fn), subprocess.check_output(
            'wc -l %s' % (fn), shell=True).split()[0]
        bed_fn = fn + '.bed'
        common.block_on('bigBedToBed %s %s' % (fn, bed_fn))
        common.run_pipe([
            'cat %s' % (bed_fn),
            r"""awk 'BEGIN{FS="\t";OFS="\t"}{if (($3-$2) < %d) {print $0}}'"""
            % (cutoff)
        ], outfn)
        print outfn, os.path.getsize(outfn), subprocess.check_output(
            'wc -l %s' % (outfn), shell=True).split()[0]
        if tracktype == 'bigBed 6 +':
            as_file = 'narrowPeak.as'
        elif tracktype == 'bigBed 12 +':
            as_file = 'gappedPeak.as'
        else:
            print "Cannot match tracktype %s to any .as file" % (tracktype)
        bb_fn = common.bed2bb(outfn, 'mm10.chrom.sizes', as_file)
        newdx = dxpy.upload_local_file(filename=bb_fn,
                                       folder="/tracks",
                                       wait_on_close=True)
        new_url, headers = newdx.get_download_url(duration=sys.maxint,
                                                  preauthenticated=True)

        new_lines = [
            "\t\ttrack %s%d" % (accession, n + i),
            "\t\tbigDataUrl %s" % (new_url),
            "\t\tshortLabel %s" % (name[:17 - len(cutoffstr)] + cutoffstr),
            "\t\tparent %sviewpeaks on" % (accession),
            "\t\ttype %s" % (tracktype), "\t\tvisibility dense", "\t\tview PK",
            "\t\tpriority %d\n\n" % (n + i)
        ]
        new_stanza = '\n'.join(new_lines)
        return_string += new_stanza
        n_stanzas += 1
        os.remove(bed_fn)
        os.remove(bb_fn)
        os.remove(outfn)
        os.remove(fn)

    return (return_string, n_stanzas)
Exemple #51
0
def main(pipe_file, file_meta, key=None, debug=False, skipvalidate=True):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    #files = [dxpy.DXFile(item) for item in files]

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    #for i, f in enumerate(files):
    #    dxpy.download_dxfile(f.get_id(), "files-" + str(i))

    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.

    encd.logger = logging.getLogger("Applet.dxe")
    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    (AUTHID, AUTHPW, SERVER) = encd.processkey(key)

    f_des = dxpy.describe(pipe_file)
    filename = f_des['name']
    fid = f_des['id']
    folder = dxpy.DXFile(fid, project=dxpy.PROJECT_CONTEXT_ID).folder
    logger.info("* Downloading file from dx to local...")
    start = datetime.now()
    dx_file = dxpy.download_dxfile(pipe_file, filename)
    end = datetime.now()
    duration = end - start
    logger.info("* Download in %.2f seconds" % duration.seconds)

    if filename.endswith('.bed') or filename.endswith('.gff'):
        subprocess.check_call(['gzip', filename])
        filename = filename + '.gz'

    # gathering metadata
    file_meta['submitted_file_name'] = "%s/%s" % (folder, filename)
    file_meta['md5sum'] = dx.calc_md5(filename).hexdigest()
    file_meta['file_size'] = os.path.getsize(filename)
    if "aliases" not in file_meta:
        file_meta["aliases"] = []
    file_meta["aliases"].append("dnanexus:" + fid)
    if file_meta.get('accession') != None:
        file_meta[
            "status"] = "upload failed"  # Can only repost to same accession if status is upload failed.

    if not skipvalidate:
        logger.info("* Validating: %s (%s)" % (filename, folder))
        start = datetime.now()
        v = validate(filename, file_meta)
        end = datetime.now()
        duration = end - start
        logger.info("* Validated in %.2f seconds" % duration.seconds)
    else:
        v = {'validation': 'Not Run'}

    if v['validation'] == "Error count 0\n" or v['validation'].find(
            'Not Run') == 0:  ## yes with CR

        logger.info("* Posting file and metadata to ENCODEd...")
        f_obj = encd.post_file(filename, file_meta, SERVER, AUTHID, AUTHPW)
        v['accession'] = f_obj.get('accession', "NOT POSTED")
        if v['accession'] == "NOT POSTED":
            v['accession'] = f_obj.get("external_accession", "NOT POSTED")
        if v['accession'] == "NOT POSTED":
            v['accession'] = file_meta.get("external_accession", "NOT POSTED")
            print "* Returned f_obj..."
            print json.dumps(f_obj, indent=4, sort_keys=True)
            raise  # This will ensure that splashdown doesn't continue uploading.

        post_status = f_obj.get('status', 'upload failed')
        if post_status == 'upload failed':
            logger.info("* Post ERROR on %s to '%s': %s" %
                        (filename, v['accession'], post_status))
            # NOTE: need to set the accession to dx file nonetheless, since the file object was created in encodeD
        else:
            logger.info("* Posted %s to '%s'" % (filename, v['accession']))

        # update pipe_file md5sum and accession properties
        dx.file_set_property(fid,
                             'md5sum',
                             file_meta['md5sum'],
                             proj_id=dxpy.PROJECT_CONTEXT_ID,
                             verbose=True)
        acc_key = dx.property_accesion_key(SERVER)
        if post_status == 'upload failed':
            acc_key = acc_key + ' upload failed'
        acc = dx.file_set_property(fid,
                                   acc_key,
                                   v['accession'],
                                   proj_id=dxpy.PROJECT_CONTEXT_ID,
                                   verbose=True)
        if acc == None or acc != v['accession']:
            logger.info("* Failed to update '%s' to '%s' in file properties" %
                        (acc_key, v['accession']))
        else:
            logger.info("* Updated '%s' to '%s' in file properties" %
                        (acc_key, acc))
        #logger.debug(json.dumps(f_obj, indent=4, sort_keys=True))

        if post_status == 'upload failed':
            raise  # This will ensure that splashdown doesn't continue uploading.

    else:
        logger.info("* File invalid: %s" % v['validation'])
        v['accession'] = "NOT POSTED"

    return v
def histone(args, analysis, experiment_accession, first_analysis):
    authid, authpw, server = processkey(args.key)
    keypair = (authid, authpw)

    stages = analysis.get('stages')
    peaks_stage = next(
        stage for stage in stages
        if stage['execution']['name'] == "ENCODE Peaks")['execution']
    replicated_stages = [
        stage['execution'] for stage in stages
        if 'Final' in stage['execution']['name']
    ]

    # this is just a cheap way of determining singlicate or replicate analysis
    # singlicate analyses have no rescue_ratio
    singlicate_analysis = all(stage['output'].get('rep2_signal') is None
                              for stage in replicated_stages)

    output_names = [
        'rep1_narrowpeaks_bb',
        'rep1_gappedpeaks_bb',
        'rep1_pvalue_signal',
        'rep1_fc_signal',
    ] if singlicate_analysis else [
        'rep1_narrowpeaks_bb', 'rep2_narrowpeaks_bb', 'pooled_narrowpeaks_bb',
        'rep1_gappedpeaks_bb', 'rep2_gappedpeaks_bb', 'pooled_gappedpeaks_bb',
        'rep1_pvalue_signal', 'rep2_pvalue_signal', 'pooled_pvalue_signal',
        'rep1_fc_signal', 'rep2_fc_signal', 'pooled_fc_signal'
    ]

    outputs = dict(
        zip(output_names,
            [{
                'dx': dxpy.DXFile(peaks_stage['output'][output_name])
            } for output_name in output_names]))

    output_names.insert(3, 'replicated_narrowpeaks_bb')
    outputs.update({
        'replicated_narrowpeaks_bb': {
            'dx':
            dxpy.DXFile(
                next(stage['execution']['output']['overlapping_peaks_bb']
                     for stage in stages
                     if stage['execution']['name'] == 'Final narrowpeaks'))
        }
    })
    output_names.insert(7, 'replicated_gappedpeaks_bb')
    outputs.update({
        'replicated_gappedpeaks_bb': {
            'dx':
            dxpy.DXFile(
                next(stage['execution']['output']['overlapping_peaks_bb']
                     for stage in stages
                     if stage['execution']['name'] == 'Final gappedpeaks'))
        }
    })

    track_directory = os.path.join(args.ddir, experiment_accession)
    url_base = urlparse.urljoin(args.turl, experiment_accession + '/')
    #print "url_base %s" %(url_base)
    if not args.nodownload and not os.path.exists(track_directory):
        os.makedirs(track_directory)
    if first_analysis:
        if os.path.exists(args.tdbpath):
            if args.truncate:
                trackDb = open(args.tdbpath, 'w')
            else:
                trackDb = open(args.tdbpath, 'a')
        else:
            if not os.path.exists(os.path.dirname(args.tdbpath)):
                os.makedirs(os.path.dirname(args.tdbpath))
            trackDb = open(args.tdbpath, 'w')
    else:
        trackDb = open(args.tdbpath, 'a')

    for (output_name, output) in outputs.iteritems():
        local_path = os.path.join(track_directory, output['dx'].name)
        print output_name, output['dx'].get_id(), local_path
        if not args.nodownload:
            dxpy.download_dxfile(output['dx'].get_id(), local_path)
        outputs[output_name].update({'local_path': local_path})
        #print "Joining %s and %s" %(url_base, os.path.basename(local_path))
        if args.dxf:
            url, headers = output['dx'].get_download_url(duration=sys.maxint,
                                                         preauthenticated=True)
            outputs[output_name].update({'url': url})
        else:
            outputs[output_name].update({
                'url':
                urlparse.urljoin(url_base, os.path.basename(local_path))
            })
        #print outputs[output_name]['url']

    experiment = encoded_get(
        urlparse.urljoin(server, '/experiments/%s' % (experiment_accession)),
        keypair)
    description = '%s %s %s %s' % (
        experiment['target']['label'], experiment['replicates'][0]['library']
        ['biosample']['biosample_term_name'],
        experiment['replicates'][0]['library']['biosample'].get('life_stage'),
        experiment['replicates'][0]['library']['biosample'].get('age_display'))
    longLabel = 'E3 Histone ChIP - %s - %s' % (experiment_accession,
                                               description)
    if args.tag:
        longLabel += ' - %s' % (args.tag)
    trackDb.write(composite_stanza(experiment_accession, longLabel))

    first_peaks = True
    first_signal = True
    priority = 1
    for (n, output_name) in enumerate(output_names, start=1):
        if output_name.endswith('narrowpeaks_bb'):
            if first_peaks:
                trackDb.write(viewpeaks_stanza(experiment_accession))
                first_peaks = False
            stanzas, n_stanzas = peaks_stanza(experiment_accession,
                                              outputs[output_name]['url'],
                                              output_name,
                                              priority,
                                              tracktype="bigBed 6 +",
                                              lowpass=args.lowpass,
                                              dx=outputs[output_name]['dx'])
            trackDb.write(stanzas)
            priority += n_stanzas
        elif output_name.endswith('gappedpeaks_bb'):
            if first_peaks:
                trackDb.write(viewpeaks_stanza(experiment_accession))
                first_peaks = False
            stanzas, n_stanzas = peaks_stanza(experiment_accession,
                                              outputs[output_name]['url'],
                                              output_name,
                                              priority,
                                              tracktype="bigBed 12 +",
                                              lowpass=args.lowpass,
                                              dx=outputs[output_name]['dx'])
            trackDb.write(stanzas)
            priority += n_stanzas
        elif output_name.endswith('_signal'):
            if first_signal:
                trackDb.write(viewsignal_stanza(experiment_accession))
                first_signal = False
            trackDb.write(
                signal_stanza(experiment_accession,
                              outputs[output_name]['url'],
                              output_name,
                              priority,
                              tracktype="bigWig"))
            priority += 1

    trackDb.close()
def main(input_vcf,
         reference,
         input_bam=None,
         annotation_vcf=None,
         comparison_vcf=None,
         dbsnp=None,
         genes=None,
         gatk_annotator_params='',
         snpeff_build_params='-gtf22 -v',
         snpeff_annotate_params='-v -onlyCoding true -i vcf -o vcf'):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_vcf = dxpy.DXFile(input_vcf)
    reference = dxpy.DXFile(reference)
    ref_name = reference.describe()['name'].replace(".gz", "")

    if genes != None:
        genes = dxpy.DXFile(genes)
        genes_name = genes.describe()['name']

    if annotation_vcf != None:
        annotation_vcf = dxpy.DXFile(annotation_vcf)
        annotation_name = annotation_vcf.describe()['name']

    if comparison_vcf != None:
        comparison_vcf = dxpy.DXFile(comparison_vcf)
        comparison_name = comparison_vcf.describe()['name']

    if dbsnp != None:
        print "dbsnp present"
        dbsnp = dxpy.DXFile(dbsnp)
        dbsnp_name = dbsnp.describe()['name']

    if input_bam != None:
        input_bam = dxpy.DXFile(input_bam)
        bam_name = input_bam.describe()['name']

    base_name = input_vcf.describe()['name'].replace(".vcf", '')
    vcf_name = input_vcf.describe()['name']

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_vcf.get_id(), "%s" % vcf_name)
    dxpy.download_dxfile(reference.get_id(), "%s.gz" % ref_name)
    if genes != None:
        dxpy.download_dxfile(genes.get_id(), "%s" % genes_name)
    if annotation_vcf != None:
        dxpy.download_dxfile(annotation_vcf.get_id(), "%s" % annotation_name)
    if comparison_vcf != None:
        dxpy.download_dxfile(comparison_vcf.get_id(), "%s" % comparison_name)
    if dbsnp != None:
        dxpy.download_dxfile(dbsnp.get_id(), "%s" % dbsnp_name)
    if input_bam != None:
        dxpy.download_dxfile(input_bam.get_id(), "%s" % bam_name)

    # Fill in your application code here.

    subprocess.check_call("gzip -d %s.gz" % ref_name, shell=True)

    if genes != None:
        subprocess.check_call("mv %s /snpEff_2_0_5/data/genomes/%s" %
                              (ref_name, ref_name),
                              shell=True)
        genes_file = open("/snpEff_2_0_5/snpEff.config", "a+")
        genes_file.write("\n%s.genome : Custom_species\n" %
                         ref_name.replace(".fa", ""))
        genes_file.close()
        subprocess.check_call("mkdir /snpEff_2_0_5/data/%s" %
                              ref_name.replace(".fa", ""),
                              shell=True)
        subprocess.check_call(
            "mv %s /snpEff_2_0_5/data/%s/%s" %
            (genes_name, ref_name.replace(".fa", ""), genes_name),
            shell=True)
        #Build the snpeff database
        subprocess.check_call(
            "java -Xmx4g -jar /snpEff_2_0_5/snpEff.jar build -c /snpEff_2_0_5/snpEff.config %s %s"
            % (snpeff_build_params, ref_name.replace(".fa", "")),
            shell=True)
        # Produce snpeff annotation file
        subprocess.check_call(
            "java -Xmx4g -jar /snpEff_2_0_5/snpEff.jar -c /snpEff_2_0_5/snpEff.config %s %s %s > snpeff.vcf"
            % (snpeff_annotate_params, ref_name.replace(".fa", ""), vcf_name),
            shell=True)
        ref_name = "/snpEff_2_0_5/data/genomes/%s"

    try:
        subprocess.check_call("tabix -p vcf %s" % dbsnp_name, shell=True)
    except:
        print "Tried tabix indexing dbsnp file and failed. Proceeding as though file is uncompressed VCF"

    annotate_command = "java -Xmx4g -jar /opt/jar/GenomeAnalysisTK.jar -T VariantAnnotator -R %s --variant %s -L %s -o %s_annotated.vcf %s" % (
        ref_name, vcf_name, vcf_name, base_name, gatk_annotator_params)
    if dbsnp != None:
        annotate_command += " --dbsnp %s" % dbsnp_name
    if input_bam != None:
        annotate_command += " -I %s" % input_bam
    if genes != None:
        annotate_command += " -A SnpEff --snpEffFile snpeff.vcf"
    if annotation_vcf != None:
        annotate_command += " -resource %s" % annotation_vcf
    if comparison_vcf != None:
        annotate_command += " -comp %s" % comparison_name

    subprocess.check_call(annotate_command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    annotated_variants = dxpy.upload_local_file("%s_annotated.vcf" % base_name)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["annotated_variants"] = dxpy.dxlink(annotated_variants)

    return output
Exemple #54
0
def postprocess(indexed_reads, unmapped_reads, reference_tar, bwa_version, samtools_version):

	print "In postprocess with:"

	if samtools_version == "0.1.19":
		samtools = "/usr/local/bin/samtools-0.1.19/samtools"
	elif samtools_version == "1.0":
		samtools = "/usr/local/bin/samtools-1.0/bin/samtools"
	else:
		samtools = "/usr/local/bin/samtools-0.1.19/samtools"

	if bwa_version == "0.7.7":
		bwa = "bwa0.7.7"
	elif bwa_version == "0.7.10":
		bwa = "bwa0.7.10"
	else:
		print "BWA version %s not supported, defaulting to 0.7.7"
		bwa = "bwa0.7.7"

	print "samtools version: %s" %(samtools)
	print "bwa version %s" %(bwa)

	indexed_reads_filenames = []
	unmapped_reads_filenames = []
	for i,reads in enumerate(indexed_reads):
		read_pair_number = i+1
		
		fn = dxpy.describe(reads)['name']
		print "indexed_reads %d: %s" %(read_pair_number, fn)
		indexed_reads_filenames.append(fn)
		dxpy.download_dxfile(reads,fn)

		unmapped = unmapped_reads[i]
		fn = dxpy.describe(unmapped)['name']
		print "unmapped reads %d: %s" %(read_pair_number, fn)
		unmapped_reads_filenames.append(fn)
		dxpy.download_dxfile(unmapped,fn)

	reference_tar_filename = dxpy.describe(reference_tar)['name']
	print "reference_tar: %s" %(reference_tar_filename)
	dxpy.download_dxfile(reference_tar, reference_tar_filename)
	# extract the reference files from the tar
	if reference_tar_filename.endswith('.gz') or reference_tar_filename.endswith('.tgz'):
		tar_command = 'tar -xzvf %s' %(reference_tar_filename)
	else:
		tar_command = 'tar -xvf %s' %(reference_tar_filename)
	print "Unpacking %s" %(reference_tar_filename)
	print subprocess.check_output(shlex.split(tar_command))
	reference_filename = resolve_reference()

	paired_end = len(indexed_reads) == 2

	if paired_end:
		r1_basename = unmapped_reads_filenames[0].rstrip('.gz').rstrip('.fq').rstrip('.fastq')
		r2_basename = unmapped_reads_filenames[1].rstrip('.gz').rstrip('.fq').rstrip('.fastq')
		reads_basename = r1_basename + r2_basename
	else:
		reads_basename = unmapped_reads_filenames[0].rstrip('.gz').rstrip('.fq').rstrip('.fastq')
	raw_bam_filename = '%s.raw.srt.bam' %(reads_basename)
	raw_bam_mapstats_filename = '%s.raw.srt.bam.flagstat.qc' %(reads_basename)

	if paired_end:
		reads1_filename = indexed_reads_filenames[0]
		reads2_filename = indexed_reads_filenames[1]
		unmapped_reads1_filename = unmapped_reads_filenames[0]
		unmapped_reads2_filename = unmapped_reads_filenames[1]
		raw_sam_filename = reads_basename + ".raw.sam"
		badcigar_filename = "badreads.tmp"
		steps = [ "%s sampe -P %s %s %s %s %s" %(bwa, reference_filename, reads1_filename, reads2_filename, unmapped_reads1_filename, unmapped_reads2_filename),
				  "tee %s" %(raw_sam_filename),
				  r"""awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t" ; }'""",
				  "sort",
				  "uniq" ]
		out,err = run_pipe(steps,badcigar_filename)
		if err:
			print "sampe error: %s" %(err)

		steps = [ "cat %s" %(raw_sam_filename),
				  "grep -v -F -f %s" %(badcigar_filename)]
	else: #single end
		reads_filename = indexed_reads_filenames[0]
		unmapped_reads_filename = unmapped_reads_filenames[0]
		steps = [ "%s samse %s %s %s" %(bwa, reference_filename, reads_filename, unmapped_reads_filename) ]
	if samtools_version == "0.1.9":
		steps.extend(["%s view -Su -" %(samtools),
					  "%s sort - %s" %(samtools, raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam
	else:
		steps.extend(["%s view -@%d -Su -" %(samtools, cpu_count()),
					  "%s sort -@%d - %s" %(samtools, cpu_count(), raw_bam_filename.rstrip('.bam')) ]) # samtools adds .bam
	print "Running pipe:"
	print steps
	out,err = run_pipe(steps)

	if out:
		print "samtools output: %s" %(out)
	if err:
		print "samtools error: %s" %(err)

	with open(raw_bam_mapstats_filename, 'w') as fh:
		subprocess.check_call(shlex.split("%s flagstat %s" \
			%(samtools, raw_bam_filename)), stdout=fh)

	print subprocess.check_output('ls', shell=True)
	mapped_reads = dxpy.upload_local_file(raw_bam_filename)
	mapping_statistics = dxpy.upload_local_file(raw_bam_mapstats_filename)

	output = { "mapped_reads": dxpy.dxlink(mapped_reads),
			   "mapping_statistics": dxpy.dxlink(mapping_statistics) }
	print "Returning from post with output: %s" %(output)
	return output
def main(input_bam, paired_end, spp_version):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam_file = dxpy.DXFile(input_bam)

    input_bam_filename = input_bam_file.name
    input_bam_basename = input_bam_file.name.rstrip('.bam')
    dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename)

    intermediate_TA_filename = input_bam_basename + ".tagAlign"
    if paired_end:
        end_infix = 'PE2SE'
    else:
        end_infix = 'SE'
    final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz'

    # ===================
    # Create tagAlign file
    # ===================

    out, err = common.run_pipe([
        "bamToBed -i %s" % (input_bam_filename),
        r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""",
        "tee %s" % (intermediate_TA_filename), "gzip -cn"
    ],
                               outfile=final_TA_filename)

    # ================
    # Create BEDPE file
    # ================
    if paired_end:
        final_BEDPE_filename = input_bam_basename + ".bedpe.gz"
        # need namesorted bam to make BEDPE
        final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt"
        final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam"
        samtools_sort_command = \
            "samtools sort -n %s %s" % (input_bam_filename, final_nmsrt_bam_prefix)
        logger.info(samtools_sort_command)
        subprocess.check_output(shlex.split(samtools_sort_command))
        out, err = common.run_pipe([
            "bamToBed -bedpe -mate1 -i %s" %
            (final_nmsrt_bam_filename), "gzip -cn"
        ],
                                   outfile=final_BEDPE_filename)

    # =================================
    # Subsample tagAlign file
    # ================================
    logger.info("Intermediate tA md5: %s" %
                (common.md5(intermediate_TA_filename)))
    NREADS = 15000000
    if paired_end:
        end_infix = 'MATE1'
    else:
        end_infix = 'SE'
    subsampled_TA_filename = \
        input_bam_basename + \
        ".filt.nodup.sample.%d.%s.tagAlign.gz" % (NREADS/1000000, end_infix)
    steps = [
        'grep -v "chrM" %s' % (intermediate_TA_filename),
        'shuf -n %d --random-source=%s' % (NREADS, intermediate_TA_filename)
    ]
    if paired_end:
        steps.extend([r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'"""])
    steps.extend(['gzip -cn'])
    out, err = common.run_pipe(steps, outfile=subsampled_TA_filename)
    logger.info("Subsampled tA md5: %s" % (common.md5(subsampled_TA_filename)))

    # Calculate Cross-correlation QC scores
    CC_scores_filename = subsampled_TA_filename + ".cc.qc"
    CC_plot_filename = subsampled_TA_filename + ".cc.plot.pdf"

    # CC_SCORE FILE format
    # Filename <tab>
    # numReads <tab>
    # estFragLen <tab>
    # corr_estFragLen <tab>
    # PhantomPeak <tab>
    # corr_phantomPeak <tab>
    # argmin_corr <tab>
    # min_corr <tab>
    # phantomPeakCoef <tab>
    # relPhantomPeakCoef <tab>
    # QualityTag

    spp_tarball = SPP_VERSION_MAP.get(spp_version)
    assert spp_tarball, "spp version %s is not supported" % (spp_version)
    # install spp
    subprocess.check_output(shlex.split('R CMD INSTALL %s' % (spp_tarball)))
    # run spp
    run_spp_command = '/phantompeakqualtools/run_spp_nodups.R'
    out, err = common.run_pipe([
        "Rscript %s -c=%s -p=%d -filtchr=chrM -savp=%s -out=%s" %
        (run_spp_command, subsampled_TA_filename, cpu_count(),
         CC_plot_filename, CC_scores_filename)
    ])
    out, err = common.run_pipe(
        [r"""sed -r  's/,[^\t]+//g' %s""" % (CC_scores_filename)],
        outfile="temp")
    out, err = common.run_pipe(["mv temp %s" % (CC_scores_filename)])

    tagAlign_file = dxpy.upload_local_file(final_TA_filename)
    if paired_end:
        BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename)

    CC_scores_file = dxpy.upload_local_file(CC_scores_filename)
    CC_plot_file = dxpy.upload_local_file(CC_plot_filename)
    xcor_qc = xcor_parse(CC_scores_filename)

    # Return the outputs
    output = {
        "tagAlign_file": dxpy.dxlink(tagAlign_file),
        "CC_scores_file": dxpy.dxlink(CC_scores_file),
        "CC_plot_file": dxpy.dxlink(CC_plot_file),
        "paired_end": paired_end,
        "RSC": float(xcor_qc.get('relPhantomPeakCoef')),
        "NSC": float(xcor_qc.get('phantomPeakCoef')),
        "est_frag_len": float(xcor_qc.get('estFragLen'))
    }
    if paired_end:
        output.update({"BEDPE_file": dxpy.dxlink(BEDPE_file)})
    return output
def main(experiment,
         reps_peaks,
         r1pr_peaks,
         r2pr_peaks,
         pooledpr_peaks,
         chrom_sizes,
         as_file,
         blacklist=None):

    #TODO for now just taking the peak files.  This applet should actually call IDR instead of
    #putting that in the workflow populator script

    # Initialize the data object inputs on the platform into
    # dxpy.DXDataObject instances.

    reps_peaks_file = dxpy.DXFile(reps_peaks)
    r1pr_peaks_file = dxpy.DXFile(r1pr_peaks)
    r2pr_peaks_file = dxpy.DXFile(r2pr_peaks)
    pooledpr_peaks_file = dxpy.DXFile(pooledpr_peaks)
    chrom_sizes_file = dxpy.DXFile(chrom_sizes)
    as_file_file = dxpy.DXFile(as_file)
    if blacklist is not None:
        blacklist_file = dxpy.DXFile(blacklist)
        blacklist_filename = 'blacklist_%s' % (blacklist_file.name)
        dxpy.download_dxfile(blacklist_file.get_id(), blacklist_filename)
        blacklist_filename = common.uncompress(blacklist_filename)

    # Download the file inputs to the local file system.

    #Need to prepend something to ensure the local filenames will be unique
    reps_peaks_filename = 'true_%s' % (reps_peaks_file.name)
    r1pr_peaks_filename = 'r1pr_%s' % (r1pr_peaks_file.name)
    r2pr_peaks_filename = 'r2pr_%s' % (r2pr_peaks_file.name)
    pooledpr_peaks_filename = 'pooledpr_%s' % (pooledpr_peaks_file.name)
    chrom_sizes_filename = chrom_sizes_file.name
    as_file_filename = as_file_file.name

    dxpy.download_dxfile(reps_peaks_file.get_id(), reps_peaks_filename)
    dxpy.download_dxfile(r1pr_peaks_file.get_id(), r1pr_peaks_filename)
    dxpy.download_dxfile(r2pr_peaks_file.get_id(), r2pr_peaks_filename)
    dxpy.download_dxfile(pooledpr_peaks_file.get_id(), pooledpr_peaks_filename)
    dxpy.download_dxfile(chrom_sizes_file.get_id(), chrom_sizes_filename)
    dxpy.download_dxfile(as_file_file.get_id(), as_file_filename)

    print subprocess.check_output('ls -l', shell=True)

    reps_peaks_filename = common.uncompress(reps_peaks_filename)
    r1pr_peaks_filename = common.uncompress(r1pr_peaks_filename)
    r2pr_peaks_filename = common.uncompress(r2pr_peaks_filename)
    pooledpr_peaks_filename = common.uncompress(pooledpr_peaks_filename)

    Nt = common.count_lines(reps_peaks_filename)
    print "%d peaks from true replicates" % (Nt)
    N1 = common.count_lines(r1pr_peaks_filename)
    print "%d peaks from rep1 self-pseudoreplicates" % (N1)
    N2 = common.count_lines(r2pr_peaks_filename)
    print "%d peaks from rep2 self-pseudoreplicates" % (N2)
    Np = common.count_lines(pooledpr_peaks_filename)
    print "%d peaks from pooled pseudoreplicates" % (Np)

    conservative_set_filename = '%s_final_conservative.narrowPeak' % (
        experiment)
    if blacklist is not None:
        blacklist_filter(reps_peaks_filename, conservative_set_filename,
                         blacklist_filename)
    else:
        conservative_set_filename = reps_peaks_filename
    Ncb = common.count_lines(conservative_set_filename)
    print "%d peaks blacklisted from the conservative set" % (Nt - Ncb)

    if Nt >= Np:
        peaks_to_filter_filename = reps_peaks_filename
        No = Nt
    else:
        peaks_to_filter_filename = pooledpr_peaks_filename
        No = Np

    optimal_set_filename = '%s_final_optimal.narrowPeak' % (experiment)
    if blacklist is not None:
        blacklist_filter(peaks_to_filter_filename, optimal_set_filename,
                         blacklist_filename)
    else:
        optimal_set_filename = peaks_to_filter_filename
    Nob = common.count_lines(optimal_set_filename)
    print "%d peaks blacklisted from the optimal set" % (No - Nob)

    rescue_ratio = float(max(Np, Nt)) / float(min(Np, Nt))
    self_consistency_ratio = float(max(N1, N2)) / float(min(N1, N2))

    if rescue_ratio > 2 and self_consistency_ratio > 2:
        reproducibility = 'fail'
    elif rescue_ratio > 2 or self_consistency_ratio > 2:
        reproducibility = 'borderline'
    else:
        reproducibility = 'pass'

    output = {}

    #bedtobigbed often fails, so skip creating the bb if it does
    conservative_set_bb_filename = common.bed2bb(conservative_set_filename,
                                                 chrom_sizes_filename,
                                                 as_file_filename)
    optimal_set_bb_filename = common.bed2bb(optimal_set_filename,
                                            chrom_sizes_filename,
                                            as_file_filename)
    if conservative_set_bb_filename:
        conservative_set_bb_output = dxpy.upload_local_file(
            conservative_set_bb_filename)
        output.update(
            {"conservative_set_bb": dxpy.dxlink(conservative_set_bb_output)})
    if optimal_set_bb_filename:
        optimal_set_bb_output = dxpy.upload_local_file(optimal_set_bb_filename)
        output.update({"optimal_set_bb": dxpy.dxlink(optimal_set_bb_output)})

    output.update({
        "Nt":
        Nt,
        "N1":
        N1,
        "N2":
        N2,
        "Np":
        Np,
        "conservative_set":
        dxpy.dxlink(
            dxpy.upload_local_file(
                common.compress(conservative_set_filename))),
        "optimal_set":
        dxpy.dxlink(
            dxpy.upload_local_file(common.compress(optimal_set_filename))),
        "rescue_ratio":
        rescue_ratio,
        "self_consistency_ratio":
        self_consistency_ratio,
        "reproducibility_test":
        reproducibility
    })

    logging.info("Exiting with output: %s", output)
    return output
Exemple #57
0
def main(rep1_ta, rep2_ta, ctl1_ta, ctl2_ta, rep1_xcor, rep2_xcor,
         rep1_paired_end, rep2_paired_end, chrom_sizes, genomesize,
         narrowpeak_as, gappedpeak_as, broadpeak_as):

    if not rep1_paired_end == rep2_paired_end:
        raise ValueError('Mixed PE/SE not supported (yet)')
    paired_end = rep1_paired_end
    # The following lines initialize the data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    unary_control = ctl1_ta == ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    ctl2_ta_file = dxpy.DXFile(ctl2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name)

    rep1_ta_filename = rep1_ta_file.name
    rep2_ta_filename = rep2_ta_file.name
    ctl1_ta_filename = ctl1_ta_file.name
    ctl2_ta_filename = ctl2_ta_file.name
    rep1_xcor_filename = rep1_xcor_file.name
    rep2_xcor_filename = rep2_xcor_file.name

    ntags_rep1 = count_lines(rep1_ta_filename)
    ntags_rep2 = count_lines(rep2_ta_filename)
    ntags_ctl1 = count_lines(ctl1_ta_filename)
    ntags_ctl2 = count_lines(ctl2_ta_filename)

    for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename),
                              (ntags_rep2, 'replicate 2', rep2_ta_filename),
                              (ntags_ctl1, 'control 1', ctl1_ta_filename),
                              (ntags_ctl2, 'control 2', ctl2_ta_filename)]:
        print "Found %d tags in %s file %s" % (n, name, filename)

    print subprocess.check_output('ls -l',
                                  shell=True,
                                  stderr=subprocess.STDOUT)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = pool_applet.run({"inputs": [rep1_ta, rep2_ta]})
    pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")

    rep1_control = ctl1_ta  #default.  May be changed later.
    rep2_control = ctl2_ta  #default.  May be changed later.

    if unary_control:
        print "Only one control supplied.  Using it for both replicate 1 and 2 and for the pool."
        control_for_pool = rep1_control
    else:
        pool_controls_subjob = pool_applet.run({"inputs": [ctl1_ta, ctl2_ta]})
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        #always use the pooled controls for the pool
        control_for_pool = pooled_controls

        #use the pooled controls for the reps depending on the ration of rep to control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            print "Number of reads in controls differ by > factor of %f. Using pooled controls." % (
                ratio_cutoff)
            rep1_control = pooled_controls
            rep2_control = pooled_controls
        else:
            if ntags_ctl1 < ntags_rep1:
                print "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                rep1_control = pooled_controls
            elif ntags_ctl2 < ntags_rep2:
                print "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                rep2_control = pooled_controls
            else:
                print "Using distinct controls for replicate 1 and 2."

    pseudoreplicator_applet = dxpy.find_one_data_object(
        classname='applet',
        name='pseudoreplicator',
        zero_ok=False,
        more_ok=False,
        return_handler=True)
    rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta})
    rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta})

    pool_pr1_subjob = pool_applet.run({
        "inputs": [
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep2_pr_subjob.get_output_ref("pseudoreplicate1")
        ]
    })
    pool_pr2_subjob = pool_applet.run({
        "inputs": [
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep2_pr_subjob.get_output_ref("pseudoreplicate2")
        ]
    })

    pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end)
    rep1_pr1_xcor_subjob = xcor_only(
        rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end)
    rep1_pr2_xcor_subjob = xcor_only(
        rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end)
    rep2_pr1_xcor_subjob = xcor_only(
        rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end)
    rep2_pr2_xcor_subjob = xcor_only(
        rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end)
    pool_pr1_xcor_subjob = xcor_only(pool_pr1_subjob.get_output_ref("pooled"),
                                     paired_end)
    pool_pr2_xcor_subjob = xcor_only(pool_pr2_subjob.get_output_ref("pooled"),
                                     paired_end)

    common_args = {
        'chrom_sizes': chrom_sizes,
        'genomesize': genomesize,
        'narrowpeak_as': narrowpeak_as,
        'gappedpeak_as': gappedpeak_as,
        'broadpeak_as': broadpeak_as
    }

    common_args.update({'prefix': 'r1'})
    rep1_peaks_subjob = macs2(rep1_ta, rep1_control, rep1_xcor, **common_args)

    common_args.update({'prefix': 'r2'})
    rep2_peaks_subjob = macs2(rep2_ta, rep2_control, rep2_xcor, **common_args)

    common_args.update({'prefix': 'pool'})
    pooled_peaks_subjob = macs2(
        pooled_replicates, control_for_pool,
        pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
        **common_args)

    common_args.update({'prefix': 'r1pr1'})
    rep1pr1_peaks_subjob = macs2(
        rep1_pr_subjob.get_output_ref("pseudoreplicate1"), rep1_control,
        rep1_pr1_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'r1pr2'})
    rep1pr2_peaks_subjob = macs2(
        rep1_pr_subjob.get_output_ref("pseudoreplicate2"), rep1_control,
        rep1_pr2_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'r2pr1'})
    rep2pr1_peaks_subjob = macs2(
        rep2_pr_subjob.get_output_ref("pseudoreplicate1"), rep2_control,
        rep2_pr1_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'r2pr2'})
    rep2pr2_peaks_subjob = macs2(
        rep2_pr_subjob.get_output_ref("pseudoreplicate2"), rep2_control,
        rep2_pr2_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'ppr1'})
    pooledpr1_peaks_subjob = macs2(
        pool_pr1_subjob.get_output_ref("pooled"), control_for_pool,
        pool_pr1_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    common_args.update({'prefix': 'ppr2'})
    pooledpr2_peaks_subjob = macs2(
        pool_pr2_subjob.get_output_ref("pooled"), control_for_pool,
        pool_pr2_xcor_subjob.get_output_ref("CC_scores_file"), **common_args)

    output = {
        'rep1_narrowpeaks':
        rep1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1_gappedpeaks':
        rep1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1_broadpeaks':
        rep1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1_narrowpeaks_bb':
        rep1_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'rep1_gappedpeaks_bb':
        rep1_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'rep1_broadpeaks_bb':
        rep1_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'rep1_fc_signal':
        rep1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1_pvalue_signal':
        rep1_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep2_narrowpeaks':
        rep2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep2_gappedpeaks':
        rep2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep2_broadpeaks':
        rep2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep2_narrowpeaks_bb':
        rep2_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'rep2_gappedpeaks_bb':
        rep2_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'rep2_broadpeaks_bb':
        rep2_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'rep2_fc_signal':
        rep2_peaks_subjob.get_output_ref("fc_signal"),
        'rep2_pvalue_signal':
        rep2_peaks_subjob.get_output_ref("pvalue_signal"),
        'pooled_narrowpeaks':
        pooled_peaks_subjob.get_output_ref("narrowpeaks"),
        'pooled_gappedpeaks':
        pooled_peaks_subjob.get_output_ref("gappedpeaks"),
        'pooled_broadpeaks':
        pooled_peaks_subjob.get_output_ref("broadpeaks"),
        'pooled_narrowpeaks_bb':
        pooled_peaks_subjob.get_output_ref("narrowpeaks_bb"),
        'pooled_gappedpeaks_bb':
        pooled_peaks_subjob.get_output_ref("gappedpeaks_bb"),
        'pooled_broadpeaks_bb':
        pooled_peaks_subjob.get_output_ref("broadpeaks_bb"),
        'pooled_fc_signal':
        pooled_peaks_subjob.get_output_ref("fc_signal"),
        'pooled_pvalue_signal':
        pooled_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep1pr1_narrowpeaks':
        rep1pr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr1_gappedpeaks':
        rep1pr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr1_broadpeaks':
        rep1pr1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr1_fc_signal':
        rep1pr1_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr1_pvalue_signal':
        rep1pr1_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep1pr2_narrowpeaks':
        rep1pr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep1pr2_gappedpeaks':
        rep1pr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep1pr2_broadpeaks':
        rep1pr2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep1pr2_fc_signal':
        rep1pr2_peaks_subjob.get_output_ref("fc_signal"),
        'rep1pr2_pvalue_signal':
        rep1pr2_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep2pr1_narrowpeaks':
        rep2pr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep2pr1_gappedpeaks':
        rep2pr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep2pr1_broadpeaks':
        rep2pr1_peaks_subjob.get_output_ref("broadpeaks"),
        'rep2pr1_fc_signal':
        rep2pr1_peaks_subjob.get_output_ref("fc_signal"),
        'rep2pr1_pvalue_signal':
        rep2pr1_peaks_subjob.get_output_ref("pvalue_signal"),
        'rep2pr2_narrowpeaks':
        rep2pr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'rep2pr2_gappedpeaks':
        rep2pr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'rep2pr2_broadpeaks':
        rep2pr2_peaks_subjob.get_output_ref("broadpeaks"),
        'rep2pr2_fc_signal':
        rep2pr2_peaks_subjob.get_output_ref("fc_signal"),
        'rep2pr2_pvalue_signal':
        rep2pr2_peaks_subjob.get_output_ref("pvalue_signal"),
        'pooledpr1_narrowpeaks':
        pooledpr1_peaks_subjob.get_output_ref("narrowpeaks"),
        'pooledpr1_gappedpeaks':
        pooledpr1_peaks_subjob.get_output_ref("gappedpeaks"),
        'pooledpr1_broadpeaks':
        pooledpr1_peaks_subjob.get_output_ref("broadpeaks"),
        'pooledpr1_fc_signal':
        pooledpr1_peaks_subjob.get_output_ref("fc_signal"),
        'pooledpr1_pvalue_signal':
        pooledpr1_peaks_subjob.get_output_ref("pvalue_signal"),
        'pooledpr2_narrowpeaks':
        pooledpr2_peaks_subjob.get_output_ref("narrowpeaks"),
        'pooledpr2_gappedpeaks':
        pooledpr2_peaks_subjob.get_output_ref("gappedpeaks"),
        'pooledpr2_broadpeaks':
        pooledpr2_peaks_subjob.get_output_ref("broadpeaks"),
        'pooledpr2_fc_signal':
        pooledpr2_peaks_subjob.get_output_ref("fc_signal"),
        'pooledpr2_pvalue_signal':
        pooledpr2_peaks_subjob.get_output_ref("pvalue_signal")
    }

    return output
Exemple #58
0
def main(base_name,
         At,
         An,
         Bt,
         Bn,
         Ct,
         Cn,
         Dt,
         Dn,
         Et=None,
         En=None,
         Ft=None,
         Fn=None,
         Gt=None,
         Gn=None,
         Ht=None,
         Hn=None,
         It=None,
         In=None,
         Jt=None,
         Jn=None,
         Kt=None,
         Kn=None,
         Lt=None,
         Ln=None,
         Mt=None,
         Mn=None,
         Nt=None,
         Nn=None,
         Ot=None,
         On=None,
         Pt=None,
         Pn=None,
         Qt=None,
         Qn=None,
         Rt=None,
         Rn=None,
         St=None,
         Sn=None,
         Tt=None,
         Tn=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    At = dxpy.DXFile(At)
    Bt = dxpy.DXFile(Bt)
    Ct = dxpy.DXFile(Ct)
    Dt = dxpy.DXFile(Dt)
    if Et is not None:
        Et = dxpy.DXFile(Et)
    if Ft is not None:
        Ft = dxpy.DXFile(Ft)
    if Gt is not None:
        Gt = dxpy.DXFile(Gt)
    if Ht is not None:
        Ht = dxpy.DXFile(Ht)
    if It is not None:
        It = dxpy.DXFile(It)
    if Jt is not None:
        Jt = dxpy.DXFile(Jt)
    if Kt is not None:
        Kt = dxpy.DXFile(Kt)
    if Lt is not None:
        Lt = dxpy.DXFile(Lt)
    if Mt is not None:
        Mt = dxpy.DXFile(Mt)
    if Nt is not None:
        Nt = dxpy.DXFile(Nt)
    if Ot is not None:
        Ot = dxpy.DXFile(Ot)
    if Pt is not None:
        Pt = dxpy.DXFile(Pt)
    if Qt is not None:
        Qt = dxpy.DXFile(Qt)
    if Rt is not None:
        Rt = dxpy.DXFile(Rt)
    if St is not None:
        St = dxpy.DXFile(St)
    if Tt is not None:
        Tt = dxpy.DXFile(Tt)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(At.get_id(), "At")

    dxpy.download_dxfile(Bt.get_id(), "Bt")

    dxpy.download_dxfile(Ct.get_id(), "Ct")

    dxpy.download_dxfile(Dt.get_id(), "Dt")

    if Et is not None:
        dxpy.download_dxfile(Et.get_id(), "Et")

    if Ft is not None:
        dxpy.download_dxfile(Ft.get_id(), "Ft")

    if Gt is not None:
        dxpy.download_dxfile(Gt.get_id(), "Gt")

    if Ht is not None:
        dxpy.download_dxfile(Ht.get_id(), "Ht")

    if It is not None:
        dxpy.download_dxfile(It.get_id(), "It")

    if Jt is not None:
        dxpy.download_dxfile(Jt.get_id(), "Jt")

    if Kt is not None:
        dxpy.download_dxfile(Kt.get_id(), "Kt")

    if Lt is not None:
        dxpy.download_dxfile(Lt.get_id(), "Lt")

    if Mt is not None:
        dxpy.download_dxfile(Mt.get_id(), "Mt")

    if Nt is not None:
        dxpy.download_dxfile(Nt.get_id(), "Nt")

    if Ot is not None:
        dxpy.download_dxfile(Ot.get_id(), "Ot")

    if Pt is not None:
        dxpy.download_dxfile(Pt.get_id(), "Pt")

    if Qt is not None:
        dxpy.download_dxfile(Qt.get_id(), "Qt")

    if Rt is not None:
        dxpy.download_dxfile(Rt.get_id(), "Rt")

    if St is not None:
        dxpy.download_dxfile(St.get_id(), "St")

    if Tt is not None:
        dxpy.download_dxfile(Tt.get_id(), "Tt")

    # Fill in your application code here.

    total_t = ['At', 'Bt', 'Ct', 'Dt']

    if Et is not None:
        total_t = total_t + ['Et']
    if Ft is not None:
        total_t = total_t + ['Ft']
    if Gt is not None:
        total_t = total_t + ['Gt']
    if Ht is not None:
        total_t = total_t + ['Ht']
    if It is not None:
        total_t = total_t + ['It']
    if Jt is not None:
        total_t = total_t + ['Jt']
    if Kt is not None:
        total_t = total_t + ['Kt']
    if Lt is not None:
        total_t = total_t + ['Lt']
    if Mt is not None:
        total_t = total_t + ['Mt']
    if Nt is not None:
        total_t = total_t + ['Nt']
    if Ot is not None:
        total_t = total_t + ['Ot']
    if Pt is not None:
        total_t = total_t + ['Pt']
    if Qt is not None:
        total_t = total_t + ['Qt']
    if Rt is not None:
        total_t = total_t + ['Rt']
    if St is not None:
        total_t = total_t + ['St']
    if Tt is not None:
        total_t = total_t + ['Tt']

    total_n = [
        An, Bn, Cn, Dn, En, Fn, Gn, Hn, In, Jn, Kn, Ln, Mn, Nn, On, Pn, Qn, Rn,
        Sn, Tn
    ]
    total_n = [x for x in total_n if x is not None]

    TPM = list(map(read_TPM, total_t))
    cts = generate_table(TPM, total_n)
    meta = generate_meta(list(total_n))

    with localconverter(ro.default_converter + pandas2ri.converter):
        r.assign("cts", cts)
        r.assign("meta", meta)
        r('dds <- DESeqDataSetFromMatrix(countData = cts,\
                          colData = meta,\
                          design = ~ condition)')
        r('dds <- DESeq(dds)')
        r('vsd <- vst(dds, blind=FALSE)')
        r('plotPCA(vsd, intgroup=c("condition"))')
        string = 'ggsave("' + base_name + '_PCA.pdf")'
        r(string)
        r('PCA_information <- plotPCA(vsd, intgroup=c("condition"),returnData=TRUE)'
          )
        string = 'write.csv(as.data.frame(PCA_information),file="' + base_name + '_table_PCA.csv' + '")'
        r(string)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    plot = dxpy.upload_local_file(base_name + "_PCA.pdf")
    csv = dxpy.upload_local_file(base_name + "_table_PCA.csv")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["plot"] = dxpy.dxlink(plot)
    output["csv"] = dxpy.dxlink(csv)

    return output
def main(rep1_ta,
         rep2_ta,
         ctl1_ta,
         ctl2_ta,
         rep1_xcor,
         rep2_xcor,
         npeaks,
         nodups,
         rep1_paired_end,
         rep2_paired_end,
         chrom_sizes,
         as_file=None,
         idr_peaks=False):

    if not rep1_paired_end == rep2_paired_end:
        raise ValueError('Mixed PE/SE not supported (yet)')
    paired_end = rep1_paired_end
    # The following lines initialize the data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    rep1_ta_file = dxpy.DXFile(rep1_ta)
    rep2_ta_file = dxpy.DXFile(rep2_ta)
    unary_control = ctl1_ta == ctl2_ta
    ctl1_ta_file = dxpy.DXFile(ctl1_ta)
    ctl2_ta_file = dxpy.DXFile(ctl2_ta)
    rep1_xcor_file = dxpy.DXFile(rep1_xcor)
    rep2_xcor_file = dxpy.DXFile(rep2_xcor)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(rep1_ta_file.get_id(), rep1_ta_file.name)
    dxpy.download_dxfile(rep2_ta_file.get_id(), rep2_ta_file.name)
    dxpy.download_dxfile(ctl1_ta_file.get_id(), ctl1_ta_file.name)
    dxpy.download_dxfile(ctl2_ta_file.get_id(), ctl2_ta_file.name)
    dxpy.download_dxfile(rep1_xcor_file.get_id(), rep1_xcor_file.name)
    dxpy.download_dxfile(rep2_xcor_file.get_id(), rep2_xcor_file.name)

    rep1_ta_filename = rep1_ta_file.name
    rep2_ta_filename = rep2_ta_file.name
    ctl1_ta_filename = ctl1_ta_file.name
    ctl2_ta_filename = ctl2_ta_file.name
    rep1_xcor_filename = rep1_xcor_file.name
    rep2_xcor_filename = rep2_xcor_file.name

    ntags_rep1 = count_lines(rep1_ta_filename)
    ntags_rep2 = count_lines(rep2_ta_filename)
    ntags_ctl1 = count_lines(ctl1_ta_filename)
    ntags_ctl2 = count_lines(ctl2_ta_filename)

    for n, name, filename in [(ntags_rep1, 'replicate 1', rep1_ta_filename),
                              (ntags_rep2, 'replicate 2', rep2_ta_filename),
                              (ntags_ctl1, 'control 1', ctl1_ta_filename),
                              (ntags_ctl2, 'control 2', ctl2_ta_filename)]:
        print "Found %d tags in %s file %s" % (n, name, filename)

    print subprocess.check_output('ls -l',
                                  shell=True,
                                  stderr=subprocess.STDOUT)

    pool_applet = dxpy.find_one_data_object(classname='applet',
                                            name='pool',
                                            project=dxpy.PROJECT_CONTEXT_ID,
                                            zero_ok=False,
                                            more_ok=False,
                                            return_handler=True)
    pool_replicates_subjob = pool_applet.run({"inputs": [rep1_ta, rep2_ta]})
    pooled_replicates = pool_replicates_subjob.get_output_ref("pooled")
    pooled_replicates_xcor_subjob = xcor_only(pooled_replicates, paired_end)

    rep1_control = ctl1_ta  #default.  May be changed later.
    rep2_control = ctl2_ta  #default.  May be changed later.

    if unary_control:
        print "Only one control supplied.  Using it for both replicate 1 and 2 and for the pool."
        control_for_pool = rep1_control
    else:
        pool_controls_subjob = pool_applet.run({"inputs": [ctl1_ta, ctl2_ta]})
        pooled_controls = pool_controls_subjob.get_output_ref("pooled")
        #always use the pooled controls for the pool
        control_for_pool = pooled_controls

        #use the pooled controls for the reps depending on the ration of rep to control reads
        ratio_ctl_reads = float(ntags_ctl1) / float(ntags_ctl2)
        if ratio_ctl_reads < 1:
            ratio_ctl_reads = 1 / ratio_ctl_reads
        ratio_cutoff = 1.2
        if ratio_ctl_reads > ratio_cutoff:
            print "Number of reads in controls differ by > factor of %f. Using pooled controls." % (
                ratio_cutoff)
            rep1_control = pooled_controls
            rep2_control = pooled_controls
        else:
            if ntags_ctl1 < ntags_rep1:
                print "Fewer reads in control replicate 1 than experiment replicate 1.  Using pooled controls for replicate 1."
                rep1_control = pooled_controls
            elif ntags_ctl2 < ntags_rep2:
                print "Fewer reads in control replicate 2 than experiment replicate 2.  Using pooled controls for replicate 2."
                rep2_control = pooled_controls
            else:
                print "Using distinct controls for replicate 1 and 2."

    rep1_peaks_subjob = spp(rep1_ta,
                            rep1_control,
                            rep1_xcor,
                            chrom_sizes=chrom_sizes,
                            bigbed=True,
                            as_file=as_file)

    rep2_peaks_subjob = spp(rep2_ta,
                            rep2_control,
                            rep2_xcor,
                            chrom_sizes=chrom_sizes,
                            bigbed=True,
                            as_file=as_file)

    pooled_peaks_subjob = spp(
        pooled_replicates,
        control_for_pool,
        pooled_replicates_xcor_subjob.get_output_ref("CC_scores_file"),
        chrom_sizes=chrom_sizes,
        bigbed=True,
        as_file=as_file)

    output = {
        'rep1_peaks': rep1_peaks_subjob.get_output_ref("peaks"),
        'rep1_peaks_bb': rep1_peaks_subjob.get_output_ref("peaks_bb"),
        'rep1_xcor_plot': rep1_peaks_subjob.get_output_ref("xcor_plot"),
        'rep1_xcor_scores': rep1_peaks_subjob.get_output_ref("xcor_scores"),
        'rep2_peaks': rep2_peaks_subjob.get_output_ref("peaks"),
        'rep2_peaks_bb': rep2_peaks_subjob.get_output_ref("peaks_bb"),
        'rep2_xcor_plot': rep2_peaks_subjob.get_output_ref("xcor_plot"),
        'rep2_xcor_scores': rep2_peaks_subjob.get_output_ref("xcor_scores"),
        'pooled_peaks': pooled_peaks_subjob.get_output_ref("peaks"),
        'pooled_peaks_bb': pooled_peaks_subjob.get_output_ref("peaks_bb"),
        'pooled_xcor_plot': pooled_peaks_subjob.get_output_ref("xcor_plot"),
        'pooled_xcor_scores': pooled_peaks_subjob.get_output_ref("xcor_scores")
    }

    if idr_peaks:  #also call peaks on pseudoreplicates for IDR
        pseudoreplicator_applet = dxpy.find_one_data_object(
            classname='applet',
            name='pseudoreplicator',
            project=dxpy.PROJECT_CONTEXT_ID,
            zero_ok=False,
            more_ok=False,
            return_handler=True)

        rep1_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep1_ta})
        rep2_pr_subjob = pseudoreplicator_applet.run({"input_tags": rep2_ta})

        pool_pr1_subjob = pool_applet.run({
            "inputs": [
                rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
                rep2_pr_subjob.get_output_ref("pseudoreplicate1")
            ]
        })

        pool_pr2_subjob = pool_applet.run({
            "inputs": [
                rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
                rep2_pr_subjob.get_output_ref("pseudoreplicate2")
            ]
        })

        rep1_pr1_xcor_subjob = xcor_only(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end)
        rep1_pr2_xcor_subjob = xcor_only(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end)
        rep2_pr1_xcor_subjob = xcor_only(
            rep2_pr_subjob.get_output_ref("pseudoreplicate1"), paired_end)
        rep2_pr2_xcor_subjob = xcor_only(
            rep2_pr_subjob.get_output_ref("pseudoreplicate2"), paired_end)
        pool_pr1_xcor_subjob = xcor_only(
            pool_pr1_subjob.get_output_ref("pooled"), paired_end)
        pool_pr2_xcor_subjob = xcor_only(
            pool_pr2_subjob.get_output_ref("pooled"), paired_end)

        rep1pr1_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep1_control,
            rep1_pr1_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False)

        rep1pr2_peaks_subjob = spp(
            rep1_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep1_control,
            rep1_pr2_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False)

        rep2pr1_peaks_subjob = spp(
            rep2_pr_subjob.get_output_ref("pseudoreplicate1"),
            rep2_control,
            rep2_pr1_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False)

        rep2pr2_peaks_subjob = spp(
            rep2_pr_subjob.get_output_ref("pseudoreplicate2"),
            rep2_control,
            rep2_pr2_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False)

        pooledpr1_peaks_subjob = spp(
            pool_pr1_subjob.get_output_ref("pooled"),
            control_for_pool,
            pool_pr1_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False)

        pooledpr2_peaks_subjob = spp(
            pool_pr2_subjob.get_output_ref("pooled"),
            control_for_pool,
            pool_pr2_xcor_subjob.get_output_ref("CC_scores_file"),
            chrom_sizes=chrom_sizes,
            bigbed=False)

        output.update({
            'rep1pr1_peaks':
            rep1pr1_peaks_subjob.get_output_ref("peaks"),
            'rep1pr1_xcor_plot':
            rep1pr1_peaks_subjob.get_output_ref("xcor_plot"),
            'rep1pr1_xcor_scores':
            rep1pr1_peaks_subjob.get_output_ref("xcor_scores"),
            'rep1pr2_peaks':
            rep1pr2_peaks_subjob.get_output_ref("peaks"),
            'rep1pr2_xcor_plot':
            rep1pr2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep1pr2_xcor_scores':
            rep1pr2_peaks_subjob.get_output_ref("xcor_scores"),
            'rep2pr1_peaks':
            rep2pr1_peaks_subjob.get_output_ref("peaks"),
            'rep2pr1_xcor_plot':
            rep2pr1_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2pr1_xcor_scores':
            rep2pr1_peaks_subjob.get_output_ref("xcor_scores"),
            'rep2pr2_peaks':
            rep2pr2_peaks_subjob.get_output_ref("peaks"),
            'rep2pr2_xcor_plot':
            rep2pr2_peaks_subjob.get_output_ref("xcor_plot"),
            'rep2pr2_xcor_scores':
            rep2pr2_peaks_subjob.get_output_ref("xcor_scores"),
            'pooledpr1_peaks':
            pooledpr1_peaks_subjob.get_output_ref("peaks"),
            'pooledpr1_xcor_plot':
            pooledpr1_peaks_subjob.get_output_ref("xcor_plot"),
            'pooledpr1_xcor_scores':
            pooledpr1_peaks_subjob.get_output_ref("xcor_scores"),
            'pooledpr2_peaks':
            pooledpr2_peaks_subjob.get_output_ref("peaks"),
            'pooledpr2_xcor_plot':
            pooledpr2_peaks_subjob.get_output_ref("xcor_plot"),
            'pooledpr2_xcor_scores':
            pooledpr2_peaks_subjob.get_output_ref("xcor_scores"),
        })

    return output
Exemple #60
0
def main(input_SAM,
         deviations=None,
         histogram_width=None,
         min_percent=None,
         metric_acc_level=None,
         ref=None,
         is_sorted=None,
         stop_after=None):

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_SAM, "input")
    if ref != None:
        dxpy.download_dxfile(ref, "ref.fa")

    command = "java -Xmx2g -jar /CollectInsertSizeMetrics.jar"
    command += " INPUT=input"
    command += " OUTPUT=insert_distribution.txt"
    command += " HISTOGRAM_FILE=histogram.pdf"
    if deviations != None:
        command += " DEVIATIONS=" + str(deviations)
    if histogram_width != None:
        command += " HISTOGRAM_WIDTH=" + str(histogram_width)
    if min_percent != None:
        command += " MINIMUM_PCT=" + str(histogram_width)
    if metric_acc_level != None:
        for level in metric_acc_level:
            command += " METRIC_ACCUMULATION_LEVEL=" + str(level)
    if ref != None:
        command += " REFERENCE_SEQUENCE=ref.fa"
    if is_sorted != None:
        if is_sorted:
            command += " ASSUME_SORTED=true"
        else:
            command += " ASSUME_SORTED=false"
    if stop_after != None:
        command += " STOP_AFTER=" + str(stop_after)

    print "Executing:"
    print command

    # CALL the command here:
    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    histogram = dxpy.upload_local_file("histogram.pdf")
    histogram.rename(
        dxpy.DXFile(input_SAM).describe()['name'] + "_histogram.pdf")
    output_dist = dxpy.upload_local_file("insert_distribution.txt")
    output_dist.rename(
        dxpy.DXFile(input_SAM).describe()['name'] + "_insert_dist.txt")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["histogram"] = dxpy.dxlink(histogram)
    output["output"] = dxpy.dxlink(output_dist)

    return output