def process(scattered_input, dme_ix, ncpus, reads_root):
    # Fill in code here to process the input and create output.

    if DEBUG:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    dme_ix = dxpy.DXFile(dme_ix)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(dme_ix.get_id(), "index.tgz")
    fq = dxpy.DXFile(scattered_input)
    name = fq.describe()['name']
    dxpy.download_dxfile(fq.get_id(), name)
    bam_root = name + '_techrep'

    logger.info("* === Calling DNAnexus and ENCODE independent script... ===")
    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug(subprocess.check_output(shlex.split('head %s' % name)))
    if os.path.isfile(ALIGN_SCRIPT):
        logger.debug("** Executable %s exists" % ALIGN_SCRIPT)
    else:
        logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT)
        exit(1)
    align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name,
                                                    str(ncpus), bam_root)
    logger.debug('** command line: %s' % align_cmd)
    map_out = subprocess.check_output(shlex.split(align_cmd))
    logger.info("* === Returned from dname_align_se  ===")

    # As always, you can choose not to return output if the
    # "postprocess" stage does not require any input, e.g. rows have
    # been added to a GTable that has been created in advance.  Just
    # make sure that the "postprocess" job does not run until all
    # "process" jobs have finished by making it wait for "map" to
    # finish using the depends_on argument (this is already done for
    # you in the invocation of the "postprocess" job in "main").

    logger.debug("** DIR: %s" % os.listdir('./'))
    logger.debug("** OUTPUT DIR: %s" % os.listdir('output/'))

    os.rename(bam_root + '_bismark.bam', bam_root + '.bam')
    return {
        "bam_file":
        dxpy.dxlink(dxpy.upload_local_file(bam_root + '.bam')),
        "report_file":
        dxpy.dxlink(
            dxpy.upload_local_file(bam_root + '_bismark_map_report.txt'))
    }
 def download_qc_report(self, download_dir):
     """
     Downloads the QC report from the DNAnexus sequencing results project.
  
     Args: 
         download_dir: `str` - The local directory path to download the QC report to.
 
     Returns:
         `str`. The filepath to the downloaded QC report.
     """
     if not os.path.isdir(download_dir):
         os.makedirs(download_dir)
     res = dxpy.find_one_data_object(project=self.dx_project_id,
                                     folder=self.DX_QC_REPORT_FOLDER,
                                     name="*_QC_Report.pdf",
                                     name_mode="glob")
     #res will be something like {u'project': u'project-BzqVkxj08kVZbPXk54X0P2JY', u'id': u'file-BzqVkg800Fb0z4437GXJfGY6'}
     #dxpy.find_one_data_object() raises a dxpy.exceptions.DXSearchError() if nothing is found.
     dx_file = dxpy.DXFile(dxid=res["id"], project=res["project"])
     download_file_name = os.path.join(download_dir, dx_file.name)
     msg = "{filename} to {download_dir}.".format(filename=dx_file.name,
                                                  download_dir=download_dir)
     debug_logger.debug("Downloading " + msg)
     dxpy.bindings.dxfile_functions.download_dxfile(
         dxid=dx_file, filename=download_file_name)
     success_logger.info("Downloaded " + msg)
     return download_file_name
def main(input_bam, paired=True, params=''):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_bam = dxpy.DXFile(input_bam)
    base_name = remove_extensions(input_bam.describe()['name'],
                                  [".bam", ".BAM", ".sam", ".SAM"])

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_bam.get_id(), "input.bam")

    # Fill in your application code here.

    command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name
    if paired:
        command += " F2=%s_2.fastq" % base_name

    subprocess.check_call(command, shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output = {}
    fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name)
    output["fastq_file"] = dxpy.dxlink(fastq_file)
    if paired:
        paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name)
        output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file)

    return output
def calc_mismatch_per_cycle_stats(bam_file,
                                  aligner,
                                  output_project,
                                  output_folder,
                                  properties={}):
    logger = []
    misc_subfolder = output_folder + '/miscellany'

    bam_file = dxpy.DXFile(bam_file)
    bam_filename = bam_file.describe()['name']
    dxpy.download_dxfile(bam_file.get_id(), bam_filename)
    ofn = os.path.splitext(bam_filename)[0] + '.mm_stats'

    # Change permissions
    cmd = 'chmod +x /bwa_mismatches'
    run_cmd(cmd, logger)
    cmd = '/bwa_mismatches -o {0} -m {1} {2}'.format(ofn, ALIGNERS[aligner],
                                                     bam_filename)
    run_cmd(cmd, logger)

    properties['file_type'] = 'mismatch_stats'
    mismatch_per_cycle_stats = dxpy.upload_local_file(filename=ofn,
                                                      project=output_project,
                                                      folder=misc_subfolder,
                                                      properties=properties,
                                                      parents=True)

    return {
        'mismatch_per_cycle_stats': mismatch_per_cycle_stats,
        "tools_used": logger
    }
Example #5
0
def main(inputs):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    extension = splitext(splitext(input_filenames[-1])[0])[
        1]  #uses last extension - presumably they are all the same
    pooled_filename = '-'.join(
        [splitext(splitext(fn)[0])[0]
         for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = run_pipe(
        ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -c'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["pooled"] = dxpy.dxlink(pooled)

    return output
Example #6
0
    def test_upload_download_remove(self):
        num_test_objs = 10
        min_obj_size = 50
        with NamedTemporaryDirectory(change_dir=True) as tmp_d:
            self.create_dataset(tmp_d, num_test_objs, min_obj_size)
            self.test_dir.upload(['.'])

        which_obj = self.get_dataset_obj_names(num_test_objs)[-1]
        dx_p = self.test_dir / which_obj
        file_h = dxpy.DXFile(dxid=dx_p.canonical_resource,
                             project=dx_p.canonical_project)
        file_h.wait_on_close(20)  # wait for file to go to closed state

        for which_obj in self.get_dataset_obj_names(num_test_objs):
            self.assertTrue((self.test_dir / which_obj).exists())

        with NamedTemporaryDirectory(change_dir=True) as tmp_d:
            self.test_dir.download(tmp_d)
            for which_obj in self.get_dataset_obj_names(num_test_objs):
                self.assertCorrectObjectContents(which_obj, which_obj,
                                                 min_obj_size)
                (self.test_dir / which_obj).remove()

                # consistency check
                while (self.test_dir / which_obj).exists():
                    time.sleep(.5)
                self.assertFalse((self.test_dir / which_obj).exists())
def _download_symbolic_link(dxid, md5digest, project, dest_filename):
    dxfile = dxpy.DXFile(dxid)
    url, _headers = dxfile.get_download_url(preauthenticated=True,
                                            duration=6 * 3600,
                                            project=project)

    # Follow the redirection
    print('Following redirect for ' + url)

    wget_exe = _which("wget")
    if wget_exe is None:
        err_exit("wget is not installed on this system")

    cmd = ["wget", "--tries=5", "--quiet"]
    if os.path.isfile(dxid):
        # file already exists, resume upload.
        cmd += ["--continue"]
    cmd += ["-O", dest_filename, url]

    try:
        print("Downloading symbolic link with wget")
        subprocess.check_call(cmd, stderr=subprocess.STDOUT)
    except subprocess.CalledProcessError as e:
        msg = ""
        if e and e.output:
            msg = e.output.strip()
        err_exit("Failed to call wget: {cmd}\n{msg}\n".format(cmd=str(cmd),
                                                              msg=msg))

    if md5digest is not None:
        _verify(dest_filename, md5digest)
def flagstat_parse(dxlink):
	desc = dxpy.describe(dxlink)
	with dxpy.DXFile(desc['id'], mode='r') as flagstat_file:
		if not flagstat_file:
			return None

	qc_dict = { #values are regular expressions, will be replaced with scores [hiq, lowq]
		'in_total': 'in total',
		'duplicates': 'duplicates',
		'mapped': 'mapped',
		'paired_in_sequencing': 'paired in sequencing',
		'read1': 'read1',
		'read2': 'read2',
		'properly_paired': 'properly paired',
		'with_self_mate_mapped': 'with itself and mate mapped',
		'singletons': 'singletons',
		'mate_mapped_different_chr': 'with mate mapped to a different chr$', #i.e. at the end of the line
		'mate_mapped_different_chr_hiQ': 'with mate mapped to a different chr \(mapQ>=5\)' #RE so must escape
	}
	flagstat_lines = flagstat_file.read().splitlines()
	for (qc_key, qc_pattern) in qc_dict.items():
		qc_metrics = next(re.split(qc_pattern, line) for line in flagstat_lines if re.search(qc_pattern, line))
		(hiq, lowq) = qc_metrics[0].split(' + ')
		qc_dict[qc_key] = [int(hiq.rstrip()), int(lowq.rstrip())]

	return qc_dict
Example #9
0
def group_files_by_read(fastq_files):
    """
    Function : Groups a list of FASTQ files by the values of their Read property that indicates the read number.
                       Returns a dict mapping each observed value of the property (or 'none' if a file does not have a value
                         for the property) to a list of the files with that value. Within each group, the files are sorted by their
                       value of the Chunk property (to ensure that left and right reads of a given chunk are handled together.
    Args     : fastq_files - a list of dxpy.DXFile objects representing FASTQ files.
    Returns  : dict.
    """

    #print("Grouping Fastq files by read number")
    fastq_dxfiles = [dxpy.DXFile(item) for item in fastq_files]
    read_dict = {}

    for fastq_dxfile in fastq_dxfiles:
        props = fastq_dxfile.get_properties()
        read_num = props["read"]
        if read_num not in ["1", "2", "none"]:
            raise dxpy.AppError("%s has invalid Read property: %s" %
                                (fastq_dxfile.get_id(), read_num))
        if read_num not in read_dict:
            read_dict[read_num] = []
        fastq_dxlink = dxpy.dxlink(fastq_dxfile)
        read_dict[read_num].append(fastq_dxlink)

    #for read_num in read_dict:
    #    read_dict[read_num] = sorted(read_dict[read_num], key=chunk_property)

    return read_dict
Example #10
0
def main():

    inputs_file = open("inputs_stats.txt", 'w')

    print sys.argv[2]

    workflow = dxpy.DXWorkflow(sys.argv[2].split(":")[-1])
    fh = dxpy.DXFile(sys.argv[1].split(":")[-1])

    if "/Results" in fh.describe()['folder']:
        return

    app_id = sys.argv[3]

    if "applet" in app_id:
        app = dxpy.DXApplet(app_id)
    else:
        app = dxpy.DXApp(app_id)

    w_id = sys.argv[1].split(":")[1]

    existing_inputs = []
    for item in workflow.describe()['stages'][0]['input']:
        existing_inputs.append(item)
    print existing_inputs

    for x in app.describe()['inputSpec']:
        print x
        if x['class'] == 'file' and x['name'] not in existing_inputs:
            inputs_file.write(x['name'] + "\n")
            
    inputs_file.close()
Example #11
0
def test_alignment_count(applet_id, project_id, folder, tmpdir):
    """Run BWA on a FASTQ file and verify that the number of
    alignments produced is correct.
    """

    # Recall that applet_id is set in the associated conftest.py, which either
    # gets it from the command line or builds the applet and retrieves its id.

    # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath.
    # It's strpath property just returns a string.

    applet = dxpy.DXApplet(applet_id)
    input_dict = {
        "fastq": dxpy.dxlink(SAMPLE_FASTQ),
        "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX)
    }

    job = applet.run(input_dict,
                     instance_type="mem1_ssd1_x16",
                     folder=folder,
                     project=project_id)

    job.wait_on_done()

    output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"])
    local_filename = os.path.join(tmpdir.strpath, "test.bam")
    dxpy.download_dxfile(output_bam_dxfile.get_id(), local_filename)
    count_alignments_cmd = "samtools view {bam} | wc -l".format(
        bam=local_filename)
    num_alignments = int(
        subprocess.check_output(count_alignments_cmd, shell=True))
    assert num_alignments == 1951476
def main(input_file):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    input_file = dxpy.DXFile(input_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(input_file.get_id(), "input_file")

    # Fill in your application code here.

    subprocess.check_call(
        "fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file",
        shell=True)

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    output_file = dxpy.upload_local_file("output_file")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["output_file"] = dxpy.dxlink(output_file)

    return output
Example #13
0
    def download_url_create_symlink(self, url, sym_name):
        print("url = {}".format(url))

        tmp_file = "localfile"
        # download [url]
        cmd = ["wget", "--tries=5", "--quiet", "-O", tmp_file, url]

        try:
            print("Downloading original link with wget")
            subprocess.check_call(cmd, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as e:
            msg = ""
            if e and e.output:
                msg = e.output.strip()
            err_exit("Failed to download with wget: {cmd}\n{msg}\n".format(
                cmd=str(cmd), msg=msg))

        # calculate its md5 checksum
        digest = md5_checksum(tmp_file)
        os.remove(tmp_file)

        # create a symlink on the platform, with the correct checksum
        input_params = {
            'name': sym_name,
            'project': self.proj_id,
            'drive': "drive-PUBLISHED",
            'md5sum': digest,
            'symlinkPath': {
                'object': url
            }
        }
        result = dxpy.api.file_new(input_params=input_params)
        return dxpy.DXFile(dxid=result["id"], project=self.proj_id)
Example #14
0
def combine_files(countDXlinks, resultfn):
    """The 'gather' subjob of the applet.

    Arguments:
        countDXlinks (list[dict]): list of DXlinks to process job output files.
        resultfn (str): Filename to use for job output file.

    Returns:
        DXLink for the main function to return as the job output.

    Note: Only the DXLinks are passed as parameters.
    Subjobs work on a fresh instance so files must be downloaded to the machine
    """
    if resultfn.endswith(".bam"):
        resultfn = resultfn[:-4] + '.txt'

    sum_reads = 0
    with open(resultfn, 'w') as f:
        for i, dxlink in enumerate(countDXlinks):
            dxfile = dxpy.DXFile(dxlink)
            filename = "countfile{0}".format(i)
            dxpy.download_dxfile(dxfile, filename)
            with open(filename, 'r') as fsub:
                for line in fsub:
                    sum_reads += parse_line_for_readcount(line)
                    f.write(line)
        f.write('Total Reads: {0}'.format(sum_reads))

    countDXFile = dxpy.upload_local_file(resultfn)
    countDXlink = dxpy.dxlink(countDXFile.get_id())

    return {"countDXLink": countDXlink}
Example #15
0
def main(**job_inputs):
    # If we weren't provided a mmi index for the reference, generate it.
    if 'genome_mmi' not in job_inputs:
        mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']}
        minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index')
        job_inputs['genome_mmi'] = minimap_index_job.get_output_ref(
            'genome_mmi')
    output = {'genome_mmi': job_inputs['genome_mmi']}

    # check if we're dealing with pacbio or ONT reads and what the filetype is
    datatype = job_inputs['datatype']
    one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name']
    try:
        file_ext = re.search("(fastq|fasta|fa|fq){1}(.gz)?$",
                             one_reads_file,
                             flags=re.I).group(1).lower()
    except AttributeError:
        raise dxpy.AppError("Invalid filetype extension supplied.")

    # for fasta and fastq inputs, run jobs using native minimap2
    jobs = run_minimap2_subjobs(job_inputs)

    output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs]
    output['bai_files'] = [
        j.get_output_ref('mapped_reads_index') for j in jobs
    ]

    return output
Example #16
0
    def _move(self, dest):
        """Moves the data object to a different folder within project.

        Args:
            dest (Path): The destination file/folder path within same project

        Raises:
            ValueError: When attempting to move projects
            DNAnexusError: If attempting to move across projects
        """
        if not self.resource:
            raise ValueError('Cannot move project ({})'.format(self))
        if dest.canonical_project != self.canonical_project:
            # This can be implemented by clone and remove original
            raise DNAnexusError('Cannot move across different projects')
        if self == dest:
            return

        file_handler = dxpy.DXFile(dxid=self.canonical_resource,
                                   project=self.canonical_project)
        target_dest, should_rename = self._prep_for_copy(dest)

        with _wrap_dx_calls():
            file_handler.move('/' + (target_dest.parent.resource or ''))
            if should_rename:
                file_handler.rename(dest.name)
        self.clear_cached_properties()
Example #17
0
    def _clone(self, dest):
        """Clones the data object into the destination path.
        The original file is retained.

        Args:
            dest (Path): The destination file/folder path in a different project

        Raises:
            ValueError: If attempting to clone a project
            DNAnexusError: If cloning within same project
        """
        if not self.resource:
            raise ValueError('Cannot clone project ({})'.format(self))
        if dest.canonical_project == self.canonical_project:
            raise DNAnexusError('Cannot clone within same project')
        file_handler = dxpy.DXFile(dxid=self.canonical_resource,
                                   project=self.canonical_project)
        target_dest, should_rename = self._prep_for_copy(dest)

        with _wrap_dx_calls():
            new_file_h = file_handler.clone(
                project=dest.canonical_project,
                folder='/' + (target_dest.parent.resource or ''))
            # no need to rename if we changed destination to include original name
            if should_rename:
                new_file_h.rename(dest.name)
Example #18
0
def main(inputs, prefix=None):

    input_filenames = []
    for input_file in inputs:
        dxf = dxpy.DXFile(input_file)
        input_filenames.append(dxf.name)
        dxpy.download_dxfile(dxf.get_id(), dxf.name)

    # uses last extension - presumably they are all the same
    extension = splitext(splitext(input_filenames[-1])[0])[1]
    if prefix:
        pooled_filename = prefix + "_pooled%s.gz" % (extension)
    else:
        pooled_filename = \
            '-'.join([splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension)
    out, err = common.run_pipe([
        'gzip -dc %s' % (' '.join(input_filenames)),
        'gzip -cn'],
        outfile=pooled_filename)

    pooled = dxpy.upload_local_file(pooled_filename)

    output = {
        "pooled": dxpy.dxlink(pooled)
    }

    return output
Example #19
0
def download_and_gunzip_file(input_file, skip_decompress=False, additional_pipe=None, create_named_pipe=False, input_filename=None):
    input_file = dxpy.DXFile(input_file)
    if input_filename is None:
        input_filename = input_file.describe()['name']
    ofn = input_filename

    cmd = 'dx download ' + input_file.get_id() + ' -o - '
    if input_filename.endswith('.tar.gz'):
        ofn = 'tar_output_{0}'.format(ofn.replace('.tar.gz', ''))
        cmd += '| tar -zxvf - '
    elif (os.path.splitext(input_filename)[-1] == '.gz') and not skip_decompress:
        cmd += '| gunzip '
        ofn = os.path.splitext(ofn)[0]
    if additional_pipe is not None:
        cmd += '| ' + additional_pipe
    cmd += ' > "{0}"'.format(ofn)

    if create_named_pipe:
        named_pipe_cmd = 'mkfifo {0}'.format(ofn)
        run_cmd(named_pipe_cmd)
        cmd += '&'

    run_cmd(cmd)

    return ofn
def merge_bams(bam_files, bam_root, use_cat, use_sort, nthreads):

    fnames = []
    for bam in bam_files:
        dxbam = dxpy.DXFile(bam)
        dxfn = dxbam.describe()['name']
        logger.info("* Downloading %s... *" % dxfn)
        dxpy.download_dxfile(bam, dxfn)
        fnames.append(dxfn)

    outfile_name = bam_root
    logger.info("* Merged alignments file will be: %s *" %
                (outfile_name + '.bam'))
    if len(fnames) == 1:
        # UNTESTED
        rep_outfile_name = bam_root + '_bismark_biorep'
        logger.info("* Only one input file (%s), no merging required." %
                    fnames[0])
        os.rename(fnames[0], outfile_name + '.bam')

    else:
        if use_cat:
            for fn in fnames:
                if not os.path.isfile('sofar.bam'):
                    os.rename(fn, 'sofar.bam')
                else:
                    logger.info("* Merging...")
                    # NOTE: keeps the first header
                    cat_cmd = 'samtools cat sofar.bam %s' % fn
                    subprocess.check_call(shlex.split(cat_cmd),
                                          stdout=open('merging.bam', 'a'))
                    os.rename('merging.bam', 'sofar.bam')

            # At this point there is a 'sofar.bam' with one or more input bams

            logger.info("* Files merged into %s (via cat) *" %
                        (outfile_name + '.bam'))

        else:
            # use samtools merge
            # UNTESTED
            filelist = " ".join(fnames)
            merge_cmd = 'samtools merge sofar.bam ' + filelist
            logger.info("Merging via merge: %s " % merge_cmd)
            mergeout = subprocess.check_output(shlex.split(merge_cmd))
            # this gets renamed later
            logger.info(mergeout)

        if use_sort:
            # sorting needed due to samtools cat
            # UNTESTED
            sort_cmd = 'samtools sort -@ %s -m 6G -f sofar.bam sorted.bam' % nthreads
            logger.info("* Sorting merged bam: %s" % sort_cmd)
            sortout = subprocess.check_output(shlex.split(sort_cmd))
            logger.info(sortout)
            os.rename('sorted.bam', outfile_name + '.bam')
        else:
            os.rename('sofar.bam', outfile_name + '.bam')

    return outfile_name + '.bam'
Example #21
0
def main(quants_a, quants_b, annotations):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)
    dxfile_anno = dxpy.DXFile(annotations)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a.tsv")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b.tsv")
    dxpy.download_dxfile(dxfile_anno.get_id(), "annotations.gtf.gz")
    
    # Create and appropriate name for output files
    out_root = root_name_from_pair(dxfile_a.name.split('.')[0],dxfile_b.name.split('.')[0])
    print "* Expecting output: '"+out_root+"_srna_mad_plot.png'"
    
    # Must move sub-scripts into current dir so they will be found by srna-mad-qc.sh
    subprocess.check_call(['mv', "/usr/bin/extract_gene_ids.awk", '.'])
    subprocess.check_call(['mv', "/usr/bin/sum_srna_expression.awk", '.'])
    subprocess.check_call(['mv', "/usr/bin/MAD.R", '.'])
    
    # DX/ENCODE independent script is found in resources/usr/bin
    print "* ===== Calling DNAnexus and ENCODE independent script... ====="
    subprocess.check_call(['srna_mad_qc.sh','annotations.gtf.gz','quants_a.tsv','quants_b.tsv',out_root])
    print "* ===== Returned from dnanexus and encodeD independent script ====="
    mad_plot_file = out_root + '_mad_plot.png'
    mad_qc_file = out_root + '_mad_qc.txt'

    print "* package properties..."
    qc_metrics = {}
    f_qc = open(mad_qc_file, 'r')
    mad_output = f_qc.read()
    f_qc.close()
    mad_output = mad_output.replace("NA","-1")
    qc_metrics["MAD.R"] = json.loads(mad_output)
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics,indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(mad_plot_file,properties=props,details=qc_metrics)
    
    return { "metadata": meta_string, "mad_plot": plot_dxfile }
def main(quants_a, quants_b):

    # tool_versions.py --applet $script_name --appver $script_ver
    sw_versions = subprocess.check_output(
        ['tool_versions.py', '--dxjson', 'dnanexus-executable.json'])

    dxfile_a = dxpy.DXFile(quants_a)
    dxfile_b = dxpy.DXFile(quants_b)

    print "* Downloading files..."
    dxpy.download_dxfile(dxfile_a.get_id(), "quants_a.tsv")
    dxpy.download_dxfile(dxfile_b.get_id(), "quants_b.tsv")

    # Create and appropriate name for output files
    out_root = root_name_from_pair(
        dxfile_a.name.split('.')[0],
        dxfile_b.name.split('.')[0])
    out_root += '_mad'
    mad_plot_file = out_root + '_plot.png'

    # DX/ENCODE independent script is found in resources/usr/bin
    print "* Runnning MAD.R..."
    subprocess.check_call(["ls", "-l"])
    #mad_output = subprocess.check_output(['Rscript', '/usr/bin/MAD.R', 'quants_a.tsv', 'quants_b.tsv'])
    #subprocess.check_call(['mv', "MAplot.png", mad_plot_file ])
    subprocess.check_call(
        ['rampage_mad_qc.sh', 'quants_a.tsv', 'quants_b.tsv', out_root])
    mad_json_file = out_root + '.json'

    print "* package properties..."
    qc_metrics = {}
    #qc_metrics["MAD.R"] = json.loads(mad_output)
    fileH = open(mad_json_file, 'r')
    qc_metrics["MAD.R"] = json.load(fileH)
    fileH.close()
    meta_string = json.dumps(qc_metrics)
    print json.dumps(qc_metrics, indent=4)
    props = {}
    props["SW"] = sw_versions

    print "* Upload Plot..."
    plot_dxfile = dxpy.upload_local_file(mad_plot_file,
                                         properties=props,
                                         details=qc_metrics)

    return {"metadata": meta_string, "mad_plot": plot_dxfile}
Example #23
0
def file_get_details(fid, dxfile=None, proj_id=None):
    '''Returns dx file's details as json.'''
    if dxfile == None:
        if proj_id != None:
            dxfile = dxpy.DXFile(fid, project=proj_id)
        else:
            dxfile = file_handler_from_fid(fid)
    return dxfile.get_details()
Example #24
0
def main(bam_file,
         ref_vcf_file,
         eval_vcf_file,
         qual_cutoff,
         depth_cutoff,
         bed_file=None):

    # The following line(s) initialize your data object inputs on the platform
    # into dxpy.DXDataObject instances that you can start using immediately.

    bam_file = dxpy.DXFile(bam_file)
    if bed_file is not None:
        bed_file = dxpy.DXFile(bed_file)
    ref_vcf_file = dxpy.DXFile(vcf_file)
    eval_vcf_file = dxpy.DXFile(eval_vcf_file)

    # The following line(s) download your file inputs to the local file system
    # using variable names for the filenames.

    dxpy.download_dxfile(bam_file.get_id(), "bam_file")

    dxpy.download_dxfile(vcf_file.get_id(), "vcf_file")
    if bed_file is not None:
        dxpy.download_dxfile(bed_file.get_id(), "bed_file")

    # Fill in your application code here.

    # The following line(s) use the Python bindings to upload your file outputs
    # after you have created them on the local file system.  It assumes that you
    # have used the output field name for the filename for each output, but you
    # can change that behavior to suit your needs.

    sites_for_manual_review = dxpy.upload_local_file("sites_for_manual_review")

    # The following line fills in some basic dummy output and assumes
    # that you have created variables to represent your output with
    # the same name as your output fields.

    output = {}
    output["sites_for_manual_review"] = dxpy.dxlink(sites_for_manual_review)
    output["number_of_missed_sites"] = number_of_missed_sites
    output["found_sites"] = found_sites
    output["Sensitivity"] = Sensitivity
    output["specificity"] = specificity

    return output
Example #25
0
def main(childbam, fatherbam, motherbam, reference, outputbase=None, targetbed=None):

    inputbams = [dxpy.DXFile(item) for item in (childbam, fatherbam, motherbam)]

    if len(inputbams) != 3:
        raise dxpy.exceptions.AppError("A trio must consist of three files (%d bam files were provided)"%len(inputbams))

    scc("dx download '%s' -o - --no-progress | zcat > reference.fa"%dxpy.DXFile(reference).get_id(), shell=True)
    scc(["samtools", "faidx", "reference.fa"])

    targetopt = ''
    if not targetbed is None:
        dxpy.download_dxfile(targetbed, "target.bed")
        targetopt = '-l target.bed'

    if outbase is None:
        outbase = inputbams[0].get_properties()['SAMPLE_NAME'] + '.trio'

    # order in trioconfig must be child, father, mother
    trioconfig = open('trioconfig', 'w')
    inputfiles = []
    for i in range(len(inputbams)):
        dxpy.download_dxfile(inputbams[i].get_id(), "inputbams-%d.bam"%i)
        inputfiles.append("inputbams-%d.bam"%i)
        trioconfig.write(str(inputbams[i].get_properties()['SAMPLE_NAME']) + "\n")

    trioconfig.close()

    # could tee to an output bcf file if desired
    # FIXME - need to specify child's gender for non-PAR on X
    #        trioxd for female child, trioxs for male child
    command = """samtools mpileup -uf reference.fa -D -V -C 50 %s %s | \
        bcftools view -s trioconfig -T trioauto -vg - > %s.vcf \
    """%(targetopt, ' '.join(inputfiles), outputbase)

    print "::: command is:\n\t" + command
    scc(command, shell=True)

    #bcfout = dxpy.upload_local_file("bcfout");
    vcfout = dxpy.upload_local_file("%s.vcf"%outputbase);

    output = {}
    #output["bcfout"] = dxpy.dxlink(bcfout)
    output["vcfout"] = dxpy.dxlink(vcfout)

    return output
Example #26
0
def file_get_properties(fid, dxfile=None, proj_id=None):
    '''Returns dx file's properties.'''
    if dxfile == None:
        if proj_id != None:
            dxfile = dxpy.DXFile(fid, project=proj_id)
        else:
            dxfile = file_handler_from_fid(fid)
    return dxfile.get_properties()
Example #27
0
 def test_AAA_DownloadResultResults(self):
     job_hash = self.job.describe()
     output_hash = job_hash["output"]["rds"]
     self.assertTrue(len(output_hash) == 1)
     f = dxpy.DXFile(output_hash.values()[0], project=job_hash["project"])
     print "TestCase: Downloading %s" % f.name
     dxpy.download_dxfile(f.id, f.name, project=job_hash["project"])
     self.assertTrue(os.path.isfile(f.name))
Example #28
0
def main(**kwargs):

    dxpy.download_folder(DCC_CREDENTIALS_PROJECT,
                         '.',
                         folder=DCC_CREDENTIALS_FOLDER)
    if 'key' in kwargs:
        key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')])
    else:
        key = dxpy.api.system_whoami()['id']
    key_tuple = common.processkey(key, KEYFILE)
    if not key_tuple:
        logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE))
        raise PortalCredentialsError("Supply a valid keypair ID")
    authid, authpw, server = key_tuple
    if 'url' in kwargs:
        server = kwargs.pop('url')
    keypair = (authid, authpw)

    tokens = ['python3 checkfiles.py']
    for k, v in kwargs.iteritems():
        if isinstance(v, bool):
            if v:
                tokens.append("--" + k.replace('_', '-'))
            continue
        if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int):
            tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)]))

    if 'dx_file' in kwargs:
        dxfile = dxpy.DXFile(kwargs.get('dx_file'))
        local_file = dxpy.download_dxfile(dxfile, dxfile.name)
        tokens.append("--local-file %s" % (dxfile.name))

    # this is just to get a command string to print that has no secrets
    tokens_safe = deepcopy(tokens)
    tokens_safe.append("--username %s --password %s" %
                       ("." * len(authid), "." * len(authpw)))
    tokens_safe.append(server)
    logger.info(' '.join(tokens_safe))

    tokens.append("--username %s --password %s" % (authid, authpw))
    # this needs to be the last token
    tokens.append(server)

    checkfiles_command = ' '.join(tokens)
    subprocess.check_call(shlex.split(checkfiles_command))

    output = {}
    outfilename = kwargs.get('out')
    errfilename = kwargs.get('err')
    if outfilename:
        out = dxpy.upload_local_file(outfilename)
        output.update({'out': dxpy.dxlink(out)})
    if errfilename:
        err = dxpy.upload_local_file(errfilename)
        output.update({'err': dxpy.dxlink(err)})

    return output
Example #29
0
def download_file(file_dxid):
    """
    Args    : dx_file - a file object ID on DNAnexus to the current working directory.
    Returns : str. Path to downloaded file.
    """
    dx_file = dxpy.DXFile(file_dxid)
    filename = dx_file.describe()['name']
    dxpy.download_dxfile(dxid=dx_file.get_id(), filename=filename)
    return filename
Example #30
0
def main ( fastq1_gz, fastq2_gz, bowtie_index ):

    input_fastq_file1 = dxpy.DXFile(fastq1_gz)
    input_fastq_file1_name = input_fastq_file1.describe()['name']
    if input_fastq_file1_name.endswith('.gz'):
       dxpy.download_dxfile(fastq1_gz, "fastq1.gz")
       subprocess.call("gunzip -q fastq1.gz", shell=True)
    else:
       dxpy.download_dxfile(fastq1_gz, "fastq1")
       

    input_fastq_file2 = dxpy.DXFile(fastq2_gz)
    input_fastq_file2_name = input_fastq_file2.describe()['name']
    if input_fastq_file2_name.endswith('.gz'):
       dxpy.download_dxfile(fastq2_gz, "fastq2.gz")
       subprocess.call("gunzip -q fastq2.gz", shell=True)
    else: 
       dxpy.download_dxfile(fastq2_gz, "fastq2")

    dxpy.download_dxfile(bowtie_index, "bowtie_index.tgz")
    
    command1 = "tar -xzf bowtie_index.tgz"
    subprocess.call(command1, shell=True)

    command4 = "ls -1 *bt2 | head -1 | sed 's/.1.bt2//g' > file_containing_bowtie_index_name"
    subprocess.call(command4, shell=True)

    subprocess.call("ls",shell=True)
    subprocess.call("cat file_containing_bowtie_index_name",shell=True)

    outprefix='out'

    command5 = "cat file_containing_bowtie_index_name | xargs -i sh -c 'preprocessing.sh {{}} fastq1 fastq2 . {outprefix}'".format(outprefix=outprefix)
    subprocess.call(command5, shell=True)
    print(command5)
    
    sorted_bam_pe_filename = "{outprefix}_noDup.sort.bam".format(outprefix=outprefix)
    split_bam1_filename = "{outprefix}_pair1.bam".format(outprefix=outprefix)
    split_bam2_filename = "{outprefix}_pair2.bam".format(outprefix=outprefix)
    sorted_bam_pe_file = dxpy.upload_local_file(sorted_bam_pe_filename)
    split_bam1_file = dxpy.upload_local_file(split_bam1_filename)
    split_bam2_file = dxpy.upload_local_file(split_bam2_filename)

    return { "sorted_bam_pe": sorted_bam_pe_file, "split_bam1": split_bam1_file, "split_bam2": split_bam2_file }