Esempio n. 1
0
def makeInputsBwa():
    try:
        contigset_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "fasta_contigset_importer"}).next()['id'])
        reads_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "Letter Space FASTQ importer"}).next()['id'])
    except StopIteration:
        raise Exception("fasta_contigset_importer or Letter Space FASTQ importer not found, please upload them")

    genome_archive = dxpy.upload_local_file(os.path.join(test_resources_dir, "hg19_chr22.fa.xz"), wait_on_close=True)
    contigset_importer_input = {"name": "hg19_chr22", "sequence_file": dxpy.dxlink(genome_archive)}
    print "Running fasta_contigset_importer with", contigset_importer_input
    job = contigset_importer.run(contigset_importer_input)
    job.wait_on_done()
    contig_set = job.describe()["output"]["contig_set"]

    left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_left.fq"), wait_on_close=True)
    right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_right.fq"), wait_on_close=True)
    #left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_1_1M.fastq.xz"), wait_on_close=True)
    #right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_2_1M.fastq.xz"), wait_on_close=True)

    reads_importer_input = {"left_file": dxpy.dxlink(left_reads), "right_file": dxpy.dxlink(right_reads)}
    print "Running LetterSpaceFileObjectToReadsTable with", reads_importer_input
    job = reads_importer.run(reads_importer_input)
    job.wait_on_done()
    reads = job.describe()["output"]["reads"]

    return {"reads": [reads] * 3, "reference": contig_set}
    def get_fastq_dxfile_objects(self, barcode=None):
        """
        Retrieves all the FASTQ files in project self.dx_project_name as DXFile objects.
  
        Args:
            barcode: `str`. If set, then only FASTQ file properties for FASTQ files having the specified barcode are returned.
  
        Returns: 
            `list` of DXFile objects representing FASTQ files.
  
        Raises:
            `dnanexus_utils.FastqNotFound`: No FASTQ files were found.
        """
        bc_reg = re.compile("[ACGT]{6,}-[ACGT]{6,}", re.I)
        fq_ext_glob = "*{}".format(self.FQEXT)
        name = fq_ext_glob
        fastqs = list(
            dxpy.find_data_objects(project=self.dx_project_id,
                                   folder=self.DX_FASTQ_FOLDER,
                                   name=name,
                                   name_mode="glob"))
        if not fastqs:
            # Then look for them in all folders:
            fastqs = list(
                dxpy.find_data_objects(project=self.dx_project_id,
                                       name=name,
                                       name_mode="glob"))

        if not fastqs:
            debug_logger.info("No FASTQ files found for run {run} ".format(
                run=self.dx_project_name))
            return []
        fastqs = [
            dxpy.DXFile(project=x["project"], dxid=x["id"]) for x in fastqs
        ]
        if not barcode:
            return fastqs

        bc_fastqs = []  # Only those DXFiles that have the barcode of interest.
        for fq in fastqs:
            props = fq.get_properties()
            barcode_val = props.get(self.FQFILE_BARCODE_PROP_NAME)
            if not barcode_val:
                # Then try to get it from the file name:
                hit = bc_reg.search(fq.name)
                if hit:
                    barcode_val = hit.group()
            if barcode_val == barcode:
                bc_fastqs.append(fq)
        if not bc_fastqs:
            msg = "No FASTQ files found for run {run} and barcode {barcode}.".format(
                run=self.dx_project_name, barcode=barcode)
            debug_logger.error(msg)
            raise FastqNotFound(msg)
        return bc_fastqs
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='Create a manifest file for a particular folder in a project')
    parser.add_argument('folder', help='a folder in the current DNAnexus project')
    parser.add_argument('-o', '--output_file', help='Name of the output file', default='manifest.json.bz2')
    parser.add_argument('-r', '--recursive', help='Recursively traverse folders and append to manifest', action='store_true', default=False)

    args = parser.parse_args()

    project, folder, _ = resolve_existing_path(args.folder)

    ids = dxpy.find_data_objects(classname='file', first_page_size=1000, state='closed', describe={'fields': {'id': True, 'name': True, 'folder': True, 'parts': True, 'state': True, 'archivalState': True }}, project=project, folder=folder, recurse=args.recursive)
    manifest = { project: [] }

    for i,f in enumerate(ids):
        manifest[project].append(fileID2manifest(f['describe'], project))
        if i%1000 == 0 and i != 0:
            print("Processed {} files".format(i))

    # Dedup
    # Duplicate filenames are converted to filename_fileid
    dups = [item for item, count in collections.Counter([x['name'] for x in manifest[project]]).items() if count > 1]
    for x in manifest[project]:
        if x['name'] in dups:
            fname, fext = os.path.splitext(x['name'])
            x['name'] = fname + "_" + x['id'] + fext

    write_manifest_to_file(args.output_file, manifest)
    print("Manifest file written to {}".format(args.output_file))
    print("Total {} objects".format(len(manifest[project])))
Esempio n. 4
0
def get_tas(exp_id, default_project, ta_folders):
	possible_files = []
	for base_folder in ta_folders:
		if ':' in base_folder:
			project_name, path = base_folder.split(':')
			project = resolve_project(project_name)
			project = project.get_id()
			project_name += ":"
		else:
			project = default_project
			project_name = ""
			path = base_folder
		if not path.startswith('/'):
			path = '/' + path
		print project, project_name, path
		for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project):
			desc = dxfile.get('describe')
			if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
				possible_files.append(desc)
	print "%s %i possible files" %(exp_id, len(possible_files))
	rep1_files = [f for f in possible_files if 'rep1' in f.get('folder')]
	rep2_files = [f for f in possible_files if 'rep2' in f.get('folder')]
	if len(rep1_files) != 1:
		print "Tried to find one rep1 ta, found %d" %(len(rep1_files))
		rep1 = None
	else:
		rep1 = rep1_files[0].get('project') + ':' + rep1_files[0].get('folder') + '/' + rep1_files[0].get('name')
	if len(rep2_files) != 1:
		print "Tried to find one rep2 ta, found %d" %(len(rep2_files))
		rep2 = None
	else:
		rep2 = rep2_files[0].get('project') + ':' + rep2_files[0].get('folder') + '/' + rep2_files[0].get('name')
	
	return rep1, rep2
Esempio n. 5
0
def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid =  project.get_id()

    counts = {}
    n = 0
    summaries = dxpy.find_data_objects(classname='file', folder='/runs', name='*_summary.txt', recurse=True, name_mode='glob', project=pid, return_handler=False)
    while summaries:
        try:
            flink = dxpy.dxlink(summaries.next())
            n = n+1
        except StopIteration:
            break
        fd = dxpy.describe(flink)
        fn = "fastqc/%s" % fd['name']
        if not os.path.isfile(fn):
            print 'Downloading: %s from %s' % (fn, fd['folder'])
            try:
                dxpy.download_dxfile(flink, fn)
            except Exception, e:
                print "Error %s" % e

        parse_summary(fn, counts)
def get_repns(exp_id, ta_folders):
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        print project, project_name, path
        for dxfile in dxpy.find_data_objects(classname='file',
                                             state='closed',
                                             folder=path,
                                             describe=True,
                                             recurse=True,
                                             project=project):
            desc = dxfile.get('describe')
            if exp_id in desc.get('folder') and '/bams' in desc.get(
                    'folder') and desc.get('name').endswith(
                        ('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    print "%s %i possible files" % (exp_id, len(possible_files))
    folders = [f.get('folder') for f in possible_files]
    print "%s folders %s" % (exp_id, folders)
Esempio n. 7
0
    def _format_data_file(self, df: DataFile) -> dict:
        if isinstance(df.localizer, UrlLocalizer):
            ul = cast(UrlLocalizer, df.localizer)
            if ul.url.startswith("dx://"):
                return dxpy.dxlink(*ul.url[5:].split(":"))

        file_name = df.local_path.name

        existing_files = list(dxpy.find_data_objects(
            classname="file",
            state="closed",
            name=file_name,
            project=self._project_id,
            folder=self._folder,
            recurse=False
        ))

        if not existing_files:
            # TODO: batch uploads and use dxpy.sugar.transfers.Uploader for
            #  parallelization
            return dxpy.dxlink(dxpy.upload_local_file(
                str(df.path),
                name=file_name,
                project=self._project_id,
                folder=self._folder,
                parents=True,
                wait_on_close=True
            ))
        elif len(existing_files) == 1:
            return dxpy.dxlink(existing_files[0]["id"], self._project_id)
        else:
            raise RuntimeError(
                f"Multiple files with name {file_name} found in "
                f"{self._project_id}:{self._folder}"
            )
 def create_dxrecord(self, develop):
     details = self._get_record_details()
     self.record_properties = self._set_record_properties()
     
     if develop:
         record_name = 'dev_%s_L%d' % (self.run_name, self.lane_index)            
         self.record_properties['production'] = 'false'
         #self.record_properties['status'] = 'uploading'
         details['email'] = '*****@*****.**'
                     
     else:
         record_name = '%s_L%d' % (self.run_name, self.lane_index)
         self.record_properties['production'] = 'true'
                 
     record_generator = dxpy.find_data_objects(classname = 'record', 
                                               name = record_name,
                                               name_mode = 'exact',
                                               project = self.dashboard_project_id,
                                               folder = '/')
     records = list(record_generator)
     if len(records) > 0:
         self.record_id = records[0]['id']
     else:
         input_params={
                       "project": self.dashboard_project_id,
                       "name": record_name,
                       "types": ["SCGPMRun"],
                       "properties": self.record_properties,
                       "details": details
                      }
         print input_params
         self.record_id = dxpy.api.record_new(input_params)['id']
         dxpy.api.record_close(self.record_id)
def get_all_tas(experiment, default_project, ta_folders):
    logging.debug(
        'get_all_tas: enter with experiment %s default_project %s and ta_folders %s'
        % (experiment.get('accession'), default_project, ta_folders))
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, base_path = base_folder.split(':')
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            base_path = base_folder
        if not base_path.startswith('/'):
            base_path = '/' + base_path
        if not base_path.endswith('/'):
            base_path = base_path + '/'
        path = base_path + 'bams/' + exp_id + '/'
        logging.debug(
            "get_all_tas: find_data objects in project %s project_name %s path %s"
            % (project, project_name, path))
        for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project):
            desc = dxfile.get('describe')
            logging.debug(
                "get_all_tas: checking object for match: folder %s name %s"
                % (desc.get('folder'), desc.get('name')))
            if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    logging.debug(
        "get_all_tas: exit with possible_files %s" % (possible_files))
    return possible_files
def get_all_tas(experiment, default_project, ta_folders):
    logging.debug(
        'get_all_tas: enter with experiment %s default_project %s and ta_folders %s'
        % (experiment.get('accession'), default_project, ta_folders))
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, base_path = base_folder.split(':')
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            base_path = base_folder
        if not base_path.startswith('/'):
            base_path = '/' + base_path
        if not base_path.endswith('/'):
            base_path = base_path + '/'
        path = base_path + 'bams/' + exp_id + '/'
        logging.debug(
            "get_all_tas: find_data objects in project %s project_name %s path %s"
            % (project, project_name, path))
        for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project):
            desc = dxfile.get('describe')
            logging.debug(
                "get_all_tas: checking object for match: folder %s name %s"
                % (desc.get('folder'), desc.get('name')))
            if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    logging.debug(
        "get_all_tas: exit with possible_files %s" % (possible_files))
    return possible_files
Esempio n. 11
0
    def test_full_pipeline(self):
        if mappingsId == False:
            input = self.base_input
            print "Running program with", input
            try:
                bwa = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "BWA"}).next()['id'])
            except:
                print "BWA not found, please upload it"
            job = bwa.run(input)
            job.wait_on_done()
            print "Bwa output:"
            print json.dumps(job.describe()["output"])
            mappings = job.describe()["output"]["mappings"][0]
        else:
            mappings = {"$dnanexus_link":mappingsId}
            print mappings
        #print {'mappings':mappings, "output_mode":"EMIT_VARIANTS_ONLY"}
        #hg19_chr22 reference:
        job = self.gatk.run({'mappings':mappings, 'reference':{"$dnanexus_link":"record-9ykz7KQ00006B3PXk1b00005"}, "output_mode":"EMIT_VARIANTS_ONLY"})

        #yeast
        #job = self.gatk.run({'mappings':mappings, 'reference':{"$dnanexus_link":"record-9zPp07j000035P6yJ9kQ0006"}, "output_mode":"EMIT_ALL_CONFIDENT_SITES"})

        #ce
        #job = self.gatk.run({'mappings':mappings, 'reference':{"$dnanexus_link":"record-9zV2FBQ0000293088JZ00005"}, "output_mode":"EMIT_ALL_CONFIDENT_SITES"})

        job.wait_on_done()
        print "GATK output:"
        print json.dumps(job.describe()["output"])
Esempio n. 12
0
def main():
    cmnd = get_args()

    ## resolve projects
    project = dxencode.resolve_project(PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    pid = project.get_id()

    counts = {}
    n = 0
    summaries = dxpy.find_data_objects(classname='file',
                                       folder='/runs',
                                       name='*_summary.txt',
                                       recurse=True,
                                       name_mode='glob',
                                       project=pid,
                                       return_handler=False)
    while summaries:
        try:
            flink = dxpy.dxlink(summaries.next())
            n = n + 1
        except StopIteration:
            break
        fd = dxpy.describe(flink)
        fn = "fastqc/%s" % fd['name']
        if not os.path.isfile(fn):
            print 'Downloading: %s from %s' % (fn, fd['folder'])
            try:
                dxpy.download_dxfile(flink, fn)
            except Exception, e:
                print "Error %s" % e

        parse_summary(fn, counts)
def get_all_tas(experiment, default_project, ta_folders):
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        print project, project_name, path
        for dxfile in dxpy.find_data_objects(classname='file',
                                             state='closed',
                                             folder=path,
                                             describe=True,
                                             recurse=True,
                                             project=project):
            desc = dxfile.get('describe')
            if exp_id in desc.get('folder') and '/bams' in desc.get(
                    'folder') and desc.get('name').endswith(
                        ('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    return possible_files
Esempio n. 14
0
def ccle_fetch_existing(info):
    analysis_id = str(info['analysis_id'])
    expected_files = ccle_expected_files(info)
    print '\n\nLooking for existing data for {} in the project, consisting of files: {}'.format(analysis_id,json.dumps(expected_files))

    # for each expected file, see if it's already in the project
    existing = []
    for md5 in expected_files:
        for candidate in dxpy.find_data_objects(project=dxpy.PROJECT_CONTEXT_ID,
                                                classname='file',
                                                state='closed',
                                                name=expected_files[md5],
                                                name_mode='exact',
                                                properties={'md5': md5},
                                                return_handler=True):
            deets = candidate.get_details()
            if 'cghub_metadata' in deets and 'md5' in deets and deets['md5'] == md5:
                existing.append(candidate)
                break

    # if the project already has all of them, we can quit early
    if len(existing) == len(expected_files):
        print 'The files are already in the project!'
        dxpy.DXProject(dxpy.PROJECT_CONTEXT_ID).clone(dxpy.WORKSPACE_ID,objects=[dxfile.get_id() for dxfile in existing])
        return existing
    elif len(existing) > 0:
        print 'Only some of the files are already in the project!'
    else:
        print 'No existing data found in the project.'

    return None
Esempio n. 15
0
def determineStepsToDo(pairedEnd, priors, deprecate, projectId, force=False):
    '''Determine what steps need to be done, base upon prior results.'''
    willCreate = []
    stepsToDo = []
    steps = []
    if pairedEnd:
        steps = STEP_ORDER['pe']
    else:
        steps = STEP_ORDER['se']
    for step in steps:
        # Force will include the first step with all its inputs
        # This should avoid forcing concat if it isn't needed
        #
        if force:
            inputs = STEPS[step]['inputs'].keys()
            count = 0
            for input in inputs:
                if input in priors:
                    count += 1
            if count == len(inputs):
                stepsToDo += [ step ]
        if step not in stepsToDo:
            results = STEPS[step]['results'].keys()
            for result in results:
                if result not in priors:
                    #print "- Adding step '"+step+"' because prior '"+result+"' was not found."
                    stepsToDo += [ step ]
                    break
        # If results are there but inputs are being recreated, then step must be rerun
        if step not in stepsToDo:
            inputs = STEPS[step]['inputs'].keys()
            for inp in inputs:
                if inp in willCreate:
                    #print "- Adding step '"+step+"' due to prior step dependency."
                    stepsToDo += [ step ]
                    break
        # Any step that is rerun, will cause prior results to be deprecated
        # NOTE: It is necessary to remove from 'priors' so succeeding steps are rerun
        # NOTE: It is also important to move prior results out of target folder to avoid confusion!
        if step in stepsToDo:
            results = STEPS[step]['results'].keys()
            for result in results:
                willCreate += [ result ]
                if result in priors:
                    deprecate += [ priors[result] ]
                    del priors[result]
                    # if results are in folder, then duplicate files cause a problem!
                    # So add to 'deprecate' to move or remove before launching

    # Now make sure the steps can be found, and error out if not.
    for step in stepsToDo:
        app = STEPS[step]['app']
        dxApp = dxpy.find_data_objects(classname='file', name=app, name_mode='exact',
                                                         project=projectId, return_handler=False)
        if dxApp == None:
            print "ERROR: failure to locate app '"+app+"'!"
            sys.exit(1)

    return stepsToDo
Esempio n. 16
0
    def walkfiles(self,
                  pattern=None,
                  canonicalize=False,
                  recurse=True,
                  starts_with=None,
                  limit=None,
                  classname=None):
        """Iterates over listed files that match an optional pattern.

        Args:
            pattern (str): glob pattern to match the filenames against.
            canonicalize (bool, default False): if True, return canonical paths
            recurse (bool, default True): if True, look in subfolders of folder as well
            starts_with (str): Allows for an additional search path to
                be appended to the resource of the dx path. Note that this
                resource path is treated as a directory
            limit (int): Limit the amount of results returned
            classname (str): Restricting class : One of 'record', 'file', 'gtable,
                'applet', 'workflow'

        Returns:
             Iter[DXPath]: Iterates over listed files that match an optional pattern.
        """
        proj_id = self.canonical_project
        proj_name = self.virtual_project
        kwargs = {
            'project': proj_id,
            'name': pattern,
            'name_mode': 'glob',
            # the query performance is similar w/wo describe field,
            # hence no need to customize query based on canonicalize flag
            'describe': {
                'fields': {
                    'name': True,
                    'folder': True
                }
            },
            'recurse': recurse,
            'classname': classname,
            'limit': limit,
            'folder': ('/' + (self.resource or '')) + (starts_with or '')
        }
        with _wrap_dx_calls():
            list_gen = dxpy.find_data_objects(**kwargs)
        for obj in list_gen:
            if canonicalize:
                yield DXCanonicalPath('dx://{}:/{}'.format(
                    obj['project'], obj['id']))
            else:
                yield DXVirtualPath(
                    '{drive}{proj_name}:{folder}/{name}'.format(
                        drive=self.drive,
                        proj_name=proj_name,
                        folder=obj['describe']['folder'].rstrip('/'),
                        name=obj['describe']['name']))
Esempio n. 17
0
def get_data_matches(text,
                     delim_pos,
                     dxproj,
                     folderpath,
                     classname=None,
                     typespec=None,
                     visibility=None):
    '''
    :param text: String to be tab-completed; still in escaped form
    :type text: string
    :param delim_pos: index of last unescaped "/" or ":" in text
    :type delim_pos: int
    :param dxproj: DXProject handler to use
    :type dxproj: DXProject
    :param folderpath: Unescaped path in which to search for data object matches
    :type folderpath: string
    :param classname: Data object class by which to restrict the search (None for no restriction on class)
    :type classname: string
    :param visibility: Visibility to constrain the results to; default is "visible" for empty strings, "either" for nonempty
    :type visibility: string
    :returns: List of matches
    :rtype: list of strings

    Members of the returned list are guaranteed to start with *text*
    and be in escaped form for consumption by the command-line.
    '''

    #unescaped_text = unescape_completion_name_str(text[delim_pos + 1:])
    unescaped_text = text[delim_pos + 1:]

    if visibility is None:
        if text != '' and delim_pos != len(text) - 1:
            visibility = "either"
        else:
            visibility = "visible"

    try:
        results = list(
            dxpy.find_data_objects(project=dxproj.get_id(),
                                   folder=folderpath,
                                   name=unescaped_text + "*",
                                   name_mode="glob",
                                   recurse=False,
                                   visibility=visibility,
                                   classname=classname,
                                   limit=100,
                                   describe=True,
                                   typename=typespec))
        prefix = '' if text == '' else text[:delim_pos + 1]
        return [
            prefix + escape_name(result['describe']['name'])
            for result in results
        ]
    except:
        return []
def main():
    args = get_args()

    authid, authpw, server = common.processkey(args.key, args.keyfile)
    keypair = (authid, authpw)
    project = resolve_project(args.project)
    SRR_files = dxpy.find_data_objects(
                    name="SRR???????_?.fastq.gz", name_mode='glob',
                    classname='file', recurse=True, return_handler=True,
                    folder=args.folder, project=args.project)
    for srr_dxfile in SRR_files:
        m = re.search('(SRR.{7})_(\d)', srr_dxfile.name)
        if m:
            srr_basename = m.group(1)
            end_num = m.group(2)
        else:
            assert m
        srr_encfiles = common.encoded_get('/'.join([server,'search/?type=File&external_accession=%s&status!=deleted&status!=replaced&status!=revoked' % (srr_basename)]), keypair)['@graph']
        if not srr_encfiles:
            logging.error('%s object not found at ENCODE.  Skipping.' % (srr_basename))
            continue
        elif len(srr_encfiles) > 1:
            logging.error('%s multiple matching objects found at ENCODE.  Skipping.' % (srr_basename))
            continue
        else:
            srr_encfile = srr_encfiles[0]
        # experiment = common.encoded_get('/'.join([server, srr_encfile.get('dataset')]), keypair)
        # replicate = common.encoded_get('/'.join([server, srr_encfile.get('replicate')]), keypair)
        # biorep_n = replicate.get('biological_replicate_number')
        all_fastqs = common.encoded_get('/'.join([
            server,
            'search/?type=File&file_format=fastq&derived_from=/files/%s/&status!=deleted&status!=revoked&status!=replaced' % (srr_basename)
        ]), keypair)['@graph']
        if not all_fastqs:
            print("%s: no fastq(s) found.  Skipping." % (srr_dxfile.name))
            continue
        if end_num == '1':
            fastqs = [f for f in all_fastqs if f.get('run_type') == 'single-ended' or f.get('paired_end') == end_num]
        elif end_num in ['2', '3']:
            fastqs = [f for f in all_fastqs if f.get('run_type') == 'paired-ended' and f.get('paired_end') == '2']
        if not fastqs:
            print("%s: no fastq(s) found for paired_end %s.  Skipping" % (srr_basename, end_num))
            continue
        elif len(fastqs) > 1:
            print("%s: ambiguous matches to %s.  Skipping" % (srr_basename, [f.get('accession') for f in fastqs]))
            continue
        else:
            fastq = fastqs[0]
            newname = '%s.fastq.gz' % (fastq.get('accession'))
            if args.dry_run:
                print('dry_run: Could rename %s to %s' % (srr_dxfile.name, newname))
            else:
                srr_dxfile.set_properties({'srr_filename': srr_dxfile.name})
                srr_dxfile.rename(newname)
                print('%s renamed to %s' % (srr_dxfile.name, newname))
def process(filename, bucket_url, project, folder, skipvalidate=False):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    logger.debug(filename)

    test = list( dxpy.find_data_objects(classname='file',
                           folder=folder, project=project, name_mode='exact',
                           name=filename, return_handler=False) )

    if not test or len(test) == 0:
        #cp the file from the bucket
        subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %(bucket_url)), stderr=subprocess.STDOUT)
        subprocess.check_call(shlex.split('ls -l %s' %(filename)))
        dx_file = dxpy.upload_local_file(filename, project=project, folder=folder)

    else:
        dxpy.download_dxfile(test[0]['id'], filename)
        dx_file=dxpy.dxfile.DXFile(test[0]['id'])
    reads_basename = filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq')

    if skipvalidate:
        return {
            "file": dx_file,
            "report": None,
            "summary": None,
            "zip": None
        }

    subprocess.check_call(['mkdir', 'output'])
    logger.info("Run QC")
    fqc_command = "/usr/bin/FastQC/fastqc " + filename + " -o output"
    logger.debug(fqc_command)
    stdio = subprocess.check_output(shlex.split(fqc_command))
    logger.debug(stdio)
    logger.debug(subprocess.check_output(['ls','-l', 'output']))
    subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename])
    logger.info("Upload results")

    subprocess.check_call(['mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ])
    subprocess.check_call(['mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ])

    report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=folder, project=project)
    summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=folder, project=project)
    zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=folder, project=project)
    logger.debug(report_dxfile)
    return {
        "file": dx_file,
        "report": report_dxfile,
        "summary": summary_dxfile,
        "zip": zip_dxfile
    }
Esempio n. 20
0
def _resolve_global_entity(project_or_job_id, folderpath, entity_name, describe=True, visibility="either"):
    """
    :param project_or_job_id: The project ID to which the entity belongs
                              (then the entity is an existing data object),
                              or the job ID to which the entity belongs
                              (then the entity is a job-based object
                              reference to an object that may not exist yet)
    :type project_or_job_id: string
    :param folderpath: Full path to the object (parsed from command line)
    :type folderpath: string
    :param entity_name: Name of the object
    :type entity_name: string
    :param describe: Input mapping used to describe the job's project if
                     project_or_job_id is a job ID, or True if the input
                     mapping is to be empty
    :type describe: dict or True
    :param visibility: The expected visibility of the entity ("either",
                       "hidden", or "visible"); to be used in resolution
    :type visibility: string
    :returns: The results obtained from attempting to resolve the entity;
              the expected format of the return value is described below
    :rtype: list
    :raises: ResolutionError if dxpy.find_data_objects throws an error

    If project_or_job_id is a job ID, then return value will be like:
        [{"id": ..., "describe": {...}}, ...]

    Otherwise, the return value will be like:
        [{"id": ..., "project": ..., "describe": {...}}, ...]
    Note that if the entity is successfully resolved, then the "describe"
    key will be in the dictionary if and only if a nonempty describe
    mapping was provided.

    TODO: Inspect entity_name and conditionally treat it as a "glob" pattern.
    """
    if is_job_id(project_or_job_id):
        if describe is True:
            describe = {}
        # The following function call will raise a ResolutionError if no results
        # could be found.
        # If the call is successful, then the project will be incorporated into the
        # "describe" mapping of the returned dictionaries.
        return resolve_job_ref(project_or_job_id, entity_name, describe=describe)
    else:
        try:
            return list(dxpy.find_data_objects(project=project_or_job_id,
                                               folder=folderpath,
                                               name=entity_name,
                                               name_mode='glob',
                                               recurse=False,
                                               describe=describe,
                                               visibility=visibility))
        except Exception as details:
            raise ResolutionError(str(details))
Esempio n. 21
0
def find_replicates(reps, source_id, project, experiment, test=False):
    replicates = []
    for rep in reps:
        dx_rep = dxpy.find_data_objects(classname='file', name=rep,
                                        name_mode='glob', project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not test:
        replicates = copy_files(replicates, project.get_id(), "/"+experiment)

    return replicates
Esempio n. 22
0
 def count_logfiles(self):
     """Count logfiles in the DNAnexus project (self.id). Logfiles are in an expected location.
     Returns:
         logfile_count (int): A count of logfiles"""
     # Set uploaded runfolder name. Runfolder is renamed upon upload to the DNANexus project
     # without the first four characters
     uploaded_runfolder = dxpy.describe(self.id)['name'][4:]
     # Set logfile location in DNANexus project. This is expected in 'Logfiles/', a subdirectory of the uploaded runfolder
     logfile_dir = str(Path('/', uploaded_runfolder, 'Logfiles'))
     logfile_list = dxpy.find_data_objects(project=self.id,
                                           folder=logfile_dir,
                                           classname='file')
     logfile_count = len(list(logfile_list))
     return logfile_count
Esempio n. 23
0
def get_dxwdl_applet():
    """Build or find the applet to run dxWDL."""

    found_applets = list(
        dxpy.find_data_objects(name=APPLET_NAME,
                               properties={"version": APPLET_VERSION},
                               classname="applet",
                               state="closed",
                               return_handler=True))

    if found_applets:
        return found_applets[0]
    else:
        return build_applet()
Esempio n. 24
0
def find_replicates(reps, source_id, project, experiment, test=False):
    replicates = []
    for rep in reps:
        dx_rep = dxpy.find_data_objects(classname='file',
                                        name=rep,
                                        name_mode='glob',
                                        project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not test:
        replicates = copy_files(replicates, project.get_id(), "/" + experiment)

    return replicates
Esempio n. 25
0
def find_file(filePath,project=None,verbose=False,multiple=False, recurse=True):
    '''Using a DX style file path, find the file.'''
    proj = project
    path = filePath
    fileName = filePath
    if filePath.find(':') != -1:
        proj, path = filePath.split(':', 1)
    if path.rfind('/') != -1:
        path, fileName = path.rsplit('/', 1)
    else:
        fileName = path
        path = '/'
    if proj == None:
        if verbose:
            print "ERROR: Don't know what project to use for '" + path + "'."
        return None
    if proj.find('project-') == 0:
        projId = proj
    else:
        projId = get_project(proj, level='VIEW').get_id()
    mode = 'exact'
    if filePath.find('*') or filePath.find('?'):
        mode = 'glob'
    fileDicts = list(dxpy.find_data_objects(classname='file', folder=path, name=fileName, recurse=recurse,
                                            name_mode=mode, project=projId, return_handler=False))

    if fileDicts == None or len(fileDicts) == 0:
        #print "- Found 0 files from '" + proj + ":" + filePath + "'."
        if verbose:
            print "ERROR: Failed to find '" + proj + ":" + filePath + "'."
        return None
    elif len(fileDicts) > 1 or multiple:
        #print "- Found "+str(len(fileDict))+" files from '" + proj + ":" + filePath + "'."
        if not multiple:
            if verbose:
                print "ERROR: Found "+str(len(fileDicts))+" files when expecting 1 '" + proj + ":" + filePath + "'."
            return None
        else:
            if verbose:
                print " Found "+str(len(fileDicts))+" files for '" + proj + ":" + filePath + "'."
        fids = []
        for fileDict in fileDicts:
            FILES[fileDict['id']] = dxpy.dxlink(fileDict)
            fids.append( fileDict['id'] )
        return fids
    else:
        #print "- FOUND '" + proj + ":" + filePath + "'."
        FILES[fileDicts[0]['id']] = dxpy.dxlink(fileDicts[0])
        return fileDicts[0]['id']
Esempio n. 26
0
def get_data_matches(text, delim_pos, dxproj, folderpath, classname=None, typespec=None, visibility=None):
    """
    :param text: String to be tab-completed; still in escaped form
    :type text: string
    :param delim_pos: index of last unescaped "/" or ":" in text
    :type delim_pos: int
    :param dxproj: DXProject handler to use
    :type dxproj: DXProject
    :param folderpath: Unescaped path in which to search for data object matches
    :type folderpath: string
    :param classname: Data object class by which to restrict the search (None for no restriction on class)
    :type classname: string
    :param visibility: Visibility to constrain the results to; default is "visible" for empty strings, "either" for nonempty
    :type visibility: string
    :returns: List of matches
    :rtype: list of strings

    Members of the returned list are guaranteed to start with *text*
    and be in escaped form for consumption by the command-line.
    """

    # unescaped_text = unescape_completion_name_str(text[delim_pos + 1:])
    unescaped_text = text[delim_pos + 1 :]

    if visibility is None:
        if text != "" and delim_pos != len(text) - 1:
            visibility = "either"
        else:
            visibility = "visible"

    try:
        results = list(
            dxpy.find_data_objects(
                project=dxproj.get_id(),
                folder=folderpath,
                name=unescaped_text + "*",
                name_mode="glob",
                recurse=False,
                visibility=visibility,
                classname=classname,
                limit=100,
                describe=True,
                typename=typespec,
            )
        )
        prefix = "" if text == "" else text[: delim_pos + 1]
        return [prefix + escape_name(result["describe"]["name"]) for result in results]
    except:
        return []
Esempio n. 27
0
def get_localizer_applet():
    """Return a dxpy.DXApplet object for the localizer applet."""

    # First try to find an existing applet.
    found_applets = list(
        dxpy.find_data_objects(name=APPLET_NAME,
                               properties={"version": APPLET_VERSION},
                               classname="applet",
                               state="closed",
                               return_handler=True))

    if found_applets:
        return found_applets[0]
    else:
        return build_applet()
Esempio n. 28
0
def main():
    args = get_args()
    if len(args.replicates) < 1:
        sys.exit('Need to have at least 1 replicate file.')

    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/'+args.experiment):
        project.new_folder('/'+args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id()

    replicates = []
    for rep in args.replicates:
        dx_rep = dxpy.find_data_objects(classname='file', name=rep,
                                        name_mode='glob', project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not args.test:
        replicates = copy_files(replicates, project.get_id(), "/"+args.experiment)

    if not replicates:
        print "No replicates found in project: " + project.name
        print "Looking for " + ", ".join(args.replicates)
        sys.exit(1)


    paired = args.paired
    gender = args.gender
    organism = args.organism
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    spec_name = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates])
    wf = dxpy.new_dxworkflow(title='dx_dna_me_'+spec_name,
                             name='ENCODE Bismark DNA-ME pipeline: '+spec_name,
                             description='The ENCODE Bismark pipeline for WGBS shotgun methylation analysis for experiment' + args.experiment,
                             folder='/'+args.experiment,
                             project=project.get_id())

    populate_workflow(wf, replicates, args.experiment, paired, gender, organism, project.id)
def get_ta_from_accessions(accessions, default_project, ta_folders):
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project_id = project.get_id()
            project_name += ":"
        else:
            project_id = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        if not path.endswith('/'):
            path += '/'
        logging.debug(
            "Looking for TA's in %s %s %s" % (project_id, project_name, path))
        for dxfile in dxpy.find_data_objects(
            classname='file',
            state='closed',
            folder=path + 'bams/',
            project=project_id,
            describe=True,
            recurse=True,
            name='*tagAlign.gz',
            name_mode='glob'
        ):
            possible_files.append(dxfile.get('describe'))
    matched_files = \
        [f for f in possible_files if all([acc in f['name'] for acc in accessions])]
    if not matched_files:
        logging.error(
            'Could not find tagAlign with accessions %s' % (accessions))
        return None
    elif len(matched_files) > 1:
        logging.warning(
            'Found multiple tagAligns that matched accessions %s'
            % (accessions))
        logging.warning(
            'Matched files %s'
            % ([(f['folder'], f['name']) for f in matched_files]))
        logging.warning('Using first one found')
        return matched_files[0]
    else:
        return matched_files[0]
def get_ta_from_accessions(accessions, default_project, ta_folders):
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project_id = project.get_id()
            project_name += ":"
        else:
            project_id = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        if not path.endswith('/'):
            path += '/'
        logging.debug(
            "Looking for TA's in %s %s %s" % (project_id, project_name, path))
        for dxfile in dxpy.find_data_objects(
            classname='file',
            state='closed',
            folder=path + 'bams/',
            project=project_id,
            describe=True,
            recurse=True,
            name='*tagAlign.gz',
            name_mode='glob'
        ):
            possible_files.append(dxfile.get('describe'))
    matched_files = \
        [f for f in possible_files if all([acc in f['name'] for acc in accessions])]
    if not matched_files:
        logging.error(
            'Could not find tagAlign with accessions %s' % (accessions))
        return None
    elif len(matched_files) > 1:
        logging.warning(
            'Found multiple tagAligns that matched accessions %s'
            % (accessions))
        logging.warning(
            'Matched files %s'
            % ([(f['folder'], f['name']) for f in matched_files]))
        logging.warning('Using first one found')
        return matched_files[0]
    else:
        return matched_files[0]
Esempio n. 31
0
    def addLevel(self, node, folder):
        """
        Recurse into folders, and find all IGV-compatible files to be added to registry
        :param node: an Element, or SubElement to add items to
        :param folder: a folder to find files within
        :param debug: boolean. If True, then stop finding files after the first one.
        :return: nothing.
        """
        assert node is not None
        assert folder is not None

        print("Adding {}:{}".format(self.project.name, folder))
        subfolders = dxpy.api.project_list_folder(self.project.id, input_params={"folder": folder, "describe": {
            "fields": {"id": True, "name": True, "class": True}}, "only": "folders", "includeHidden": False},
                                                  always_retry=True)["folders"]
        subfolders = [os.path.basename(subfolder) for subfolder in subfolders]
        subfolders = list(set(subfolders) - set(("metrics", "inputFastq", "reports")))

        for subfolder in subfolders:
            subnode = SubElement(node, "Category", name=subfolder)
            subnodepath = str(folder + "/" + subfolder).replace("//", "/")
            self.addLevel(subnode, subnodepath)

        dxfiles = list(dxpy.find_data_objects(
            recurse=False, folder=folder, return_handler=True, project=self.project.get_id())
        )
        dxfiles.sort(key=lambda x: x.name)

        for dxfile in dxfiles:
            if isinstance(dxfile, dxpy.DXFile):
                # n, ext = os.path.splitext(dxfile.name)
                if str(dxfile.name).endswith("bam"):
                    self.__addIndexedFile(dxfile, folder, node, ["bai"])
                elif str(dxfile.name).endswith("vcf.gz"):
                    self.__addIndexedFile(dxfile, folder, node, ["tbi", "idx"])
                elif str(dxfile.name).endswith("bw"):
                    self.__addNonIndexedFile(dxfile, folder, node)
                elif str(dxfile.name).endswith("bed.gz"):
                    self.__addNonIndexedFile(dxfile, folder, node)
                elif str(dxfile.name).endswith("seg"):
                    self.__addNonIndexedFile(dxfile, folder, node)
                elif str(dxfile.name).endswith("cn"):
                    self.__addNonIndexedFile(dxfile, folder, node)
Esempio n. 32
0
    def find_fastq_files(self):
        '''
        Description: Returns a dict of all fastq files in the lane project;
        key = fastq filename, 
        value = fastq dxid

        DEV: Instead of returning a generator, I think this should return dxids
        for each fastq file. Same for interop, and bam files.
        '''
        fastq_dxids = []
        fastq_files_generator = dxpy.find_data_objects(classname='file',
                                                       name='*.fastq.gz',
                                                       name_mode='glob',
                                                       project=self.project_id,
                                                       folder='/')
        for fastq_dict in self.fastq_files_generator:
            fastq_dxid = fastq_dict['id']
            fastq_dxids.append(fastq_dxid)
        return fastq_dxids
Esempio n. 33
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Create a manifest file for a particular folder in a project')
    parser.add_argument('folder',
                        help='a folder in the current DNAnexus project')
    parser.add_argument('--outfile',
                        help='Name of the output file',
                        default='manifest.json.bz2')
    parser.add_argument(
        '-r',
        '--recursive',
        help='Recursively traverse folders and append to manifest',
        action='store_true')

    args = parser.parse_args()

    project, folder, _ = resolve_existing_path(args.folder)

    ids = dxpy.find_data_objects(classname='file',
                                 first_page_size=1000,
                                 describe={
                                     'id': True,
                                     'name': True,
                                     'folder': True,
                                     'parts': True
                                 },
                                 project=project,
                                 folder=folder,
                                 recurse=args.recursive)
    manifest = {project: []}

    for i, f in enumerate(ids):
        manifest[project].append(fileID2manifest(f['describe'], project))
        if i % 1000 == 0 and i != 0:
            print("Processed {} files".format(i))

    with open(args.outfile, "w") as f:
        f.write(bz2.compress(json.dumps(manifest, indent=2, sort_keys=True)))

    print("Manifest file written to {}".format(args.outfile))
    print("Total {} objects".format(len(manifest[project])))
Esempio n. 34
0
    def find_bam_files(self):
        ''' DEV: DEPRECATED
                 add functionality to also find BAI files
        '''

        bam_dxids = []
        bam_files_generator = dxpy.find_data_objects(classname='file',
                                                     name='*.bam',
                                                     name_mode='glob',
                                                     project=self.project_id,
                                                     folder='/')
        bam_files = list(bam_files_generator)

        if len(bam_files) < 1:
            print 'Info: No bam files found.'
            pass
        else:
            for bam_dict in bam_files:
                bam_dxid = bam_dict['id']
                bam_dxids.append(bam_dxid)
            return bam_dxids
def get_repns(exp_id, ta_folders):
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        print project, project_name, path
        for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project):
            desc = dxfile.get('describe')
            if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    print "%s %i possible files" %(exp_id, len(possible_files))
    folders = [f.get('folder') for f in possible_files]
    print "%s folders %s" %(exp_id, folders)
def get_all_tas(experiment, default_project, ta_folders):
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        print project, project_name, path
        for dxfile in dxpy.find_data_objects(classname='file', state='closed', folder=path, describe=True, recurse=True, project=project):
            desc = dxfile.get('describe')
            if exp_id in desc.get('folder') and '/bams' in desc.get('folder') and desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    return possible_files
def get_tas(exp_id, default_project, ta_folders):
    possible_files = []
    for base_folder in ta_folders:
        if ":" in base_folder:
            project_name, path = base_folder.split(":")
            project = resolve_project(project_name)
            project = project.get_id()
            project_name += ":"
        else:
            project = default_project
            project_name = ""
            path = base_folder
        if not path.startswith("/"):
            path = "/" + path
        print project, project_name, path
        for dxfile in dxpy.find_data_objects(
            classname="file", state="closed", folder=path, describe=True, recurse=True, project=project
        ):
            desc = dxfile.get("describe")
            if (
                exp_id in desc.get("folder")
                and "/bams" in desc.get("folder")
                and desc.get("name").endswith(("tagAlign", "tagAlign.gz"))
            ):
                possible_files.append(desc)
    print "%s %i possible files" % (exp_id, len(possible_files))
    rep1_files = [f for f in possible_files if "rep1" in f.get("folder")]
    rep2_files = [f for f in possible_files if "rep2" in f.get("folder")]
    if len(rep1_files) != 1:
        print "Tried to find one rep1 ta, found %d" % (len(rep1_files))
        rep1 = None
    else:
        rep1 = rep1_files[0].get("project") + ":" + rep1_files[0].get("folder") + "/" + rep1_files[0].get("name")
    if len(rep2_files) != 1:
        print "Tried to find one rep2 ta, found %d" % (len(rep2_files))
        rep2 = None
    else:
        rep2 = rep2_files[0].get("project") + ":" + rep2_files[0].get("folder") + "/" + rep2_files[0].get("name")

    return rep1, rep2
Esempio n. 38
0
def get_data_matches(text, delim_pos, dxproj, folderpath, classname=None,
                     typespec=None):
    '''
    :param text: String to be tab-completed; still in escaped form
    :type text: string
    :param delim_pos: index of last unescaped "/" or ":" in text
    :type delim_pos: int
    :param dxproj: DXProject handler to use
    :type dxproj: DXProject
    :param folderpath: Unescaped path in which to search for data object matches
    :type folderpath: string
    :param classname: Data object class by which to restrict the search (None for no restriction on class)
    :type classname: string
    :returns: List of matches
    :rtype: list of strings

    Members of the returned list are guaranteed to start with *text*
    and be in escaped form for consumption by the command-line.
    '''

    unescaped_text = unescape_completion_name_str(text[delim_pos + 1:])

    try:
        results = list(dxpy.find_data_objects(project=dxproj.get_id(),
                                              folder=folderpath,
                                              name=unescaped_text + "*",
                                              name_mode="glob",
                                              recurse=False,
                                              visibility='either' if text != '' and delim_pos != len(text) - 1 else 'visible',
                                              classname=classname,
                                              limit=100,
                                              describe=True,
                                              typename=typespec))
        names = map(lambda result: result['describe']['name'], results)
        return filter(startswith(text),
                      map(lambda name:
                              ('' if text == '' else text[:delim_pos + 1]) + escape_completion_name_str(name),
                          names))
    except:
        return []
Esempio n. 39
0
    def find_fastqs(self):
        """Returns a list of files in the DNAnexus project (self.id) with the fastq.gz extension"""
        # Search dnanexus for files with the fastq.gz extension.
        # name_mode='regexp' tells dxpy to look for any occurence of 'fastq.gz' in the filename
        search_response = dxpy.find_data_objects(project=self.id,
                                                 classname='file',
                                                 name='fastq.gz',
                                                 name_mode='regexp')
        file_ids = [result['id'] for result in search_response]

        # Gather a list of uploaded fastq files with the state 'closed', indicating a completed upload.
        fastq_filenames_unsorted = []
        for dx_file in file_ids:
            file_description = dxpy.describe(dx_file)
            if file_description['state'] == 'closed':
                fastq_filenames_unsorted.append(file_description['name'])
        # Sort fastq filenames for cleaner logfile output
        fastq_filenames = sorted(fastq_filenames_unsorted)
        self.logger.debug(
            f'{self.id} contains {len(fastq_filenames)} "closed" fastq files: {fastq_filenames}'
        )
        return fastq_filenames
Esempio n. 40
0
def get_vg_bundle(project, applets_folder, existing_dxid=None):
    if existing_dxid is not None:
        return dxpy.DXFile(existing_dxid)
    
    # determine desired git revision of vg
    vg_git_revision = subprocess.check_output(["git", "describe", "--long", "--always", "--tags"],
                                              cwd=os.path.join(here,"vg")).strip()
    # is the exe available already?
    existing = dxpy.find_data_objects(classname="file", typename="vg_bundle",
                                      project=project.get_id(), folder="/vg-bundle",
                                      properties={"git_revision": vg_git_revision},
                                      return_handler=True)
    existing = list(existing)
    if len(existing) > 0:
        if len(existing) > 1:
            print("Warning: found multiple vg bundles with git_revision={}, picking one".format(vg_git_revision))
        existing = existing[0]
        print("Using vg bundle {} ({})".format(vg_git_revision,existing.get_id()))
        return existing
    
    # no - build one for this git revision
    project.new_folder("/vg-bundle", parents=True)
    print("Building new vg bundle for {}".format(vg_git_revision))
    build_cmd = ["dx","build","-f","--destination",project.get_id()+":/vg-bundle/",os.path.join(here,"vg_bundle_builder")]
    print(" ".join(build_cmd))
    build_applet = dxpy.DXApplet(json.loads(subprocess.check_output(build_cmd))["id"])
    build_job = build_applet.run({"git_commit": vg_git_revision},
                                 project=project.get_id(), folder="/vg-bundle",
                                 name="vg_bundle_builder " + vg_git_revision)
    print("Launched {} to build vg bundle, waiting...".format(build_job.get_id()))
    noise = subprocess.Popen(["/bin/bash", "-c", "while true; do sleep 60; date; done"])
    try:
        build_job.wait_on_done()
    finally:
        noise.kill()
    vg_bundle = dxpy.DXFile(build_job.describe()["output"]["vg_bundle"])
    print("Using vg bundle {} ({})".format(vg_git_revision,vg_bundle.get_id()))
    return vg_bundle
Esempio n. 41
0
def _clone_to_all_regions(region2projid, regions, asset_file_name, folder,
                          url):
    jobs = []
    for region in regions:
        dest_proj_id = region2projid[region]
        results = list(
            dxpy.find_data_objects(classname="file",
                                   visibility="hidden",
                                   name=asset_file_name,
                                   project=dest_proj_id,
                                   folder=folder))
        file_ids = [p["id"] for p in results]
        nfiles = len(file_ids)
        if nfiles == 1:
            continue
        if nfiles > 1:
            print("cleanup in {}, found {} files instead of 0/1".format(
                dest_proj_id, nfiles))
            dxpy.DXProject(dest_proj_id).remove_objects(file_ids)
        dxjob = _clone_asset_into_region(region, dest_proj_id, asset_file_name,
                                         folder, url)
        jobs.append(dxjob)
    return jobs
Esempio n. 42
0
def main():
    args = get_args()
    if len(args.replicates) < 1:
        sys.exit('Need to have at least 1 replicate file.')

    project = resolve_project(ENCODE_DNA_ME_PROJECT_NAME)
    print 'Project: ' + project.describe()['name']
    print 'Experiment to analyze: ' + args.experiment
    if not project_has_folder(project, '/'+args.experiment):
        project.new_folder('/'+args.experiment)

    #TODO get all replicate ids from encoded DB from ENCSR (args.experiment)
    #TODO error out if ENCSR not found, status not complete etc.
    if args.test:
        source_id = project.get_id()
    else:
        source_id = resolve_project(ENCODE_SNAPSHOT_PROJECT, level='VIEW').get_id()

    replicates = []
    for rep in args.replicates:
        dx_rep = dxpy.find_data_objects(classname='file', name=rep,
                                        name_mode='exact', project=source_id,
                                        return_handler=False)
        replicates.extend(dx_rep)

    if not args.test:
        replicates = copy_files(replicates, project.get_id(), "/"+args.experiment)

    if not replicates:
        print "No replicates found in project: " + project.name
        print "Looking for " + ", ".join(args.replicates)
        sys.exit(1)

    inputs = {
        'rnd_seed': 12345
    }
    inputs['paired'] = args.paired
    inputs['gender']= args.gender
    inputs['organism'] = args.organism
    inputs['library_id'] = args.library
    inputs['nthreads'] = args.nthreads
    #TODO determine paired or gender from ENCSR metadata
    # Now create a new workflow ()
    inputs['spec_name'] = args.experiment+'-'+'-'.join([ r.split('.')[0] for r in args.replicates])
    title_root = 'dx_long_rna_seq_'
    name_root = 'ENCODE Long RNA Seq: '
    desc = 'The ENCODE RNA Seq pipeline for long RNAs'
    if args.paired:
        title_root = title_root + '_paired_end '
        name_root = name_root + '(paired-end) '
        inputs['stranded'] = True
    else:
        title_root = title_root + '_single_end '
        name_root = name_root + '(single-end) '
        inputs['stranded'] = False


    if args.export:
        project_id = dxpy.find_one_project(name=ENCODE_PUBLIC_PROJECT, name_mode='exact', return_handler=False)['id']
        wf = dxpy.new_dxworkflow(title=title_root,
                                 name=name_root,
                                 description=desc,
                                 folder=PUBLIC_FOLDER,
                                 project=project_id)
    else:
        project_id = project.get_id()
        wf = dxpy.new_dxworkflow(title=title_root+inputs['spec_name'],
                             name=name_root+inputs['spec_name'],
                             description=desc+' for experiment:' + args.experiment,
                             folder='/'+args.experiment,
                             project=project.get_id())

    populate_workflow(wf, replicates, args.experiment, inputs, project.id, args.export)
def get_tas(experiment, server, keypair, default_project, ta_folders):
    # tas = {
    #   'rep1_ta': {
    #       'file_id': "",
    #       'project_id': "",
    #       'folder': "",
    #       'name': "",
    #       'paired_end': False,
    #       'control_path': "",
    #       'enc_repn': 0
    #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR...
    #.from this list infer repns from the paths ../bams/ENCSR.../repn*
    #.from this list infer the ENCFF's for the fastqs that were used
    #for each repn go to the experiment and find all the fastqs for that rep
    #if there are different fastq's in the experiment, or different reps, warn
    #for each fastq found in the TA filename, find its controlled_by
    #if any have controlled_by, all must have controlled_by else error
    #   gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error
    #else get possible_controls and try to match the repn, else pick one (rememeber it)
    #   gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project_id = project.get_id()
            project_name += ":"
        else:
            project_id = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        if not path.endswith('/'):
            path += '/'
        logging.debug("Looking for TA's in %s %s %s" %
                      (project_id, project_name, path))
        for dxfile in dxpy.find_data_objects(
                classname='file',
                state='closed',
                folder=path + 'bams/%s/' % (exp_id),
                project=project_id,
                describe=True,
                recurse=True,
        ):
            desc = dxfile.get('describe')
            if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    logging.debug('Found %s possible files' % (len(possible_files)))
    logging.debug('%s' % ([(f.get('folder'), f.get('name'))
                           for f in possible_files]))
    repns = []
    files_to_ignore = []
    for f in possible_files:
        m = re.search('/rep(\d+)$', f['folder'])
        if m:
            repn = int(m.group(1))
            logging.debug("Matched rep%d" % (repn))
            if repn in repns:
                logging.warning(
                    "Ignoring additional rep%d bam, using first found" %
                    (repn))
                files_to_ignore.append(f)
            else:
                logging.debug("First time finding rep%d" % (repn))
                repns.append(repn)
        else:
            logging.error("Cannot parse rep number from %s" % (f['folder']))
            return None
    for f in files_to_ignore:
        possible_files.remove(f)
    logging.debug('Discovered repns %s' % (repns))
    if len(repns) != 2:
        logging.error("Required to have exactly 2 reps for %s.  Found %d: %s" %
                      (exp_id, len(repns), repns))
        return None

    tas = {}
    used_controls = []
    for i, repn in enumerate(repns):
        encode_files = [
            common.encoded_get(server + '/files/%s/' % (f), keypair)
            for f in get_encffs(possible_files[i].get('name'))
        ]
        controlled_by = common.flat(
            [f.get('controlled_by') for f in encode_files])
        if any(controlled_by):
            controlled_by_accessions = list(
                set([uri.split('/')[2] for uri in controlled_by if uri]))
            controlled_by_ta = get_ta_from_accessions(controlled_by_accessions,
                                                      default_project,
                                                      ta_folders)
            if controlled_by_ta:
                controlled_by_ta_name = controlled_by_ta.get('name')
                controlled_by_ta_id = controlled_by_ta.get('id')
            else:
                logging.error(
                    "%s: Could not find controlled_by_ta for accessions %s" %
                    (experiment.get('accession'), controlled_by_accessions))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
        else:
            #evaluate possible controls
            controlled_by_accessions = None
            possible_controls = experiment.get('possible_controls')
            logging.warning(
                '%s: No controlled_by for rep%d, attempting to infer from possible_controls %s'
                % (experiment.get('accession'), repn, possible_controls))
            if not possible_controls or not any(possible_controls):
                logging.error(
                    '%s: Could not find controlled_by or resolve possible_controls for rep%d'
                    % (experiment.get('accession'), repn))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
            else:
                control_ta = get_possible_ctl_ta(experiment, repn, server,
                                                 keypair, default_project,
                                                 ta_folders, used_controls)
                controlled_by_ta_name = control_ta.get('name')
                controlled_by_ta_id = control_ta.get('id')
        if controlled_by_ta_id and controlled_by_ta_id in used_controls:
            logging.warning('%s: Using same control %s for multiple reps' %
                            (controlled_by_ta_id, controlled_by_ta_name))
        used_controls.append(controlled_by_ta_id)
        #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same.
        #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first.
        if repn == min(repns):
            ta_index = 1
        else:
            ta_index = 2
        tas.update({
            'rep%d_ta' % (ta_index): {
                'file_id': possible_files[i].get('id'),
                'project_id': possible_files[i].get('project'),
                'folder': possible_files[i].get('folder'),
                'file_name': possible_files[i].get('name'),
                'enc_fqs': get_encffs(possible_files[i].get('name')),
                'controlled_by': controlled_by_accessions,
                'controlled_by_name': controlled_by_ta_name,
                'control_id': controlled_by_ta_id,
                'enc_repn': repn,
                'paired_end': is_paired_end(possible_files[i])
            }
        })

    return tas
Esempio n. 44
0
def resolve_existing_path(path,
                          expected=None,
                          ask_to_resolve=True,
                          expected_classes=None,
                          allow_mult=False,
                          describe={},
                          all_mult=False,
                          allow_empty_string=True):
    '''
    :param ask_to_resolve: Whether picking may be necessary (if true, a list is returned; if false, only one result is returned)
    :type ask_to_resolve: boolean
    :param allow_mult: Whether to allow the user to select multiple results from the same path
    :type allow_mult: boolean
    :param describe: Input hash to describe call for the results
    :type describe: dict
    :param all_mult: Whether to return all matching results without prompting (only applicable if allow_mult == True)
    :type all_mult: boolean
    :returns: A LIST of results when ask_to_resolve is False or allow_mult is True
    :raises: :exc:`ResolutionError` if the request path was invalid, or a single result was requested and input is not a TTY
    :param allow_empty_string: If false, a ResolutionError will be raised if *path* is an empty string. Use this when resolving the empty string could result in unexpected behavior.
    :type allow_empty_string: boolean

    Returns either a list of results or a single result (depending on
    how many is expected; if only one, then an interactive picking of
    a choice will be initiated if input is a tty, or else throw an error).

    TODO: Always treats the path as a glob pattern.

    Output is of the form {"id": id, "describe": describe hash} a list
    of those

    TODO: Allow arbitrary flags for the describe hash.

    NOTE: if expected_classes is provided and conflicts with the class
    of the hash ID, it will return None for all fields.
    '''

    project, folderpath, entity_name = resolve_path(
        path, expected, allow_empty_string=allow_empty_string)

    if entity_name is None:
        # Definitely a folder (or project)
        # FIXME? Should I check that the folder exists if expected="folder"?
        return project, folderpath, entity_name
    elif is_hashid(entity_name):
        found_valid_class = True
        if expected_classes is not None:
            found_valid_class = False
            for klass in expected_classes:
                if entity_name.startswith(klass):
                    found_valid_class = True
        if not found_valid_class:
            return None, None, None

        if 'project' not in describe:
            if project != dxpy.WORKSPACE_ID:
                describe['project'] = project
            elif dxpy.WORKSPACE_ID is not None:
                describe['project'] = dxpy.WORKSPACE_ID
        try:
            desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe',
                                      describe)
        except Exception as details:
            if 'project' in describe:
                # Now try it without the hint
                del describe['project']
                try:
                    desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe',
                                              describe)
                except Exception as details:
                    raise ResolutionError(str(details))
            else:
                raise ResolutionError(str(details))
        result = {"id": entity_name, "describe": desc}
        if ask_to_resolve and not allow_mult:
            return project, folderpath, result
        else:
            return project, folderpath, [result]
    elif project is None:
        raise ResolutionError(
            'Could not resolve \"' + path +
            '\" to a project context.  Please either set a default project using dx select or cd, or add a colon (":") after your project ID or name'
        )
    else:
        msg = 'Object of name ' + unicode(
            entity_name) + ' could not be resolved in folder ' + unicode(
                folderpath) + ' of project ID ' + str(project)
        # Probably an object
        if is_job_id(project):
            # The following will raise if no results could be found
            results = resolve_job_ref(project, entity_name, describe=describe)
        else:
            try:
                results = list(
                    dxpy.find_data_objects(project=project,
                                           folder=folderpath,
                                           name=entity_name,
                                           name_mode='glob',
                                           recurse=False,
                                           describe=describe,
                                           visibility='either'))
            except BaseException as details:
                raise ResolutionError(str(details))
        if len(results) == 0:
            # Could not find it as a data object.  If anything, it's a
            # folder.

            if '/' in entity_name:
                # Then there's no way it's supposed to be a folder
                raise ResolutionError(msg)

            # This is the only possibility left.  Leave the
            # error-checking for later.  Note that folderpath does
            possible_folder = folderpath + '/' + entity_name
            possible_folder, skip = clean_folder_path(possible_folder,
                                                      'folder')
            return project, possible_folder, None

        # Caller wants ALL results; just return the whole thing
        if not ask_to_resolve:
            return project, None, results

        if len(results) > 1:
            if allow_mult and (all_mult or is_glob_pattern(entity_name)):
                return project, None, results
            if sys.stdout.isatty():
                print 'The given path \"' + path + '\" resolves to the following data objects:'
                choice = pick(map(
                    lambda result: get_ls_l_desc(result['describe']), results),
                              allow_mult=allow_mult)
                if allow_mult and choice == '*':
                    return project, None, results
                else:
                    return project, None, ([results[choice]]
                                           if allow_mult else results[choice])
            else:
                raise ResolutionError('The given path \"' + path +
                                      '\" resolves to ' + str(len(results)) +
                                      ' data objects')
        elif len(results) == 1:
            return project, None, ([results[0]] if allow_mult else results[0])
Esempio n. 45
0
def interactive_help(in_class, param_desc, prompt):
    is_array = param_desc['class'].startswith("array:")
    print_param_help(param_desc)
    print()
    array_help_str = ', or <ENTER> to finish the list of inputs'
    if in_class in dx_data_classes:
        # Class is some sort of data object
        if dxpy.WORKSPACE_ID is not None:
            proj_name = None
            try:
                proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name']
            except:
                pass
            if proj_name is not None:
                print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/'))
        while True:
            print('Pick an option to find input data:')
            try:
                opt_num = pick(['List and choose from available data in the current project',
                                'List and choose from available data in the DNAnexus Reference Genomes Files project',
                                'Select another project to list and choose available data',
                                'Select an output from a previously-run job (current project only)',
                                'Return to original prompt (specify an ID or path directly)'])
            except KeyboardInterrupt:
                opt_num = 4
            if opt_num == 0:
                query_project = dxpy.WORKSPACE_ID
            elif opt_num == 1:
                region = None
                if dxpy.WORKSPACE_ID:
                    region = dxpy.describe(dxpy.WORKSPACE_ID).get("region")
                query_project = dxpy.find_one_project(name="Reference Genome Files:*", public=True, billed_to="org-dnanexus_apps", level="VIEW", name_mode="glob", region=region)['id']
            elif opt_num == 2:
                project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True)
                print('\nProjects to choose from:')
                query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id']
            if opt_num in range(3):
                result_generator = dxpy.find_data_objects(classname=in_class,
                                                          typename=param_desc.get('type'),
                                                          describe=dict(fields=get_ls_l_desc_fields()),
                                                          project=query_project)
                print('\nAvailable data:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_ls_l_desc(result['describe'])))
                if result_choice == 'none found':
                    print('No compatible data found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    return [result_choice['project'] + ':' + result_choice['id']]
            elif opt_num == 3:
                # Select from previous jobs in current project
                result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID,
                                                  describe=True,
                                                  parent_job="none")
                print()
                print('Previously-run jobs to choose from:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_find_executions_string(result['describe'],
                                                                                             has_children=False,
                                                                                             single_result=True)),
                                                  filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed']))
                if result_choice == 'none found':
                    print('No jobs found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None:
                        keys = result_choice['describe']['output'].keys()
                    else:
                        exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet']))
                        exec_desc = exec_handler.describe()
                        if 'outputSpec' not in exec_desc:
                            # This if block will either continue, return, or raise
                            print('No output spec found for the executable')
                            try:
                                field = input('Output field to use (^C or <ENTER> to cancel): ')
                                if field == '':
                                    continue
                                else:
                                    return [result_choice['id'] + ':' + field]
                            except KeyboardInterrupt:
                                continue
                        else:
                            keys = exec_desc['outputSpec'].keys()
                    if len(keys) > 1:
                        print('\nOutput fields to choose from:')
                        field_choice = pick(keys)
                        return [result_choice['id'] + ':' + keys[field_choice]]
                    elif len(keys) == 1:
                        print('Using the only output field: ' + keys[0])
                        return [result_choice['id'] + ':' + keys[0]]
                    else:
                        print('No available output fields')
            else:
                print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else '')))
                return shlex.split(input(prompt))
    else:
        if in_class == 'boolean':
            if is_array:
                print(fill('Enter "true", "false"' + array_help_str))
            else:
                print(fill('Enter "true" or "false"'))
        elif in_class == 'string' and is_array:
                print(fill('Enter a nonempty string' + array_help_str))
        elif (in_class == 'float' or in_class == 'int') and is_array:
            print(fill('Enter a number' + array_help_str))
        elif in_class == 'hash':
            print(fill('Enter a quoted JSON hash'))
        result = input(prompt)
        if in_class == 'string':
            return [result]
        else:
            return shlex.split(result)
Esempio n. 46
0
def interactive_help(in_class, param_desc, prompt):
    is_array = param_desc['class'].startswith("array:")
    print_param_help(param_desc)
    print()
    array_help_str = ', or <ENTER> to finish the list of inputs'
    if in_class in dx_data_classes:
        # Class is some sort of data object
        if dxpy.WORKSPACE_ID is not None:
            proj_name = None
            try:
                proj_name = dxpy.api.project_describe(dxpy.WORKSPACE_ID)['name']
            except:
                pass
            if proj_name is not None:
                print('Your current working directory is ' + proj_name + ':' + dxpy.config.get('DX_CLI_WD', '/'))
        while True:
            print('Pick an option to find input data:')
            try:
                opt_num = pick(['List and choose from available data in the current project',
                                'List and choose from available data in the DNAnexus Reference Genomes project',
                                'Select another project to list and choose available data',
                                'Select an output from a previously-run job (current project only)',
                                'Return to original prompt (specify an ID or path directly)'])
            except KeyboardInterrupt:
                opt_num = 4
            if opt_num == 0:
                query_project = dxpy.WORKSPACE_ID
            elif opt_num == 1:
                query_project = dxpy.find_one_project(name="Reference Genome Files", public=True, billed_to="org-dnanexus", level="VIEW")['id']
            elif opt_num == 2:
                project_generator = dxpy.find_projects(level='VIEW', describe=True, explicit_perms=True)
                print('\nProjects to choose from:')
                query_project = paginate_and_pick(project_generator, (lambda result: result['describe']['name']))['id']
            if opt_num in range(3):
                result_generator = dxpy.find_data_objects(classname=in_class,
                                                          typename=param_desc.get('type'),
                                                          describe=True,
                                                          project=query_project)
                print('\nAvailable data:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_ls_l_desc(result['describe'])))
                if result_choice == 'none found':
                    print('No compatible data found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    return [result_choice['project'] + ':' + result_choice['id']]
            elif opt_num == 3:
                # Select from previous jobs in current project
                result_generator = dxpy.find_jobs(project=dxpy.WORKSPACE_ID,
                                                  describe=True,
                                                  parent_job="none")
                print()
                print('Previously-run jobs to choose from:')
                result_choice = paginate_and_pick(result_generator,
                                                  (lambda result: get_find_executions_string(result['describe'],
                                                                                             has_children=False,
                                                                                             single_result=True)),
                                                  filter_fn=(lambda result: result['describe']['state'] not in ['unresponsive', 'terminating', 'terminated', 'failed']))
                if result_choice == 'none found':
                    print('No jobs found')
                    continue
                elif result_choice == 'none picked':
                    continue
                else:
                    if 'output' in result_choice['describe'] and result_choice['describe']['output'] != None:
                        keys = result_choice['describe']['output'].keys()
                    else:
                        exec_handler = dxpy.get_handler(result_choice.get('app', result_choice['applet']))
                        exec_desc = exec_handler.describe()
                        if 'outputSpec' not in exec_desc:
                            # This if block will either continue, return, or raise
                            print('No output spec found for the executable')
                            try:
                                field = input('Output field to use (^C or <ENTER> to cancel): ')
                                if field == '':
                                    continue
                                else:
                                    return [result_choice['id'] + ':' + field]
                            except KeyboardInterrupt:
                                continue
                        else:
                            keys = exec_desc['outputSpec'].keys()
                    if len(keys) > 1:
                        print('\nOutput fields to choose from:')
                        field_choice = pick(keys)
                        return [result_choice['id'] + ':' + keys[field_choice]]
                    elif len(keys) == 1:
                        print('Using the only output field: ' + keys[0])
                        return [result_choice['id'] + ':' + keys[0]]
                    else:
                        print('No available output fields')
            else:
                print(fill('Enter an ID or path (<TAB> twice for compatible ' + in_class + 's in current directory)' + (array_help_str if is_array else '')))
                return shlex.split(input(prompt))
    else:
        if in_class == 'boolean':
            if is_array:
                print(fill('Enter "true", "false"' + array_help_str))
            else:
                print(fill('Enter "true" or "false"'))
        elif in_class == 'string' and is_array:
                print(fill('Enter a nonempty string' + array_help_str))
        elif (in_class == 'float' or in_class == 'int') and is_array:
            print(fill('Enter a number' + array_help_str))
        elif in_class == 'hash':
            print(fill('Enter a quoted JSON hash'))
        result = input(prompt)
        if in_class == 'string':
            return [result]
        else:
            return shlex.split(result)
Esempio n. 47
0
def upload_applet(src_dir, uploaded_resources, check_name_collisions=True, overwrite=False, archive=False, project=None, override_folder=None, override_name=None, dx_toolkit_autodep="stable", dry_run=False, **kwargs):
    """
    Creates a new applet object.

    :param project: ID of container in which to create the applet.
    :type project: str, or None to use whatever is specified in dxapp.json
    :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_folder: str
    :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_name: str
    :param dx_toolkit_autodep: What type of dx-toolkit dependency to inject if none is present. "stable" for the APT package; "git" for HEAD of dx-toolkit master branch; or False for no dependency.
    :type dx_toolkit_autodep: boolean or string
    """
    applet_spec = _get_applet_spec(src_dir)

    if project is None:
        dest_project = applet_spec['project']
    else:
        dest_project = project
        applet_spec['project'] = project

    if 'name' not in applet_spec:
        try:
            applet_spec['name'] = os.path.basename(os.path.abspath(src_dir))
        except:
            raise AppBuilderException("Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)" % (src_dir,))

    if override_folder:
        applet_spec['folder'] = override_folder
    if 'folder' not in applet_spec:
        applet_spec['folder'] = '/'

    if override_name:
        applet_spec['name'] = override_name

    if 'dxapi' not in applet_spec:
        applet_spec['dxapi'] = dxpy.API_VERSION

    archived_applet = None
    if check_name_collisions and not dry_run:
        destination_path = applet_spec['folder'] + ('/' if not applet_spec['folder'].endswith('/') else '') + applet_spec['name']
        logger.debug("Checking for existing applet at " + destination_path)
        for result in dxpy.find_data_objects(classname="applet", name=applet_spec["name"], folder=applet_spec['folder'], project=dest_project, recurse=False):
            if overwrite:
                logger.info("Deleting applet %s" % (result['id']))
                # TODO: test me
                dxpy.DXProject(dest_project).remove_objects([result['id']])
            elif archive:
                logger.debug("Archiving applet %s" % (result['id']))
                proj = dxpy.DXProject(dest_project)
                archive_folder = '/.Applet_archive'
                try:
                    proj.list_folder(archive_folder)
                except dxpy.DXAPIError:
                    proj.new_folder(archive_folder)

                proj.move(objects=[result['id']], destination=archive_folder)
                archived_applet = dxpy.DXApplet(result['id'], project=dest_project)
                now = datetime.datetime.fromtimestamp(archived_applet.created/1000).ctime()
                new_name = archived_applet.name + " ({d})".format(d=now)
                archived_applet.rename(new_name)
                logger.info("Archived applet %s to %s:\"%s/%s\"" % (result['id'], dest_project, archive_folder, new_name))
            else:
                raise AppBuilderException("An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given" % (destination_path, result['id']))

    # -----
    # Override various fields from the pristine dxapp.json

    # Inline Readme.md and Readme.developer.md
    _inline_documentation_files(applet_spec, src_dir)

    # Inline the code of the program
    if "runSpec" in applet_spec and "file" in applet_spec["runSpec"]:
        # Avoid using runSpec.file for now, it's not fully implemented
        #code_filename = os.path.join(src_dir, applet_spec["runSpec"]["file"])
        #f = dxpy.upload_local_file(code_filename, wait_on_close=True)
        #applet_spec["runSpec"]["file"] = f.get_id()
        # Put it into runSpec.code instead
        with open(os.path.join(src_dir, applet_spec["runSpec"]["file"])) as code_fh:
            applet_spec["runSpec"]["code"] = code_fh.read()
            del applet_spec["runSpec"]["file"]

    # Attach bundled resources to the app
    if uploaded_resources is not None:
        applet_spec["runSpec"].setdefault("bundledDepends", [])
        applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources)

    # Include the DNAnexus client libraries as an execution dependency, if they are not already
    # there
    if dx_toolkit_autodep == "git":
        dx_toolkit_dep = {"name": "dx-toolkit",
                          "package_manager": "git",
                          "url": "git://github.com/dnanexus/dx-toolkit.git",
                          "tag": "master",
                          "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus"}
    # TODO: reject "beta" and "unstable" eventually
    elif dx_toolkit_autodep in ("stable", "beta", "unstable"):
        dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"}
    elif dx_toolkit_autodep:
        raise AppBuilderException("dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead" % (dx_toolkit_autodep,))

    if dx_toolkit_autodep:
        applet_spec["runSpec"].setdefault("execDepends", [])
        exec_depends = applet_spec["runSpec"]["execDepends"]
        if type(exec_depends) is not list or any(type(dep) is not dict for dep in exec_depends):
            raise AppBuilderException("Expected runSpec.execDepends to be an array of objects")
        dx_toolkit_dep_found = any(dep.get('name') in DX_TOOLKIT_PKGS or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends)
        if not dx_toolkit_dep_found:
            exec_depends.append(dx_toolkit_dep)
            if dx_toolkit_autodep == "git":
                applet_spec.setdefault("access", {})
                applet_spec["access"].setdefault("network", [])
                # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps
                if "*" not in applet_spec["access"]["network"]:
                    applet_spec["access"]["network"].append("*")

    merge(applet_spec, kwargs)

    # -----
    # Now actually create the applet

    if dry_run:
        print("Would create the following applet:")
        print(json.dumps(applet_spec, indent=2))
        print("*** DRY-RUN-- no applet was created ***")
        return None, None

    applet_id = dxpy.api.applet_new(applet_spec)["id"]

    if "categories" in applet_spec:
        dxpy.DXApplet(applet_id, project=dest_project).add_tags(applet_spec["categories"])

    if archived_applet:
        archived_applet.set_properties({'replacedWith': archived_applet.get_id()})

    return applet_id, applet_spec
Esempio n. 48
0
def resolve_existing_path(path, expected=None, ask_to_resolve=True, expected_classes=None, allow_mult=False, describe={}, all_mult=False):
    '''
    :param ask_to_resolve: Whether picking may be necessary (if true, a list is returned; if false, only one result is returned)
    :type ask_to_resolve: boolean
    :param allow_mult: Whether to allow the user to select multiple results from the same path
    :type allow_mult: boolean
    :param describe: Input hash to describe call for the results
    :type describe: dict
    :param all_mult: Whether to return all matching results without prompting (only applicable if allow_mult == True)
    :type all_mult: boolean
    :returns: A LIST of results when ask_to_resolve is False or allow_mult is True

    Returns either a list of results or a single result (depending on
    how many is expected; if only one, then an interactive picking of
    a choice will be initiated if input is a tty, or else throw an error).

    TODO: Always treats the path as a glob pattern.

    Output is of the form {"id": id, "describe": describe hash} a list
    of those

    TODO: Allow arbitrary flags for the describe hash.

    NOTE: if expected_classes is provided and conflicts with the class
    of the hash ID, it will return None for all fields.
    '''

    project, folderpath, entity_name = resolve_path(path, expected)

    if entity_name is None:
        # Definitely a folder (or project)
        # FIXME? Should I check that the folder exists if expected="folder"?
        return project, folderpath, entity_name
    elif is_hashid(entity_name):
        found_valid_class = True
        if expected_classes is not None:
            found_valid_class = False
            for klass in expected_classes:
                if entity_name.startswith(klass):
                    found_valid_class = True
        if not found_valid_class:
            return None, None, None
        try:
            if 'project' not in describe:
                if project != dxpy.WORKSPACE_ID:
                    describe['project'] = project
                elif dxpy.WORKSPACE_ID is not None:
                    describe['project'] = dxpy.WORKSPACE_ID
            desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe', describe)
        except:
            if 'project' in describe:
                # Now try it without the hint
                del describe['project']
                try:
                    desc = dxpy.DXHTTPRequest('/' + entity_name + '/describe', describe)
                except BaseException as details:
                    raise ResolutionError(str(details))
        result = {"id": entity_name, "describe": desc}
        if ask_to_resolve and not allow_mult:
            return project, folderpath, result
        else:
            return project, folderpath, [result]
    elif project is None:
        raise ResolutionError('Error: Could not resolve \"' + path + '\" to a project context.  Please either set a default project using dx select or cd, or add a colon (":") after your project ID or name')
    else:
        msg = 'Object of name ' + unicode(entity_name) + ' could not be resolved in folder ' + unicode(folderpath) + ' of project ID ' + str(project)
        # Probably an object
        if is_job_id(project):
            # The following will raise if no results could be found
            results = resolve_job_ref(project, entity_name, describe=describe)
        else:
            results = list(dxpy.find_data_objects(project=project,
                                                  folder=folderpath,
                                                  name=entity_name,
                                                  name_mode='glob',
                                                  recurse=False,
                                                  describe=describe,
                                                  visibility='either'))
        if len(results) == 0:
            # Could not find it as a data object.  If anything, it's a
            # folder.

            if '/' in entity_name:
                # Then there's no way it's supposed to be a folder
                raise ResolutionError(msg)

            # This is the only possibility left.  Leave the
            # error-checking for later.  Note that folderpath does
            possible_folder = folderpath + '/' + entity_name
            possible_folder, skip = clean_folder_path(possible_folder, 'folder')
            return project, possible_folder, None

        # Caller wants ALL results; just return the whole thing
        if not ask_to_resolve:
            return project, None, results

        if len(results) > 1:
            if allow_mult and (all_mult or is_glob_pattern(entity_name)):
                return project, None, results
            if sys.stdout.isatty():
                print 'The given path \"' + path + '\" resolves to the following data objects:'
                choice = pick(map(lambda result:
                                      get_ls_l_desc(result['describe']),
                                  results),
                              allow_mult=allow_mult)
                if allow_mult and choice == '*':
                    return project, None, results
                else:
                    return project, None, ([results[choice]] if allow_mult else results[choice])
            else:
                raise ResolutionError('Error: The given path \"' + path + '\" resolves to ' + str(len(results)) + ' data objects')
        elif len(results) == 1:
            return project, None, ([results[0]] if allow_mult else results[0])
def process(filename, bucket_url, project, folder, skipvalidate=False):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    logger.debug(filename)

    test = list(
        dxpy.find_data_objects(classname='file',
                               folder=folder,
                               project=project,
                               name_mode='exact',
                               name=filename,
                               return_handler=False))

    if not test or len(test) == 0:
        #cp the file from the bucket
        subprocess.check_call(shlex.split('aws s3 cp %s . --quiet' %
                                          (bucket_url)),
                              stderr=subprocess.STDOUT)
        subprocess.check_call(shlex.split('ls -l %s' % (filename)))
        dx_file = dxpy.upload_local_file(filename,
                                         project=project,
                                         folder=folder)

    else:
        dxpy.download_dxfile(test[0]['id'], filename)
        dx_file = dxpy.dxfile.DXFile(test[0]['id'])
    reads_basename = filename.rstrip('.gz').rstrip('.fq').rstrip('.fastq')

    if skipvalidate:
        return {"file": dx_file, "report": None, "summary": None, "zip": None}

    subprocess.check_call(['mkdir', 'output'])
    logger.info("Run QC")
    fqc_command = "/usr/bin/FastQC/fastqc " + filename + " -o output"
    logger.debug(fqc_command)
    stdio = subprocess.check_output(shlex.split(fqc_command))
    logger.debug(stdio)
    logger.debug(subprocess.check_output(['ls', '-l', 'output']))
    subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename])
    logger.info("Upload results")

    subprocess.check_call([
        'mv',
        "%s_fastqc/fastqc_data.txt" % reads_basename,
        "%s_data.txt" % reads_basename
    ])
    subprocess.check_call([
        'mv',
        "%s_fastqc/summary.txt" % reads_basename,
        "%s_summary.txt" % reads_basename
    ])

    report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename,
                                           folder=folder,
                                           project=project)
    summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename,
                                            folder=folder,
                                            project=project)
    zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" %
                                        reads_basename,
                                        folder=folder,
                                        project=project)
    logger.debug(report_dxfile)
    return {
        "file": dx_file,
        "report": report_dxfile,
        "summary": summary_dxfile,
        "zip": zip_dxfile
    }
def process(enc_file_name, bucket_url, proj_id, dx_folder, file_acc, dx_file_name, skipvalidate=False):
    # Change the following to process whatever input this stage
    # receives.  You may also want to copy and paste the logic to download
    # and upload files here as well if this stage receives file input
    # and/or makes file output.

    print "* "+enc_file_name+" to "+dx_folder

    test = list( dxpy.find_data_objects(classname='file',
                           folder=dx_folder, project=proj_id, name_mode='exact',
                           name=dx_file_name, properties={ "accession": file_acc }, return_handler=False) )

    start = datetime.now()
    if not test or len(test) == 0:
        try:
            #subprocess.check_call(shlex.split('aws s3 cp %s ./%s --quiet' %(bucket_url,dx_file_name)), stderr=subprocess.STDOUT)
            subprocess.check_call(shlex.split('aws s3 cp %s ./%s' % (bucket_url,dx_file_name)), stderr=subprocess.STDOUT)
        except:
            try:
                print "* s3 cp failed.  Reverting to 'wget'"
                web_url = "https://www.encodeproject.org/files/%s/@@download/%s" % (file_acc,enc_file_name)
                subprocess.check_call(shlex.split('wget %s -O %s --quiet' % (web_url,dx_file_name) ), stderr=subprocess.STDOUT)
            except:
                print "* ERROR: Upload failed"
                sys.exit(1)  # Better to fail than to return empty handed.
                #return {
                #    "file": None,
                #    "report": None,
                #    "summary": None,
                #    "zip": None
                #}
        end = datetime.now()
        duration = end - start
        start = end
        print "* copied to dx local in %.2f seconds" % duration.seconds

        subprocess.check_call(shlex.split('ls -l %s' %(dx_file_name)))

        # Make sure folder exists before copying!
        project = dxpy.DXProject(proj_id)  ## should be default

        dx_file = dxpy.upload_local_file(dx_file_name, project=proj_id, folder=dx_folder, properties={ "accession": file_acc })
        end = datetime.now()
        duration = end - start
        print "* Uploaded to dx project in %.2f seconds" % duration.seconds

    else:
        dxpy.download_dxfile(test[0]['id'], dx_file_name)
        dx_file=dxpy.dxfile.DXFile(test[0]['id'])
        end = datetime.now()
        duration = end - start
        print "* Downloaded already existing file from in %.2f seconds" % duration.seconds

    if skipvalidate or not (dx_file_name.endswith(".fastq.gz") or dx_file_name.endswith(".fq.gz")):
        return {
            "file": dx_file,
            "report": None,
            "summary": None,
            "zip": None
        }

    subprocess.check_call(['mkdir', 'output'])
    print "* Run QC"
    fqc_command = "/usr/bin/FastQC/fastqc " + dx_file_name + " -o output"
    print "* " + fqc_command
    subprocess.check_output(shlex.split(fqc_command))
    subprocess.check_output(['ls','-l', 'output'])
    reads_basename = dx_file_name.rstrip('.gz').rstrip('.fq').rstrip('.fastq')
    subprocess.check_call(['unzip', "output/%s_fastqc.zip" % reads_basename])
    print "* Upload results"

    subprocess.check_call(['mv', "%s_fastqc/fastqc_data.txt" % reads_basename, "%s_data.txt" % reads_basename ])
    subprocess.check_call(['mv', "%s_fastqc/summary.txt" % reads_basename, "%s_summary.txt" % reads_basename ])

    report_dxfile = dxpy.upload_local_file("%s_data.txt" % reads_basename, folder=dx_folder, project=proj_id)
    summary_dxfile = dxpy.upload_local_file("%s_summary.txt" % reads_basename, folder=dx_folder, project=proj_id)
    zip_dxfile = dxpy.upload_local_file("output/%s_fastqc.zip" % reads_basename, folder=dx_folder, project=proj_id)
    print report_dxfile
    return {
        "file": dx_file,
        "report": report_dxfile,
        "summary": summary_dxfile,
        "zip": zip_dxfile
    }
def main(folder_name, key_name, assembly, noupload, force, debug):

    # accessions bams contained within the folder named folder_name/bams

    # Requires
    # . directory structure folder_name/bams/ENCSRxxxabc/ ... /basename[.anything].bam
    # . basename contains one or more ENCFF numbers from which the bam is derived
    # . bam_filename.flagstat.qc exists
    # . raw bam flagstat file exists in folder_name/raw_bams/ENCSRxxxabc/ ... /basename[.anything].flagstat.qc

    # if bam file's tags on DNAnexus already contains and ENCFF number, assume it's already accessioned and skip
    # create a fully qualified project:filename for submitted_file_name and calculate the file size
    # if an ENCFF objects exists with the same submitted_file_name, AND it has the same size, skip

    # **INFER the experiment accession number from the bam's containing folder
    # calculate the md5
    # find the raw bam's .flagstat.qc file and parse
    # find the bam's .flagstat.qc file and parse
    # **ASSUME all derived_from ENCFF's appear in the bam's filename
    # POST file object
    # Upload to AWS

    if debug:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    if not folder_name.startswith("/"):
        folder_name = "/" + folder_name
    if not folder_name.endswith("/"):
        folder_name += "/"

    try:
        project = dxpy.DXProject(dxpy.PROJECT_CONTEXT_ID)
        project_name = project.describe().get("name")
    except:
        logger.error("Failed to resolve proejct")
        project_name = ""

    bam_folder = folder_name + "bams/"
    bams = dxpy.find_data_objects(
        classname="file",
        state="closed",
        name="*.bam",
        name_mode="glob",
        project=dxpy.PROJECT_CONTEXT_ID,
        folder=bam_folder,
        recurse=True,
        return_handler=True,
    )

    authid, authpw, server = processkey(key_name)
    if not subprocess.call("which md5", shell=True):
        md5_command = "md5 -q"
    elif not subprocess.call("which md5sum", shell=True):
        md5_command = "md5sum"
    else:
        logger.error("Cannot find md5 or md5sum command")
        md5_command = ""

    file_mapping = []
    for bam in bams:
        already_accessioned = False
        for tag in bam.tags:
            m = re.search(r"(ENCFF\d{3}\D{3})|(TSTFF\D{6})", tag)
            if m:
                logger.info(
                    "%s appears to contain ENCODE accession number in tag %s ... skipping" % (bam.name, m.group(0))
                )
                already_accessioned = True
                break
        if already_accessioned:
            continue
        bam_description = bam.describe()
        submitted_file_name = project_name + ":" + "/".join([bam.folder, bam.name])
        submitted_file_size = bam_description.get("size")
        url = urlparse.urljoin(
            server, "search/?type=file&submitted_file_name=%s&format=json&frame=object" % (submitted_file_name)
        )
        r = encoded_get(url, authid, authpw)
        try:
            r.raise_for_status()
            if r.json()["@graph"]:
                for duplicate_item in r.json()["@graph"]:
                    if duplicate_item.get("status") == "deleted":
                        logger.info("A potential duplicate file was found but its status=deleted ... proceeding")
                        duplicate_found = False
                    else:
                        logger.info("Found potential duplicate: %s" % (duplicate_item.get("accession")))
                        if submitted_file_size == duplicate_item.get("file_size"):
                            logger.info(
                                "%s %s: File sizes match, assuming duplicate."
                                % (str(submitted_file_size), duplicate_item.get("file_size"))
                            )
                            duplicate_found = True
                            break
                        else:
                            logger.info(
                                "%s %s: File sizes differ, assuming new file."
                                % (str(submitted_file_size), duplicate_item.get("file_size"))
                            )
                            duplicate_found = False
            else:
                logger.info("No duplicate ... proceeding")
                duplicate_found = False
        except:
            logger.warning("Duplicate accession check failed: %s %s" % (r.status_code, r.reason))
            logger.debug(r.text)
            duplicate_found = False

        if duplicate_found:
            if force:
                logger.info("Duplicate detected, but force=true, so continuing")
            else:
                logger.info("Duplicate detected, skipping")
                continue

        try:
            bamqc_fh = dxpy.find_one_data_object(
                classname="file",
                name="*.flagstat.qc",
                name_mode="glob",
                project=dxpy.PROJECT_CONTEXT_ID,
                folder=bam.folder,
                return_handler=True,
            )
        except:
            logger.warning("Flagstat file not found ... skipping")
            continue
            bamqc_fh = None

        raw_bams_folder = str(bam.folder).replace("%sbams/" % (folder_name), "%sraw_bams/" % (folder_name), 1)
        try:
            raw_bamqc_fh = dxpy.find_one_data_object(
                classname="file",
                name="*.flagstat.qc",
                name_mode="glob",
                project=dxpy.PROJECT_CONTEXT_ID,
                folder=raw_bams_folder,
                return_handler=True,
            )
        except:
            logger.warning("Raw flagstat file not found ... skipping")
            continue
            raw_bamqc_fh = None

        try:
            dup_qc_fh = dxpy.find_one_data_object(
                classname="file",
                name="*.dup.qc",
                name_mode="glob",
                project=dxpy.PROJECT_CONTEXT_ID,
                folder=bam.folder,
                return_handler=True,
            )
        except:
            logger.warning("Picard duplicates QC file not found ... skipping")
            continue
            dup_qc_fh = None

        try:
            xcor_qc_fh = dxpy.find_one_data_object(
                classname="file",
                name="*.cc.qc",
                name_mode="glob",
                project=dxpy.PROJECT_CONTEXT_ID,
                folder=bam.folder,
                return_handler=True,
            )
        except:
            logger.warning("Cross-correlation QC file not found ... skipping")
            continue
            xcor_qc_fh = None

        try:
            pbc_qc_fh = dxpy.find_one_data_object(
                classname="file",
                name="*.pbc.qc",
                name_mode="glob",
                project=dxpy.PROJECT_CONTEXT_ID,
                folder=bam.folder,
                return_handler=True,
            )
        except:
            logger.warning("PBC QC file not found ... skipping")
            continue
            pbc_qc_fh = None

        experiment_accession = re.match("\S*(ENC\S{8})", bam.folder).group(1)
        logger.info("Downloading %s" % (bam.name))
        dxpy.download_dxfile(bam.get_id(), bam.name)
        md5_output = subprocess.check_output(" ".join([md5_command, bam.name]), shell=True)
        calculated_md5 = md5_output.partition(" ")[0].rstrip()
        encode_object = FILE_OBJ_TEMPLATE
        encode_object.update({"assembly": assembly})

        notes = {
            "filtered_qc": flagstat_parse(bamqc_fh),
            "qc": flagstat_parse(raw_bamqc_fh),
            "dup_qc": dup_parse(dup_qc_fh),
            "xcor_qc": xcor_parse(xcor_qc_fh),
            "pbc_qc": pbc_parse(pbc_qc_fh),
            "dx-id": bam_description.get("id"),
            "dx-createdBy": bam_description.get("createdBy"),
        }
        encode_object.update(
            {
                "dataset": experiment_accession,
                "notes": json.dumps(notes),
                "submitted_file_name": submitted_file_name,
                "derived_from": re.findall("(ENCFF\S{6})", bam.name),
                "file_size": submitted_file_size,
                "md5sum": calculated_md5,
            }
        )
        logger.info("Experiment accession: %s" % (experiment_accession))
        logger.debug("File metadata: %s" % (encode_object))

        url = urlparse.urljoin(server, "files")
        r = encoded_post(url, authid, authpw, encode_object)
        try:
            r.raise_for_status()
            new_file_object = r.json()["@graph"][0]
            logger.info("New accession: %s" % (new_file_object.get("accession")))
        except:
            logger.warning("POST file object failed: %s %s" % (r.status_code, r.reason))
            logger.debug(r.text)
            new_file_object = {}
            if r.status_code == 409:
                try:  # cautiously add a tag with the existing accession number
                    if calculated_md5 in r.json().get("detail"):
                        url = urlparse.urljoin(server, "/search/?type=file&md5sum=%s" % (calculated_md5))
                        r = encoded_get(url, authid, authpw)
                        r.raise_for_status()
                        accessioned_file = r.json()["@graph"][0]
                        existing_accession = accessioned_file["accession"]
                        bam.add_tags([existing_accession])
                        logger.info("Already accessioned.  Added %s to dxfile tags" % (existing_accession))
                except:
                    logger.info("Conflict does not appear to be md5 ... continuing")
        if noupload:
            logger.info("--noupload so skipping upload")
            upload_returncode = -1
        else:
            if new_file_object:
                creds = new_file_object["upload_credentials"]
                env = os.environ.copy()
                env.update(
                    {
                        "AWS_ACCESS_KEY_ID": creds["access_key"],
                        "AWS_SECRET_ACCESS_KEY": creds["secret_key"],
                        "AWS_SECURITY_TOKEN": creds["session_token"],
                    }
                )

                logger.info("Uploading file.")
                start = time.time()
                try:
                    subprocess.check_call(["aws", "s3", "cp", bam.name, creds["upload_url"], "--quiet"], env=env)
                except subprocess.CalledProcessError as e:
                    # The aws command returns a non-zero exit code on error.
                    logger.error("Upload failed with exit code %d" % e.returncode)
                    upload_returncode = e.returncode
                else:
                    upload_returncode = 0
                    end = time.time()
                    duration = end - start
                    logger.info("Uploaded in %.2f seconds" % duration)
                    bam.add_tags([new_file_object.get("accession")])
            else:
                upload_returncode = -1

        out_string = "\t".join(
            [
                experiment_accession,
                encode_object.get("submitted_file_name"),
                new_file_object.get("accession") or "",
                str(upload_returncode),
                encode_object.get("notes"),
            ]
        )
        print out_string
        file_mapping.append(out_string)

        os.remove(bam.name)

    output_log_filename = time.strftime("%m%d%y%H%M") + "-accession_log.csv"
    out_fh = dxpy.upload_string("\n".join(file_mapping), name=output_log_filename, media_type="text/csv")
    out_fh.close()

    output = {"file_mapping": file_mapping, "outfile": dxpy.dxlink(out_fh)}

    return output
Esempio n. 52
0
def _clone_asset(record, folder, regions, project_dict):
    """
    This function will attempt to clone the given record into all of the given regions.
    It will return a dictionary with the regions as keys and the record-ids of the
    corresponding asset as the values.  If an asset is not able to be created in a given
    region, the value will be set to None.
    """
    # Get the asset record
    fid = record.get_details()['archiveFileId']['$dnanexus_link']
    curr_region = dxpy.describe(record.project)['region']

    # Only run once per region
    regions = set(regions) - set([curr_region])
    if len(regions) == 0:
        # there is nothing to do
        return

    app_supported_regions = set(
        COPY_FILE_APP.describe()['regionalOptions'].keys())
    if len(regions - app_supported_regions) > 0:
        print('Currently no support for the following region(s): [{regions}]'.
              format(regions=', '.join(regions - app_supported_regions)),
              file=sys.stderr)
        sys.exit(1)

    # Get information about the asset
    asset_properties = record.get_properties()
    asset_properties['cloned_from'] = record.get_id()
    asset_file_name = dxpy.describe(fid)['name']
    url = dxpy.DXFile(fid).get_download_url(
        preauthenticated=True,
        project=dxpy.DXFile.NO_PROJECT_HINT,
        duration=URL_DURATION)[0]

    # setup target folders
    region2projid = {}
    for region in regions:
        dest_proj = util.get_project(project_dict[region])
        dest_proj.new_folder(folder, parents=True)
        region2projid[region] = dest_proj.get_id()
    print(region2projid)

    # Fire off a clone process for each region
    # Wait for the cloning to complete
    for i in [1, 2, 3]:
        jobs = _clone_to_all_regions(region2projid, regions, asset_file_name,
                                     folder, url)
        retval = _wait_for_completion(jobs)
        if retval:
            break

    # make records for each file
    for region in regions:
        dest_proj_id = region2projid[region]
        results = list(
            dxpy.find_data_objects(classname="file",
                                   visibility="hidden",
                                   name=asset_file_name,
                                   project=dest_proj_id,
                                   folder=folder))
        file_ids = [p["id"] for p in results]
        if len(file_ids) == 0:
            raise RuntimeError("Found no files {}:{}/{}".format(
                dest_proj_id, folder, asset_file_name))
        if len(file_ids) > 1:
            raise RuntimeError(
                "Found {} files {}:{}/{}, instead of just one".format(
                    len(dxfiles), dest_proj_id, folder, asset_file_name))
        dest_asset = dxpy.new_dxrecord(
            name=record.name,
            types=['AssetBundle'],
            details={'archiveFileId': dxpy.dxlink(file_ids[0])},
            properties=record.get_properties(),
            project=dest_proj_id,
            folder=folder,
            close=True)
def get_tas(experiment, server, keypair, default_project, ta_folders):
    # tas = {
    #   'rep1_ta': {
    #       'file_id': "",
    #       'project_id': "",
    #       'folder': "",
    #       'name': "",
    #       'paired_end': False,
    #       'control_path': "",
    #       'enc_repn': 0
    #.for each ta_folder get list of TA's in /ta_folder/bams/ENCSR...
    #.from this list infer repns from the paths ../bams/ENCSR.../repn*
    #.from this list infer the ENCFF's for the fastqs that were used
    #for each repn go to the experiment and find all the fastqs for that rep
    #if there are different fastq's in the experiment, or different reps, warn
    #for each fastq found in the TA filename, find its controlled_by
    #if any have controlled_by, all must have controlled_by else error
    #   gather the list of controlled by and find a TA (anywhere in ta_folders) with those ENCFF's, else error
    #else get possible_controls and try to match the repn, else pick one (rememeber it)
    #   gather the list of fastqs in the possible_controls and find (one) TA with those ENCFF's, else error
    exp_id = experiment['accession']
    possible_files = []
    for base_folder in ta_folders:
        if ':' in base_folder:
            project_name, path = base_folder.split(':')
            project = resolve_project(project_name)
            project_id = project.get_id()
            project_name += ":"
        else:
            project_id = default_project
            project_name = ""
            path = base_folder
        if not path.startswith('/'):
            path = '/' + path
        if not path.endswith('/'):
            path += '/'
        logging.debug(
            "Looking for TA's in %s %s %s" % (project_id, project_name, path))
        for dxfile in dxpy.find_data_objects(
            classname='file',
            state='closed',
            folder=path + 'bams/%s/' %(exp_id),
            project=project_id,
            describe=True,
            recurse=True,
        ):
            desc = dxfile.get('describe')
            if desc.get('name').endswith(('tagAlign', 'tagAlign.gz')):
                possible_files.append(desc)
    logging.debug('Found %s possible files' %(len(possible_files)))
    logging.debug('%s' %([(f.get('folder'),f.get('name')) for f in possible_files]))
    repns = []
    files_to_ignore = []
    for f in possible_files:
        m = re.search('/rep(\d+)$',f['folder'])
        if m:
            repn = int(m.group(1))
            logging.debug("Matched rep%d" %(repn))
            if repn in repns:
                logging.warning("Ignoring additional rep%d bam, using first found" %(repn))
                files_to_ignore.append(f)
            else:
                logging.debug("First time finding rep%d" %(repn))
                repns.append(repn)
        else:
            logging.error("Cannot parse rep number from %s" %(f['folder']))
            return None
    for f in files_to_ignore:
        possible_files.remove(f)
    logging.debug('Discovered repns %s' %(repns))
    if len(repns) != 2:
        logging.error("Required to have exactly 2 reps for %s.  Found %d: %s" %(exp_id, len(repns), repns))
        return None

    tas = {}
    used_controls = []
    for i, repn in enumerate(repns):
        encode_files = [common.encoded_get(server+'/files/%s/' %(f), keypair) for f in get_encffs(possible_files[i].get('name'))]
        controlled_by = common.flat([f.get('controlled_by') for f in encode_files])
        if any(controlled_by):
            controlled_by_accessions = list(set([uri.split('/')[2] for uri in controlled_by if uri]))
            controlled_by_ta = get_ta_from_accessions(controlled_by_accessions, default_project, ta_folders)
            if controlled_by_ta:
                controlled_by_ta_name = controlled_by_ta.get('name')
                controlled_by_ta_id = controlled_by_ta.get('id')
            else:
                logging.error("%s: Could not find controlled_by_ta for accessions %s" %(experiment.get('accession'), controlled_by_accessions))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
        else:
            #evaluate possible controls
            controlled_by_accessions = None
            possible_controls = experiment.get('possible_controls')
            logging.warning('%s: No controlled_by for rep%d, attempting to infer from possible_controls %s' %(experiment.get('accession'), repn, possible_controls))
            if not possible_controls or not any(possible_controls):
                logging.error('%s: Could not find controlled_by or resolve possible_controls for rep%d' %(experiment.get('accession'), repn))
                controlled_by_ta_name = None
                controlled_by_ta_id = None
            else:
                control_ta = get_possible_ctl_ta(experiment, repn, server, keypair, default_project, ta_folders, used_controls)
                controlled_by_ta_name = control_ta.get('name')
                controlled_by_ta_id = control_ta.get('id')
        if controlled_by_ta_id and controlled_by_ta_id in used_controls:
            logging.warning('%s: Using same control %s for multiple reps' %(controlled_by_ta_id, controlled_by_ta_name))
        used_controls.append(controlled_by_ta_id)
        #if encode repns are 1,2 then let the pipline input rep numbers (1 or 2) be the same.
        #Otherwise the mapping is arbitrary, but at least do it with smaller rep number first.
        if repn == min(repns):
            ta_index = 1
        else:
            ta_index = 2
        tas.update(
            {'rep%d_ta' %(ta_index): {
                'file_id': possible_files[i].get('id'),
                'project_id': possible_files[i].get('project'),
                'folder': possible_files[i].get('folder'),
                'file_name': possible_files[i].get('name'),
                'enc_fqs': get_encffs(possible_files[i].get('name')),
                'controlled_by': controlled_by_accessions,
                'controlled_by_name': controlled_by_ta_name,
                'control_id': controlled_by_ta_id,
                'enc_repn': repn,
                'paired_end': is_paired_end(possible_files[i])
                }
            }
        )

    return tas
Esempio n. 54
0
def main(pop1, pop2, skip=25, recals=2):
    # Split your work into parallel tasks.  As an example, the
    # following generates 10 subjobs running with the same dummy
    # input.
    psmc20_id = (
        "project-B53fX06gYqYbb6B87kgQ0007"
    )  # Dxpy.find_one_project(zero_ok=True, more_ok=False, name="PSMC_20")['id']
    #    print psmc20_id, dxpy.WORKSPACE_ID
    pipeline = dxpy.find_one_data_object(
        name="PSMC-pipeline", name_mode="regexp", project=psmc20_id, return_handler=True
    )
    files1 = {}
    for result in dxpy.find_data_objects(
        name=pop1, name_mode="regexp", classname="file", folder="/ConsensusSequences", project=psmc20_id
    ):
        id = result["id"]
        name = dxpy.describe(id)["name"]
        files1[name] = id
    files2 = {}
    if pop1 != pop2:
        for result in dxpy.find_data_objects(
            name=pop2, name_mode="regexp", classname="file", folder="/ConsensusSequences", project=psmc20_id
        ):
            id = result["id"]
            name = dxpy.describe(id)["name"]
            files2[name] = id
    if len(files2) == 0 and pop1 != pop2:
        return {}
    appjobs = []
    if len(files2) == 0:
        # Single population processing
        subjobs = []
        fn1sort = files1.keys()
        fn1sort.sort()
        for i in range(len(fn1sort)):
            for j in range(i + 1, len(fn1sort)):
                outroot = pop1 + "." + str(i + 1) + "." + pop1 + "." + str(j + 1)
                applet_in = {
                    "cons1": dxpy.dxlink(files1[fn1sort[i]]),
                    "cons2": dxpy.dxlink(files1[fn1sort[j]]),
                    "outroot": outroot,
                    "skip": skip,
                    "recalnums": recals,
                }
                # appjobs.append(pipeline.run(applet_input=applet_in))
                print "dx run -y --folder /psmcfa -icons1=/ConsensusSequences/" + fn1sort[
                    i
                ] + " -icons2=/ConsensusSequences/" + fn1sort[j] + " -ioutroot=" + outroot + " -iskip=" + str(
                    skip
                ) + " -irecalnums=" + str(
                    recals
                ) + " PSMC-pipeline"
    elif len(files2) > 0:
        subjobs = []
        fn1sort = files1.keys()
        fn2sort = files2.keys()
        fn1sort.sort()
        fn2sort.sort()
        for i in range(len(fn1sort)):
            for j in range(len(fn2sort)):
                outroot = pop1 + "." + str(i + 1) + "." + pop2 + "." + str(j + 1)
                applet_in = {
                    "cons1": dxpy.dxlink(files1[fn1sort[i]]),
                    "cons2": dxpy.dxlink(files2[fn2sort[j]]),
                    "outroot": outroot,
                    "skip": skip,
                    "recalnums": recals,
                }
                # appjobs.append(pipeline.run(applet_input=applet_in))
                print "dx run -y --folder /psmcfa -icons1=/ConsensusSequences/" + fn1sort[
                    i
                ] + " -icons2=/ConsensusSequences/" + fn2sort[j] + " -ioutroot=" + outroot + " -iskip=" + str(
                    skip
                ) + " -irecalnums=" + str(
                    recals
                ) + " PSMC-pipeline"

    #    for job in app1jobs.keys():
    #        print job
    #        print app1jobs[job]
    #        print(app1jobs[job].describe())
    #        print app1jobs[job].get_output_ref("psmcfa")
    #        print app1jobs[job].get_output_ref("psmcfa").describe()

    # The following line creates the job that will perform the
    # "postprocess" step of your app.  We've given it an input field
    # that is a list of job-based object references created from the
    # "process" jobs we just created.  Assuming those jobs have an
    # output field called "output", these values will be passed to the
    # "postprocess" job.  Because these values are not ready until the
    # "process" jobs finish, the "postprocess" job WILL NOT RUN until
    # all job-based object references have been resolved (i.e. the
    # jobs they reference have finished running).
    #
    # If you do not plan to have the "process" jobs create output that
    # the "postprocess" job will require, then you can explicitly list
    # the dependencies to wait for those jobs to finish by setting the
    # "depends_on" field to the list of subjobs to wait for (it
    # accepts either dxpy handlers or string IDs in the list).  We've
    # included this parameter in the line below as well for
    # completeness, though it is unnecessary if you are providing
    # job-based object references in the input that refer to the same
    # set of jobs.
    #    of1 = {}
    #    for j in app1jobs:
    #        of1[j] = app1jobs[j].get_output_ref("psmcfa")
    #    postprocess_job = dxpy.new_dxjob(fn_input={"files1":of1, "files2":[]}, fn_name="postprocess")

    # If you would like to include any of the output fields from the
    # postprocess_job as the output of your app, you should return it
    # here using a job-based object reference.  If the output field in
    # the postprocess function is called "answer", you can pass that
    # on here as follows:
    #
    # return { "app_output_field": postprocess_job.get_output_ref("answer"), ...}
    #
    # Tip: you can include in your output at this point any open
    # objects (such as gtables) which will be closed by a job that
    # finishes later.  The system will check to make sure that the
    # output object is closed and will attempt to clone it out as
    # output into the parent container only after all subjobs have
    # finished.
    psmcfaFiles = []
    psmcFiles = []
    for job in appjobs:
        psmcfaFiles.append(job.get_output_ref("outfile1"))
        psmcFiles.append(job.get_output_ref("outfile2"))

    output = {"psmcfaFiles": psmcfaFiles, "psmcFiles": psmcFiles}

    return output
Esempio n. 55
0
def build_and_upload_locally(
    src_dir,
    mode,
    overwrite=False,
    archive=False,
    publish=False,
    destination_override=None,
    version_override=None,
    bill_to_override=None,
    use_temp_build_project=True,
    do_parallel_build=True,
    do_version_autonumbering=True,
    do_try_update=True,
    dx_toolkit_autodep="stable",
    do_check_syntax=True,
    dry_run=False,
    return_object_dump=False,
    confirm=True,
    ensure_upload=False,
    region=None,
    **kwargs
):

    dxpy.app_builder.build(src_dir, parallel_build=do_parallel_build)
    app_json = _parse_app_spec(src_dir)

    _verify_app_source_dir(src_dir, mode, enforce=do_check_syntax)
    if mode == "app" and not dry_run:
        _verify_app_writable(app_json["name"])

    working_project = None
    using_temp_project = False
    override_folder = None
    override_applet_name = None

    if mode == "applet" and destination_override:
        working_project, override_folder, override_applet_name = parse_destination(destination_override)
    elif mode == "app" and use_temp_build_project and not dry_run:
        # Create a temp project
        try:
            if region:
                working_project = dxpy.api.project_new(
                    {"name": "Temporary build project for dx-build-app", "region": region}
                )["id"]
            else:
                working_project = dxpy.api.project_new({"name": "Temporary build project for dx-build-app"})["id"]
        except:
            err_exit()
        logger.debug("Created temporary project %s to build in" % (working_project,))
        using_temp_project = True

    try:
        if mode == "applet" and working_project is None and dxpy.WORKSPACE_ID is None:
            parser.error(
                "Can't create an applet without specifying a destination project; please use the -d/--destination flag to explicitly specify a project"
            )

        if "buildOptions" in app_json:
            if app_json["buildOptions"].get("dx_toolkit_autodep") == False:
                dx_toolkit_autodep = False

        # Perform check for existence of applet with same name in
        # destination for case in which neither "-f" nor "-a" is
        # given BEFORE uploading resources.
        if mode == "applet" and not overwrite and not archive:
            try:
                dest_name = override_applet_name or app_json.get("name") or os.path.basename(os.path.abspath(src_dir))
            except:
                raise dxpy.app_builder.AppBuilderException(
                    "Could not determine applet name from specification + "
                    "(dxapp.json) or from working directory (%r)" % (src_dir,)
                )
            dest_folder = override_folder or app_json.get("folder") or "/"
            if not dest_folder.endswith("/"):
                dest_folder = dest_folder + "/"
            dest_project = working_project if working_project else dxpy.WORKSPACE_ID
            for result in dxpy.find_data_objects(
                classname="applet", name=dest_name, folder=dest_folder, project=dest_project, recurse=False
            ):
                dest_path = dest_folder + dest_name
                msg = "An applet already exists at {} (id {}) and neither".format(dest_path, result["id"])
                msg += " -f/--overwrite nor -a/--archive were given."
                raise dxpy.app_builder.AppBuilderException(msg)

        bundled_resources = (
            dxpy.app_builder.upload_resources(
                src_dir, project=working_project, folder=override_folder, ensure_upload=ensure_upload
            )
            if not dry_run
            else []
        )

        try:
            # TODO: the "auto" setting is vestigial and should be removed.
            if dx_toolkit_autodep == "auto":
                dx_toolkit_autodep = "stable"
            applet_id, applet_spec = dxpy.app_builder.upload_applet(
                src_dir,
                bundled_resources,
                check_name_collisions=(mode == "applet"),
                overwrite=overwrite and mode == "applet",
                archive=archive and mode == "applet",
                project=working_project,
                override_folder=override_folder,
                override_name=override_applet_name,
                dx_toolkit_autodep=dx_toolkit_autodep,
                dry_run=dry_run,
                **kwargs
            )
        except:
            # Avoid leaking any bundled_resources files we may have
            # created, if applet creation fails. Note that if
            # using_temp_project, the entire project gets destroyed at
            # the end, so we don't bother.
            if not using_temp_project:
                objects_to_delete = [
                    dxpy.get_dxlink_ids(bundled_resource_obj["id"])[0] for bundled_resource_obj in bundled_resources
                ]
                if objects_to_delete:
                    dxpy.api.project_remove_objects(
                        dxpy.app_builder.get_destination_project(src_dir, project=working_project),
                        input_params={"objects": objects_to_delete},
                    )
            raise

        if dry_run:
            return

        applet_name = applet_spec["name"]

        logger.debug("Created applet " + applet_id + " successfully")

        if mode == "app":
            if "version" not in app_json:
                parser.error('dxapp.json contains no "version" field, but it is required to build an app')
            version = app_json["version"]
            try_versions = [version_override or version]
            if not version_override and do_version_autonumbering:
                try_versions.append(version + _get_version_suffix(src_dir, version))

            app_id = dxpy.app_builder.create_app(
                applet_id,
                applet_name,
                src_dir,
                publish=publish,
                set_default=publish,
                billTo=bill_to_override,
                try_versions=try_versions,
                try_update=do_try_update,
                confirm=confirm,
            )

            app_describe = dxpy.api.app_describe(app_id)

            if publish:
                print(
                    "Uploaded and published app %s/%s (%s) successfully"
                    % (app_describe["name"], app_describe["version"], app_id),
                    file=sys.stderr,
                )
            else:
                print(
                    "Uploaded app %s/%s (%s) successfully" % (app_describe["name"], app_describe["version"], app_id),
                    file=sys.stderr,
                )
                print("You can publish this app with:", file=sys.stderr)
                print(
                    '  dx api app-%s/%s publish "{\\"makeDefault\\": true}"'
                    % (app_describe["name"], app_describe["version"]),
                    file=sys.stderr,
                )

            return app_describe if return_object_dump else {"id": app_id}

        elif mode == "applet":
            return dxpy.api.applet_describe(applet_id) if return_object_dump else {"id": applet_id}
        else:
            raise dxpy.app_builder.AppBuilderException("Unrecognized mode %r" % (mode,))

    finally:
        # Clean up after ourselves.
        if using_temp_project:
            dxpy.api.project_destroy(working_project)
Esempio n. 56
0
def upload_applet(src_dir,
                  uploaded_resources,
                  check_name_collisions=True,
                  overwrite=False,
                  archive=False,
                  project=None,
                  override_folder=None,
                  override_name=None,
                  dx_toolkit_autodep="stable",
                  dry_run=False,
                  **kwargs):
    """
    Creates a new applet object.

    :param project: ID of container in which to create the applet.
    :type project: str, or None to use whatever is specified in dxapp.json
    :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_folder: str
    :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_name: str
    :param dx_toolkit_autodep: What type of dx-toolkit dependency to inject if none is present. "stable" for the APT package; "git" for HEAD of dx-toolkit master branch; or False for no dependency.
    :type dx_toolkit_autodep: boolean or string
    """
    applet_spec = _get_applet_spec(src_dir)

    if project is None:
        dest_project = applet_spec['project']
    else:
        dest_project = project
        applet_spec['project'] = project

    if 'name' not in applet_spec:
        try:
            applet_spec['name'] = os.path.basename(os.path.abspath(src_dir))
        except:
            raise AppBuilderException(
                "Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)"
                % (src_dir, ))

    if override_folder:
        applet_spec['folder'] = override_folder
    if 'folder' not in applet_spec:
        applet_spec['folder'] = '/'

    if override_name:
        applet_spec['name'] = override_name

    if 'dxapi' not in applet_spec:
        applet_spec['dxapi'] = dxpy.API_VERSION

    applet_to_overwrite = None
    archived_applet = None
    if check_name_collisions and not dry_run:
        destination_path = applet_spec['folder'] + (
            '/' if not applet_spec['folder'].endswith('/') else
            '') + applet_spec['name']
        logger.debug("Checking for existing applet at " + destination_path)
        for result in dxpy.find_data_objects(classname="applet",
                                             name=applet_spec["name"],
                                             folder=applet_spec['folder'],
                                             project=dest_project,
                                             recurse=False):
            if overwrite:
                # Don't remove the old applet until after the new one
                # has been created. This avoids a race condition where
                # we remove the old applet, but that causes garbage
                # collection of the bundled resources that will be
                # shared with the new applet
                applet_to_overwrite = result['id']
            elif archive:
                logger.debug("Archiving applet %s" % (result['id']))
                proj = dxpy.DXProject(dest_project)
                archive_folder = '/.Applet_archive'
                try:
                    proj.list_folder(archive_folder)
                except dxpy.DXAPIError:
                    proj.new_folder(archive_folder)

                proj.move(objects=[result['id']], destination=archive_folder)
                archived_applet = dxpy.DXApplet(result['id'],
                                                project=dest_project)
                now = datetime.datetime.fromtimestamp(archived_applet.created /
                                                      1000).ctime()
                new_name = archived_applet.name + " ({d})".format(d=now)
                archived_applet.rename(new_name)
                logger.info(
                    "Archived applet %s to %s:\"%s/%s\"" %
                    (result['id'], dest_project, archive_folder, new_name))
            else:
                raise AppBuilderException(
                    "An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given"
                    % (destination_path, result['id']))

    # -----
    # Override various fields from the pristine dxapp.json

    # Inline Readme.md and Readme.developer.md
    _inline_documentation_files(applet_spec, src_dir)

    # Inline the code of the program
    if "runSpec" in applet_spec and "file" in applet_spec["runSpec"]:
        # Avoid using runSpec.file for now, it's not fully implemented
        #code_filename = os.path.join(src_dir, applet_spec["runSpec"]["file"])
        #f = dxpy.upload_local_file(code_filename, wait_on_close=True)
        #applet_spec["runSpec"]["file"] = f.get_id()
        # Put it into runSpec.code instead
        with open(os.path.join(src_dir,
                               applet_spec["runSpec"]["file"])) as code_fh:
            applet_spec["runSpec"]["code"] = code_fh.read()
            del applet_spec["runSpec"]["file"]

    # Attach bundled resources to the app
    if uploaded_resources is not None:
        applet_spec["runSpec"].setdefault("bundledDepends", [])
        applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources)

    # Include the DNAnexus client libraries as an execution dependency, if they are not already
    # there
    if dx_toolkit_autodep == "git":
        dx_toolkit_dep = {
            "name": "dx-toolkit",
            "package_manager": "git",
            "url": "git://github.com/dnanexus/dx-toolkit.git",
            "tag": "master",
            "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus"
        }
    # TODO: reject "beta" and "unstable" eventually
    elif dx_toolkit_autodep in ("stable", "beta", "unstable"):
        dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"}
    elif dx_toolkit_autodep:
        raise AppBuilderException(
            "dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead"
            % (dx_toolkit_autodep, ))

    if dx_toolkit_autodep:
        applet_spec["runSpec"].setdefault("execDepends", [])
        exec_depends = applet_spec["runSpec"]["execDepends"]
        if type(exec_depends) is not list or any(
                type(dep) is not dict for dep in exec_depends):
            raise AppBuilderException(
                "Expected runSpec.execDepends to be an array of objects")
        dx_toolkit_dep_found = any(
            dep.get('name') in DX_TOOLKIT_PKGS
            or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends)
        if not dx_toolkit_dep_found:
            exec_depends.append(dx_toolkit_dep)
            if dx_toolkit_autodep == "git":
                applet_spec.setdefault("access", {})
                applet_spec["access"].setdefault("network", [])
                # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps
                if "*" not in applet_spec["access"]["network"]:
                    applet_spec["access"]["network"].append("*")

    merge(applet_spec, kwargs)

    # -----
    # Now actually create the applet

    if dry_run:
        print("Would create the following applet:")
        print(json.dumps(applet_spec, indent=2))
        print("*** DRY-RUN-- no applet was created ***")
        return None, None

    if applet_spec.get("categories", []):
        if "tags" not in applet_spec:
            applet_spec["tags"] = []
        applet_spec["tags"] = list(
            set(applet_spec["tags"]) | set(applet_spec["categories"]))

    applet_id = dxpy.api.applet_new(applet_spec)["id"]

    if archived_applet:
        archived_applet.set_properties({'replacedWith': applet_id})

    # Now it is permissible to delete the old applet, if any
    if applet_to_overwrite:
        logger.info("Deleting applet %s" % (applet_to_overwrite, ))
        # TODO: test me
        dxpy.DXProject(dest_project).remove_objects([applet_to_overwrite])

    return applet_id, applet_spec
    return project['id']


if __name__ == '__main__':
    args = get_args()
    if args.sort_filter_and_remove_dups:
        args.duplicates_removed = True

    applets_project_id = resolve_applets_project()
    project = get_project(args.project_name)
    print 'Project: ' + project.describe()['name']
    if project_has_controls_and_replicates_folders(project):
        replicates = dxpy.find_data_objects(classname='file',
                                            name='*.bam',
                                            name_mode='glob',
                                            project=project.get_id(),
                                            folder=REPLICATES_FOLDER,
                                            return_handler=False)
        replicates = [dxpy.dxlink(r) for r in replicates]
        controls = dxpy.find_data_objects(classname='file',
                                          name='*.bam',
                                          name_mode='glob',
                                          project=project.get_id(),
                                          folder=CONTROLS_FOLDER,
                                          return_handler=False)
        controls = [dxpy.dxlink(c) for c in controls]
    else:
        if (len(args.replicates) < 1) or (len(args.controls) < 1):
            sys.exit(
                'Need to have at least 1 replicate file and 1 control file.')
        project.new_folder(REPLICATES_FOLDER, True)
Esempio n. 58
0
def upload_applet(src_dir,
                  uploaded_resources,
                  check_name_collisions=True,
                  overwrite=False,
                  archive=False,
                  project=None,
                  override_folder=None,
                  override_name=None,
                  dx_toolkit_autodep="stable",
                  dry_run=False,
                  **kwargs):
    """
    Creates a new applet object.

    :param project: ID of container in which to create the applet.
    :type project: str, or None to use whatever is specified in dxapp.json
    :param override_folder: folder name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_folder: str
    :param override_name: name for the resulting applet which, if specified, overrides that given in dxapp.json
    :type override_name: str
    :param dx_toolkit_autodep: What type of dx-toolkit dependency to
        inject if none is present. "stable" for the APT package; "git"
        for HEAD of dx-toolkit master branch; or False for no
        dependency.
    :type dx_toolkit_autodep: boolean or string

    """
    applet_spec = _get_applet_spec(src_dir)

    if project is None:
        dest_project = applet_spec['project']
    else:
        dest_project = project
        applet_spec['project'] = project

    if 'name' not in applet_spec:
        try:
            applet_spec['name'] = os.path.basename(os.path.abspath(src_dir))
        except:
            raise AppBuilderException(
                "Could not determine applet name from the specification (dxapp.json) or from the name of the working directory (%r)"
                % (src_dir, ))

    if override_folder:
        applet_spec['folder'] = override_folder
    if 'folder' not in applet_spec:
        applet_spec['folder'] = '/'

    if override_name:
        applet_spec['name'] = override_name

    if 'dxapi' not in applet_spec:
        applet_spec['dxapi'] = dxpy.API_VERSION

    applets_to_overwrite = []
    archived_applet = None
    if check_name_collisions and not dry_run:
        destination_path = applet_spec['folder'] + (
            '/' if not applet_spec['folder'].endswith('/') else
            '') + applet_spec['name']
        logger.debug("Checking for existing applet at " + destination_path)
        for result in dxpy.find_data_objects(classname="applet",
                                             name=applet_spec["name"],
                                             folder=applet_spec['folder'],
                                             project=dest_project,
                                             recurse=False):
            if overwrite:
                # Don't remove the old applet until after the new one
                # has been created. This avoids a race condition where
                # we remove the old applet, but that causes garbage
                # collection of the bundled resources that will be
                # shared with the new applet
                applets_to_overwrite.append(result['id'])
            elif archive:
                logger.debug("Archiving applet %s" % (result['id']))
                proj = dxpy.DXProject(dest_project)
                archive_folder = '/.Applet_archive'
                try:
                    proj.list_folder(archive_folder)
                except dxpy.DXAPIError:
                    proj.new_folder(archive_folder)

                proj.move(objects=[result['id']], destination=archive_folder)
                archived_applet = dxpy.DXApplet(result['id'],
                                                project=dest_project)
                now = datetime.datetime.fromtimestamp(archived_applet.created /
                                                      1000).ctime()
                new_name = archived_applet.name + " ({d})".format(d=now)
                archived_applet.rename(new_name)
                logger.info(
                    "Archived applet %s to %s:\"%s/%s\"" %
                    (result['id'], dest_project, archive_folder, new_name))
            else:
                raise AppBuilderException(
                    "An applet already exists at %s (id %s) and the --overwrite (-f) or --archive (-a) options were not given"
                    % (destination_path, result['id']))

    # -----
    # Override various fields from the pristine dxapp.json

    # Carry region-specific values from regionalOptions into the main
    # runSpec
    applet_spec["runSpec"].setdefault("bundledDepends", [])
    applet_spec["runSpec"].setdefault("assetDepends", [])
    if not dry_run:
        region = dxpy.api.project_describe(
            dest_project, input_params={"fields": {
                "region": True
            }})["region"]

        # if regionalOptions contain at least one region, they must include
        # the region of the target project
        if len(applet_spec.get('regionalOptions',
                               {})) != 0 and region not in applet_spec.get(
                                   'regionalOptions', {}):
            err_mesg = "destination project is in region {} but \"regionalOptions\" do not contain this region. ".format(
                region)
            err_mesg += "Please, update your \"regionalOptions\" specification"
            raise AppBuilderException(err_mesg)

        regional_options = applet_spec.get('regionalOptions',
                                           {}).get(region, {})

        # We checked earlier that if region-specific values for the
        # fields below are given, the same fields are not also specified
        # in the top-level runSpec. So the operations below should not
        # result in any user-supplied settings being clobbered.

        if 'systemRequirements' in regional_options:
            applet_spec["runSpec"]["systemRequirements"] = regional_options[
                'systemRequirements']

        if 'bundledDepends' in regional_options:
            applet_spec["runSpec"]["bundledDepends"].extend(
                regional_options["bundledDepends"])
        if 'assetDepends' in regional_options:
            applet_spec["runSpec"]["assetDepends"].extend(
                regional_options["assetDepends"])

    # Inline Readme.md and Readme.developer.md
    dxpy.executable_builder.inline_documentation_files(applet_spec, src_dir)

    # Inline the code of the program
    if "file" in applet_spec["runSpec"]:
        # Put it into runSpec.code instead
        with open(os.path.join(src_dir,
                               applet_spec["runSpec"]["file"])) as code_fh:
            applet_spec["runSpec"]["code"] = code_fh.read()
            del applet_spec["runSpec"]["file"]

    # If this is applet requires a cluster, inline any bootstrapScript code that may be provided.
    # bootstrapScript is an *optional* clusterSpec parameter.
    # NOTE: assumes bootstrapScript is always provided as a filename
    if "systemRequirements" in applet_spec["runSpec"]:
        sys_reqs = applet_spec["runSpec"]["systemRequirements"]
        for entry_point in sys_reqs:
            try:
                bootstrap_script = os.path.join(
                    src_dir,
                    sys_reqs[entry_point]["clusterSpec"]["bootstrapScript"])
                with open(bootstrap_script) as code_fh:
                    sys_reqs[entry_point]["clusterSpec"][
                        "bootstrapScript"] = code_fh.read()
            except KeyError:
                # either no "clusterSpec" or no "bootstrapScript" within "clusterSpec"
                continue
            except IOError:
                raise AppBuilderException(
                    "The clusterSpec \"bootstrapScript\" could not be read.")

    # Attach bundled resources to the app
    if uploaded_resources is not None:
        applet_spec["runSpec"]["bundledDepends"].extend(uploaded_resources)

    # Validate and process assetDepends
    asset_depends = applet_spec["runSpec"]["assetDepends"]
    if type(asset_depends) is not list or any(
            type(dep) is not dict for dep in asset_depends):
        raise AppBuilderException(
            "Expected runSpec.assetDepends to be an array of objects")
    for asset in asset_depends:
        asset_project = asset.get("project", None)
        asset_folder = asset.get("folder", '/')
        asset_stages = asset.get("stages", None)
        if "id" in asset:
            asset_record = dxpy.DXRecord(asset["id"]).describe(
                fields={'details'}, default_fields=True)
        elif "name" in asset and asset_project is not None and "version" in asset:
            try:
                asset_record = dxpy.find_one_data_object(
                    zero_ok=True,
                    classname="record",
                    typename="AssetBundle",
                    name=asset["name"],
                    properties=dict(version=asset["version"]),
                    project=asset_project,
                    folder=asset_folder,
                    recurse=False,
                    describe={
                        "defaultFields": True,
                        "fields": {
                            "details": True
                        }
                    },
                    state="closed",
                    more_ok=False)
            except dxpy.exceptions.DXSearchError:
                msg = "Found more than one asset record that matches: name={0}, folder={1} in project={2}."
                raise AppBuilderException(
                    msg.format(asset["name"], asset_folder, asset_project))
        else:
            raise AppBuilderException(
                "Each runSpec.assetDepends element must have either {'id'} or "
                "{'name', 'project' and 'version'} field(s).")

        if asset_record:
            if "id" in asset:
                asset_details = asset_record["details"]
            else:
                asset_details = asset_record["describe"]["details"]
            if "archiveFileId" in asset_details:
                archive_file_id = asset_details["archiveFileId"]
            else:
                raise AppBuilderException(
                    "The required field 'archiveFileId' was not found in "
                    "the details of the asset bundle %s " % asset_record["id"])
            archive_file_name = dxpy.DXFile(archive_file_id).describe()["name"]
            bundle_depends = {"name": archive_file_name, "id": archive_file_id}
            if asset_stages:
                bundle_depends["stages"] = asset_stages
            applet_spec["runSpec"]["bundledDepends"].append(bundle_depends)
            # If the file is not found in the applet destination project, clone it from the asset project
            if (not dry_run and
                    dxpy.DXRecord(dxid=asset_record["id"],
                                  project=dest_project).describe()["project"]
                    != dest_project):
                dxpy.DXRecord(
                    asset_record["id"],
                    project=asset_record["project"]).clone(dest_project)
        else:
            raise AppBuilderException(
                "No asset bundle was found that matched the specification %s" %
                (json.dumps(asset)))

    # Include the DNAnexus client libraries as an execution dependency, if they are not already
    # there
    if dx_toolkit_autodep == "git":
        dx_toolkit_dep = {
            "name": "dx-toolkit",
            "package_manager": "git",
            "url": "git://github.com/dnanexus/dx-toolkit.git",
            "tag": "master",
            "build_commands": "make install DESTDIR=/ PREFIX=/opt/dnanexus"
        }
    elif dx_toolkit_autodep == "stable":
        dx_toolkit_dep = {"name": "dx-toolkit", "package_manager": "apt"}
    elif dx_toolkit_autodep:
        raise AppBuilderException(
            "dx_toolkit_autodep must be one of 'stable', 'git', or False; got %r instead"
            % (dx_toolkit_autodep, ))

    if dx_toolkit_autodep:
        applet_spec["runSpec"].setdefault("execDepends", [])
        exec_depends = applet_spec["runSpec"]["execDepends"]
        if type(exec_depends) is not list or any(
                type(dep) is not dict for dep in exec_depends):
            raise AppBuilderException(
                "Expected runSpec.execDepends to be an array of objects")
        dx_toolkit_dep_found = any(
            dep.get('name') in DX_TOOLKIT_PKGS
            or dep.get('url') in DX_TOOLKIT_GIT_URLS for dep in exec_depends)
        if not dx_toolkit_dep_found:
            exec_depends.append(dx_toolkit_dep)
            if dx_toolkit_autodep == "git":
                applet_spec.setdefault("access", {})
                applet_spec["access"].setdefault("network", [])
                # Note: this can be set to "github.com" instead of "*" if the build doesn't download any deps
                if "*" not in applet_spec["access"]["network"]:
                    applet_spec["access"]["network"].append("*")

    merge(applet_spec, kwargs)

    # -----
    # Now actually create the applet

    if dry_run:
        print("Would create the following applet:")
        print(json.dumps(applet_spec, indent=2))
        print("*** DRY-RUN-- no applet was created ***")
        return None, None

    if applet_spec.get("categories", []):
        if "tags" not in applet_spec:
            applet_spec["tags"] = []
        applet_spec["tags"] = list(
            set(applet_spec["tags"]) | set(applet_spec["categories"]))

    applet_id = dxpy.api.applet_new(applet_spec)["id"]

    if archived_applet:
        archived_applet.set_properties({'replacedWith': applet_id})

    # Now it is permissible to delete the old applet(s), if any
    if applets_to_overwrite:
        logger.info("Deleting applet(s) %s" % (','.join(applets_to_overwrite)))
        dxpy.DXProject(dest_project).remove_objects(applets_to_overwrite)

    return applet_id, applet_spec
Esempio n. 59
0
def find_file(filePath,
              project=None,
              verbose=False,
              multiple=False,
              recurse=True):
    '''Using a DX style file path, find the file.'''
    proj = project
    path = filePath
    fileName = filePath
    if filePath.find(':') != -1:
        proj, path = filePath.split(':', 1)
    if path.rfind('/') != -1:
        path, fileName = path.rsplit('/', 1)
    else:
        fileName = path
        path = '/'
    if proj == None:
        if verbose:
            print "ERROR: Don't know what project to use for '" + path + "'."
        return None
    if proj.find('project-') == 0:
        projId = proj
    else:
        projId = get_project(proj, level='VIEW').get_id()
    mode = 'exact'
    if filePath.find('*') or filePath.find('?'):
        mode = 'glob'
    fileDicts = list(
        dxpy.find_data_objects(classname='file',
                               folder=path,
                               name=fileName,
                               recurse=recurse,
                               name_mode=mode,
                               project=projId,
                               return_handler=False))

    if fileDicts == None or len(fileDicts) == 0:
        #print "- Found 0 files from '" + proj + ":" + filePath + "'."
        if verbose:
            print "ERROR: Failed to find '" + proj + ":" + filePath + "'."
        return None
    elif len(fileDicts) > 1 or multiple:
        #print "- Found "+str(len(fileDict))+" files from '" + proj + ":" + filePath + "'."
        if not multiple:
            if verbose:
                print "ERROR: Found " + str(
                    len(fileDicts)
                ) + " files when expecting 1 '" + proj + ":" + filePath + "'."
            return None
        else:
            if verbose:
                print " Found " + str(len(
                    fileDicts)) + " files for '" + proj + ":" + filePath + "'."
        fids = []
        for fileDict in fileDicts:
            FILES[fileDict['id']] = dxpy.dxlink(fileDict)
            fids.append(fileDict['id'])
        return fids
    else:
        #print "- FOUND '" + proj + ":" + filePath + "'."
        FILES[fileDicts[0]['id']] = dxpy.dxlink(fileDicts[0])
        return fileDicts[0]['id']