def validate_compressed(compressed_file, testing_base=None, clean_up=True): '''validate_compressed will first decompress a file to a temporary location, and then test if the folder is valid given the WordFish standard. :param compressed_file: the file to first extract. :param testing_base: If not given, a temporary location will be created. Otherwise, a folder will be made in testing_base. :param clean_up: clean up (remove) extracted files/folders after test. Default True ''' if testing_base == None: testing_base = tempfile.mkdtemp() valid = True dest_dir = tempfile.mkdtemp(prefix="%s/" % testing_base) if compressed_file.endswith('.tar.gz'): test_folder = untar_dir(compressed_file, dest_dir) elif compressed_file.endswith('.zip'): test_folder = unzip_dir(compressed_file, dest_dir) else: bot.error("Invalid compressed file type: %s, exiting." % compressed_file) sys.exit(1) # Each object in the folder (a collection) collections = os.listdir(test_folder) bot.info("collections found: %s" % len(collections)) for collection in collections: collection_path = "%s/%s" % (test_folder, collection) if validate_folder(collection_path) == False: bot.error("collection %s is invalid." % collection) if clean_up == True: shutil.rmtree(dest_dir) return valid
def validate_metadata(full_path, metadata_type=None): '''validate_metadata checks to see if a name (either a collection name, folder for an image or text) has associated metadata, indicated by a file of the same name (ending with json) in the parent directory of the named file. If no matching files are found, None is returned, and the user is alerted. If a matching file is found, it is checked to be valid json. :param full_path: full path to a file or folder :param metadata_type: either one of collection, image, or text. Default collection ''' if metadata_type == None: metadata_type = "collection" parent_dir = os.path.dirname(full_path) base_name = os.path.basename(full_path).split('.')[0] metadata = "%s/%s.json" % (parent_dir, base_name) if os.path.exists(metadata): bot.debug('found %s metadata: %s' % (metadata_type, base_name)) try: md = read_json(metadata) bot.info('%s %s metadata is valid' % (metadata_type, base_name)) except: bot.error('%s %s has invalid json metadata %s' % (metadata_type, base_name, metadata)) return False else: bot.info('%s %s does not have metadata file %s.json' % (metadata_type, base_name, base_name)) return None return True
def search_collections(project, uid=None): '''search collections will search for a particular collection, or if no uid specified, return a complete list of collections. Parameters ========== project: the project name to search Datastore for. uid: if specified, return a specific collection Returns ======= collections: list of Google datastore Entity ''' fields = None if uid is not None: fields = {'uid': uid} requester = RetryRequester(project=project) collections = requester.get_collection(filters=fields) for collection in collections: bot.info('Collection: %s' % collection['uid']) for key, val in collection.items(): bot.custom(prefix=key, message=val) bot.newline() print('Found %s collections' % len(collections)) return collections
def validate_entities(full_path): '''validate_entities will check to see if each subdirectory (an entity in a collection) has a set of valid images and text objects. The user is alerted about extraneous files. :param full_path: the full path to the collection folder with entities ''' valid = True entities = os.listdir(full_path) bot.info("Found %s entities in collection." % len(entities)) if len(entities) == 0: return None for entity in entities: entity_path = "%s/%s" % (full_path, entity) # Does the entity have metadata? if validate_metadata(entity_path, "entity") == False: valid = False entity_texts = validate_texts(entity_path) entity_images = validate_images(entity_path) # If images and text are empty for a collection, invalid if entity_texts == None and entity_images == None: bot.error("found invalid entity: does not have images or text.") valid = False # if either text or images are not valid, entities considered invalid if entity_texts == False or entity_images == False: bot.error("entity %s does not have valid images or text." % (entity)) valid = False return valid
def main(): parser = get_parser() subparsers = get_subparsers(parser) try: args = parser.parse_args() except: sys.exit(0) # if environment logging variable not set, make silent if args.debug is False: os.environ['MESSAGELEVEL'] = "5" if args.version is True: from som.version import __version__ print(__version__) sys.exit(0) if args.command == "list": filters = args.filters if filters is not None: fields = [] for filterset in filters: key,oper,val=filterset.split(',') bot.info("Adding filter %s%s%s" %(key,oper,val)) fields.append((key, oper, val)) filters = fields if args.entity is True: from .search import search_entity search_entity(project=args.project, filters=filters) sys.exit(0) if args.collections is True: from .search import search_collections search_collections(project=args.project) sys.exit(0) if args.images is True: from .search import search_image search_image(project=args.project, filters=filters) sys.exit(0) from .search import summary summary(project=args.project) sys.exit(0) if args.command == "get": from .get import download_collection output_folder = download_collection(output_folder=args.outfolder, collection=args.collection, project=args.project, suid=args.suid, query_entity=not args.query_images, bucket=args.bucket) sys.exit(0) parser.print_help()
def get_datasets(self, quiet=False): '''get a list of datasets, also prints to screen if not quiet''' bot.info("Datasets for project %s:" % self.project) datasets = list(self.bigquery.list_datasets()) if not quiet: for dataset in datasets: print(dataset.name) return datasets
def __init__(self, token=None, study=None): if study is None: study = "test" bot.info("Client: <study:%s>" % (study)) self.base = api_base self.version = api_version self.study = study self.spec = spec self.token = token super(Client, self).__init__()
def structure_folder(folder, relative_path=False): '''structure_folder will return a json data structure to describe a collection folder. The collection is named according to the input data file, and so if additional metadata is to be added (a unique id, name, etc.) it should be done by the calling function using the name as a lookup. :param folder: the folder to generate a structure for :param relative_path: if True, will return relative paths (for web server) :returns collection: a dictionary of entity and other objects. A collection is a dictionary with the following: { "collection": { "name": "collection1", "metadata" ... , "entities": [ ... ]}, } } A collection should be put into a collections data structure like: { "collections": [ ... ] } ''' collection = {'name': folder} full_path = os.path.abspath(folder) if relative_path == True: full_path = os.path.relpath(folder, os.getcwd()) # Add any collection metadata metadata = structure_metadata(full_path) if metadata != None: collection['metadata'] = metadata # validate images, text, and metadata of the entities entities = structure_entities(full_path) if entities == None: bot.info("no entities found for collection %s." % (folder)) else: bot.info("adding %s valid entities to collection %s." % (len(entities), folder)) collection['entities'] = entities return {"collection": collection}
def sniff_extension(file_path, verbose=True): '''sniff_extension will attempt to determine the file type based on the extension, and return the proper mimetype :param file_path: the full path to the file to sniff :param verbose: print stuff out ''' mime_types = { "xls": 'application/vnd.ms-excel', "xlsx": 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', "xml": 'text/xml', "ods": 'application/vnd.oasis.opendocument.spreadsheet', "csv": 'text/plain', "tmpl": 'text/plain', "pdf": 'application/pdf', "php": 'application/x-httpd-php', "jpg": 'image/jpeg', "png": 'image/png', "gif": 'image/gif', "bmp": 'image/bmp', "txt": 'text/plain', "doc": 'application/msword', "js": 'text/js', "swf": 'application/x-shockwave-flash', "mp3": 'audio/mpeg', "zip": 'application/zip', "rar": 'application/rar', "tar": 'application/tar', "arj": 'application/arj', "cab": 'application/cab', "html": 'text/html', "htm": 'text/html', "dcm": 'application/dicom', "dicom": 'application/dicom', "default": 'application/octet-stream', "folder": 'application/vnd.google-apps.folder', "img": "application/octet-stream" } ext = os.path.basename(file_path).split('.')[-1] mime_type = mime_types.get(ext, None) if mime_type == None: mime_type = mime_types['txt'] if verbose == True: bot.info("%s --> %s" % (file_path, mime_type)) return mime_type
def get_dataset(dataset=None): '''get_dataset will return some data provided by the application, based on a user-provided label. In the future, we can add https endpoints to retrieve online datasets. ''' here = get_installdir() valid_datasets = {'developers_uid':'%s/api/identifiers/data/developers_uid.json' %here} if dataset is not None: # In case the user gave an extension dataset = os.path.splitext(dataset)[0].lower() if dataset in valid_datasets: return valid_datasets[dataset] bot.info("Valid datasets include: %s" %(','.join(list(valid_datasets.keys()))))
def structure_template(entity_path, template_type, acceptable_types): '''structure_template will check an entity directory for an folder of a particular type, for files and metadata that meet a particular criteria. If needed, additional parsing functions can be passed to this function. :param entity_path the path to the top level (entity) folder :param template_type: should be one of images or text :param acceptable_types: the valid extensions to allow ''' template_path = "%s/%s" % (entity_path, template_type) entity_name = os.path.basename(entity_path) if not os.path.exists(template_path): bot.info("entity %s does not have %s." % (entity_name, template_type)) return None # Let's keep track of each file entity_folders = os.listdir(template_path) for entity_folder in entity_folders: all_files = os.listdir(template_path) valids = [] # valid files, loaded or not # Find all valid images for folder in entity_folders: folder_path = "%s/%s" % (template_path, folder) all_files = os.listdir(folder_path) for single_file in all_files: parts = single_file.split('.') full_path = "%s/%s" % (folder_path, single_file) ext = '.'.join(parts[1:]) if ext in acceptable_types: valid = {'original': full_path} metadata_file = "%s/%s.json" % (folder_path, parts[0]) if os.path.exists(metadata_file): valid['metadata'] = metadata_file valids.append(valid) # Warn the user about missing valid files, not logical given folder if len(valids) == 0: bot.warning("entity %s does not have %s." % (entity_name, template_type)) return None else: bot.info("entity %s has %s %s" % (entity_name, len(valids), template_type)) return valids
def structure_entities(full_path): '''structure_entities will return a data structure with a list of images and text for each entity found. :param full_path: the full path to the collection folder with entities An entity should look like the following: { "entity": { "id": "12345-6", "images": [ ... ], "text": [ ... ] } }, ''' entities = [] contenders = os.listdir(full_path) bot.info("Found %s entity folders in collection." % len(contenders)) if len(contenders) == 0: return None for contender in contenders: entity_path = "%s/%s" % (full_path, contender) entity = {'id': entity_path} # Does the entity have metadata? metadata = structure_metadata(entity_path, "entity") if metadata != None: entity['metadata'] = metadata entity_texts = structure_texts(entity_path) entity_images = structure_images(entity_path) # If images and text are empty for a collection, invalid if entity_texts == None and entity_images == None: bot.error("found invalid entity: does not have images or text.") continue # if either text or images are not valid, entities considered invalid if entity_texts != None: entity['texts'] = entity_texts if entity_images != None: entity['images'] = entity_images entities.append({"entity": entity}) return entities
def search_image(project, filters=None): '''search image will look for all or a subset of images in Google Datastore Parameters ========== project: the google project to use fields: fields to filter the entity ''' requester = RetryRequester(project=project) images = requester.get_image(filters=filters) for image in images: bot.info('Image: %s' % image['uid']) for key, val in image.items(): bot.custom(prefix=key, message=val) bot.newline() bot.info("Found %s images" % len(images))
def search_entity(project, filters=None): '''search entity will look for all or a subset of entities under one or more collections in Google Datastore Parameters ========== project: the google project to use filters: fields to filter the entity ''' bot.spinner.start() requester = RetryRequester(project=project) entities = requester.get_entity(filters=filters) bot.spinner.stop() bot.newline() for entity in entities: bot.info('Entity: %s' % entity['uid']) for key, val in entity.items(): bot.custom(prefix=key, message=val) bot.newline() bot.info("Found %s entities" % len(entities))
def structure_compressed(compressed_file, testing_base=None, clean_up=False): '''structure_compressed will first decompress a file to a temporary location, and then return the file structure in the WordFish standard. :param compressed_file: the file to first extract. :param testing_base: If not given, a temporary location will be created. Otherwise, a folder will be made in testing_base. :param clean_up: clean up (remove) extracted files/folders after test. Default False, so the user can access the extracted files. ''' if testing_base == None: testing_base = tempfile.mkdtemp() dest_dir = tempfile.mkdtemp(prefix="%s/" % testing_base) if compressed_file.endswith('.tar.gz'): test_folder = untar_dir(compressed_file, dest_dir) elif compressed_file.endswith('.zip'): test_folder = unzip_dir(compressed_file, dest_dir) else: bot.error("Invalid compressed file type: %s, exiting." % compressed_file) sys.exit(1) # Each object in the folder (a collection) collection_paths = os.listdir(test_folder) bot.info("collections found: %s" % len(collection_paths)) # We will return a list of structures, only of valid collections = [] for col in collection_paths: collection_path = "%s/%s" % (test_folder, col) collection = structure_folder(collection_path) collections.append(collection) if clean_up == True: shutil.rmtree(dest_dir) return collections
def dicom2nifti(folders, outdir=None, extension=None): '''dicom2nifti will take a list of folders and produce nifti files in an output directory. If not defined, they will be output in their original directory. ''' if isinstance(folders, dict): folders = list(folders.keys()) if not isinstance(folders, list): folders = [folders] outfiles = [] for folder in folders: lookup = find_dicoms(folder, extension) for base, dicomlist in lookup.items(): nii = read_series(dicomlist) if outdir != None: outfile = "%s/%s.nii.gz" % (outdir, os.path.basename(base)) else: outfile = "%s/%s.nii.gz" % (base, os.path.basename(base)) bot.info("Saving %s" % outfile) nibabel.save(nii, outfile) outfiles.append(outfile) return outfiles
def summary(project): '''summarize counts of collections, images, entities for a project''' bot.spinner.start() requester = RetryRequester(project=project) collections = requester.get_collection() images = requester.get_image() entities = requester.get_entity() bot.spinner.stop() bot.newline() bot.info('Collections: %s' % len(collections)) bot.info('Images: %s' % len(images)) bot.info('Entity: %s' % len(entities))
def progress_download(collection_name, output_folder, suid, project, bucket_name, query_entity=True, filters=None): ''' show progress while downloading images for a Collection/[c]/Entity/study Parameters ========== collection_name: the name of the collection, typically an IRB number output_folder: the base directory to create a study folder in project: Google Cloud project name suid: an suid of interest to query (eg, if querying an Entity, you would use the suid of the patient, an Image would be an suid of the study SUID --> (coded accession#) query_entity: by default, we query the entity first, and then get images. to query the images (studies) set this to False. bucket_name: the name for the Google Storage Bucket (usually provided) filters: a list of tuples to apply to filter the query. Default is: [ ("entity_id","=", study) ] to retrieve all Image items that are equal to the study name Returns ======= path to newly created image file ''' if filters is None: if query_entity is True: filters = [ ("uid","=", suid) ] else: filters = [ ("AccessionNumber","=", suid) ] bot.info("Collecting available images...") try: storage_client = storage.Client() except DefaultCredentialsError: bot.error("We didn't detect your GOOGLE_APPLICATION_CREDENTIALS in the environment! Did you export the path?") sys.exit(1) except Forbidden: bot.error("The service account specified by GOOGLE_APPLICATION_CREDENTIALS does not have permission to use this resource.") sys.exit(1) if not os.path.exists(output_folder): os.mkdir(output_folder) bucket = storage_client.get_bucket(bucket_name) # Retrieve bucket, datastore client, images requester = RetryRequester(bucket_name=bucket_name, project=project) collection = requester.create_collection(collection_name) if query_entity is True: entity_set = requester.get_entity(filters) images = [] for entity in entity_set: entity_images = requester.client.get_images(entity=entity) images = [x for x in entity_images if x not in images] else: images = requester.get_images(filters) bot.info("Found %s images for suid %s in collection %s" %(len(images), suid, collection_name)) progress = 0 total = len(images) files = [] if len(images) > 0: bot.debug("Saving images and metadata...") for image in images: # Download image file_name = prepare_folders(output_folder=output_folder, image_name=image.key.name) blob = bucket.blob(image['storage_name']) bot.show_progress(progress, total, length=35) requester.download(blob,file_name) files.append(file_name) files.append(save_metadata(image,file_name)) progress+=1 bot.show_progress(progress,total,length=35) # Newline to finish sys.stdout.write('\n') return files
def validate_template(entity_path, template_type, acceptable_types): '''validate_template will check an entity directory for an folder of a particular type, for files and metadata that meet a particular criteria. If needed, additional parsing functions can be passed to this function. :param entity_path the path to the top level (entity) folder :param template_type: should be one of images or text :param acceptable_types: the valid extensions to allow ''' valid = True template_path = "%s/%s" % (entity_path, template_type) entity_name = os.path.basename(entity_path) if not os.path.exists(template_path): bot.info("entity %s does not have %s." % (entity_name, template_type)) return None # Let's keep track of each file all_folders = os.listdir(template_path) valids = [] # valid files others = [] # Not valid as metadata or accepted # Find all valid images for folder in all_folders: folder_path = "%s/%s" % (template_path, folder) all_files = os.listdir(folder_path) for single_file in all_files: file_path = "%s/%s" % (folder_path, single_file) parts = single_file.split('.') ext = '.'.join(parts[1:]) if ext in acceptable_types: valids.append(file_path) else: others.append(file_path) # Warn the user about missing valid files, not logical given folder if len(valids) == 0: bot.warning("entity %s does not have %s." % (entity_name, template_type)) return None else: bot.info("entity %s has %s %s" % (entity_name, len(valids), template_type)) # Parse through the "others" and alert user about invalid file valid_metadata = 0 invalid_metadata = 0 skipped_files = 0 # Assess each valid for metadata for contender in valids: if validate_metadata(contender, template_type) == False: bot.error("metadata %s for entity %s is invalid" % (contender, entity_name)) invalid_metadata += 1 valid = False else: valid_metadata += 1 else: skipped_files += 1 bot.warning("%s for %s/%s is not valid for import and is ignored" % (contender, entity_name, template_type)) bot.info( "found %s valid metadata, %s invalid metadata, and %s skipped files for %s" % (valid_metadata, invalid_metadata, skipped_files, entity_name)) return valid