Ejemplo n.º 1
0
def validate_compressed(compressed_file, testing_base=None, clean_up=True):
    '''validate_compressed will first decompress a file to a temporary location,
    and then test if the folder is valid given the WordFish standard.
    :param compressed_file: the file to first extract.
    :param testing_base: If not given, a temporary location will be created. Otherwise,
    a folder will be made in testing_base.
    :param clean_up: clean up (remove) extracted files/folders after test. Default True
    '''
    if testing_base == None:
        testing_base = tempfile.mkdtemp()

    valid = True
    dest_dir = tempfile.mkdtemp(prefix="%s/" % testing_base)
    if compressed_file.endswith('.tar.gz'):
        test_folder = untar_dir(compressed_file, dest_dir)

    elif compressed_file.endswith('.zip'):
        test_folder = unzip_dir(compressed_file, dest_dir)

    else:
        bot.error("Invalid compressed file type: %s, exiting." %
                  compressed_file)
        sys.exit(1)

    # Each object in the folder (a collection)
    collections = os.listdir(test_folder)
    bot.info("collections found: %s" % len(collections))
    for collection in collections:
        collection_path = "%s/%s" % (test_folder, collection)
        if validate_folder(collection_path) == False:
            bot.error("collection %s is invalid." % collection)

    if clean_up == True:
        shutil.rmtree(dest_dir)
    return valid
Ejemplo n.º 2
0
def validate_metadata(full_path, metadata_type=None):
    '''validate_metadata checks to see if a name (either a collection
    name, folder for an image or text) has associated metadata, indicated by
    a file of the same name (ending with json) in the parent directory of the
    named file. If no matching files are found, None is returned, and the user
    is alerted. If a matching file is found, it is checked to be valid json.
    :param full_path: full path to a file or folder
    :param metadata_type: either one of collection, image, or text. Default collection
    '''
    if metadata_type == None:
        metadata_type = "collection"

    parent_dir = os.path.dirname(full_path)
    base_name = os.path.basename(full_path).split('.')[0]
    metadata = "%s/%s.json" % (parent_dir, base_name)

    if os.path.exists(metadata):
        bot.debug('found %s metadata: %s' % (metadata_type, base_name))
        try:
            md = read_json(metadata)
            bot.info('%s %s metadata is valid' % (metadata_type, base_name))
        except:
            bot.error('%s %s has invalid json metadata %s' %
                      (metadata_type, base_name, metadata))
            return False

    else:
        bot.info('%s %s does not have metadata file %s.json' %
                 (metadata_type, base_name, base_name))
        return None

    return True
Ejemplo n.º 3
0
def search_collections(project, uid=None):
    '''search collections will search for a particular collection,
    or if no uid specified, return a complete list of collections.
    
    Parameters
    ==========
    project: the project name to search Datastore for. 
    uid: if specified, return a specific collection

    Returns
    =======
    collections: list of Google datastore Entity
    '''

    fields = None
    if uid is not None:
        fields = {'uid': uid}

    requester = RetryRequester(project=project)
    collections = requester.get_collection(filters=fields)

    for collection in collections:
        bot.info('Collection: %s' % collection['uid'])
        for key, val in collection.items():
            bot.custom(prefix=key, message=val)
        bot.newline()

    print('Found %s collections' % len(collections))
    return collections
Ejemplo n.º 4
0
def validate_entities(full_path):
    '''validate_entities will check to see if each subdirectory (an entity
    in a collection) has a set of valid images and text objects. The user
    is alerted about extraneous files.
    :param full_path: the full path to the collection folder with entities
    '''
    valid = True
    entities = os.listdir(full_path)
    bot.info("Found %s entities in collection." % len(entities))
    if len(entities) == 0:
        return None

    for entity in entities:
        entity_path = "%s/%s" % (full_path, entity)

        # Does the entity have metadata?
        if validate_metadata(entity_path, "entity") == False:
            valid = False

        entity_texts = validate_texts(entity_path)
        entity_images = validate_images(entity_path)

        # If images and text are empty for a collection, invalid
        if entity_texts == None and entity_images == None:
            bot.error("found invalid entity: does not have images or text.")
            valid = False

        # if either text or images are not valid, entities considered invalid
        if entity_texts == False or entity_images == False:
            bot.error("entity %s does not have valid images or text." %
                      (entity))
            valid = False

    return valid
Ejemplo n.º 5
0
def main():

    parser = get_parser()
    subparsers = get_subparsers(parser)

    try:
        args = parser.parse_args()
    except:
        sys.exit(0)

    # if environment logging variable not set, make silent
    if args.debug is False:
        os.environ['MESSAGELEVEL'] = "5"

    if args.version is True:
        from som.version import __version__
        print(__version__)
        sys.exit(0)


    if args.command == "list":
        filters = args.filters
        if filters is not None:
            fields = []
            for filterset in filters:
                key,oper,val=filterset.split(',')
                bot.info("Adding filter %s%s%s" %(key,oper,val))                  
                fields.append((key, oper, val))
            filters = fields

        if args.entity is True:
            from .search import search_entity
            search_entity(project=args.project,
                          filters=filters)
            sys.exit(0)
        if args.collections is True:
            from .search import search_collections
            search_collections(project=args.project)
            sys.exit(0)
        if args.images is True:
            from .search import search_image
            search_image(project=args.project,
                         filters=filters)
            sys.exit(0)
        from .search import summary
        summary(project=args.project)
        sys.exit(0)

    if args.command == "get":
        from .get import download_collection 
        output_folder = download_collection(output_folder=args.outfolder,
                                            collection=args.collection,
                                            project=args.project,
                                            suid=args.suid,
                                            query_entity=not args.query_images,
                                            bucket=args.bucket)
        sys.exit(0)

    parser.print_help()
Ejemplo n.º 6
0
    def get_datasets(self, quiet=False):
        '''get a list of datasets, also prints to screen if not quiet'''
        bot.info("Datasets for project %s:" % self.project)
        datasets = list(self.bigquery.list_datasets())

        if not quiet:
            for dataset in datasets:
                print(dataset.name)
        return datasets
Ejemplo n.º 7
0
    def __init__(self, token=None, study=None):

        if study is None:
            study = "test"
        bot.info("Client: <study:%s>" % (study))
        self.base = api_base
        self.version = api_version
        self.study = study
        self.spec = spec
        self.token = token
        super(Client, self).__init__()
Ejemplo n.º 8
0
def structure_folder(folder, relative_path=False):
    '''structure_folder will return a json data structure to describe a collection folder.
    The collection is named according to the input data file, and so if additional metadata
    is to be added (a unique id, name, etc.) it should be done by the calling function using
    the name as a lookup.
    :param folder: the folder to generate a structure for
    :param relative_path: if True, will return relative paths (for web server)
    :returns collection: a dictionary of entity and other objects. 

    A collection is a dictionary with the following:

    { "collection":

      {
         "name": "collection1",
         "metadata" ... ,
         "entities": [ ... ]},
      }
 
    }

    A collection should be put into a collections data structure like:


     { "collections":
         
        [ ... ]

     }

    '''

    collection = {'name': folder}
    full_path = os.path.abspath(folder)
    if relative_path == True:
        full_path = os.path.relpath(folder, os.getcwd())

    # Add any collection metadata
    metadata = structure_metadata(full_path)
    if metadata != None:
        collection['metadata'] = metadata

    # validate images, text, and metadata of the entities
    entities = structure_entities(full_path)
    if entities == None:
        bot.info("no entities found for collection %s." % (folder))
    else:
        bot.info("adding %s valid entities to collection %s." %
                 (len(entities), folder))
        collection['entities'] = entities

    return {"collection": collection}
Ejemplo n.º 9
0
def sniff_extension(file_path, verbose=True):
    '''sniff_extension will attempt to determine the file type based on the extension,
    and return the proper mimetype
    :param file_path: the full path to the file to sniff
    :param verbose: print stuff out
    '''
    mime_types = {
        "xls": 'application/vnd.ms-excel',
        "xlsx":
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        "xml": 'text/xml',
        "ods": 'application/vnd.oasis.opendocument.spreadsheet',
        "csv": 'text/plain',
        "tmpl": 'text/plain',
        "pdf": 'application/pdf',
        "php": 'application/x-httpd-php',
        "jpg": 'image/jpeg',
        "png": 'image/png',
        "gif": 'image/gif',
        "bmp": 'image/bmp',
        "txt": 'text/plain',
        "doc": 'application/msword',
        "js": 'text/js',
        "swf": 'application/x-shockwave-flash',
        "mp3": 'audio/mpeg',
        "zip": 'application/zip',
        "rar": 'application/rar',
        "tar": 'application/tar',
        "arj": 'application/arj',
        "cab": 'application/cab',
        "html": 'text/html',
        "htm": 'text/html',
        "dcm": 'application/dicom',
        "dicom": 'application/dicom',
        "default": 'application/octet-stream',
        "folder": 'application/vnd.google-apps.folder',
        "img": "application/octet-stream"
    }

    ext = os.path.basename(file_path).split('.')[-1]

    mime_type = mime_types.get(ext, None)

    if mime_type == None:
        mime_type = mime_types['txt']

    if verbose == True:
        bot.info("%s --> %s" % (file_path, mime_type))

    return mime_type
Ejemplo n.º 10
0
def get_dataset(dataset=None):
    '''get_dataset will return some data provided by the application,
    based on a user-provided label. In the future, we can add https endpoints
    to retrieve online datasets.
    '''
    here = get_installdir()
    valid_datasets = {'developers_uid':'%s/api/identifiers/data/developers_uid.json' %here}
    if dataset is not None:

        # In case the user gave an extension
        dataset = os.path.splitext(dataset)[0].lower()
        if dataset in valid_datasets:
            return valid_datasets[dataset]

    bot.info("Valid datasets include: %s" %(','.join(list(valid_datasets.keys()))))
Ejemplo n.º 11
0
def structure_template(entity_path, template_type, acceptable_types):
    '''structure_template will check an entity directory
    for an folder of a particular type, for files and metadata that
    meet a particular criteria. If needed, additional parsing
    functions can be passed to this function. 
    :param entity_path the path to the top level (entity) folder
    :param template_type: should be one of images or text
    :param acceptable_types: the valid extensions to allow
    '''
    template_path = "%s/%s" % (entity_path, template_type)
    entity_name = os.path.basename(entity_path)
    if not os.path.exists(template_path):
        bot.info("entity %s does not have %s." % (entity_name, template_type))
        return None

    # Let's keep track of each file
    entity_folders = os.listdir(template_path)

    for entity_folder in entity_folders:
        all_files = os.listdir(template_path)
        valids = []  # valid files, loaded or not

    # Find all valid images
    for folder in entity_folders:
        folder_path = "%s/%s" % (template_path, folder)
        all_files = os.listdir(folder_path)
        for single_file in all_files:
            parts = single_file.split('.')
            full_path = "%s/%s" % (folder_path, single_file)
            ext = '.'.join(parts[1:])
            if ext in acceptable_types:
                valid = {'original': full_path}
                metadata_file = "%s/%s.json" % (folder_path, parts[0])
                if os.path.exists(metadata_file):
                    valid['metadata'] = metadata_file
                valids.append(valid)

    # Warn the user about missing valid files, not logical given folder
    if len(valids) == 0:
        bot.warning("entity %s does not have %s." %
                    (entity_name, template_type))
        return None
    else:
        bot.info("entity %s has %s %s" %
                 (entity_name, len(valids), template_type))

    return valids
Ejemplo n.º 12
0
def structure_entities(full_path):
    '''structure_entities will return a data structure with a list of
    images and text for each entity found. 
    :param full_path: the full path to the collection folder with entities

    An entity should look like the following:    
        { "entity": {
         
            "id": "12345-6",
            "images": [ ... ],
            "text": [ ... ] 
          }

        },
    '''
    entities = []
    contenders = os.listdir(full_path)
    bot.info("Found %s entity folders in collection." % len(contenders))
    if len(contenders) == 0:
        return None

    for contender in contenders:
        entity_path = "%s/%s" % (full_path, contender)
        entity = {'id': entity_path}

        # Does the entity have metadata?
        metadata = structure_metadata(entity_path, "entity")
        if metadata != None:
            entity['metadata'] = metadata

        entity_texts = structure_texts(entity_path)
        entity_images = structure_images(entity_path)

        # If images and text are empty for a collection, invalid
        if entity_texts == None and entity_images == None:
            bot.error("found invalid entity: does not have images or text.")
            continue

        # if either text or images are not valid, entities considered invalid
        if entity_texts != None:
            entity['texts'] = entity_texts
        if entity_images != None:
            entity['images'] = entity_images
        entities.append({"entity": entity})

    return entities
Ejemplo n.º 13
0
def search_image(project, filters=None):
    '''search image will look for all or a subset of images in Google Datastore
 
    Parameters
    ==========
    project: the google project to use
    fields: fields to filter the entity
    '''

    requester = RetryRequester(project=project)
    images = requester.get_image(filters=filters)

    for image in images:
        bot.info('Image: %s' % image['uid'])
        for key, val in image.items():
            bot.custom(prefix=key, message=val)
        bot.newline()

    bot.info("Found %s images" % len(images))
Ejemplo n.º 14
0
def search_entity(project, filters=None):
    '''search entity will look for all or a subset of entities under one
    or more collections in Google Datastore
 
    Parameters
    ==========
    project: the google project to use
    filters: fields to filter the entity
    '''

    bot.spinner.start()
    requester = RetryRequester(project=project)
    entities = requester.get_entity(filters=filters)
    bot.spinner.stop()
    bot.newline()
    for entity in entities:
        bot.info('Entity: %s' % entity['uid'])
        for key, val in entity.items():
            bot.custom(prefix=key, message=val)
        bot.newline()
    bot.info("Found %s entities" % len(entities))
Ejemplo n.º 15
0
def structure_compressed(compressed_file, testing_base=None, clean_up=False):
    '''structure_compressed will first decompress a file to a temporary location,
    and then return the file structure in the WordFish standard. 
    :param compressed_file: the file to first extract.
    :param testing_base: If not given, a temporary location will be created. Otherwise,
    a folder will be made in testing_base.
    :param clean_up: clean up (remove) extracted files/folders after test. Default False,
    so the user can access the extracted files.
    '''
    if testing_base == None:
        testing_base = tempfile.mkdtemp()

    dest_dir = tempfile.mkdtemp(prefix="%s/" % testing_base)
    if compressed_file.endswith('.tar.gz'):
        test_folder = untar_dir(compressed_file, dest_dir)

    elif compressed_file.endswith('.zip'):
        test_folder = unzip_dir(compressed_file, dest_dir)

    else:
        bot.error("Invalid compressed file type: %s, exiting." %
                  compressed_file)
        sys.exit(1)

    # Each object in the folder (a collection)
    collection_paths = os.listdir(test_folder)
    bot.info("collections found: %s" % len(collection_paths))

    # We will return a list of structures, only of valid
    collections = []

    for col in collection_paths:
        collection_path = "%s/%s" % (test_folder, col)
        collection = structure_folder(collection_path)
        collections.append(collection)

    if clean_up == True:
        shutil.rmtree(dest_dir)
    return collections
Ejemplo n.º 16
0
def dicom2nifti(folders, outdir=None, extension=None):
    '''dicom2nifti will take a list of folders and produce nifti files
    in an output directory. If not defined, they will be output in their
    original directory.
    '''
    if isinstance(folders, dict):
        folders = list(folders.keys())

    if not isinstance(folders, list):
        folders = [folders]

    outfiles = []
    for folder in folders:
        lookup = find_dicoms(folder, extension)
        for base, dicomlist in lookup.items():
            nii = read_series(dicomlist)
            if outdir != None:
                outfile = "%s/%s.nii.gz" % (outdir, os.path.basename(base))
            else:
                outfile = "%s/%s.nii.gz" % (base, os.path.basename(base))
            bot.info("Saving %s" % outfile)
            nibabel.save(nii, outfile)
            outfiles.append(outfile)
    return outfiles
Ejemplo n.º 17
0
def summary(project):
    '''summarize counts of collections, images, entities
    for a project'''

    bot.spinner.start()
    requester = RetryRequester(project=project)
    collections = requester.get_collection()
    images = requester.get_image()
    entities = requester.get_entity()
    bot.spinner.stop()
    bot.newline()

    bot.info('Collections: %s' % len(collections))
    bot.info('Images: %s' % len(images))
    bot.info('Entity: %s' % len(entities))
Ejemplo n.º 18
0
def progress_download(collection_name,
                      output_folder,
                      suid,
                      project,
                      bucket_name,
                      query_entity=True,
                      filters=None):

    '''
    show progress while downloading images for a Collection/[c]/Entity/study 
    
    Parameters
    ==========

    collection_name: the name of the collection, typically an IRB number
    output_folder: the base directory to create a study folder in
    project: Google Cloud project name
    suid: an suid of interest to query (eg, if querying an Entity, you would
          use the suid of the patient, an Image would be an suid of the
          study SUID --> (coded accession#)
    query_entity: by default, we query the entity first, and then get images.
                  to query the images (studies) set this to False.
    bucket_name: the name for the Google Storage Bucket (usually provided)
    filters: a list of tuples to apply to filter the query. Default is:

         [ ("entity_id","=", study) ]

    to retrieve all Image items that are equal to the study name

    Returns
    =======
    path to newly created image file

    '''

    if filters is None:
        if query_entity is True:
            filters = [ ("uid","=", suid) ]
        else:
            filters = [ ("AccessionNumber","=", suid) ]

    bot.info("Collecting available images...")

    try:
        storage_client = storage.Client()

    except DefaultCredentialsError:
        bot.error("We didn't detect your GOOGLE_APPLICATION_CREDENTIALS in the environment! Did you export the path?")
        sys.exit(1)
    except Forbidden:
        bot.error("The service account specified by GOOGLE_APPLICATION_CREDENTIALS does not have permission to use this resource.")
        sys.exit(1)

    if not os.path.exists(output_folder):
        os.mkdir(output_folder)

    bucket = storage_client.get_bucket(bucket_name)

    # Retrieve bucket, datastore client, images
    requester = RetryRequester(bucket_name=bucket_name,
                               project=project)

    collection = requester.create_collection(collection_name)

    if query_entity is True:
        entity_set = requester.get_entity(filters)
        images = []
        for entity in entity_set:
            entity_images = requester.client.get_images(entity=entity)
            images = [x for x in entity_images if x not in images]
    else:
        images = requester.get_images(filters)
    
    bot.info("Found %s images for suid %s in collection %s" %(len(images),
                                                             suid,
                                                             collection_name))
    
    progress = 0
    total = len(images)

    files = []
    if len(images) > 0:
        bot.debug("Saving images and metadata...")
        for image in images:

            # Download image
            file_name = prepare_folders(output_folder=output_folder,
                                        image_name=image.key.name)
            
            blob = bucket.blob(image['storage_name'])
            bot.show_progress(progress, total, length=35)
            requester.download(blob,file_name)
            files.append(file_name)
            files.append(save_metadata(image,file_name))
            progress+=1
            bot.show_progress(progress,total,length=35)

        # Newline to finish
        sys.stdout.write('\n')

    return files
Ejemplo n.º 19
0
def validate_template(entity_path, template_type, acceptable_types):
    '''validate_template will check an entity directory
    for an folder of a particular type, for files and metadata that
    meet a particular criteria. If needed, additional parsing
    functions can be passed to this function. 
    :param entity_path the path to the top level (entity) folder
    :param template_type: should be one of images or text
    :param acceptable_types: the valid extensions to allow
    '''
    valid = True
    template_path = "%s/%s" % (entity_path, template_type)
    entity_name = os.path.basename(entity_path)
    if not os.path.exists(template_path):
        bot.info("entity %s does not have %s." % (entity_name, template_type))
        return None

    # Let's keep track of each file
    all_folders = os.listdir(template_path)
    valids = []  # valid files
    others = []  # Not valid as metadata or accepted

    # Find all valid images
    for folder in all_folders:
        folder_path = "%s/%s" % (template_path, folder)
        all_files = os.listdir(folder_path)
        for single_file in all_files:
            file_path = "%s/%s" % (folder_path, single_file)
            parts = single_file.split('.')
            ext = '.'.join(parts[1:])
            if ext in acceptable_types:
                valids.append(file_path)
            else:
                others.append(file_path)

    # Warn the user about missing valid files, not logical given folder
    if len(valids) == 0:
        bot.warning("entity %s does not have %s." %
                    (entity_name, template_type))
        return None
    else:
        bot.info("entity %s has %s %s" %
                 (entity_name, len(valids), template_type))

    # Parse through the "others" and alert user about invalid file
    valid_metadata = 0
    invalid_metadata = 0
    skipped_files = 0

    # Assess each valid for metadata
    for contender in valids:
        if validate_metadata(contender, template_type) == False:
            bot.error("metadata %s for entity %s is invalid" %
                      (contender, entity_name))
            invalid_metadata += 1
            valid = False
        else:
            valid_metadata += 1
    else:
        skipped_files += 1
        bot.warning("%s for %s/%s is not valid for import and is ignored" %
                    (contender, entity_name, template_type))

    bot.info(
        "found %s valid metadata, %s invalid metadata, and %s skipped files for %s"
        % (valid_metadata, invalid_metadata, skipped_files, entity_name))
    return valid