Exemple #1
0
def validate_entities(full_path):
    '''validate_entities will check to see if each subdirectory (an entity
    in a collection) has a set of valid images and text objects. The user
    is alerted about extraneous files.
    :param full_path: the full path to the collection folder with entities
    '''
    valid = True
    entities = os.listdir(full_path)
    bot.info("Found %s entities in collection." % len(entities))
    if len(entities) == 0:
        return None

    for entity in entities:
        entity_path = "%s/%s" % (full_path, entity)

        # Does the entity have metadata?
        if validate_metadata(entity_path, "entity") == False:
            valid = False

        entity_texts = validate_texts(entity_path)
        entity_images = validate_images(entity_path)

        # If images and text are empty for a collection, invalid
        if entity_texts == None and entity_images == None:
            bot.error("found invalid entity: does not have images or text.")
            valid = False

        # if either text or images are not valid, entities considered invalid
        if entity_texts == False or entity_images == False:
            bot.error("entity %s does not have valid images or text." %
                      (entity))
            valid = False

    return valid
Exemple #2
0
def validate_folder(folder):
    '''validate_folder will ensure the folder provided is reflective of WordFish standard
    :param folder: the folder to validate, corresponding to a collection base
    :returns valid: True/False
    '''

    # The first level should have folders with collections
    valid = True
    full_path = os.path.abspath(folder)

    if validate_metadata(full_path) == False:
        valid = False

    # validate images, text, and metadata of the entities
    valid_entities = validate_entities(full_path)
    if valid_entities == None:
        bot.error("found invalid collection: does not have entities.")
        valid = False
    elif valid_entities == False:
        bot.error("found invalid collection: invalid entities.")
        valid = False
    else:
        print("collection %s is valid." % folder)

    return valid
Exemple #3
0
def validate_metadata(full_path, metadata_type=None):
    '''validate_metadata checks to see if a name (either a collection
    name, folder for an image or text) has associated metadata, indicated by
    a file of the same name (ending with json) in the parent directory of the
    named file. If no matching files are found, None is returned, and the user
    is alerted. If a matching file is found, it is checked to be valid json.
    :param full_path: full path to a file or folder
    :param metadata_type: either one of collection, image, or text. Default collection
    '''
    if metadata_type == None:
        metadata_type = "collection"

    parent_dir = os.path.dirname(full_path)
    base_name = os.path.basename(full_path).split('.')[0]
    metadata = "%s/%s.json" % (parent_dir, base_name)

    if os.path.exists(metadata):
        bot.debug('found %s metadata: %s' % (metadata_type, base_name))
        try:
            md = read_json(metadata)
            bot.info('%s %s metadata is valid' % (metadata_type, base_name))
        except:
            bot.error('%s %s has invalid json metadata %s' %
                      (metadata_type, base_name, metadata))
            return False

    else:
        bot.info('%s %s does not have metadata file %s.json' %
                 (metadata_type, base_name, base_name))
        return None

    return True
Exemple #4
0
def validate_compressed(compressed_file, testing_base=None, clean_up=True):
    '''validate_compressed will first decompress a file to a temporary location,
    and then test if the folder is valid given the WordFish standard.
    :param compressed_file: the file to first extract.
    :param testing_base: If not given, a temporary location will be created. Otherwise,
    a folder will be made in testing_base.
    :param clean_up: clean up (remove) extracted files/folders after test. Default True
    '''
    if testing_base == None:
        testing_base = tempfile.mkdtemp()

    valid = True
    dest_dir = tempfile.mkdtemp(prefix="%s/" % testing_base)
    if compressed_file.endswith('.tar.gz'):
        test_folder = untar_dir(compressed_file, dest_dir)

    elif compressed_file.endswith('.zip'):
        test_folder = unzip_dir(compressed_file, dest_dir)

    else:
        bot.error("Invalid compressed file type: %s, exiting." %
                  compressed_file)
        sys.exit(1)

    # Each object in the folder (a collection)
    collections = os.listdir(test_folder)
    bot.info("collections found: %s" % len(collections))
    for collection in collections:
        collection_path = "%s/%s" % (test_folder, collection)
        if validate_folder(collection_path) == False:
            bot.error("collection %s is invalid." % collection)

    if clean_up == True:
        shutil.rmtree(dest_dir)
    return valid
Exemple #5
0
def download_collection(collection,
                        project,
                        suid,
                        bucket,
                        query_entity=True,
                        output_folder=None,
                        filters=None):

    '''
    client function to download a collection in entirety, intended for
    command line application som get

    See parameters in progress_download
    '''

    if output_folder is None:
        output_folder = os.getcwd()

    if not os.path.exists(output_folder):
        bot.error("Output folder %s not found. Exiting." %output_folder)
        sys.exit(1)

    output_folder = "%s/%s" %(output_folder,collection)
    return progress_download(output_folder=output_folder,
                             suid=suid,
                             collection_name=collection,
                             project=project,
                             query_entity=query_entity,
                             bucket_name=bucket,
                             filters=filters)
Exemple #6
0
def run_command(cmd,error_message=None,sudopw=None,suppress=False):
    '''run_command uses subprocess to send a command to the terminal.
    :param cmd: the command to send, should be a list for subprocess
    :param error_message: the error message to give to user if fails, 
    if none specified, will alert that command failed.
    :param execute: if True, will add `` around command (default is False)
    :param sudopw: if specified (not None) command will be run asking for sudo
    '''
    if sudopw == None:
        sudopw = os.environ.get('pancakes',None)

    if sudopw != None:
        cmd = ' '.join(["echo", sudopw,"|","sudo","-S"] + cmd)
        if suppress == False:
            output = os.popen(cmd).read().strip('\n')
        else:
            output = cmd
            os.system(cmd)
    else:
        try:
            process = subprocess.Popen(cmd,stdout=subprocess.PIPE)
            output, err = process.communicate()
        except OSError as error: 
            if error.errno == os.errno.ENOENT:
                bot.error(error_message)
            else:
                bot.error(err)
            return None
    
    return output
Exemple #7
0
    def call(self, url, func, data=None, return_json=True):
        '''call overrides post for the som api to add the option
        to refresh a token given a 401 response.
        :param func: the function (eg, post, get) to call
        :param url: the url to send file to
        :param data: additional data to add to the request
        :param return_json: return json if successful
        '''
        if isinstance(data, dict):
            data = json.dumps(data)

        response = func(url=url, headers=self.headers, data=data)

        # Errored response, try again with refresh
        if response.status_code == 401:
            bot.warning("Expired token, refreshing...")
            self.token = refresh_access_token()
            self.update_headers()
            response = func(url, headers=self.headers, data=data)

        if response.status_code == 200:

            if return_json:
                try:
                    response = response.json()

                except:
                    bot.error(
                        "The server returned a malformed response. Are you on VPN?"
                    )
                    sys.exit(1)

        return response
Exemple #8
0
def validate_fields(acceptable_fields,actual_fields):
    '''validate_fields checks that there are not any in actual_fields
    that are not in acceptable_fields
    '''
    checking = [x for x in actual_fields if x not in acceptable_fields]
    if len(checking) == 0:
        return True
    bot.error("Fields not allowed: %s" %", ".join(checking))
    return False
Exemple #9
0
def validate_model(fields):
    '''remove_nones will remove any None fields from a passed dictionary
    '''
    keepers = dict()
    for entry in fields:
        if entry['value'] is not None:
            keepers[entry['key']] = entry['value']
        else:
            if entry['required'] == True:
                bot.error("Field %s is required for this entity." %
                          entry['key'])
    return keepers
Exemple #10
0
def read_client_secrets():
    secrets = None
    token_file = os.environ.get("STANFORD_CLIENT_SECRETS", None)
    if token_file is not None:
        if os.path.exists(token_file):
            secrets = read_json(token_file)
    if secrets is None:
        bot.error(
            'Cannot find STANFORD_CLIENT_SECRETS credential file path in environment.'
        )
        sys.exit(1)
    return secrets
Exemple #11
0
def get_structures(inputs, build_dir=None, clean_up=True, fail_exit=True):
    '''get structures will parse one or more compressed files and/ or folder paths
    and return a data structure that has full file paths for images/text documents,
    and the loaded json for metadata.
    :param inputs: a single, or list of inputs, meaning folders and compressed files 
    for validation
    :param build_dir: a directory to use to extract and run things. If not specified,
    one is created.
    :param clean_up: boolean to determine if test_dir and subcontents should be
    removed after tests. Default is True.
    :param fail_exit: Given failure of validation, fail the process. Otherwise, return
    False to the calling function. Default fail_exit is True
    '''
    if not isinstance(inputs, list):
        inputs = [inputs]

    bot.debug("Found %s inputs to structure using som-validator." %
              len(inputs))

    # Where are we testing?
    if build_dir == None:
        build_dir = tempfile.mkdtemp()

    # We will return a list of structures, each a collection
    structures = dict()

    # Tell the user about testing folder
    message = "Building folder will be %s"
    if clean_up == True:
        message = "%s, and will be removed upon completion." % (message)
    bot.debug(message % build_dir)

    for testing in inputs:
        valid = validate_dataset(dataset=testing,
                                 testing_base=build_dir,
                                 clean_up=clean_up)

        # We only structure input that is valid
        if valid == False:
            if fail_exit == True:
                bot.error("Input %s is not valid, please fix. Exiting." %
                          testing)
                sys.exit(1)
            bot.error("Input %s is not valid, skipping.")
        else:
            structures[dataset] = structure_dataset(dataset=testing,
                                                    testing_base=build_dir,
                                                    clean_up=clean_up)

    return structures
Exemple #12
0
def run_validation(inputs, test_dir=None, clean_up=True, fail_exit=True):
    '''run validation will run one or more inputs through the validation procedure,
    meaning checking that the folder (and other data structures) fit the WordFish
    standard: http://www.github.com/radinformatics/wordfish-standard
    :param inputs: a single, or list of inputs, meaning folders and compressed files 
    for validation
    :param test_dir: a directory to use to extract and run things. If not specified,
    one is created.
    :param clean_up: boolean to determine if test_dir and subcontents should be
    removed after tests. Default is True.
    :param fail_exit: Given failure of validation, fail the process. Otherwise, return
    False to the calling function. Default fail_exit is True
    '''
    if not isinstance(inputs, list):
        inputs = [inputs]

    bot.debug("Found %s inputs to test using som-validator." % len(inputs))

    # Where are we testing?
    if test_dir == None:
        test_dir = tempfile.mkdtemp()

    # Tell the user about testing folder
    message = "Testing folder will be %s"
    if clean_up == True:
        message = "%s, and will be removed upon completion." % (message)
    bot.debug(message % test_dir)

    for testing in inputs:
        if os.path.isdir(testing):
            valid = validate_folder(folder=testing)
        elif re.search("[.]zip$|[.]tar[.]gz$", testing):
            valid = validate_compressed(compressed_file=testing,
                                        testing_base=test_dir,
                                        clean_up=clean_up)

        # Always exit or return False if input is not valid
        if valid == False:
            if fail_exit == True:
                bot.error(
                    "Input %s is not valid, please fix and retest. Exiting." %
                    testing)
                sys.exit(1)
            bot.error(
                "Input %s is not valid, please fix and retest. Returning False."
                % testing)
            return valid

    return valid
Exemple #13
0
def get_metadata(key):
    '''get_metadata will return metadata about an instance from within it.
    :param key: the key to look up
    '''
    headers = {"Metadata-Flavor": "Google"}
    url = "http://metadata.google.internal/computeMetadata/v1/instance/attributes/%s" % (
        key)
    response = api_get(url=url, headers=headers)

    # Successful query returns the result
    if response.status_code == 200:
        return response.text
    else:
        bot.error("Error retrieving metadata %s, returned response %s" %
                  (key, response.status_code))
    return None
Exemple #14
0
    def query_key(self, kind, keys, **kwargs):
        '''query_key is an entry to query that adds a key filter to the query
        '''
        if operator == None:
            operator = '>'

        if operator not in get_key_filters():
            bot.error("%s is not a valid operator." % operator)
            sys.exit(1)

        keys = parse_keys(keys)
        query = self.client.query(kind=kind)
        for key in keys:
            query.key_filter(key, operator)

        return self.query(**kwargs)
Exemple #15
0
def validate_item(item,sources=None,verbose=True):
    '''validate_item will validate a single item objects, intended to go in as a field to
    a POST. For more than one item, use validate_items wrapper
    :param item: the item object. Must include the following:
    :param sources: a list of valid item sources (eg ["pacs"])
    :param verbose: if True, prints out valid True messages. False (errors) always printed

    :: notes

    {
        # generic attribute/values just to store, not interpreted
        "id":"123123123123123", // mandatory

        # the issuer for the above id
        # mandatory, or maybe optional with default of "stanford" or "unspecified"
        "id_source":"pacs",

        # when the id was looked up, to help with changed/merged ids
        # optional with default of current timestamp?
        "id_timestamp":"2016-01-30T17:15:23.123Z",

        # optional key/value attributes, will be stored as-is if provided, but not used or interpreted
        # values will be updated/replaced if sent multiple times (last received wins)
        # any existing values will be preserved if not explicitly set here; set empty string to remove
        "custom_fields":{
          "image_type":"x-ray",
          "resolution":"high"
    }
    '''
    if sources == None:
        sources = item_sources

    # These are the rules for an item
    rules = {
        "id": [Required, Pattern("^[A-Za-z0-9_-]*$")], # pattern
        "id_source": [Required, In(sources)],      # must be in item sources
        "id_timestamp": [Required,Pattern(timestamp)], 
    }

    valid,message = validate(rules, item)
    if verbose == True:
        bot.debug("identifier %s data structure valid: %s" %(item['id'],valid))
    if valid == False:
        bot.error(message)
        if verbose == True:
            print(item)
    return valid
Exemple #16
0
def structure_entities(full_path):
    '''structure_entities will return a data structure with a list of
    images and text for each entity found. 
    :param full_path: the full path to the collection folder with entities

    An entity should look like the following:    
        { "entity": {
         
            "id": "12345-6",
            "images": [ ... ],
            "text": [ ... ] 
          }

        },
    '''
    entities = []
    contenders = os.listdir(full_path)
    bot.info("Found %s entity folders in collection." % len(contenders))
    if len(contenders) == 0:
        return None

    for contender in contenders:
        entity_path = "%s/%s" % (full_path, contender)
        entity = {'id': entity_path}

        # Does the entity have metadata?
        metadata = structure_metadata(entity_path, "entity")
        if metadata != None:
            entity['metadata'] = metadata

        entity_texts = structure_texts(entity_path)
        entity_images = structure_images(entity_path)

        # If images and text are empty for a collection, invalid
        if entity_texts == None and entity_images == None:
            bot.error("found invalid entity: does not have images or text.")
            continue

        # if either text or images are not valid, entities considered invalid
        if entity_texts != None:
            entity['texts'] = entity_texts
        if entity_images != None:
            entity['images'] = entity_images
        entities.append({"entity": entity})

    return entities
Exemple #17
0
def receive_items(items):
    '''receive items will validate reception of an items list.
    :param items: the items list from a response

     HTTP 200

        'items': [
                   {
                    'id_source': 'GE PACS', 
                    'jittered_timestamp': '2016-01-15T17:15:23.123Z', 
                    'id': 'A654321', 
                    'suid': '103e'
                    }
         ]

    '''
    expected_fields = [
        'id_source', 'jittered_timestamp', 'suid', 'id', 'custom_fields'
    ]

    if not isinstance(items, list):
        bot.error("Items must be a list")
        return False

    # These are the rules for each uidEntity
    rules = {
        "id": [Required, Pattern("^[A-Za-z0-9_-]*$")],  # pattern
        "suid": [Required, Pattern("^[A-Za-z0-9_-]*$")],  # the suid
        "id_source": [Required, In(item_sources)],  # must be in person sources
        "jittered_timestamp": [Required, Pattern(timestamp)]
    }

    for item in items:

        # Validate required fields
        valid, message = validate(rules, item)
        if valid == False:
            bot.error(message)
            return valid

        # Validate fields returned in response
        if not validate_fields(expected_fields, item.keys()):
            return False

    return valid
Exemple #18
0
def get_universal_source(source,comparator):
    '''get universal source will, given a source item (a list or single item), check to
    see if the length of a comparator is equivalent. The comparator is expected to be a list
    the same length as the source.
    :param source: the source item (single item or list)
    :param comparator: the thing to compare to
    :returns: EITHER a universal source (meaning one item to use for all items in source) OR None,
    meaning the comparator should be treated as a list, and source[i] matched to comparator[i]
    '''
    universal_source = None
    if isinstance(source,list):
        if len(source) != len(comparator):
            bot.error("Mismatch in length of source (%s) and comparator (%s). Exiting"
                                                            %(len(source),len(comparator)))
            sys.exit(1)
    else:
        # Otherwise, we assume a common source
        universal_source = id_sources
    return universal_source
Exemple #19
0
def add_tag(dicom,name,value):
    '''add tag will add a tag only if it's in the (active) DicomDictionary
    :param dicom: the pydicom.dataset Dataset (pydicom.read_file)
    :param name: the name of the field to add
    :param value: the value to set, if name is a valid tag
    '''
    dicom_file = os.path.basename(dicom.filename)
    tag = get_tag(name)

    if name in tag:
        dicom.add_new(tag['tag'], tag['VR'], value) 
 
        # dicom.data_element("PatientIdentityRemoved")
        # (0012, 0062) Patient Identity Removed            CS: 'Yes'

        bot.debug("ADDITION %s to %s." %(dicom.data_element(name),dicom_file))
    else:
        bot.error("%s is not a valid field to add. Skipping." %(name))

    return dicom
Exemple #20
0
def structure_compressed(compressed_file, testing_base=None, clean_up=False):
    '''structure_compressed will first decompress a file to a temporary location,
    and then return the file structure in the WordFish standard. 
    :param compressed_file: the file to first extract.
    :param testing_base: If not given, a temporary location will be created. Otherwise,
    a folder will be made in testing_base.
    :param clean_up: clean up (remove) extracted files/folders after test. Default False,
    so the user can access the extracted files.
    '''
    if testing_base == None:
        testing_base = tempfile.mkdtemp()

    dest_dir = tempfile.mkdtemp(prefix="%s/" % testing_base)
    if compressed_file.endswith('.tar.gz'):
        test_folder = untar_dir(compressed_file, dest_dir)

    elif compressed_file.endswith('.zip'):
        test_folder = unzip_dir(compressed_file, dest_dir)

    else:
        bot.error("Invalid compressed file type: %s, exiting." %
                  compressed_file)
        sys.exit(1)

    # Each object in the folder (a collection)
    collection_paths = os.listdir(test_folder)
    bot.info("collections found: %s" % len(collection_paths))

    # We will return a list of structures, only of valid
    collections = []

    for col in collection_paths:
        collection_path = "%s/%s" % (test_folder, col)
        collection = structure_folder(collection_path)
        collections.append(collection)

    if clean_up == True:
        shutil.rmtree(dest_dir)
    return collections
Exemple #21
0
    def _validate_table(self, table, schema_required=True):
        ''' ensure that the user has provided a table, with the following
            order of preference:

            1. First preference goes to table provided at runtime
            2. Second preference goes to BatchManager table
            3. If not found, fail

         Parameters
         ==========
         table (required) to validate
         schema_required: Does the table, if found, require a schema?

         Returns
         =======
         If valid, the valid table. If not, returns None

        '''

        # First preference to table provided at runtime
        active_table = table
        if active_table is None:
            active_table = self.table

        # Second (above) to BatchManagers, then error)
        if active_table is None:
            bot.error("Please provide a table to the Manager or function.") 
            return False

        # The table must have a schema
        if schema_required is True:
            if table.schema in ["",[]]: 
                bot.error("Table must be defined with a schema before batch insert.") 
                active_table = None

        return active_table
Exemple #22
0
def receive_identifiers(response):
    '''receive identifiers will validate reception of an identifiers response.
    This should be a list
    :param response: the response list of identifiers

    :: notes
     successful response:

     HTTP 200

    [
       {'jittered_timestamp': '2016-01-30T17:15:23.123Z', 
        'id': '12345678', 
        'suid': '103e', 
        'id_source': 'Stanford MRN', 
        'custom_fields': [
             {'key': 'studySiteID', 'value': '78329'}], 
        'items': [
                   {
                    'id_source': 'GE PACS', 
                    'jittered_timestamp': '2016-01-15T17:15:23.123Z', 
                    'id': 'A654321', 
                    'suid': '103e'}
                   ]}

    ]

    '''
    # These fields are expected, but not required. We will error
    # if any fields are present outside this scope
    expected_fields = [
        'items', 'id_source', 'jittered_timestamp', 'suid', 'id',
        'custom_fields'
    ]

    if not isinstance(response, list):
        bot.error("Response must be a list")
        return False

    # These are the rules for each uidEntity
    rules = {
        "id": [Required, Pattern("^[A-Za-z0-9_-]*$")],  # pattern
        "suid": [Required, Pattern("^[A-Za-z0-9_-]*$")],  # the suid
        "id_source": [Required,
                      In(identifier_sources)],  # must be in identifer sources
        "jittered_timestamp": [Required, Pattern(timestamp)]
    }

    for item in response:

        # Validate required fields
        valid, message = validate(rules, item)
        if valid == False:
            bot.error(message)
            return valid

        # Validate fields returned in response
        if not validate_fields(expected_fields, item.keys()):
            return False

        # Validate items
        if "items" in item:
            if not receive_items(item['items']):
                return False

    bot.debug("Identifiers data structure valid: %s" % valid)
    return valid
Exemple #23
0
def validate_identifiers(identifiers,id_sources=None,item_sources=None,verbose=True):
    '''validate_identifiers will validate one or more identifier objects, 
    intended to go in as a field to a POST
    :param identifiers: the identifiers object.
    :param verbose: verbose output for items
    :param identifier_sources: a list of one or more identifier sources. 
    :param item_sources: a list of one or more item sources
    If either not defined, default to use standards

    :: notes
       {
        # mandatory key for uniquely identifying the person
        "id":"1234567-8",
       
        # the issuer for the above id
        # mandatory, or maybe optional with default of "stanford" or "unspecified"
        
        "id_source":"stanford",
        # when the id was looked up, to help with changed/merged ids
        # optional with default of current timestamp?
        
        "id_timestamp":"2016-01-30T17:15:23.123Z",
        # optional key/value attributes, will be stored as-is if provided, but not used or interpreted
        # values will be updated/replaced if sent multiple times (last received wins)
        # any existing values will be preserved if not explicitly set here; set empty string to remove
        
        "custom_fields":{
          "first_name":"Joe",
          "last_name":"Smith",
          "dob":"1970-02-28"
        }
    '''
    if id_sources == None:
        id_sources = identifier_sources

    # These are the rules for a person
    rules = {
        "id": [Required, Pattern("^[A-Za-z0-9_-]*$")], # pattern
        "id_source": [Required, In(id_sources)],   # must be in person sources
        "id_timestamp": [Required,Pattern(timestamp)], 
    }

    if not isinstance(identifiers,dict):
        bot.error("Identifiers data structure must be dictionary.")
        return False

    if "identifiers" not in identifiers:
        bot.error("identifiers key not found in data structure.")

    items = identifiers['identifiers']

    if not isinstance(items,list):
        bot.error("Items in identifiers data structure must be list.")
        return False  

    for item in items:

        valid,message = validate(rules, item)
        if valid == False:
            bot.error(message)
            return valid

        if "items" in item:
            validate_items(item['items'],sources=item_sources)

    bot.debug("Identifiers data structure valid: %s" %valid)
    return valid
Exemple #24
0
def progress_download(collection_name,
                      output_folder,
                      suid,
                      project,
                      bucket_name,
                      query_entity=True,
                      filters=None):

    '''
    show progress while downloading images for a Collection/[c]/Entity/study 
    
    Parameters
    ==========

    collection_name: the name of the collection, typically an IRB number
    output_folder: the base directory to create a study folder in
    project: Google Cloud project name
    suid: an suid of interest to query (eg, if querying an Entity, you would
          use the suid of the patient, an Image would be an suid of the
          study SUID --> (coded accession#)
    query_entity: by default, we query the entity first, and then get images.
                  to query the images (studies) set this to False.
    bucket_name: the name for the Google Storage Bucket (usually provided)
    filters: a list of tuples to apply to filter the query. Default is:

         [ ("entity_id","=", study) ]

    to retrieve all Image items that are equal to the study name

    Returns
    =======
    path to newly created image file

    '''

    if filters is None:
        if query_entity is True:
            filters = [ ("uid","=", suid) ]
        else:
            filters = [ ("AccessionNumber","=", suid) ]

    bot.info("Collecting available images...")

    try:
        storage_client = storage.Client()

    except DefaultCredentialsError:
        bot.error("We didn't detect your GOOGLE_APPLICATION_CREDENTIALS in the environment! Did you export the path?")
        sys.exit(1)
    except Forbidden:
        bot.error("The service account specified by GOOGLE_APPLICATION_CREDENTIALS does not have permission to use this resource.")
        sys.exit(1)

    if not os.path.exists(output_folder):
        os.mkdir(output_folder)

    bucket = storage_client.get_bucket(bucket_name)

    # Retrieve bucket, datastore client, images
    requester = RetryRequester(bucket_name=bucket_name,
                               project=project)

    collection = requester.create_collection(collection_name)

    if query_entity is True:
        entity_set = requester.get_entity(filters)
        images = []
        for entity in entity_set:
            entity_images = requester.client.get_images(entity=entity)
            images = [x for x in entity_images if x not in images]
    else:
        images = requester.get_images(filters)
    
    bot.info("Found %s images for suid %s in collection %s" %(len(images),
                                                             suid,
                                                             collection_name))
    
    progress = 0
    total = len(images)

    files = []
    if len(images) > 0:
        bot.debug("Saving images and metadata...")
        for image in images:

            # Download image
            file_name = prepare_folders(output_folder=output_folder,
                                        image_name=image.key.name)
            
            blob = bucket.blob(image['storage_name'])
            bot.show_progress(progress, total, length=35)
            requester.download(blob,file_name)
            files.append(file_name)
            files.append(save_metadata(image,file_name))
            progress+=1
            bot.show_progress(progress,total,length=35)

        # Newline to finish
        sys.stdout.write('\n')

    return files
Exemple #25
0
def validate_template(entity_path, template_type, acceptable_types):
    '''validate_template will check an entity directory
    for an folder of a particular type, for files and metadata that
    meet a particular criteria. If needed, additional parsing
    functions can be passed to this function. 
    :param entity_path the path to the top level (entity) folder
    :param template_type: should be one of images or text
    :param acceptable_types: the valid extensions to allow
    '''
    valid = True
    template_path = "%s/%s" % (entity_path, template_type)
    entity_name = os.path.basename(entity_path)
    if not os.path.exists(template_path):
        bot.info("entity %s does not have %s." % (entity_name, template_type))
        return None

    # Let's keep track of each file
    all_folders = os.listdir(template_path)
    valids = []  # valid files
    others = []  # Not valid as metadata or accepted

    # Find all valid images
    for folder in all_folders:
        folder_path = "%s/%s" % (template_path, folder)
        all_files = os.listdir(folder_path)
        for single_file in all_files:
            file_path = "%s/%s" % (folder_path, single_file)
            parts = single_file.split('.')
            ext = '.'.join(parts[1:])
            if ext in acceptable_types:
                valids.append(file_path)
            else:
                others.append(file_path)

    # Warn the user about missing valid files, not logical given folder
    if len(valids) == 0:
        bot.warning("entity %s does not have %s." %
                    (entity_name, template_type))
        return None
    else:
        bot.info("entity %s has %s %s" %
                 (entity_name, len(valids), template_type))

    # Parse through the "others" and alert user about invalid file
    valid_metadata = 0
    invalid_metadata = 0
    skipped_files = 0

    # Assess each valid for metadata
    for contender in valids:
        if validate_metadata(contender, template_type) == False:
            bot.error("metadata %s for entity %s is invalid" %
                      (contender, entity_name))
            invalid_metadata += 1
            valid = False
        else:
            valid_metadata += 1
    else:
        skipped_files += 1
        bot.warning("%s for %s/%s is not valid for import and is ignored" %
                    (contender, entity_name, template_type))

    bot.info(
        "found %s valid metadata, %s invalid metadata, and %s skipped files for %s"
        % (valid_metadata, invalid_metadata, skipped_files, entity_name))
    return valid
Exemple #26
0
    def query(self,
              kind=None,
              filters=None,
              order=None,
              projections=None,
              run=True,
              limit=None,
              keys_only=False,
              distinct_on=None,
              query=None,
              ancestor=None):
        '''query will run a query for some kind of model (eg 'Collection')
        :param kind: the kind of model to query
        :param filters: a list of lists of tuples, each length 3, with

             FIELD OPERATOR VALUE
             "description" "=' "this is a description" 
 
        :param order: optional one or more orderings -descending / ascending
        :param projections: if defined, return a dictionary of lists of each
        :param run: Default true to execute query. False returns query object
        :param limit: an integer limit of results to return
        :param distinct_on: one or more fields to make distinct
        :param query: if provided, will not start with basic query
        :param ancestor: if provided, query for the provided ancestor
                                
             ANCESTOR EXAMPLE
             ancestor = client.key('TaskList', 'default')

        '''
        if ancestor is not None:
            if isinstance(ancestor, list) or isinstance(ancestor, str):
                ancestor = self.client.key(*ancestor)

        if query == None:
            if kind is None:
                bot.error("You must define 'kind' to run a query.")
                sys.exit(1)
            query = self.client.query(
                kind=kind.capitalize(),
                ancestor=ancestor)  # uppercase first letter

        if distinct_on is not None:
            if not isinstance(distinct_on, list):
                distinct_on = [distinct_on]
            query.distinct_on = distinct_on

        # Add filters to the query
        if filters is not None:
            for f in filters:
                query.add_filter(f[0], f[1], f[2])

        # Apply order to the query (eg, ['-priority']
        if order is not None:
            if not isinstance(order, list):
                order = [order]
            order = [x.lower() for x in order]
            query.order = order

        # Return a projection, an extraction of fields from a set
        if projections is not None:
            return self.projection(query, projections)

        if keys_only is True:
            query.keys_only()

        # Do we not want to run the query?
        if not run:
            return query

        result = None

        try:
            result = [x for x in query.fetch(limit=limit)]

        except (BadRequest, GrpcRendezvous):
            bot.error("Error with query.")
            pass

        return result