def get_listing(callback): logger.info('Retrieving head of model repository') r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master')) callback() resp = r.json() if 'object' not in resp: raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) head = resp['object']['sha'] logger.info('Retrieving tree of model repository') r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1}) callback() resp = r.json() if 'tree' not in resp: raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) models = {} for el in resp['tree']: components = el['path'].split('/') # new model if len(components) == 2: models[components[1]] = {'type': components[0]} if len(components) > 2 and components[2] == 'DESCRIPTION': logger.info('Retrieving description for {}'.format(components[1])) r = requests.get(el['url']) if not r.ok: raise KrakenRepoException('{}: {}'.format(r.status_code, r.json()['message'])) raw = base64.b64decode(requests.get(el['url']).json()['content']).decode('utf-8') callback() try: models[components[1]].update(json.loads(raw)) except: del models[components[1]] elif len(components) > 2 and components[1] in models: models[components[1]]['model'] = el['url'] return models
def get_description(model_id): logger.info(u'Retrieving metadata for {}'.format(model_id)) logger.debug(u'Retrieving head of model repository') r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master')) resp = r.json() if 'object' not in resp: logger.error(u'No \'object\' field in repo head API response.') raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) head = resp['object']['sha'] logger.debug(u'Retrieving tree of model repository') r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1}) resp = r.json() if 'tree' not in resp: logger.error(u'No \'tree\' field in repo API response.') raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) for el in resp['tree']: components = el['path'].split('/') if len(components) > 2 and components[1] == model_id and components[ 2] == 'DESCRIPTION': logger.debug(u'Retrieving description for {}'.format( components[1])) raw = base64.b64decode(requests.get( el['url']).json()['content']).decode('utf-8') return defaultdict(str, json.loads(raw))
def get_model(model_id, path, callback): logger.info('Retrieving head of model repository') r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master')) callback() resp = r.json() if 'object' not in resp: raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) head = resp['object']['sha'] logger.info('Retrieving tree of model repository') r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1}) callback() resp = r.json() if 'tree' not in resp: raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) url = None for el in resp['tree']: components = el['path'].split('/') if len(components) > 2 and components[1] == model_id and components[2] == 'DESCRIPTION': logger.info('Retrieving description for {}'.format(components[1])) raw = base64.b64decode(requests.get(el['url']).json()['content']).decode('utf-8') desc = json.loads(raw) spath = os.path.join(path, desc['name']) elif len(components) > 2 and components[1] == model_id: url = el['url'] break if not url: raise KrakenRepoException('No such model known') with closing(requests.get(url, headers={'Accept': 'application/vnd.github.v3.raw'}, stream=True)) as r: with open(spath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): callback() f.write(chunk)
def get_listing(callback: Callable[..., Any] = lambda: None) -> dict: """ Fetches a listing of all kraken models from the zenodo repository. Args: callback (Callable): Function called after each HTTP request. Returns: Dict of models with each model. """ logger.info('Retrieving model list') records = [] r = requests.get('{}{}'.format(MODEL_REPO, 'records'), params={'communities': 'ocr_models'}) r.raise_for_status() callback() resp = r.json() if not resp['hits']['total']: logger.error('No models found in community \'ocr_models\'') raise KrakenRepoException('No models found in repository \'ocr_models\'') logger.debug('Total of {} records in repository'.format(resp['hits']['total'])) records.extend(resp['hits']['hits']) while 'next' in resp['links']: logger.debug('Fetching next page') r = requests.get(resp['links']['next']) r.raise_for_status() resp = r.json() logger.debug('Found {} new records'.format(len(resp['hits']['hits']))) records.extend(resp['hits']['hits']) logger.debug('Retrieving model metadata') models = {} # fetch metadata.jsn for each model for record in records: if 'keywords' not in record['metadata']: continue model_type = SUPPORTED_MODELS.intersection(record['metadata']['keywords']) if not model_type: continue for file in record['files']: if file['key'] == 'metadata.json': callback() r = requests.get(file['links']['self']) r.raise_for_status() try: metadata = r.json() except: msg = 'Metadata for \'{}\' ({}) not in JSON format'.format(record['metadata']['title'], record['metadata']['doi']) logger.error(msg) raise KrakenRepoException(msg) # merge metadata.jsn into DataCite key = record['metadata']['doi'] models[key] = record['metadata'] models[key].update({'graphemes': metadata['graphemes'], 'summary': metadata['summary'], 'script': metadata['script'], 'link': record['links']['latest'], 'type': [x.split('_')[1] for x in model_type]}) return models
def get_description(model_id: str, callback: Callable[..., Any] = lambda: None) -> dict: """ Fetches the metadata for a single model from the zenodo repository. Args: model_id (str): DOI of the model. callback (callable): Optional function called once per HTTP request. Returns: Dict """ logger.info('Retrieving metadata for {}'.format(model_id)) r = requests.get('{}{}'.format(MODEL_REPO, 'records'), params={'q': 'doi:"{}"'.format(model_id)}) r.raise_for_status() callback() resp = r.json() if resp['hits']['total'] != 1: logger.error('Found {} models when querying for id \'{}\''.format(model_id)) raise KrakenRepoException('Found {} models when querying for id \'{}\''.format(model_id)) record = resp['hits']['hits'][0] metadata = record['metadata'] if 'keywords' not in metadata: logger.error('No keywords included on deposit') raise KrakenRepoException('No keywords included on deposit.') model_type = SUPPORTED_MODELS.intersection(metadata['keywords']) if not model_type: msg = 'Unsupported model type(s): {}'.format(', '.format(metadata['keywords'])) logger.error(msg) raise KrakenRepoException(msg) meta_json = None for file in record['files']: if file['key'] == 'metadata.json': callback() r = requests.get(file['links']['self']) r.raise_for_status() callback() try: meta_json = r.json() except: msg = 'Metadata for \'{}\' ({}) not in JSON format'.format(record['metadata']['title'], record['metadata']['doi']) logger.error(msg) raise KrakenRepoException(msg) if not meta_json: msg = 'Mo metadata.jsn found for \'{}\' ({})'.format(record['metadata']['title'], record['metadata']['doi']) logger.error(msg) raise KrakenRepoException(msg) # merge metadata.json into DataCite metadata.update({'graphemes': meta_json['graphemes'], 'summary': meta_json['summary'], 'script': meta_json['script'], 'link': record['links']['latest'], 'type': [x.split('_')[1] for x in model_type], 'accuracy': meta_json['accuracy']}) return metadata
def get_model(model_id: str, path: str, callback: Callable[..., Any] = lambda: None) -> str: """ Retrieves a model and saves it to a path. Args: model_id (str): DOI of the model path (str): Destination to write model to. callback (func): Function called for every 1024 octet chunk received. Returns: The identifier the model can be called through on the command line. Will usually be the file name of the model. """ logger.info('Saving model {} to {}'.format(model_id, path)) r = requests.get('{}{}'.format(MODEL_REPO, 'records'), params={'q': 'doi:"{}"'.format(model_id)}) r.raise_for_status() callback() resp = r.json() if resp['hits']['total'] != 1: logger.error('Found {} models when querying for id \'{}\''.format(resp['hits']['total'], model_id)) raise KrakenRepoException('Found {} models when querying for id \'{}\''.format(resp['hits']['total'], model_id)) metadata = resp['hits']['hits'][0] model_url = [x['links']['self'] for x in metadata['files'] if x['type'] == 'mlmodel'][0] # callable model identifier nat_id = os.path.basename(urllib.parse.urlparse(model_url).path) spath = os.path.join(path, nat_id) logger.debug('downloading model file {} to {}'.format(model_url, spath)) with closing(requests.get(model_url, stream=True)) as r: with open(spath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): callback() f.write(chunk) return nat_id
def get_model(model_id: str, path: str, callback: Callable[..., Any]) -> None: """ Retrieves a model and saves it to a path. Args: model_id (str): Identifier of the model path (str): Destination to write model to. callback (func): Function called for every 1024 octet chunk received. """ logger.info(u'Saving model {} to {}'.format(model_id, path)) logger.debug(u'Retrieving head of model repository') r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master')) callback() resp = r.json() if 'object' not in resp: logger.error(u'No \'object\' field in repo head API response.') raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) head = resp['object']['sha'] logger.debug(u'Retrieving tree of model repository') r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1}) callback() resp = r.json() if 'tree' not in resp: logger.error(u'No \'tree\' field in repo API response.') raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message'])) url = None for el in resp['tree']: components = el['path'].split('/') if len(components) > 2 and components[1] == model_id and components[2] == 'DESCRIPTION': logger.debug(u'Retrieving description for {}'.format(components[1])) raw = base64.b64decode(requests.get(el['url']).json()['content']).decode('utf-8') desc = json.loads(raw) spath = os.path.join(path, desc['name']) elif len(components) > 2 and components[1] == model_id: url = el['url'] break if not url: logger.error(u'Model {} not in repository.'.format(model_id)) raise KrakenRepoException('Modle {} not in repository'.format(model_id)) with closing(requests.get(url, headers={'Accept': 'application/vnd.github.v3.raw'}, stream=True)) as r: with open(spath, 'wb') as f: logger.debug(u'Downloading model') for chunk in r.iter_content(chunk_size=1024): callback() f.write(chunk) return
def get_model( model_id: str, path: str, callback: Callable[[int, int], Any] = lambda total, advance: None) -> str: """ Retrieves a model and saves it to a path. Args: model_id (str): DOI of the model path (str): Destination to write model to. callback (func): Function called for every 1024 octet chunk received. Returns: The identifier the model can be called through on the command line. Will usually be the file name of the model. """ logger.info(f'Saving model {model_id} to {path}') r = requests.get(f'{MODEL_REPO}records', params={'q': f'doi:"{model_id}"'}) r.raise_for_status() callback(0, 0) resp = r.json() if resp['hits']['total'] != 1: logger.error( f'Found {resp["hits"]["total"]} models when querying for id \'{model_id}\'' ) raise KrakenRepoException( f'Found {resp["hits"]["total"]} models when querying for id \'{model_id}\'' ) metadata = resp['hits']['hits'][0] model_url = [ x['links']['self'] for x in metadata['files'] if x['type'] == 'mlmodel' ][0] # callable model identifier nat_id = os.path.basename(urllib.parse.urlparse(model_url).path) spath = os.path.join(path, nat_id) logger.debug(f'downloading model file {model_url} to {spath}') with closing(requests.get(model_url, stream=True)) as r: file_size = int(r.headers['Content-length']) with open(spath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): callback(file_size, len(chunk)) f.write(chunk) return nat_id