Beispiel #1
0
def get_listing(callback):
    logger.info('Retrieving head of model repository')
    r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master'))
    callback()
    resp = r.json()
    if 'object' not in resp:
        raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message']))
    head = resp['object']['sha']
    logger.info('Retrieving tree of model repository')
    r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1})
    callback()
    resp = r.json()
    if 'tree' not in resp:
        raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message']))
    models = {}
    for el in resp['tree']:
        components = el['path'].split('/')
        # new model
        if len(components) == 2:
            models[components[1]] = {'type': components[0]}
        if len(components) > 2 and components[2] == 'DESCRIPTION':
            logger.info('Retrieving description for {}'.format(components[1]))
            r = requests.get(el['url'])
            if not r.ok:
                raise KrakenRepoException('{}: {}'.format(r.status_code, r.json()['message']))
            raw = base64.b64decode(requests.get(el['url']).json()['content']).decode('utf-8')
            callback()
            try:
                models[components[1]].update(json.loads(raw))
            except:
                del models[components[1]]
        elif len(components) > 2 and components[1] in models:
            models[components[1]]['model'] = el['url']
    return models
Beispiel #2
0
def get_description(model_id):
    logger.info(u'Retrieving metadata for {}'.format(model_id))
    logger.debug(u'Retrieving head of model repository')
    r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master'))
    resp = r.json()
    if 'object' not in resp:
        logger.error(u'No \'object\' field in repo head API response.')
        raise KrakenRepoException('{}: {}'.format(r.status_code,
                                                  resp['message']))
    head = resp['object']['sha']
    logger.debug(u'Retrieving tree of model repository')
    r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head),
                     params={'recursive': 1})
    resp = r.json()
    if 'tree' not in resp:
        logger.error(u'No \'tree\' field in repo API response.')
        raise KrakenRepoException('{}: {}'.format(r.status_code,
                                                  resp['message']))
    for el in resp['tree']:
        components = el['path'].split('/')
        if len(components) > 2 and components[1] == model_id and components[
                2] == 'DESCRIPTION':
            logger.debug(u'Retrieving description for {}'.format(
                components[1]))
            raw = base64.b64decode(requests.get(
                el['url']).json()['content']).decode('utf-8')
            return defaultdict(str, json.loads(raw))
Beispiel #3
0
def get_model(model_id, path, callback):
    logger.info('Retrieving head of model repository')
    r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master'))
    callback()
    resp = r.json()
    if 'object' not in resp:
        raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message']))
    head = resp['object']['sha']
    logger.info('Retrieving tree of model repository')
    r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1})
    callback()
    resp = r.json()
    if 'tree' not in resp:
        raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message']))
    url = None
    for el in resp['tree']:
        components = el['path'].split('/')
        if len(components) > 2 and components[1] == model_id and components[2] == 'DESCRIPTION':
            logger.info('Retrieving description for {}'.format(components[1]))
            raw = base64.b64decode(requests.get(el['url']).json()['content']).decode('utf-8')
            desc = json.loads(raw)
            spath = os.path.join(path, desc['name'])
        elif len(components) > 2 and components[1] == model_id:
            url = el['url']
            break
    if not url:
        raise KrakenRepoException('No such model known')
    with closing(requests.get(url, headers={'Accept': 'application/vnd.github.v3.raw'},
                 stream=True)) as r:
        with open(spath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                callback()
                f.write(chunk)
Beispiel #4
0
def get_listing(callback: Callable[..., Any] = lambda: None) -> dict:
    """
    Fetches a listing of all kraken models from the zenodo repository.

    Args:
        callback (Callable): Function called after each HTTP request.

    Returns:
        Dict of models with each model.
    """
    logger.info('Retrieving model list')
    records = []
    r = requests.get('{}{}'.format(MODEL_REPO, 'records'), params={'communities': 'ocr_models'})
    r.raise_for_status()
    callback()
    resp = r.json()
    if not resp['hits']['total']:
        logger.error('No models found in community \'ocr_models\'')
        raise KrakenRepoException('No models found in repository \'ocr_models\'')
    logger.debug('Total of {} records in repository'.format(resp['hits']['total']))
    records.extend(resp['hits']['hits'])
    while 'next' in resp['links']:
        logger.debug('Fetching next page')
        r = requests.get(resp['links']['next'])
        r.raise_for_status()
        resp = r.json()
        logger.debug('Found {} new records'.format(len(resp['hits']['hits'])))
        records.extend(resp['hits']['hits'])
    logger.debug('Retrieving model metadata')
    models = {}
    # fetch metadata.jsn for each model
    for record in records:
        if 'keywords' not in record['metadata']:
            continue
        model_type = SUPPORTED_MODELS.intersection(record['metadata']['keywords'])
        if not model_type:
            continue
        for file in record['files']:
            if file['key'] == 'metadata.json':
                callback()
                r = requests.get(file['links']['self'])
                r.raise_for_status()
                try:
                    metadata = r.json()
                except:
                    msg = 'Metadata for \'{}\' ({}) not in JSON format'.format(record['metadata']['title'], record['metadata']['doi'])
                    logger.error(msg)
                    raise KrakenRepoException(msg)
        # merge metadata.jsn into DataCite
        key = record['metadata']['doi']
        models[key] = record['metadata']
        models[key].update({'graphemes': metadata['graphemes'],
                            'summary': metadata['summary'],
                            'script': metadata['script'],
                            'link': record['links']['latest'],
                            'type': [x.split('_')[1] for x in model_type]})
    return models
Beispiel #5
0
def get_description(model_id: str, callback: Callable[..., Any] = lambda: None) -> dict:
    """
    Fetches the metadata for a single model from the zenodo repository.

    Args:
        model_id (str): DOI of the model.
        callback (callable): Optional function called once per HTTP request.

    Returns:
        Dict
    """
    logger.info('Retrieving metadata for {}'.format(model_id))
    r = requests.get('{}{}'.format(MODEL_REPO, 'records'), params={'q': 'doi:"{}"'.format(model_id)})
    r.raise_for_status()
    callback()
    resp = r.json()
    if  resp['hits']['total'] != 1:
        logger.error('Found {} models when querying for id \'{}\''.format(model_id))
        raise KrakenRepoException('Found {} models when querying for id \'{}\''.format(model_id))
    record = resp['hits']['hits'][0]
    metadata = record['metadata']
    if 'keywords' not in metadata:
        logger.error('No keywords included on deposit')
        raise KrakenRepoException('No keywords included on deposit.')
    model_type = SUPPORTED_MODELS.intersection(metadata['keywords'])
    if not model_type:
        msg = 'Unsupported model type(s): {}'.format(', '.format(metadata['keywords']))
        logger.error(msg)
        raise KrakenRepoException(msg)
    meta_json = None
    for file in record['files']:
        if file['key'] == 'metadata.json':
            callback()
            r = requests.get(file['links']['self'])
            r.raise_for_status()
            callback()
            try:
                meta_json = r.json()
            except:
                msg = 'Metadata for \'{}\' ({}) not in JSON format'.format(record['metadata']['title'], record['metadata']['doi'])
                logger.error(msg)
                raise KrakenRepoException(msg)
    if not meta_json:
        msg = 'Mo metadata.jsn found for \'{}\' ({})'.format(record['metadata']['title'], record['metadata']['doi'])
        logger.error(msg)
        raise KrakenRepoException(msg)
    # merge metadata.json into DataCite
    metadata.update({'graphemes': meta_json['graphemes'],
                     'summary': meta_json['summary'],
                     'script': meta_json['script'],
                     'link': record['links']['latest'],
                     'type': [x.split('_')[1] for x in model_type],
                     'accuracy': meta_json['accuracy']})
    return metadata
Beispiel #6
0
def get_model(model_id: str, path: str, callback: Callable[..., Any] = lambda: None) -> str:
    """
    Retrieves a model and saves it to a path.

    Args:
        model_id (str): DOI of the model
        path (str): Destination to write model to.
        callback (func): Function called for every 1024 octet chunk received.

    Returns:
        The identifier the model can be called through on the command line.
        Will usually be the file name of the model.
    """
    logger.info('Saving model {} to {}'.format(model_id, path))
    r = requests.get('{}{}'.format(MODEL_REPO, 'records'), params={'q': 'doi:"{}"'.format(model_id)})
    r.raise_for_status()
    callback()
    resp = r.json()
    if  resp['hits']['total'] != 1:
        logger.error('Found {} models when querying for id \'{}\''.format(resp['hits']['total'], model_id))
        raise KrakenRepoException('Found {} models when querying for id \'{}\''.format(resp['hits']['total'], model_id))

    metadata = resp['hits']['hits'][0]
    model_url = [x['links']['self'] for x in metadata['files'] if x['type'] == 'mlmodel'][0]
    # callable model identifier 
    nat_id = os.path.basename(urllib.parse.urlparse(model_url).path)
    spath = os.path.join(path, nat_id)
    logger.debug('downloading model file {} to {}'.format(model_url, spath))
    with closing(requests.get(model_url, stream=True)) as r:
        with open(spath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                callback()
                f.write(chunk)
    return nat_id
Beispiel #7
0
def get_model(model_id: str, path: str, callback: Callable[..., Any]) -> None:
    """
    Retrieves a model and saves it to a path.

    Args:
        model_id (str): Identifier of the model
        path (str): Destination to write model to.
        callback (func): Function called for every 1024 octet chunk received.
    """
    logger.info(u'Saving model {} to {}'.format(model_id, path))
    logger.debug(u'Retrieving head of model repository')
    r = requests.get('{}{}'.format(MODEL_REPO, 'git/refs/heads/master'))
    callback()
    resp = r.json()
    if 'object' not in resp:
        logger.error(u'No \'object\' field in repo head API response.')
        raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message']))
    head = resp['object']['sha']
    logger.debug(u'Retrieving tree of model repository')
    r = requests.get('{}{}{}'.format(MODEL_REPO, 'git/trees/', head), params={'recursive': 1})
    callback()
    resp = r.json()
    if 'tree' not in resp:
        logger.error(u'No \'tree\' field in repo API response.')
        raise KrakenRepoException('{}: {}'.format(r.status_code, resp['message']))
    url = None
    for el in resp['tree']:
        components = el['path'].split('/')
        if len(components) > 2 and components[1] == model_id and components[2] == 'DESCRIPTION':
            logger.debug(u'Retrieving description for {}'.format(components[1]))
            raw = base64.b64decode(requests.get(el['url']).json()['content']).decode('utf-8')
            desc = json.loads(raw)
            spath = os.path.join(path, desc['name'])
        elif len(components) > 2 and components[1] == model_id:
            url = el['url']
            break
    if not url:
        logger.error(u'Model {} not in repository.'.format(model_id))
        raise KrakenRepoException('Modle {} not in repository'.format(model_id))
    with closing(requests.get(url, headers={'Accept': 'application/vnd.github.v3.raw'},
                 stream=True)) as r:
        with open(spath, 'wb') as f:
            logger.debug(u'Downloading model')
            for chunk in r.iter_content(chunk_size=1024):
                callback()
                f.write(chunk)
    return
Beispiel #8
0
def get_model(
        model_id: str,
        path: str,
        callback: Callable[[int, int],
                           Any] = lambda total, advance: None) -> str:
    """
    Retrieves a model and saves it to a path.

    Args:
        model_id (str): DOI of the model
        path (str): Destination to write model to.
        callback (func): Function called for every 1024 octet chunk received.

    Returns:
        The identifier the model can be called through on the command line.
        Will usually be the file name of the model.
    """
    logger.info(f'Saving model {model_id} to {path}')
    r = requests.get(f'{MODEL_REPO}records', params={'q': f'doi:"{model_id}"'})
    r.raise_for_status()
    callback(0, 0)
    resp = r.json()
    if resp['hits']['total'] != 1:
        logger.error(
            f'Found {resp["hits"]["total"]} models when querying for id \'{model_id}\''
        )
        raise KrakenRepoException(
            f'Found {resp["hits"]["total"]} models when querying for id \'{model_id}\''
        )

    metadata = resp['hits']['hits'][0]
    model_url = [
        x['links']['self'] for x in metadata['files'] if x['type'] == 'mlmodel'
    ][0]
    # callable model identifier
    nat_id = os.path.basename(urllib.parse.urlparse(model_url).path)
    spath = os.path.join(path, nat_id)
    logger.debug(f'downloading model file {model_url} to {spath}')
    with closing(requests.get(model_url, stream=True)) as r:
        file_size = int(r.headers['Content-length'])
        with open(spath, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                callback(file_size, len(chunk))
                f.write(chunk)
    return nat_id