Esempio n. 1
0
def ingest_series(self, naId=None, dest=None, offset=0):
    """Ingests a series into Drastic."""
    if naId is None or dest is None:
        raise Exception("URL and destination path are required")
    app.check_traversal_okay(self)

    # Get series description
    series_json = requests.get(SERIES_URL.format(naId)).json()
    series_descr = series_json['opaResponse']['results']['result'][0]['description']

    # Create folder
    dirname = series_descr['series']['title']
    new_folder_path = dest + dirname + '/'

    # Check if folder exists
    exists_res = get_client().get_cdmi(new_folder_path)
    if exists_res.code() == 404:
        logger.info("Creating base folder in Drastic: "+new_folder_path)
        res = get_client().put_cdmi(new_folder_path, series_descr)
        if not res.ok():
            raise IOError(str(res))
        logger.info("Base folder created: "+new_folder_path)

    # Schedule page 0
    schedule_page.s([],
                    naId=naId,
                    dest=new_folder_path,
                    offset=offset).apply_async()
Esempio n. 2
0
def incr_batch_progress(batch_dir,
                        file_cnt=0,
                        file_bytes_cnt=0,
                        folder_cnt=0,
                        done=False):
    # Get existing metadata in Drastic
    res = get_client().get_cdmi(batch_dir)
    if not res.ok():
        raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
    metadata = res.json()['metadata']
    progress_file_old = metadata.get('batch_file_progress', 0)
    progress_file = file_cnt + int(progress_file_old)
    progress_file_bytes_old = metadata.get('batch_file__bytes_progress', 0)
    progress_file_bytes = file_bytes_cnt + int(progress_file_bytes_old)
    progress_folder_old = metadata.get('batch_folder_progress', 0)
    progress_folder = folder_cnt + int(progress_folder_old)
    metadata['batch_file_progress'] = progress_file
    metadata['batch_file_bytes_progress'] = progress_file_bytes
    metadata['batch_folder_progress'] = progress_folder
    if done:
        metadata['batch_state'] = 'done'
        metadata['batch_epoch_end'] = int(time.time())
    r = get_client().put(batch_dir, metadata=metadata)
    if not r.ok():
        raise IOError(str(r))
Esempio n. 3
0
def pollForTextConversion(self, path, link):
    """Tries to download text when available."""
    headers = {
        'Accept': 'text/plain',
        'Authorization': "Basic {0}".format(dap_auth_encoded)
    }
    try:
        r = requests.get(link, headers=headers)
        r.raise_for_status()

        res = get_client().get_cdmi(str(path))
        if res.code() in [404, 403]:
            logger.warn(
                "Dropping task for object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
        cdmi_info = res.json()
        metadata = cdmi_info['metadata']
        metadata['fulltext'] = r.text
        res = get_client().put(path, metadata=metadata)
        if res.code() in [404, 403]:  # object probably deleted
            logger.warn(
                "Dropping task for an object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError(str(res))
    except IOError as e:
        raise self.retry(exc=e)
Esempio n. 4
0
def folders_complete(folder_cnt, batch_dir):
    logger.info(u"Folders created for batch: {0}".format(batch_dir))
    # Get existing metadata in Drastic
    res = get_client().get_cdmi(batch_dir)
    if not res.ok():
        raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
    metadata = res.json()['metadata']
    metadata['batch_folder_progress'] = folder_cnt
    r = get_client().put(batch_dir, metadata=metadata)
    if not r.ok():
        raise IOError(str(r))
Esempio n. 5
0
def ingest_httpfile(self, url, destPath, name=None):
    """Ingests the file at the given URL into Drastic."""
    parsed = urlparse(url)
    if name is None:
        name = basename(parsed.path)
    name = name.replace('&', '_')
    tempfilename = None
    try:
        tempfilename = download_tempfile(url)
    except IOError as e:
        os.remove(tempfilename)
        raise self.retry(exc=e)
    try:
        logger.debug(u"Downloaded file to: " + tempfilename)
        with closing(open(tempfilename, 'rb')) as f:
            res = get_client().put(destPath + '/' + name, f)
            if res.code() in [406, 999]:
                return
            if not res.ok():
                raise IOError('Failed to put {} to {}. Got {} {}'.format(
                    tempfilename, destPath + '/' + name, res.code(),
                    res.msg()))
            logger.debug(u"put success for {0}".format(destPath + name))
    finally:
        os.remove(tempfilename)
Esempio n. 6
0
def batch_ingest_httpdir(self, url=None, dest=None):
    """Batches the folders and files under the path given, using the NGINX JSON directory
    autoindex."""

    epoch_start = int(time.time())
    # Create top folder in Drastic
    res = requests.get(url)
    res.raise_for_status()
    dirname = urlparse(url).path.split('/')[-2]
    batch_dir = os.path.join(dest, dirname) + '/'
    res = get_client().mkdir(batch_dir)
    if not res.ok():
        raise IOError('Cannot make folder {0}: {1}'.format(
            batch_dir, str(res)))
    logger.info(u"Batch ingest starting: " + batch_dir)

    # Schedule a recursive count, then record it in Drastic metadata
    (file_cnt, file_byte_cnt, folder_cnt) = count_httpdir(url=url)
    logger.info(u"Batch count complete, {0} files, {1} bytes.".format(
        file_cnt, file_byte_cnt))
    record_batch_count(file_cnt, file_byte_cnt, folder_cnt, epoch_start,
                       batch_dir)

    mkdirs = mkdirs_httpdir.si(url, batch_dir)  # batch_dir /NARA/RG .....
    fc = folders_complete.si(folder_cnt, batch_dir)
    ingest = ingest_files.si(url, batch_dir)
    (mkdirs | fc | ingest).apply_async()
Esempio n. 7
0
def record_batch_count(file_cnt, file_bytes_cnt, folder_cnt, epoch_start,
                       batch_dir):
    # Get existing metadata in Drastic
    res = get_client().ls(batch_dir)
    if not res.ok():
        raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
    metadata = res.json()['metadata']
    metadata['batch_file_count'] = file_cnt
    metadata['batch_file_bytes_count'] = file_bytes_cnt
    metadata['batch_folder_count'] = folder_cnt
    metadata['batch_epoch_start'] = epoch_start
    metadata['batch_state'] = 'ingesting'
    metadata['batch_file_progress'] = 0
    metadata['batch_file_bytes_progress'] = 0
    metadata['batch_folder_progress'] = 0
    r = get_client().put(batch_dir, metadata=metadata)
    if not r.ok():
        raise IOError(str(r))
Esempio n. 8
0
def put_graph_metadata(self, path):
    """Replaces existing user triples for a single subject."""
    logger.debug(u'PUT RDF metadata for {1}'.format(path))
    path = path[:-1] if path.endswith('?') else path
    is_folder = True if str(path).endswith('/') else False

    try:
        res = get_client().get_cdmi(str(path))
        if res.code() in [404, 403]:
            logger.warn(
                "Dropping task for object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
        cdmi_info = res.json()
    except IOError as e:
        raise self.retry(exc=e)

    # Drastic fields:
    # FIXME name is not the key, is null
    name = cdmi_info.get('objectName')
    name = name[:-1] if name.endswith('?') else name
    object_UUID = cdmi_info.get('objectID')
    container_UUID = cdmi_info.get('parentID')
    #  parent_URI = cdmi_info.get('parentURI')
    mimetype = 'text/directory'
    if not is_folder:
        mimetype = cdmi_info.get('mimetype')
    metadata = cdmi_info.get('metadata')

    uri = "uuid:{0}".format(object_UUID)
    get_g().V().has('resource', 'URI', uri).drop().count().next()
    t = get_g().addV('resource')
    t = t.property('URI', uri)
    t = t.property('graph', uri)
    t = t.property('name', name)
    t = t.property('mimetype', mimetype)
    for key, value in metadata.iteritems():
        # Don't store metadata without value
        if value is None:  # numeric zero is a valid value
            continue
        t = t.property(key, value)  # key/values as properties
        # TODO add default namespace for keys that are plain tokens
        # t = add_literal_edge(t, uri, key, value)

    # Add contains Edge
    if container_UUID is not None:
        container_uri = "uuid:{0}".format(container_UUID)
        c = get_g().V().has('resource', 'URI', container_uri)
        # TODO fully qualify URIs
        t = t.addE('contains').from_(c)

    t.next()
    logging.debug(u'Created resource vertex for {0}'.format(object_UUID))
Esempio n. 9
0
def traversal(self, path, task_name, only_files, include_pattern=None):
    """Traverses the file tree under the path given, within the CDMI service.
       Applies the named task to every path."""

    app.check_traversal_okay(self)

    path = path[:-1] if path.endswith('?') else path

    try:
        res = get_client().ls(path)
        if res.code() in [404, 403]:  # object probably deleted
            logger.warn(
                "Dropping task for an object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError(str(res))
    except IOError as e:
        raise self.retry(exc=e)

    cdmi_info = res.json()
    # logger.debug('got CDMI content: {0}'.format(json.dumps(cdmi_info)))
    if not cdmi_info[u'objectType'] == u'application/cdmi-container':
        logger.error("Cannot traverse a file path: {0}".format(path))
        return

    regex_compiled = None
    if include_pattern is not None:
        regex_compiled = re.compile(include_pattern)

    if only_files:
        for f in cdmi_info[u'children']:
            f = f[:-1] if f.endswith('?') else f
            if f.endswith('/'):
                # filter matches with regex
                if include_pattern is None or regex_compiled.match(
                        f) is not None:
                    app.send_task(task_name, args=[str(path) + f], kwargs={})
    else:
        for o in cdmi_info[u'children']:
            o = o[:-1] if o.endswith('?') else o
            # filter matches with regex
            if include_pattern is None or regex_compiled.match(f) is not None:
                app.send_task(task_name, args=[str(path) + o], kwargs={})

    for x in cdmi_info[u'children']:
        x = x[:-1] if x.endswith('?') else x
        if x.endswith('/'):
            traversal.s(str(path) + x,
                        task_name,
                        only_files,
                        include_pattern=include_pattern).apply_async()
Esempio n. 10
0
def mkdirs_httpdir(url, batch_dir):
    """Counts the folders and files under the path given, using the NGINX JSON directory
    autoindex."""
    count = 0
    notifyCount = 20
    for (f, parentPath, furl) in iter_httpdir(url, files=False):
        name = str(f['name'])
        new_folder_path = os.path.join(batch_dir, parentPath, name) + '/'
        logger.debug(u'new_folder_path: {0}'.format(new_folder_path))
        res = get_client().mkdir(new_folder_path)
        if not res.ok():
            logger.error(u'Cannot make directory: {0}'.format(new_folder_path))
            continue
        count += 1
        if count >= notifyCount:
            incr_batch_progress.s(batch_dir, folder_cnt=count).apply_async()
            count = 0
    incr_batch_progress.s(batch_dir, folder_cnt=count).apply_async()
Esempio n. 11
0
def ingest_httpdir(self, url=None, dest=None):
    """Ingests the file tree under the path given, using the NGINX JSON directory autoindex."""

    if url is None or dest is None:
        raise Exception("URL and destination path are required")

    app.check_traversal_okay(self)

    # Get directory
    try:
        res = requests.get(url)
        res.raise_for_status()
        dir_info = res.json()

        parsed = urlparse(url)
        dirname = parsed.path.split('/')[-2]
        new_folder_path = dest + dirname + '/'
        logger.debug(u"DIRNAME " + new_folder_path)
        res = get_client().mkdir(new_folder_path)
        if not res.ok():
            raise IOError(str(res))
        logger.debug(u"DIRECTORY INGESTED: " + new_folder_path)

        file_ingests = []
        folder_ingests = []
        for f in dir_info:
            if 'file' == f['type']:
                s = ingest_httpfile.s(str(url) + f['name'], new_folder_path)
                file_ingests.append(s)
            elif 'directory' == f['type']:
                s = ingest_httpdir.s(url=str(url) + f['name'] + '/',
                                     dest=new_folder_path)
                folder_ingests.append(s)
        file_job = group(file_ingests)
        file_job.apply_async()
        # result.join()  # wait for files to ingest in parallel
        # file_count += result.completed_count()
        group(folder_ingests).apply_async()
        # for file_c, folder_c in folder_res.get():
        #     file_count += file_c
        #     folder_count += folder_c
        # return (file_count, folder_count)
    except IOError as e:
        raise self.retry(exc=e)
Esempio n. 12
0
def ingest_property_cards(self, dest=None):
    """Ingests a series into Drastic."""
    if dest is None:
        raise Exception("Destination path is required")
    app.check_traversal_okay(self)

    url = ("https://catalog.archives.gov/api/v1?q=title:\"property card\""
           "&description.fileUnit.parentSeries.naId=3725265"
           "&type=description"
           "&resultFields=naId,description,objects"
           "&rows=200")

    # FIXME Add the login for NARA CATALOG API


    # Get series description
    series_json = requests.get(url).json()
    for result in series_json['opaResponse']['results']['result']:
        ingest_tasks = []
        # naId = result['naId']
        title = result['description']['fileUnit']['title']
        new_folder_path = dest + title + '/'
        res = get_client().mkdir(new_folder_path)
        if not res.ok():
            logger.error('Got and error ({0}) creating folder {1}'
                         .format(str(res), new_folder_path))
            raise IOError(str(res))
        # si: create folder
        for obj in result['objects']['object']:
            file_stuff = obj['file']
            idnum = obj['@id']
            url = file_stuff['@url']
            mime = file_stuff['@mime']
            name = str(file_stuff['@name'])
            s = ingest_httpfile.s(url, new_folder_path, name=name, mimetype=mime, metadata=obj)
            ingest_tasks.append(s)
        group(ingest_tasks).apply_async()
Esempio n. 13
0
def index(self, path):
    """Reindexes the metadata for a data object"""
    from index.util import add_BD_fields_legacy, readMaxText
    path = path[:-1] if path.endswith('?') else path
    mytype = 'folder' if str(path).endswith('/') else 'file'

    esdoc = {}
    esdoc['path'] = str(path)
    esdoc['pathtext'] = str(path)
    try:
        res = get_client().get_cdmi(str(path))
        if res.code() in [404, 403]:
            logger.warn(
                "Dropping task for object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
        cdmi_info = res.json()
    except IOError as e:
        raise self.retry(exc=e)

    # Drastic fields:
    # FIXME name is not the key, is null
    name = cdmi_info.get('objectName')
    esdoc['objectName'] = name[:-1] if name.endswith('?') else name
    esdoc['objectID'] = cdmi_info.get('objectID')
    esdoc['parentID'] = cdmi_info.get('parentID')
    esdoc['parentURI'] = cdmi_info.get('parentURI')

    esdoc['mimetype'] = cdmi_info.get('mimetype')
    # TODO esdoc['size'] = cdmi_info.get('size')

    # If we have extracted metadata from Brown Dog, add any mapped fields
    if 'dts_metadata.jsonld' in cdmi_info.get('metadata'):
        add_BD_fields_legacy(
            cdmi_info['metadata'].get('dts_metadata.jsonld', '[]'), esdoc)

    if 'dts_tags.json' in cdmi_info.get('metadata'):
        esdoc['dts_tags'] = cdmi_info['metadata'].get('dts_tags.json')

    # if file mimetype is already text/plain, index it as fulltext
    if 'text/plain' == cdmi_info.get('mimetype'):
        try:
            with closing(stream_from_drastic_proxy(path)) as stream:
                esdoc['fulltext'] = readMaxText(stream,
                                                fulltext_max_index_size)
        except IOError as e:
            logger.warn(
                "Cannot get original object text for indexing: {0}".format(
                    str(e)))
    elif 'fulltext' in cdmi_info['metadata']:
        esdoc['fulltext'] = cdmi_info['metadata'].get('fulltext')

    logger.debug('ESDOC:\n{0}'.format(json.dumps(esdoc)))
    url = elasticsearch_url + '/drastic/' + mytype
    try:
        r = requests.post(url, data=json.dumps(esdoc))
        if r.status_code != requests.codes.created:
            logger.error('ES status: {0} {1}'.format(r.status_code, r.text))
    except IOError as e:
        self.retry(exc=e)
Esempio n. 14
0
def mkdir(self, path):
    res = get_client().mkdir(path)
    if not res.ok():
        raise IOError(str(res))
Esempio n. 15
0
def pollForExtract(self, path, fileid, retries):
    """Poll the feature extraction service for the results of an extraction.
       Re-enqueue this task if still waiting."""
    url = '{0}/api/extractions/{1}/status?commkey={2}'.format(
        clowder_url, fileid, clowder_commkey)
    parsed = None
    try:
        r = requests.get(url)
        r.raise_for_status()
        parsed = r.json()
    except IOError as e:
        raise self.retry(exc=e)

    extractionStatus = parsed['Status']
    doneStatus = ['Done']
    failStatus = ['No Extractor Available. Request is not queued.']
    waitStatus = [
        'Processing', 'Required Extractor is either busy or' +
        ' is not currently running. Try after some time.'
    ]
    if extractionStatus in waitStatus:
        raise self.retry()
    elif extractionStatus in failStatus:
        msg = 'Extract failed for {0} {1} with {2}'.format(
            path, fileid, extractionStatus)
        logger.warn(msg)
        return
    elif extractionStatus not in doneStatus:
        logger.error(
            'Unrecognized extraction status for {0} {1} with {2}'.format(
                path, fileid, extractionStatus))
        return

    try:
        # Get existing metadata in Drastic
        res = get_client().get_cdmi(str(path))
        if res.code() in [404, 403]:
            logger.warn(
                "Dropping task for object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError("Drastic get_cdmi failed: {0}".format(res.msg()))
        cdmi_info = res.json()
        metadata = cdmi_info['metadata']
    except IOError as e:
        raise self.retry(exc=e)

    try:
        # GET new metadata
        url = '{0}/api/files/{1}/metadata.jsonld?commkey={2}'.format(
            clowder_url, fileid, clowder_commkey)
        r = requests.get(url)
        r.raise_for_status()
        parsed = r.json()
        logger.debug("fetched metadata: {0}".format(json.dumps(parsed)))
    except IOError as e:
        raise self.retry(exc=e)

    # GET new tags
    try:
        url2 = '{0}/api/files/{1}/tags?commkey={2}'.format(
            clowder_url, fileid, clowder_commkey)
        r2 = requests.get(url2)
        r2.raise_for_status()
        tags = r2.json()['tags']
        if len(tags) > 0:
            metadata['dts_tags'] = tags
            logger.debug("fetched tags: {0}".format(json.dumps(tags)))

        # Modify existing metadata
        # Create Clowder ID and link field
        metadata['dts_clowder_link'] = '{0}/files/{1}/'.format(
            clowder_url, fileid)
        metadata['dts_clowder_id'] = fileid
        metadata['dts_metadata'] = parsed

        r = get_client().put(path, metadata=metadata)
        if not r.ok():
            raise IOError(str(r))
    except IOError as e:
        raise self.retry(exc=e)