コード例 #1
0
def ingest_series(self, naId=None, dest=None, offset=0):
    """Ingests a series into Drastic."""
    if naId is None or dest is None:
        raise Exception("URL and destination path are required")
    app.check_traversal_okay(self)

    # Get series description
    series_json = requests.get(SERIES_URL.format(naId)).json()
    series_descr = series_json['opaResponse']['results']['result'][0]['description']

    # Create folder
    dirname = series_descr['series']['title']
    new_folder_path = dest + dirname + '/'

    # Check if folder exists
    exists_res = get_client().get_cdmi(new_folder_path)
    if exists_res.code() == 404:
        logger.info("Creating base folder in Drastic: "+new_folder_path)
        res = get_client().put_cdmi(new_folder_path, series_descr)
        if not res.ok():
            raise IOError(str(res))
        logger.info("Base folder created: "+new_folder_path)

    # Schedule page 0
    schedule_page.s([],
                    naId=naId,
                    dest=new_folder_path,
                    offset=offset).apply_async()
コード例 #2
0
ファイル: workflow.py プロジェクト: UMD-DRASTIC/drastic-jobs
def traversal(self, path, task_name, only_files, include_pattern=None):
    """Traverses the file tree under the path given, within the CDMI service.
       Applies the named task to every path."""

    app.check_traversal_okay(self)

    path = path[:-1] if path.endswith('?') else path

    try:
        res = get_client().ls(path)
        if res.code() in [404, 403]:  # object probably deleted
            logger.warn(
                "Dropping task for an object that gives a 403/403: {0}".format(
                    path))
            return
        if not res.ok():
            raise IOError(str(res))
    except IOError as e:
        raise self.retry(exc=e)

    cdmi_info = res.json()
    # logger.debug('got CDMI content: {0}'.format(json.dumps(cdmi_info)))
    if not cdmi_info[u'objectType'] == u'application/cdmi-container':
        logger.error("Cannot traverse a file path: {0}".format(path))
        return

    regex_compiled = None
    if include_pattern is not None:
        regex_compiled = re.compile(include_pattern)

    if only_files:
        for f in cdmi_info[u'children']:
            f = f[:-1] if f.endswith('?') else f
            if f.endswith('/'):
                # filter matches with regex
                if include_pattern is None or regex_compiled.match(
                        f) is not None:
                    app.send_task(task_name, args=[str(path) + f], kwargs={})
    else:
        for o in cdmi_info[u'children']:
            o = o[:-1] if o.endswith('?') else o
            # filter matches with regex
            if include_pattern is None or regex_compiled.match(f) is not None:
                app.send_task(task_name, args=[str(path) + o], kwargs={})

    for x in cdmi_info[u'children']:
        x = x[:-1] if x.endswith('?') else x
        if x.endswith('/'):
            traversal.s(str(path) + x,
                        task_name,
                        only_files,
                        include_pattern=include_pattern).apply_async()
コード例 #3
0
def ingest_httpdir(self, url=None, dest=None):
    """Ingests the file tree under the path given, using the NGINX JSON directory autoindex."""

    if url is None or dest is None:
        raise Exception("URL and destination path are required")

    app.check_traversal_okay(self)

    # Get directory
    try:
        res = requests.get(url)
        res.raise_for_status()
        dir_info = res.json()

        parsed = urlparse(url)
        dirname = parsed.path.split('/')[-2]
        new_folder_path = dest + dirname + '/'
        logger.debug(u"DIRNAME " + new_folder_path)
        res = get_client().mkdir(new_folder_path)
        if not res.ok():
            raise IOError(str(res))
        logger.debug(u"DIRECTORY INGESTED: " + new_folder_path)

        file_ingests = []
        folder_ingests = []
        for f in dir_info:
            if 'file' == f['type']:
                s = ingest_httpfile.s(str(url) + f['name'], new_folder_path)
                file_ingests.append(s)
            elif 'directory' == f['type']:
                s = ingest_httpdir.s(url=str(url) + f['name'] + '/',
                                     dest=new_folder_path)
                folder_ingests.append(s)
        file_job = group(file_ingests)
        file_job.apply_async()
        # result.join()  # wait for files to ingest in parallel
        # file_count += result.completed_count()
        group(folder_ingests).apply_async()
        # for file_c, folder_c in folder_res.get():
        #     file_count += file_c
        #     folder_count += folder_c
        # return (file_count, folder_count)
    except IOError as e:
        raise self.retry(exc=e)
コード例 #4
0
def schedule_page(self, newresults, oldresults=0, naId=None, dest=None, offset=0):
    """Ingests a series into Drastic."""
    app.check_traversal_okay(self)

    OBJECTS_URL = ('https://catalog.archives.gov/api/v1?description.fileUnit.parentSeries.naId={0}'
                   '&offset={1}&rows={2}&type=object')

    newcount = len(newresults)
    logger.warn('{0} nara objects just ingested'.format(newcount))

    # Get object descriptions
    try:
        objects_json = requests.get(OBJECTS_URL.format(naId, offset, NARA_PAGE_SIZE)).json()
        object_descrs = objects_json['opaResponse']['results']['result']
        total_objects = objects_json['opaResponse']['results']['total']
        row_count = len(object_descrs)
    except IOError as e:
        raise self.retry(exc=e)

    # Schedule object ingests for this page
    page_tasks = []
    for obj in object_descrs:
        # logger.warn(json.dumps(obj))
        file_stuff = obj['objects']['object']['file']
        idnum = obj['objects']['object']['@id']
        url = file_stuff['@url']
        mime = file_stuff['@mime']
        name = str(idnum) + '_' + file_stuff['@name']
        s = ingest_httpfile.s(url, dest, name=name, mimetype=mime, metadata=obj)
        page_tasks.append(s)
    page_job = group(page_tasks)
    logger.warn('scheduling {0} ingests at nara offset {1}'.format(row_count, offset))

    # Schedule next page
    if total_objects > offset + row_count:
        next_page = schedule_page.s(naId=naId,
                                    dest=dest,
                                    offset=offset + row_count,
                                    oldresults=oldresults+newcount)
        chord(page_job)(next_page)
        return oldresults + newcount
    else:
        page_job.apply_async()
        return oldresults + newcount + len(page_tasks)
コード例 #5
0
def ingest_property_cards(self, dest=None):
    """Ingests a series into Drastic."""
    if dest is None:
        raise Exception("Destination path is required")
    app.check_traversal_okay(self)

    url = ("https://catalog.archives.gov/api/v1?q=title:\"property card\""
           "&description.fileUnit.parentSeries.naId=3725265"
           "&type=description"
           "&resultFields=naId,description,objects"
           "&rows=200")

    # FIXME Add the login for NARA CATALOG API


    # Get series description
    series_json = requests.get(url).json()
    for result in series_json['opaResponse']['results']['result']:
        ingest_tasks = []
        # naId = result['naId']
        title = result['description']['fileUnit']['title']
        new_folder_path = dest + title + '/'
        res = get_client().mkdir(new_folder_path)
        if not res.ok():
            logger.error('Got and error ({0}) creating folder {1}'
                         .format(str(res), new_folder_path))
            raise IOError(str(res))
        # si: create folder
        for obj in result['objects']['object']:
            file_stuff = obj['file']
            idnum = obj['@id']
            url = file_stuff['@url']
            mime = file_stuff['@mime']
            name = str(file_stuff['@name'])
            s = ingest_httpfile.s(url, new_folder_path, name=name, mimetype=mime, metadata=obj)
            ingest_tasks.append(s)
        group(ingest_tasks).apply_async()