def ingest_series(self, naId=None, dest=None, offset=0): """Ingests a series into Drastic.""" if naId is None or dest is None: raise Exception("URL and destination path are required") app.check_traversal_okay(self) # Get series description series_json = requests.get(SERIES_URL.format(naId)).json() series_descr = series_json['opaResponse']['results']['result'][0]['description'] # Create folder dirname = series_descr['series']['title'] new_folder_path = dest + dirname + '/' # Check if folder exists exists_res = get_client().get_cdmi(new_folder_path) if exists_res.code() == 404: logger.info("Creating base folder in Drastic: "+new_folder_path) res = get_client().put_cdmi(new_folder_path, series_descr) if not res.ok(): raise IOError(str(res)) logger.info("Base folder created: "+new_folder_path) # Schedule page 0 schedule_page.s([], naId=naId, dest=new_folder_path, offset=offset).apply_async()
def traversal(self, path, task_name, only_files, include_pattern=None): """Traverses the file tree under the path given, within the CDMI service. Applies the named task to every path.""" app.check_traversal_okay(self) path = path[:-1] if path.endswith('?') else path try: res = get_client().ls(path) if res.code() in [404, 403]: # object probably deleted logger.warn( "Dropping task for an object that gives a 403/403: {0}".format( path)) return if not res.ok(): raise IOError(str(res)) except IOError as e: raise self.retry(exc=e) cdmi_info = res.json() # logger.debug('got CDMI content: {0}'.format(json.dumps(cdmi_info))) if not cdmi_info[u'objectType'] == u'application/cdmi-container': logger.error("Cannot traverse a file path: {0}".format(path)) return regex_compiled = None if include_pattern is not None: regex_compiled = re.compile(include_pattern) if only_files: for f in cdmi_info[u'children']: f = f[:-1] if f.endswith('?') else f if f.endswith('/'): # filter matches with regex if include_pattern is None or regex_compiled.match( f) is not None: app.send_task(task_name, args=[str(path) + f], kwargs={}) else: for o in cdmi_info[u'children']: o = o[:-1] if o.endswith('?') else o # filter matches with regex if include_pattern is None or regex_compiled.match(f) is not None: app.send_task(task_name, args=[str(path) + o], kwargs={}) for x in cdmi_info[u'children']: x = x[:-1] if x.endswith('?') else x if x.endswith('/'): traversal.s(str(path) + x, task_name, only_files, include_pattern=include_pattern).apply_async()
def ingest_httpdir(self, url=None, dest=None): """Ingests the file tree under the path given, using the NGINX JSON directory autoindex.""" if url is None or dest is None: raise Exception("URL and destination path are required") app.check_traversal_okay(self) # Get directory try: res = requests.get(url) res.raise_for_status() dir_info = res.json() parsed = urlparse(url) dirname = parsed.path.split('/')[-2] new_folder_path = dest + dirname + '/' logger.debug(u"DIRNAME " + new_folder_path) res = get_client().mkdir(new_folder_path) if not res.ok(): raise IOError(str(res)) logger.debug(u"DIRECTORY INGESTED: " + new_folder_path) file_ingests = [] folder_ingests = [] for f in dir_info: if 'file' == f['type']: s = ingest_httpfile.s(str(url) + f['name'], new_folder_path) file_ingests.append(s) elif 'directory' == f['type']: s = ingest_httpdir.s(url=str(url) + f['name'] + '/', dest=new_folder_path) folder_ingests.append(s) file_job = group(file_ingests) file_job.apply_async() # result.join() # wait for files to ingest in parallel # file_count += result.completed_count() group(folder_ingests).apply_async() # for file_c, folder_c in folder_res.get(): # file_count += file_c # folder_count += folder_c # return (file_count, folder_count) except IOError as e: raise self.retry(exc=e)
def schedule_page(self, newresults, oldresults=0, naId=None, dest=None, offset=0): """Ingests a series into Drastic.""" app.check_traversal_okay(self) OBJECTS_URL = ('https://catalog.archives.gov/api/v1?description.fileUnit.parentSeries.naId={0}' '&offset={1}&rows={2}&type=object') newcount = len(newresults) logger.warn('{0} nara objects just ingested'.format(newcount)) # Get object descriptions try: objects_json = requests.get(OBJECTS_URL.format(naId, offset, NARA_PAGE_SIZE)).json() object_descrs = objects_json['opaResponse']['results']['result'] total_objects = objects_json['opaResponse']['results']['total'] row_count = len(object_descrs) except IOError as e: raise self.retry(exc=e) # Schedule object ingests for this page page_tasks = [] for obj in object_descrs: # logger.warn(json.dumps(obj)) file_stuff = obj['objects']['object']['file'] idnum = obj['objects']['object']['@id'] url = file_stuff['@url'] mime = file_stuff['@mime'] name = str(idnum) + '_' + file_stuff['@name'] s = ingest_httpfile.s(url, dest, name=name, mimetype=mime, metadata=obj) page_tasks.append(s) page_job = group(page_tasks) logger.warn('scheduling {0} ingests at nara offset {1}'.format(row_count, offset)) # Schedule next page if total_objects > offset + row_count: next_page = schedule_page.s(naId=naId, dest=dest, offset=offset + row_count, oldresults=oldresults+newcount) chord(page_job)(next_page) return oldresults + newcount else: page_job.apply_async() return oldresults + newcount + len(page_tasks)
def ingest_property_cards(self, dest=None): """Ingests a series into Drastic.""" if dest is None: raise Exception("Destination path is required") app.check_traversal_okay(self) url = ("https://catalog.archives.gov/api/v1?q=title:\"property card\"" "&description.fileUnit.parentSeries.naId=3725265" "&type=description" "&resultFields=naId,description,objects" "&rows=200") # FIXME Add the login for NARA CATALOG API # Get series description series_json = requests.get(url).json() for result in series_json['opaResponse']['results']['result']: ingest_tasks = [] # naId = result['naId'] title = result['description']['fileUnit']['title'] new_folder_path = dest + title + '/' res = get_client().mkdir(new_folder_path) if not res.ok(): logger.error('Got and error ({0}) creating folder {1}' .format(str(res), new_folder_path)) raise IOError(str(res)) # si: create folder for obj in result['objects']['object']: file_stuff = obj['file'] idnum = obj['@id'] url = file_stuff['@url'] mime = file_stuff['@mime'] name = str(file_stuff['@name']) s = ingest_httpfile.s(url, new_folder_path, name=name, mimetype=mime, metadata=obj) ingest_tasks.append(s) group(ingest_tasks).apply_async()