def main(argv=None): parser = argparse.ArgumentParser( description='Print count of objects for a given collection.') parser.add_argument('path', help="Nuxeo path to collection") parser.add_argument('--pynuxrc', default='~/.pynuxrc-prod', help="rcfile for use with pynux utils") parser.add_argument('--components', action='store_true', help="show counts for object components") if argv is None: argv = parser.parse_args() dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) if not argv.components: return print "about to iterate through objects and get components" component_count = 0 for obj in objects: components = dh.fetch_components(obj) component_count = component_count + len(components) print "finished fetching components. {} found".format(component_count) print "Grand Total: {}".format(object_count + component_count)
def main(argv=None): parser = argparse.ArgumentParser( description='list objects for a given collection.') parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) print "about to iterate through objects and get components" component_count = 0 all_components = [] for obj in objects: components = dh.fetch_components(obj) all_components.extend(components) print "{} components for {}".format(len(components), obj['uid']) print "finished fetching components. {} found".format(len(all_components)) objects.extend(all_components) total_obj = len(objects) print "Grand Total: {}".format(total_obj) # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks chunks = [ objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE) ] count = 0 for c in chunks: count = count + 1 filepath = 'chunks/{}_{}.txt'.format(registry_id, count) print "Writing file: {}".format(filepath) with open(filepath, 'w') as f: json.dump(c, f, indent=4)
def main(argv=None): parser = argparse.ArgumentParser( description='list objects for a given collection.') parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument( '--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) print "about to iterate through objects and get components" component_count = 0 all_components = [] for obj in objects: components = dh.fetch_components(obj) all_components.extend(components) print "{} components for {}".format(len(components), obj['uid']) print "finished fetching components. {} found".format(len(all_components)) objects.extend(all_components) total_obj = len(objects) print "Grand Total: {}".format(total_obj) # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks chunks = [objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)] count = 0 for c in chunks: count = count + 1 filepath = 'chunks/{}_{}.txt'.format(registry_id, count) print "Writing file: {}".format(filepath) with open(filepath, 'w') as f: json.dump(c, f, indent=4)
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None, replace=False, timeout=JOB_TIMEOUT, log_handler=None): ''' Queue a deep harvest of a nuxeo object on a worker''' if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() log = logbook.Logger('QDH') for cid in [x for x in collection_ids.split(';')]: url_api = ''.join( ('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc) for obj in dh.fetch_objects(): log.info('Queueing TOPLEVEL {} :-: {}'.format( obj['uid'], obj['path'])) # deep harvest top level object queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=obj['path'], replace=replace, timeout=timeout) # deep harvest component sub-objects for c in dh.fetch_components(obj): log.info('Queueing {} :-: {}'.format(c['uid'], c['path'])) queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=c['path'], replace=replace, timeout=timeout) log_handler.pop_application()
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None, replace=False, timeout=JOB_TIMEOUT, log_handler=None): ''' Queue a deep harvest of a nuxeo object on a worker''' if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() log = logbook.Logger('QDH') for cid in [x for x in collection_ids.split(';')]: url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc) for obj in dh.fetch_objects(): log.info('Queueing TOPLEVEL {} :-: {}'.format( obj['uid'], obj['path'])) # deep harvest top level object queue_deep_harvest_path( config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=obj['path'], replace=replace, timeout=timeout) # deep harvest component sub-objects for c in dh.fetch_components(obj): log.info('Queueing {} :-: {}'.format( c['uid'], c['path'])) queue_deep_harvest_path( config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=c['path'], replace=replace, timeout=timeout) log_handler.pop_application()
def main(argv=None): ''' stash Nuxeo image files on s3 ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, create jp2 versions of image ' 'files and stash in S3.') parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument('--bucket', default='ucldc-private-files/jp2000', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help='AWS region') parser.add_argument('--replace', action="store_true", help="replace file on s3 if it already exists") parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_image = len([ key for key, value in report.iteritems() if not value['is_image']['is_image'] ]) print "not image:\t{}".format(not_image) unrecognized = len([ key for key, value in report.iteritems() if not value['precheck']['pass'] ]) print "not convertible:\t{}".format(unrecognized) converted = len( [key for key, value in report.iteritems() if value['converted']]) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "converted:\t{}".format(converted) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "stashed:\t{}".format(stashed) print "\nDone."
def main(argv=None): ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, stash files (pdf, txt, etc) in S3.') parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument( '--bucket', default='ucldc-nuxeo-ref-media', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help="aws region") parser.add_argument( '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument( '--replace', action="store_true", help="replace file on s3 if it already exists") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashFile(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashFile(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_file = len([ key for key, value in report.iteritems() if not value['calisphere_type'] in VALID_CALISPHERE_TYPES ]) print "not type `file`, `audio` or `video`:\t{}".format(not_file) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "already stashed:\t{}".format(already_stashed) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "(re)stashed:\t{}".format(stashed) print "\nDone."
def main(argv=None): parser = argparse.ArgumentParser( description= 'Create ATOM feed for a given Nuxeo folder for Merritt harvesting') parser.add_argument("collection", help="UCLDC Registry Collection ID") parser.add_argument("--pynuxrc", help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection_id = argv.collection if argv.pynuxrc: ma = MerrittAtom(collection_id, argv.pynuxrc) else: ma = MerrittAtom(collection_id) print "atom_file: {}".format(ma.atom_file) print "ma.path: {}".format(ma.path) if argv.pynuxrc: dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc) else: dh = DeepHarvestNuxeo(ma.path, '') print "Nuxeo path: {}".format(ma.path) print "Fetching Nuxeo docs. This could take a while if collection is large..." documents = dh.fetch_objects() # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in documents: nxid = document['uid'] print "working on document: {} {}".format(nxid, document['path']) # parent entry = ma._construct_entry(nxid, True) print "inserting entry for parent object {} {}".format( nxid, document['path']) root.insert(0, entry) # children component_entries = [ ma._construct_entry(c['uid'], False) for c in dh.fetch_components(document) ] for ce in component_entries: print "inserting entry for component: {} {}".format( nxid, document['path']) root.insert(0, ce) # add header info print "Adding header info to xml tree" ma._add_merritt_id(root, ma.merritt_id) ma._add_paging_info(root) ma._add_collection_alt_link(root, ma.path) ma._add_atom_elements(root) ma._add_feed_updated(root, ma.last_update) ma._write_feed(root) print "Feed written to file: {}".format(ma.atom_file) ma._s3_stash() print "Feed stashed on s3: {}".format(ma.s3_url)
class Stash(object): ''' stash various files on s3 for a Nuxeo collection in preparation for harvesting into Calisphere ''' def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_): self.logger = logging.getLogger(__name__) self.path = path self.pynuxrc = pynuxrc self.replace = replace self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc) self.objects = self.fetch_objects() self.components = {} for obj in self.objects: self.components[obj['uid']] = self.dh.fetch_components(obj) def fetch_objects(self): ''' fetch objects to process ''' return self.dh.fetch_objects() def images(self): ''' stash Nuxeo image files on s3 ''' report = {} for obj in self.objects: nxstash = NuxeoStashImage(obj['path'], IMAGE_BUCKET, IMAGE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() for c in self.components[obj['uid']]: self.logger.info('Stashing image {}'.format(c['path'])) nxstash = NuxeoStashImage(c['path'], IMAGE_BUCKET, IMAGE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report def files(self): ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' report = {} for obj in self.objects: nxstash = NuxeoStashFile(obj['path'], FILE_BUCKET, FILE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() for c in self.components[obj['uid']]: self.logger.info('Stashing file {}'.format(c['path'])) nxstash = NuxeoStashFile(c['path'], FILE_BUCKET, FILE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report def thumbnails(self): ''' stash thumbnail images for Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' report = {} for obj in self.objects: nxstash = NuxeoStashThumb(obj['path'], THUMB_BUCKET, THUMB_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() for c in self.components[obj['uid']]: self.logger.info('Stashing thumb {}'.format(c['path'])) nxstash = NuxeoStashThumb(c['path'], THUMB_BUCKET, THUMB_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report def media_json(self): ''' create and stash media.json files for a nuxeo collection ''' report = {} for obj in self.objects: self.logger.info('Stashing media json {}'.format(obj['path'])) nxstash = NuxeoStashMediaJson(obj['path'], MEDIAJSON_BUCKET, MEDIAJSON_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report
class MerrittAtom(): def __init__(self, collection_id, **kwargs): self.logger = logging.getLogger(__name__) self.collection_id = collection_id if 'bucket' in kwargs: self.bucket = kwargs['bucket'] else: self.bucket = BUCKET if 'pynuxrc' in kwargs: pynuxrc = kwargs['pynuxrc'] else: pynuxrc = None if 'dir' in kwargs: self.dir = kwargs['dir'] else: self.dir = '.' if 'nostash' in kwargs: self.nostash = kwargs['nostash'] else: self.nostash = False self.logger.info("collection_id: {}".format(self.collection_id)) if 'nuxeo_path' in kwargs: self.path = kwargs['nuxeo_path'] else: self.path = self._get_nuxeo_path() if 'merritt_id' in kwargs: self.merritt_id = kwargs['merritt_id'] else: self.merritt_id = self._get_merritt_id() if not self.merritt_id: raise ValueError("No Merritt ID for this collection") self.feed_base_url = 'https://s3.amazonaws.com/{}/'.format(self.bucket) if pynuxrc: self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc),'r')) self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=pynuxrc) elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')): self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r')) self.dh = DeepHarvestNuxeo(self.path, '') self.atom_file = self._get_filename(self.collection_id) if not self.atom_file: raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id)) self.s3_url = "{}{}".format(self.feed_base_url, self.atom_file) self.atom_filepath = os.path.join(self.dir, self.atom_file) def _get_merritt_id(self): ''' given collection registry ID, get corresponding Merritt collection ID ''' url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id) res = requests.get(url) res.raise_for_status() md = json.loads(res.text) merritt_id = md['merritt_id'] return merritt_id def _get_nuxeo_path(self): ''' given ucldc registry collection ID, get Nuxeo path for collection ''' url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id) res = requests.get(url) res.raise_for_status() md = json.loads(res.text) nuxeo_path = md['harvest_extra_data'] return nuxeo_path def _get_filename(self, collection_id): ''' given Collection ID, get a friendly filename for the ATOM feed ''' filename = 'ucldc_collection_{}.atom'.format(collection_id) return filename def _extract_nx_metadata(self, raw_metadata): ''' extract Nuxeo metadata we want to post to the ATOM feed ''' metadata = {} # last modified metadata['lastModified'] = raw_metadata['bundle_lastModified'] # creator creators = raw_metadata['properties']['ucldc_schema:creator'] metadata['creator'] = [creator['name'] for creator in creators] # title metadata['title'] = raw_metadata['title'] # date dates = raw_metadata['properties']['ucldc_schema:date'] dates = [date['date'] for date in dates] metadata['date'] = dates[0] if dates else None # nuxeo id metadata['id'] = raw_metadata['properties']['ucldc_schema:identifier'] # nuxeo collection metadata['collection'] = raw_metadata['properties']['ucldc_schema:collection'][0] if raw_metadata['properties']['ucldc_schema:collection'] else None return metadata def _construct_entry_bundled(self, doc): ''' construct ATOM feed entry element for a given nuxeo doc, including files for any component objects ''' uid = doc['uid'] # parent nx_metadata = self._extract_nx_metadata(doc) entry = etree.Element(etree.QName(ATOM_NS, "entry")) entry = self._populate_entry(entry, nx_metadata, uid, True) # insert component md for c in self.dh.fetch_components(doc): self._insert_full_md_link(entry, c['uid']) self._insert_main_content_link(entry, c['uid']) self._insert_aux_links(entry, c['uid']) return entry def _add_atom_elements(self, doc): ''' add atom feed elements to document ''' # recommended ATOM feed elements feed_author = etree.Element(etree.QName(ATOM_NS, "author")) feed_author.text = "UC Libraries Digital Collection" doc.insert(0, feed_author) # required ATOM feed elements feed_title = etree.Element(etree.QName(ATOM_NS, "title")) feed_title.text = "UCLDC Metadata Feed" # FIXME get campus name from registry API? doc.insert(0, feed_title) feed_id = etree.Element(etree.QName(ATOM_NS, "id")) feed_id.text = self.s3_url doc.insert(0, feed_id) return doc def _add_feed_updated(self, doc, updated): ''' add feed updated ''' feed_updated = etree.Element(etree.QName(ATOM_NS, "updated")) feed_updated.text = updated doc.insert(0, feed_updated) def _add_collection_alt_link(self, doc, path): ''' add elements related to Nuxeo collection info to document ''' collection_metadata = self.nx.get_metadata(path=path) collection_title = collection_metadata['title'] collection_uid = collection_metadata['uid'] collection_uri = self.get_object_view_url(collection_uid) feed_link_alt = etree.Element(etree.QName(ATOM_NS, "link"), rel="alternate", href=collection_uri, title=collection_title) doc.insert(0, feed_link_alt) return doc def _add_paging_info(self, doc): ''' add rel links for paging ''' # this is just dumb for now last_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="last", href=self.s3_url) doc.insert(0, last_link) first_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="first", href=self.s3_url) doc.insert(0, first_link) self_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="self", href=self.s3_url) doc.insert(0, self_link) def _add_merritt_id(self, doc, merritt_collection_id): ''' add Merritt ID ''' merritt_id = etree.Element(etree.QName(ATOM_NS, "merritt_collection_id")) merritt_id.text = merritt_collection_id doc.insert(0, merritt_id) def _populate_entry(self, entry, metadata, nxid, is_parent): ''' get <entry> element for a given set of object metadata ''' # atom id (URI) nuxeo_object_view_url = self.get_object_view_url(nxid) atom_id = etree.SubElement(entry, etree.QName(ATOM_NS, "id")) atom_id.text = nuxeo_object_view_url # atom title atom_title = etree.SubElement(entry, etree.QName(ATOM_NS, "title")) atom_title.text = metadata["title"] # atom updated atom_updated = etree.SubElement(entry, etree.QName(ATOM_NS, "updated")) atom_updated.text = metadata['lastModified'].isoformat() # atom author atom_author = etree.SubElement(entry, etree.QName(ATOM_NS, "author")) atom_author.text = "UC Libraries Digital Collection" # metadata file link self._insert_full_md_link(entry, nxid) # media json link if is_parent: self._insert_media_json_link(entry, nxid) # main content file link self._insert_main_content_link(entry, nxid) # auxiliary file link(s) self._insert_aux_links(entry, nxid) # dc creator for creator_name in metadata['creator']: dc_creator = etree.SubElement(entry, etree.QName(DC_NS, "creator")) dc_creator.text = creator_name # dc title dc_title = etree.SubElement(entry, etree.QName(DC_NS, "title")) dc_title.text = metadata['title'] # dc date dc_date = etree.SubElement(entry, etree.QName(DC_NS, "date")) dc_date.text = metadata['date'] # dc identifier (a.k.a. local identifier) - Nuxeo ID nuxeo_identifier = etree.SubElement(entry, etree.QName(DC_NS, "identifier")) nuxeo_identifier.text = nxid # UCLDC identifier (a.k.a. local identifier) - ucldc_schema:identifier -- this will be the ARK if we have it if metadata['id']: ucldc_identifier = etree.SubElement(entry, etree.QName(NX_NS, "identifier")) ucldc_identifier.text = metadata['id'] # UCLDC collection identifier ucldc_collection_id = etree.SubElement(entry, etree.QName(NX_NS, "collection")) ucldc_collection_id.text = metadata['collection'] return entry def _insert_media_json_link(self, entry, uid): media_json_url = self.get_media_json_url(uid) link_media_json = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=media_json_url, type="application/json", title="Deep Harvest metadata for this object") def _insert_main_content_link(self, entry, uid): nx_metadata = self.nx.get_metadata(uid=uid) nuxeo_file_download_url = self.get_object_download_url(nx_metadata) checksum = self.get_nuxeo_file_checksum(nx_metadata) if nuxeo_file_download_url: main_content_link = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=nuxeo_file_download_url, title="Main content file") # FIXME add content_type if checksum: checksum_element = etree.SubElement(main_content_link, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5") checksum_element.text = checksum def _insert_aux_links(self, entry, uid): nx_metadata = self.nx.get_metadata(uid=uid) aux_files = self.get_aux_files(nx_metadata) for af in aux_files: link_aux_file = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=af['url'], title="Auxiliary file") if af['checksum']: checksum_element = etree.SubElement(link_aux_file, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5") checksum_element.text = af['checksum'] def _insert_full_md_link(self, entry, uid): full_metadata_url = self.get_full_metadata(uid) link_md = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=full_metadata_url, type="application/xml", title="Full metadata for this object from Nuxeo") def _write_feed(self, doc): ''' publish feed ''' feed = etree.ElementTree(doc) feed_string = etree.tostring(feed, pretty_print=True, encoding='utf-8', xml_declaration=True) with open(self.atom_filepath, "w") as f: f.write(feed_string) def _s3_get_feed(self): """ Retrieve ATOM feed file from S3. Return as ElementTree object """ bucketpath = self.bucket.strip("/") bucketbase = self.bucket.split("/")[0] keyparts = bucketpath.split("/")[1:] keyparts.append(self.atom_file) keypath = '/'.join(keyparts) s3 = boto3.client('s3') response = s3.get_object(Bucket=bucketbase,Key=keypath) contents = response['Body'].read() return etree.fromstring(contents) def _s3_stash(self): """ Stash file in S3 bucket. """ bucketpath = self.bucket.strip("/") bucketbase = self.bucket.split("/")[0] keyparts = bucketpath.split("/")[1:] keyparts.append(self.atom_file) keypath = '/'.join(keyparts) s3 = boto3.client('s3') with open(self.atom_filepath, 'r') as f: s3.upload_fileobj(f, bucketbase, keypath) def get_object_view_url(self, nuxeo_id): """ Get object view URL """ parts = urlparse.urlsplit(self.nx.conf["api"]) url = "{}://{}/Nuxeo/nxdoc/default/{}/view_documents".format(parts.scheme, parts.netloc, nuxeo_id) return url def get_full_metadata(self, nuxeo_id): """ Get full metadata via Nuxeo API """ parts = urlparse.urlsplit(self.nx.conf["api"]) url = '{}://{}/Merritt/{}.xml'.format(parts.scheme, parts.netloc, nuxeo_id) return url def get_object_download_url(self, metadata): ''' given the full metadata for an object, get file download url ''' try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'") if file_content is None: return None else: url = file_content['data'] # make available via basic auth url = url.replace('/nuxeo/', '/Nuxeo/') return url def get_media_json_url(self, nuxeo_id): """ Get media.json (deep harvest) url """ # https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/002130a5-e171-461b-a41b-28ab46af9652-media.json url = "https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/{}-media.json".format(nuxeo_id) return url def get_nuxeo_file_checksum(self, metadata): ''' get md5 checksum for nuxeo file ''' try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'") if file_content is None: return None else: checksum = file_content['digest'] return checksum def get_aux_files(self, metadata): ''' get auxiliary file urls ''' all_md = [] # get any "attachment" files if metadata['properties']['files:files']: attachments = metadata['properties']['files:files'] for attachment in attachments: md = {} if attachment['file'] and attachment['file']['data']: url = attachment['file']['data'] url = url.replace('/nuxeo/', '/Nuxeo/') md['url'] = url if attachment['file'] and attachment['file']['digest']: md['checksum'] = attachment['file']['digest'] if md: all_md.append(md) # get any "extra_file" files if metadata['properties']['extra_files:file']: for extra_file in metadata['properties']['extra_files:file']: md = {} if extra_file['blob'] and extra_file['blob']['data']: url = extra_file['blob']['data'] url = url.replace('/nuxeo/', '/Nuxeo/') md['url'] = url if extra_file['blob'] and extra_file['blob']['digest']: md['checksum'] = extra_file['blob']['digest'] if md: all_md.append(md) return all_md def _bundle_docs(self, docs): ''' given a dict of parent level nuxeo docs, fetch any components and also figure out when any part of the object was most recently modified/added ''' for doc in docs: last_mod_str = doc['lastModified'] overall_mod_datetime = parse(last_mod_str) doc['components'] = [] for c in doc['components']: mod_str = c['lastModified'] mod_datetime = parse(mod_str) if mod_datetime > overall_mod_datetime: overall_mod_datetime = mod_datetime doc['bundle_lastModified'] = overall_mod_datetime return docs def process_feed(self): ''' create feed for collection and stash on s3 ''' self.logger.info("atom_file: {}".format(self.atom_file)) self.logger.info("Nuxeo path: {}".format(self.path)) self.logger.info("Fetching Nuxeo docs. This could take a while if collection is large...") parent_docs = self.dh.fetch_objects() bundled_docs = self._bundle_docs(parent_docs) bundled_docs.sort(key=itemgetter('bundle_lastModified')) # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in bundled_docs: nxid = document['uid'] self.logger.info("working on document: {} {}".format(nxid, document['path'])) # object, bundled into one <entry> if complex entry = self._construct_entry_bundled(document) self.logger.info("inserting entry for object {} {}".format(nxid, document['path'])) root.insert(0, entry) # add header info logging.info("Adding header info to xml tree") self._add_merritt_id(root, self.merritt_id) self._add_paging_info(root) self._add_collection_alt_link(root, self.path) self._add_atom_elements(root) self._add_feed_updated(root, datetime.now(dateutil.tz.tzutc()).isoformat()) self._write_feed(root) logging.info("Feed written to file: {}".format(self.atom_filepath)) if not self.nostash: self._s3_stash() self.logger.info("Feed stashed on s3: {}".format(self.s3_url))
class NuxeoStashMediaJson(NuxeoStashRef): ''' create and stash media.json file for a nuxeo object ''' def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=True, **kwargs): super(NuxeoStashMediaJson, self).__init__(path, bucket, region, pynuxrc, replace, **kwargs) self.dh = DeepHarvestNuxeo( self.path, self.bucket, pynuxrc=self.pynuxrc) self.mj = MediaJson() self.filename = FILENAME_FORMAT.format(self.uid) self.filepath = os.path.join(self.tmp_dir, self.filename) self._update_report('filename', self.filename) self._update_report('filepath', self.filepath) def nxstashref(self): return self.nxstash_mediajson() def nxstash_mediajson(self): ''' create media.json file for object and stash on s3 ''' self._update_report('stashed', False) # extract and transform metadata for parent obj and any components parent_md = self._get_parent_metadata(self.metadata) component_md = [ self._get_component_metadata(c) for c in self.dh.fetch_components(self.metadata) ] # create media.json file media_json = self.mj.create_media_json(parent_md, component_md) self._write_file(media_json, self.filepath) # stash media.json file on s3 stashed, s3_report = s3stash.s3tools.s3stash( self.filepath, self.bucket, self.filename, self.region, 'application/json', self.replace) self._update_report('s3_stash', s3_report) self._update_report('stashed', stashed) self._remove_tmp() return self.report def _get_parent_metadata(self, obj): ''' assemble top-level (parent) object metadata ''' metadata = {} metadata['label'] = obj['title'] # only provide id, href, format if Nuxeo Document has file attached full_metadata = self.nx.get_metadata(uid=obj['uid']) if self.dh.has_file(full_metadata): metadata['id'] = obj['uid'] metadata['href'] = self.dh.get_object_download_url(full_metadata) metadata['format'] = self.dh.get_calisphere_object_type(obj[ 'type']) if metadata['format'] == 'video': metadata['dimensions'] = self.dh.get_video_dimensions( full_metadata) return metadata def _get_component_metadata(self, obj): ''' assemble component object metadata ''' metadata = {} full_metadata = self.nx.get_metadata(uid=obj['uid']) metadata['label'] = obj['title'] metadata['id'] = obj['uid'] metadata['href'] = self.dh.get_object_download_url(full_metadata) # extract additional ucldc metadata from 'properties' element ucldc_md = self._get_ucldc_schema_properties(full_metadata) for key, value in ucldc_md.iteritems(): metadata[key] = value # map 'type' metadata['format'] = self.dh.get_calisphere_object_type(obj['type']) return metadata def _get_ucldc_schema_properties(self, metadata): ''' get additional metadata as mapped by harvester ''' properties = {} mapper = UCLDCNuxeoMapper(metadata) mapper.map_original_record() mapper.map_source_resource() properties = mapper.mapped_data['sourceResource'] properties.update(mapper.mapped_data['originalRecord']) return properties def _write_file(self, content_dict, filepath): """ convert dict to json and write to file """ content_json = json.dumps( content_dict, indent=4, separators=(',', ': '), sort_keys=False) with open(filepath, 'wb') as f: f.write(content_json) f.flush()
def main(argv=None): ''' stash Nuxeo image files on s3 ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, create jp2 versions of image ' 'files and stash in S3.' ) parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument( '--bucket', default='ucldc-private-files/jp2000', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help='AWS region') parser.add_argument( '--replace', action="store_true", help="replace file on s3 if it already exists") parser.add_argument( '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_image = len([ key for key, value in report.iteritems() if not value['is_image']['is_image'] ]) print "not image:\t{}".format(not_image) unrecognized = len([ key for key, value in report.iteritems() if not value['precheck']['pass'] ]) print "not convertible:\t{}".format(unrecognized) converted = len( [key for key, value in report.iteritems() if value['converted']]) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "converted:\t{}".format(converted) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "stashed:\t{}".format(stashed) print "\nDone."