def main(argv=None): parser = argparse.ArgumentParser( description='print differences between Nuxeo and CouchDB for a ' 'given collection' ) parser.add_argument('regid', help="Collection Registry ID") parser.add_argument( '--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.regid couch = get_couch_objects(registry_id) print('couch has {} objects'.format(len(couch))) nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) for obj in dh.fetch_objects(): incouch = True if obj['uid'] in couch else False if not incouch: print(obj['uid'])
def main(argv=None): parser = argparse.ArgumentParser( description='Print count of objects for a given collection.') parser.add_argument('path', help="Nuxeo path to collection") parser.add_argument( 'since_date', help= "Script will list docs updated since midnight on this date, GMT. Format YYYY-MM-DD", type=valid_date) parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rcfile for use with pynux utils") parser.add_argument('--components', action='store_true', help="show counts for object components") if argv is None: argv = parser.parse_args() dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc) print "about to fetch docs for path {}".format(dh.path) objects = dh.fetch_objects() component_count = 0 for obj in objects: last_mod_str = obj['lastModified'][:10] last_mod_date = parse(last_mod_str) if last_mod_date > argv.since_date: print last_mod_str, obj['path'] '''
def main(argv=None): parser = argparse.ArgumentParser( description='Print count of objects for a given collection.') parser.add_argument('path', help="Nuxeo path to collection") parser.add_argument('--pynuxrc', default='~/.pynuxrc-prod', help="rcfile for use with pynux utils") parser.add_argument('--components', action='store_true', help="show counts for object components") if argv is None: argv = parser.parse_args() dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) if not argv.components: return print "about to iterate through objects and get components" component_count = 0 for obj in objects: components = dh.fetch_components(obj) component_count = component_count + len(components) print "finished fetching components. {} found".format(component_count) print "Grand Total: {}".format(object_count + component_count)
def main(argv=None): ''' create and stash media.json files for a nuxeo collection ''' parser = argparse.ArgumentParser(description='Create and stash media.json' 'files for a nuxeo collection') parser.add_argument("path", help="Nuxeo document path") parser.add_argument("--bucket", default="static.ucldc.cdlib.org/media_json", help="S3 bucket where media.json files will be stashed") parser.add_argument('--region', default='us-east-1', help="aws region") parser.add_argument("--pynuxrc", default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument( '--replace', action="store_true", help="replace file on s3 if it already exists") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/mediajson-{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashMediaJson( obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/mediajson-{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report))
def main(argv=None): ''' create and stash media.json files for a nuxeo collection ''' parser = argparse.ArgumentParser(description='Create and stash media.json' 'files for a nuxeo collection') parser.add_argument("path", help="Nuxeo document path") parser.add_argument( "--bucket", default="static.ucldc.cdlib.org/media_json", help="S3 bucket where media.json files will be stashed") parser.add_argument('--region', default='us-east-1', help="aws region") parser.add_argument("--pynuxrc", default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument('--replace', action="store_true", help="replace file on s3 if it already exists") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/mediajson-{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashMediaJson(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/mediajson-{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report))
def main(argv=None): parser = argparse.ArgumentParser( description='list objects for a given collection.') parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) print "about to iterate through objects and get components" component_count = 0 all_components = [] for obj in objects: components = dh.fetch_components(obj) all_components.extend(components) print "{} components for {}".format(len(components), obj['uid']) print "finished fetching components. {} found".format(len(all_components)) objects.extend(all_components) total_obj = len(objects) print "Grand Total: {}".format(total_obj) # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks chunks = [ objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE) ] count = 0 for c in chunks: count = count + 1 filepath = 'chunks/{}_{}.txt'.format(registry_id, count) print "Writing file: {}".format(filepath) with open(filepath, 'w') as f: json.dump(c, f, indent=4)
def main(argv=None): parser = argparse.ArgumentParser( description='list objects for a given collection.') parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument( '--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) print "about to iterate through objects and get components" component_count = 0 all_components = [] for obj in objects: components = dh.fetch_components(obj) all_components.extend(components) print "{} components for {}".format(len(components), obj['uid']) print "finished fetching components. {} found".format(len(all_components)) objects.extend(all_components) total_obj = len(objects) print "Grand Total: {}".format(total_obj) # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks chunks = [objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)] count = 0 for c in chunks: count = count + 1 filepath = 'chunks/{}_{}.txt'.format(registry_id, count) print "Writing file: {}".format(filepath) with open(filepath, 'w') as f: json.dump(c, f, indent=4)
def main(argv=None): parser = argparse.ArgumentParser(description='Create ATOM feed for a given Nuxeo folder for Merritt harvesting') parser.add_argument("collection", help="UCLDC Registry Collection ID") parser.add_argument("--pynuxrc", help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection_id = argv.collection if argv.pynuxrc: ma = MerrittAtom(collection_id, argv.pynuxrc) else: ma = MerrittAtom(collection_id) print "atom_file: {}".format(ma.atom_file) if argv.pynuxrc: dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc) else: dh = DeepHarvestNuxeo(ma.path, '') print "Fetching Nuxeo docs. This could take a while if collection is large..." documents = dh.fetch_objects() # TODO: fetch components also # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in documents: nxid = document['uid'] print "constructing entry for {} {}".format(nxid, document['path']) nx_metadata = ma._extract_nx_metadata(nxid) entry = etree.Element(etree.QName(ATOM_NS, "entry")) entry = ma._populate_entry(entry, nx_metadata, nxid) root.insert(0, entry) # add header info print "Adding header info to xml tree" ma._add_merritt_id(root, ma.merritt_id) ma._add_paging_info(root) ma._add_collection_alt_link(root, ma.path) ma._add_atom_elements(root) ma._add_feed_updated(root, ma.last_update) ma._publish_feed(root)
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None, replace=False, timeout=JOB_TIMEOUT, log_handler=None): ''' Queue a deep harvest of a nuxeo object on a worker''' if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() log = logbook.Logger('QDH') for cid in [x for x in collection_ids.split(';')]: url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc) for obj in dh.fetch_objects(): log.info('Queueing TOPLEVEL {} :-: {}'.format( obj['uid'], obj['path'])) # deep harvest top level object queue_deep_harvest_path( config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=obj['path'], replace=replace, timeout=timeout) # deep harvest component sub-objects for c in dh.fetch_components(obj): log.info('Queueing {} :-: {}'.format( c['uid'], c['path'])) queue_deep_harvest_path( config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=c['path'], replace=replace, timeout=timeout) log_handler.pop_application()
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None, replace=False, timeout=JOB_TIMEOUT, log_handler=None): ''' Queue a deep harvest of a nuxeo object on a worker''' if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() log = logbook.Logger('QDH') for cid in [x for x in collection_ids.split(';')]: url_api = ''.join( ('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc) for obj in dh.fetch_objects(): log.info('Queueing TOPLEVEL {} :-: {}'.format( obj['uid'], obj['path'])) # deep harvest top level object queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=obj['path'], replace=replace, timeout=timeout) # deep harvest component sub-objects for c in dh.fetch_components(obj): log.info('Queueing {} :-: {}'.format(c['uid'], c['path'])) queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=c['path'], replace=replace, timeout=timeout) log_handler.pop_application()
def main(argv=None): parser = argparse.ArgumentParser( description= 'Print info on objects missing from couchdb for Nuxeo collection') parser.add_argument('id', help='Collection registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.id print "Registry ID: {}".format(registry_id) nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) print "Nuxeo path: {}".format(nxpath) # get couchdb data view = "https://harvest-stg.cdlib.org/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key=%22{}%22".format( registry_id) print view res = requests.get(view, verify=False) # FIXME we want to verify res.raise_for_status() couchdata = json.loads(res.content) rows = couchdata['rows'] delimiter = "{}--".format(registry_id) couch_uids = [row['id'].split(delimiter)[1] for row in rows] couch_count = len(couch_uids) print "Total rows in couchdb: {}".format(couch_count) # get nuxeo data dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() nx_count = len(objects) print "Total objects in Nuxeo: {}".format(nx_count) for obj in objects: if obj['uid'] not in couch_uids: print obj['uid'], obj['path']
def main(argv=None): parser = argparse.ArgumentParser( description='Print info on objects missing from couchdb for Nuxeo collection') parser.add_argument('id', help='Collection registry ID') parser.add_argument( '--pynuxrc', default='~/.pynuxrc', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.id print "Registry ID: {}".format(registry_id) nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) print "Nuxeo path: {}".format(nxpath) # get couchdb data view = "https://harvest-stg.cdlib.org/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key=%22{}%22".format(registry_id) print view res = requests.get(view, verify=False) # FIXME we want to verify res.raise_for_status() couchdata = json.loads(res.content) rows = couchdata['rows'] delimiter = "{}--".format(registry_id) couch_uids = [row['id'].split(delimiter)[1] for row in rows] couch_count = len(couch_uids) print "Total rows in couchdb: {}".format(couch_count) # get nuxeo data dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() nx_count = len(objects) print "Total objects in Nuxeo: {}".format(nx_count) for obj in objects: if obj['uid'] not in couch_uids: print obj['uid'], obj['path']
def main(argv=None): parser = argparse.ArgumentParser( description= 'list objects for a given collection where nuxeo doc type is image but file type is pdf' ) parser.add_argument('registry_id', help='UCLDC Registry ID') parser.add_argument('--pynuxrc', default='~/.pynuxrc-basic', help="rcfile for use with pynux utils") if argv is None: argv = parser.parse_args() registry_id = argv.registry_id # get nuxeo path nxpath = s3stash.s3tools.get_nuxeo_path(registry_id) if nxpath is None: print "No record found for registry_id: {}".format(registry_id) sys.exit() dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc) print "about to fetch objects for path {}".format(dh.path) objects = dh.fetch_objects() object_count = len(objects) print "finished fetching objects. {} found".format(object_count) convert = Convert() counter = 0 for obj in objects: if dh.has_file(obj) and obj['type'] == u'SampleCustomPicture' and obj[ 'properties']['file:content'][ 'mime-type'] == u'application/pdf': print obj['uid'], obj['path'], obj['type'], obj['properties'][ 'file:content']['name'] counter = counter + 1 print counter
class NuxeoFetcher(Fetcher): '''Harvest a Nuxeo FILE. Can be local or at a URL''' def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs): ''' uses pynux (https://github.com/ucldc/pynux) to grab objects from the Nuxeo API api url is set from url_harvest, overriding pynuxrc config and passed in conf. the pynux config file should have user & password and X-NXDocumemtProperties values filled in. ''' super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs) self._url = url_harvest self._path = extra_data self._nx = pynux.utils.Nuxeo(conf=conf_pynux) self._nx.conf['api'] = self._url self._structmap_bucket = STRUCTMAP_S3_BUCKET # get harvestable child objects conf_pynux['api'] = self._url self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux) self._children = iter(self._dh.fetch_objects()) def _get_structmap_url(self, bucket, obj_key): '''Get structmap_url property for object''' structmap_url = "s3://{0}/{1}{2}".format(bucket, obj_key, '-media.json') return structmap_url def _get_structmap_text(self, structmap_url): ''' Get structmap_text for object. This is all the words from 'label' in the json. See https://github.com/ucldc/ucldc-docs/wiki/media.json ''' structmap_text = "" bucketpath = self._structmap_bucket.strip("/") bucketbase = bucketpath.split("/")[0] parts = urlparse.urlsplit(structmap_url) # get contents of <nuxeo_id>-media.json file conn = boto.connect_s3() bucket = conn.get_bucket(bucketbase) key = bucket.get_key(parts.path) if not key: # media_json hasn't been harvested yet for this record self.logger.error('Media json at: {} missing.'.format(parts.path)) return structmap_text mediajson = key.get_contents_as_string() mediajson_dict = json.loads(mediajson) # concatenate all of the words from 'label' in the json labels = [] labels.append(mediajson_dict['label']) if 'structMap' in mediajson_dict: labels.extend([sm['label'] for sm in mediajson_dict['structMap']]) structmap_text = ' '.join(labels) return structmap_text def _get_isShownBy(self, nuxeo_metadata): ''' Get isShownBy value for object 1) if object has image at parent level, use this 2) if component(s) have image, use first one we can find 3) if object has PDF or video at parent level, use image stashed on S3 4) if component(s) have PDF or video, use first component image stashed on S3 we can find 5) return None ''' is_shown_by = None uid = nuxeo_metadata['uid'] self.logger.info("About to get isShownBy for uid {}".format(uid)) # 1) if object has image at parent level, use this if self._has_image(nuxeo_metadata): self.logger.info("Nuxeo doc with uid {} has an image at the " "parent level".format(uid)) is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format(nuxeo_metadata[ 'uid']) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 2) if component(s) have image, use first one we can find first_image_component_uid = self._get_first_image_component( nuxeo_metadata) self.logger.info("first_image_component_uid: {}".format( first_image_component_uid)) if first_image_component_uid: self.logger.info("Nuxeo doc with uid {} has an image at the" "component level".format(uid)) is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format( first_image_component_uid) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 3) if object has PDF at parent level, use image stashed on S3 if self._has_s3_thumbnail(nuxeo_metadata): self.logger.info("Nuxeo doc with uid {} has a thumbnail for" "parent file (probably PDF) stashed on S3".format( uid)) is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format(nuxeo_metadata[ 'uid']) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 4) if component(s) have PDF or video, use first component image stashed on S3 we can find first_thumb_component_uid = self._get_first_thumb_component( nuxeo_metadata) self.logger.info("first_thumb_component_uid: {}".format( first_thumb_component_uid)) if first_thumb_component_uid: self.logger.info("Nuxeo doc with uid {} has thumbnail at the" "component level".format(uid)) is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format( first_thumb_component_uid) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 5) return None self.logger.info("Could not find any image for Nuxeo doc with uid " "{}! Returning None".format(uid)) return is_shown_by def _has_image(self, metadata): ''' based on json metadata, determine whether or not this Nuxeo doc has an image file associated ''' if metadata['type'] != "SampleCustomPicture": return False properties = metadata['properties'] file_content = properties.get('file:content') if file_content and 'name' in file_content and file_content['name'] == 'empty_picture.png': return False elif file_content and 'data' in file_content: return True else: return False def _has_s3_thumbnail(self, metadata): ''' based on json metadata, determine whether or not this Nuxeo doc is PDF (or other non-image) that will have thumb image stashed on S3 for it ''' if metadata['type'] not in ("CustomFile", "CustomVideo"): return False properties = metadata['properties'] file_content = properties.get('file:content') if file_content and 'data' in file_content: return True else: return False def _get_first_image_component(self, parent_metadata): ''' get first image component we can find ''' component_uid = None query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \ "ecm:currentLifeCycleState != 'deleted' ORDER BY " \ "ecm:pos".format(parent_metadata['uid']) for child in self._nx.nxql(query): child_metadata = self._nx.get_metadata(uid=child['uid']) if self._has_image(child_metadata): component_uid = child_metadata['uid'] break return component_uid def _get_first_thumb_component(self, parent_metadata): ''' get first non-image component with thumbnail we can find ''' component_uid = None query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \ "ecm:currentLifeCycleState != 'deleted' ORDER BY " \ "ecm:pos".format(parent_metadata['uid']) for child in self._nx.nxql(query): child_metadata = self._nx.get_metadata(uid=child['uid']) if self._has_s3_thumbnail(child_metadata): component_uid = child_metadata['uid'] break return component_uid def next(self): '''Return Nuxeo record by record to the controller''' doc = self._children.next() self.metadata = self._nx.get_metadata(uid=doc['uid']) self.structmap_url = self._get_structmap_url(self._structmap_bucket, doc['uid']) self.metadata['structmap_url'] = self.structmap_url self.metadata['structmap_text'] = self._get_structmap_text( self.structmap_url) self.metadata['isShownBy'] = self._get_isShownBy(self.metadata) return self.metadata
def main(argv=None): ''' stash Nuxeo image files on s3 ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, create jp2 versions of image ' 'files and stash in S3.' ) parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument( '--bucket', default='ucldc-private-files/jp2000', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help='AWS region') parser.add_argument( '--replace', action="store_true", help="replace file on s3 if it already exists") parser.add_argument( '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_image = len([ key for key, value in report.iteritems() if not value['is_image']['is_image'] ]) print "not image:\t{}".format(not_image) unrecognized = len([ key for key, value in report.iteritems() if not value['precheck']['pass'] ]) print "not convertible:\t{}".format(unrecognized) converted = len( [key for key, value in report.iteritems() if value['converted']]) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "converted:\t{}".format(converted) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "stashed:\t{}".format(stashed) print "\nDone."
class Stash(object): ''' stash various files on s3 for a Nuxeo collection in preparation for harvesting into Calisphere ''' def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_): self.logger = logging.getLogger(__name__) self.path = path self.pynuxrc = pynuxrc self.replace = replace self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc) self.objects = self.fetch_objects() self.components = {} for obj in self.objects: self.components[obj['uid']] = self.dh.fetch_components(obj) def fetch_objects(self): ''' fetch objects to process ''' return self.dh.fetch_objects() def images(self): ''' stash Nuxeo image files on s3 ''' report = {} for obj in self.objects: nxstash = NuxeoStashImage(obj['path'], IMAGE_BUCKET, IMAGE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() for c in self.components[obj['uid']]: self.logger.info('Stashing image {}'.format(c['path'])) nxstash = NuxeoStashImage(c['path'], IMAGE_BUCKET, IMAGE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report def files(self): ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' report = {} for obj in self.objects: nxstash = NuxeoStashFile(obj['path'], FILE_BUCKET, FILE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() for c in self.components[obj['uid']]: self.logger.info('Stashing file {}'.format(c['path'])) nxstash = NuxeoStashFile(c['path'], FILE_BUCKET, FILE_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report def thumbnails(self): ''' stash thumbnail images for Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' report = {} for obj in self.objects: nxstash = NuxeoStashThumb(obj['path'], THUMB_BUCKET, THUMB_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() for c in self.components[obj['uid']]: self.logger.info('Stashing thumb {}'.format(c['path'])) nxstash = NuxeoStashThumb(c['path'], THUMB_BUCKET, THUMB_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report def media_json(self): ''' create and stash media.json files for a nuxeo collection ''' report = {} for obj in self.objects: self.logger.info('Stashing media json {}'.format(obj['path'])) nxstash = NuxeoStashMediaJson(obj['path'], MEDIAJSON_BUCKET, MEDIAJSON_REGION, self.pynuxrc, self.replace) report[nxstash.uid] = nxstash.nxstashref() return report
class NuxeoFetcher(Fetcher): '''Harvest a Nuxeo FILE. Can be local or at a URL''' def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs): ''' uses pynux (https://github.com/ucldc/pynux) to grab objects from the Nuxeo API api url is set from url_harvest, overriding pynuxrc config and passed in conf. the pynux config file should have user & password and X-NXDocumemtProperties values filled in. ''' super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs) self._url = url_harvest self._path = extra_data self._nx = pynux.utils.Nuxeo(conf=conf_pynux) self._nx.conf['api'] = self._url self._structmap_bucket = STRUCTMAP_S3_BUCKET # get harvestable child objects conf_pynux['api'] = self._url self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux) self._children = iter(self._dh.fetch_objects()) def _get_structmap_url(self, bucket, obj_key): '''Get structmap_url property for object''' structmap_url = "s3://{0}/{1}{2}".format(bucket, obj_key, '-media.json') return structmap_url def _get_structmap_text(self, structmap_url): ''' Get structmap_text for object. This is all the words from 'label' in the json. See https://github.com/ucldc/ucldc-docs/wiki/media.json ''' structmap_text = "" bucketpath = self._structmap_bucket.strip("/") bucketbase = bucketpath.split("/")[0] parts = urlparse.urlsplit(structmap_url) # get contents of <nuxeo_id>-media.json file conn = boto.connect_s3() bucket = conn.get_bucket(bucketbase) key = bucket.get_key(parts.path) if not key: # media_json hasn't been harvested yet for this record self.logger.error('Media json at: {} missing.'.format(parts.path)) return structmap_text mediajson = key.get_contents_as_string() mediajson_dict = json.loads(mediajson) # concatenate all of the words from 'label' in the json labels = [] labels.append(mediajson_dict['label']) if 'structMap' in mediajson_dict: labels.extend([sm['label'] for sm in mediajson_dict['structMap']]) structmap_text = ' '.join(labels) return structmap_text def _get_isShownBy(self, nuxeo_metadata): ''' Get isShownBy value for object 1) if object has image at parent level, use this 2) if component(s) have image, use first one we can find 3) if object has PDF or video at parent level, use image stashed on S3 4) if component(s) have PDF or video, use first component image stashed on S3 we can find 5) return None ''' is_shown_by = None uid = nuxeo_metadata['uid'] self.logger.info("About to get isShownBy for uid {}".format(uid)) # 1) if object has image at parent level, use this if self._has_image(nuxeo_metadata): self.logger.info("Nuxeo doc with uid {} has an image at the " "parent level".format(uid)) is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format( nuxeo_metadata['uid']) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 2) if component(s) have image, use first one we can find first_image_component_uid = self._get_first_image_component( nuxeo_metadata) self.logger.info( "first_image_component_uid: {}".format(first_image_component_uid)) if first_image_component_uid: self.logger.info("Nuxeo doc with uid {} has an image at the" "component level".format(uid)) is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format( first_image_component_uid) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 3) if object has PDF at parent level, use image stashed on S3 if self._has_s3_thumbnail(nuxeo_metadata): self.logger.info( "Nuxeo doc with uid {} has a thumbnail for" "parent file (probably PDF) stashed on S3".format(uid)) is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format( nuxeo_metadata['uid']) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 4) if component(s) have PDF or video, use first component image stashed on S3 we can find first_thumb_component_uid = self._get_first_thumb_component( nuxeo_metadata) self.logger.info( "first_thumb_component_uid: {}".format(first_thumb_component_uid)) if first_thumb_component_uid: self.logger.info("Nuxeo doc with uid {} has thumbnail at the" "component level".format(uid)) is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format( first_thumb_component_uid) self.logger.info("is_shown_by: {}".format(is_shown_by)) return is_shown_by # 5) return None self.logger.info("Could not find any image for Nuxeo doc with uid " "{}! Returning None".format(uid)) return is_shown_by def _has_image(self, metadata): ''' based on json metadata, determine whether or not this Nuxeo doc has an image file associated ''' if metadata['type'] != "SampleCustomPicture": return False properties = metadata['properties'] file_content = properties.get('file:content') if file_content and 'name' in file_content and file_content[ 'name'] == 'empty_picture.png': return False elif file_content and 'data' in file_content: return True else: return False def _has_s3_thumbnail(self, metadata): ''' based on json metadata, determine whether or not this Nuxeo doc is PDF (or other non-image) that will have thumb image stashed on S3 for it ''' if metadata['type'] not in ("CustomFile", "CustomVideo"): return False properties = metadata['properties'] file_content = properties.get('file:content') if file_content and 'data' in file_content: return True else: return False def _get_first_image_component(self, parent_metadata): ''' get first image component we can find ''' component_uid = None query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \ "ecm:currentLifeCycleState != 'deleted' ORDER BY " \ "ecm:pos".format(parent_metadata['uid']) for child in self._nx.nxql(query): child_metadata = self._nx.get_metadata(uid=child['uid']) if self._has_image(child_metadata): component_uid = child_metadata['uid'] break return component_uid def _get_first_thumb_component(self, parent_metadata): ''' get first non-image component with thumbnail we can find ''' component_uid = None query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \ "ecm:currentLifeCycleState != 'deleted' ORDER BY " \ "ecm:pos".format(parent_metadata['uid']) for child in self._nx.nxql(query): child_metadata = self._nx.get_metadata(uid=child['uid']) if self._has_s3_thumbnail(child_metadata): component_uid = child_metadata['uid'] break return component_uid def next(self): '''Return Nuxeo record by record to the controller''' doc = self._children.next() self.metadata = self._nx.get_metadata(uid=doc['uid']) self.structmap_url = self._get_structmap_url(self._structmap_bucket, doc['uid']) self.metadata['structmap_url'] = self.structmap_url self.metadata['structmap_text'] = self._get_structmap_text( self.structmap_url) self.metadata['isShownBy'] = self._get_isShownBy(self.metadata) return self.metadata
''' write nuxeo uid, identifier to file for UCSF Berne collections ''' collections = [ #'/asset-library/UCSF/Berne_Eric_Collection', '/asset-library/UCSF/MSS 2003-12 Eric Berne papers', '/asset-library/UCSF/MSS 2005-08 Eric Berne papers', '/asset-library/UCSF/MSS 2013-18 Eric Berne papers', '/asset-library/UCSF/MSS 2013-19 Eric Berne papers', '/asset-library/UCSF/MSS 82-0 Eric Berne papers', '/asset-library/UCSF/MSS 89-12 Eric Berne papers' ] map = {} for collection in collections: dh = DeepHarvestNuxeo(collection, '') objects = dh.fetch_objects() for obj in objects: uid = obj['uid'] filename = obj['path'].split('/')[-1] identifier = filename.split('.')[0] map[identifier] = uid # Additions map['mss2005-08_1_7_CTmedapplication_1937-08-02'] = 'bef32337-6ca6-43c9-9eaa-e0553f26f3dc' map['mss2013-19_5_17_difficulties-comparative-psychiatry_ca1959'] = 'ad8c13d2-a89d-4346-809b-03cd612b9c80' map['mss2013-19_18_CAboardmedexaminers-cert_1945-07-19'] = 'ad8c13d2-a89d-4346-809b-03cd612b9c80' map['mss82-0_cover_gamespeopleplay-Israeli-ed'] = '13d32c79-0e59-4b25-b590-08c3249ca420' map['mss2005-08_1_13_AUS-certofservice_1946-09-23'] = '74a79291-43b4-4ae0-9d56-634ee1a5953c' map['mss2013-19_1_2_statement-interests-activities_ca1937'] = 'a52a3ab2-8c2b-48d0-b082-969e6cb6dcc3' map['mss82-0_cover_juegos-en-que-participamos001'] = 'ec8a4de2-e65d-4963-9af2-6693fef19763' map['mss2013-19_4_6_travelephemera_turkeytckt_002'] = '231291e7-91af-4fee-8c06-0bebb817a899'
class MerrittAtom(): def __init__(self, collection_id, **kwargs): self.logger = logging.getLogger(__name__) self.collection_id = collection_id if 'bucket' in kwargs: self.bucket = kwargs['bucket'] else: self.bucket = BUCKET if 'pynuxrc' in kwargs: pynuxrc = kwargs['pynuxrc'] else: pynuxrc = None if 'dir' in kwargs: self.dir = kwargs['dir'] else: self.dir = '.' if 'nostash' in kwargs: self.nostash = kwargs['nostash'] else: self.nostash = False self.logger.info("collection_id: {}".format(self.collection_id)) if 'nuxeo_path' in kwargs: self.path = kwargs['nuxeo_path'] else: self.path = self._get_nuxeo_path() if 'merritt_id' in kwargs: self.merritt_id = kwargs['merritt_id'] else: self.merritt_id = self._get_merritt_id() if not self.merritt_id: raise ValueError("No Merritt ID for this collection") self.feed_base_url = 'https://s3.amazonaws.com/{}/'.format(self.bucket) if pynuxrc: self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc),'r')) self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=pynuxrc) elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')): self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r')) self.dh = DeepHarvestNuxeo(self.path, '') self.atom_file = self._get_filename(self.collection_id) if not self.atom_file: raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id)) self.s3_url = "{}{}".format(self.feed_base_url, self.atom_file) self.atom_filepath = os.path.join(self.dir, self.atom_file) def _get_merritt_id(self): ''' given collection registry ID, get corresponding Merritt collection ID ''' url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id) res = requests.get(url) res.raise_for_status() md = json.loads(res.text) merritt_id = md['merritt_id'] return merritt_id def _get_nuxeo_path(self): ''' given ucldc registry collection ID, get Nuxeo path for collection ''' url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id) res = requests.get(url) res.raise_for_status() md = json.loads(res.text) nuxeo_path = md['harvest_extra_data'] return nuxeo_path def _get_filename(self, collection_id): ''' given Collection ID, get a friendly filename for the ATOM feed ''' filename = 'ucldc_collection_{}.atom'.format(collection_id) return filename def _extract_nx_metadata(self, raw_metadata): ''' extract Nuxeo metadata we want to post to the ATOM feed ''' metadata = {} # last modified metadata['lastModified'] = raw_metadata['bundle_lastModified'] # creator creators = raw_metadata['properties']['ucldc_schema:creator'] metadata['creator'] = [creator['name'] for creator in creators] # title metadata['title'] = raw_metadata['title'] # date dates = raw_metadata['properties']['ucldc_schema:date'] dates = [date['date'] for date in dates] metadata['date'] = dates[0] if dates else None # nuxeo id metadata['id'] = raw_metadata['properties']['ucldc_schema:identifier'] # nuxeo collection metadata['collection'] = raw_metadata['properties']['ucldc_schema:collection'][0] if raw_metadata['properties']['ucldc_schema:collection'] else None return metadata def _construct_entry_bundled(self, doc): ''' construct ATOM feed entry element for a given nuxeo doc, including files for any component objects ''' uid = doc['uid'] # parent nx_metadata = self._extract_nx_metadata(doc) entry = etree.Element(etree.QName(ATOM_NS, "entry")) entry = self._populate_entry(entry, nx_metadata, uid, True) # insert component md for c in self.dh.fetch_components(doc): self._insert_full_md_link(entry, c['uid']) self._insert_main_content_link(entry, c['uid']) self._insert_aux_links(entry, c['uid']) return entry def _add_atom_elements(self, doc): ''' add atom feed elements to document ''' # recommended ATOM feed elements feed_author = etree.Element(etree.QName(ATOM_NS, "author")) feed_author.text = "UC Libraries Digital Collection" doc.insert(0, feed_author) # required ATOM feed elements feed_title = etree.Element(etree.QName(ATOM_NS, "title")) feed_title.text = "UCLDC Metadata Feed" # FIXME get campus name from registry API? doc.insert(0, feed_title) feed_id = etree.Element(etree.QName(ATOM_NS, "id")) feed_id.text = self.s3_url doc.insert(0, feed_id) return doc def _add_feed_updated(self, doc, updated): ''' add feed updated ''' feed_updated = etree.Element(etree.QName(ATOM_NS, "updated")) feed_updated.text = updated doc.insert(0, feed_updated) def _add_collection_alt_link(self, doc, path): ''' add elements related to Nuxeo collection info to document ''' collection_metadata = self.nx.get_metadata(path=path) collection_title = collection_metadata['title'] collection_uid = collection_metadata['uid'] collection_uri = self.get_object_view_url(collection_uid) feed_link_alt = etree.Element(etree.QName(ATOM_NS, "link"), rel="alternate", href=collection_uri, title=collection_title) doc.insert(0, feed_link_alt) return doc def _add_paging_info(self, doc): ''' add rel links for paging ''' # this is just dumb for now last_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="last", href=self.s3_url) doc.insert(0, last_link) first_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="first", href=self.s3_url) doc.insert(0, first_link) self_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="self", href=self.s3_url) doc.insert(0, self_link) def _add_merritt_id(self, doc, merritt_collection_id): ''' add Merritt ID ''' merritt_id = etree.Element(etree.QName(ATOM_NS, "merritt_collection_id")) merritt_id.text = merritt_collection_id doc.insert(0, merritt_id) def _populate_entry(self, entry, metadata, nxid, is_parent): ''' get <entry> element for a given set of object metadata ''' # atom id (URI) nuxeo_object_view_url = self.get_object_view_url(nxid) atom_id = etree.SubElement(entry, etree.QName(ATOM_NS, "id")) atom_id.text = nuxeo_object_view_url # atom title atom_title = etree.SubElement(entry, etree.QName(ATOM_NS, "title")) atom_title.text = metadata["title"] # atom updated atom_updated = etree.SubElement(entry, etree.QName(ATOM_NS, "updated")) atom_updated.text = metadata['lastModified'].isoformat() # atom author atom_author = etree.SubElement(entry, etree.QName(ATOM_NS, "author")) atom_author.text = "UC Libraries Digital Collection" # metadata file link self._insert_full_md_link(entry, nxid) # media json link if is_parent: self._insert_media_json_link(entry, nxid) # main content file link self._insert_main_content_link(entry, nxid) # auxiliary file link(s) self._insert_aux_links(entry, nxid) # dc creator for creator_name in metadata['creator']: dc_creator = etree.SubElement(entry, etree.QName(DC_NS, "creator")) dc_creator.text = creator_name # dc title dc_title = etree.SubElement(entry, etree.QName(DC_NS, "title")) dc_title.text = metadata['title'] # dc date dc_date = etree.SubElement(entry, etree.QName(DC_NS, "date")) dc_date.text = metadata['date'] # dc identifier (a.k.a. local identifier) - Nuxeo ID nuxeo_identifier = etree.SubElement(entry, etree.QName(DC_NS, "identifier")) nuxeo_identifier.text = nxid # UCLDC identifier (a.k.a. local identifier) - ucldc_schema:identifier -- this will be the ARK if we have it if metadata['id']: ucldc_identifier = etree.SubElement(entry, etree.QName(NX_NS, "identifier")) ucldc_identifier.text = metadata['id'] # UCLDC collection identifier ucldc_collection_id = etree.SubElement(entry, etree.QName(NX_NS, "collection")) ucldc_collection_id.text = metadata['collection'] return entry def _insert_media_json_link(self, entry, uid): media_json_url = self.get_media_json_url(uid) link_media_json = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=media_json_url, type="application/json", title="Deep Harvest metadata for this object") def _insert_main_content_link(self, entry, uid): nx_metadata = self.nx.get_metadata(uid=uid) nuxeo_file_download_url = self.get_object_download_url(nx_metadata) checksum = self.get_nuxeo_file_checksum(nx_metadata) if nuxeo_file_download_url: main_content_link = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=nuxeo_file_download_url, title="Main content file") # FIXME add content_type if checksum: checksum_element = etree.SubElement(main_content_link, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5") checksum_element.text = checksum def _insert_aux_links(self, entry, uid): nx_metadata = self.nx.get_metadata(uid=uid) aux_files = self.get_aux_files(nx_metadata) for af in aux_files: link_aux_file = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=af['url'], title="Auxiliary file") if af['checksum']: checksum_element = etree.SubElement(link_aux_file, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5") checksum_element.text = af['checksum'] def _insert_full_md_link(self, entry, uid): full_metadata_url = self.get_full_metadata(uid) link_md = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=full_metadata_url, type="application/xml", title="Full metadata for this object from Nuxeo") def _write_feed(self, doc): ''' publish feed ''' feed = etree.ElementTree(doc) feed_string = etree.tostring(feed, pretty_print=True, encoding='utf-8', xml_declaration=True) with open(self.atom_filepath, "w") as f: f.write(feed_string) def _s3_get_feed(self): """ Retrieve ATOM feed file from S3. Return as ElementTree object """ bucketpath = self.bucket.strip("/") bucketbase = self.bucket.split("/")[0] keyparts = bucketpath.split("/")[1:] keyparts.append(self.atom_file) keypath = '/'.join(keyparts) s3 = boto3.client('s3') response = s3.get_object(Bucket=bucketbase,Key=keypath) contents = response['Body'].read() return etree.fromstring(contents) def _s3_stash(self): """ Stash file in S3 bucket. """ bucketpath = self.bucket.strip("/") bucketbase = self.bucket.split("/")[0] keyparts = bucketpath.split("/")[1:] keyparts.append(self.atom_file) keypath = '/'.join(keyparts) s3 = boto3.client('s3') with open(self.atom_filepath, 'r') as f: s3.upload_fileobj(f, bucketbase, keypath) def get_object_view_url(self, nuxeo_id): """ Get object view URL """ parts = urlparse.urlsplit(self.nx.conf["api"]) url = "{}://{}/Nuxeo/nxdoc/default/{}/view_documents".format(parts.scheme, parts.netloc, nuxeo_id) return url def get_full_metadata(self, nuxeo_id): """ Get full metadata via Nuxeo API """ parts = urlparse.urlsplit(self.nx.conf["api"]) url = '{}://{}/Merritt/{}.xml'.format(parts.scheme, parts.netloc, nuxeo_id) return url def get_object_download_url(self, metadata): ''' given the full metadata for an object, get file download url ''' try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'") if file_content is None: return None else: url = file_content['data'] # make available via basic auth url = url.replace('/nuxeo/', '/Nuxeo/') return url def get_media_json_url(self, nuxeo_id): """ Get media.json (deep harvest) url """ # https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/002130a5-e171-461b-a41b-28ab46af9652-media.json url = "https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/{}-media.json".format(nuxeo_id) return url def get_nuxeo_file_checksum(self, metadata): ''' get md5 checksum for nuxeo file ''' try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'") if file_content is None: return None else: checksum = file_content['digest'] return checksum def get_aux_files(self, metadata): ''' get auxiliary file urls ''' all_md = [] # get any "attachment" files if metadata['properties']['files:files']: attachments = metadata['properties']['files:files'] for attachment in attachments: md = {} if attachment['file'] and attachment['file']['data']: url = attachment['file']['data'] url = url.replace('/nuxeo/', '/Nuxeo/') md['url'] = url if attachment['file'] and attachment['file']['digest']: md['checksum'] = attachment['file']['digest'] if md: all_md.append(md) # get any "extra_file" files if metadata['properties']['extra_files:file']: for extra_file in metadata['properties']['extra_files:file']: md = {} if extra_file['blob'] and extra_file['blob']['data']: url = extra_file['blob']['data'] url = url.replace('/nuxeo/', '/Nuxeo/') md['url'] = url if extra_file['blob'] and extra_file['blob']['digest']: md['checksum'] = extra_file['blob']['digest'] if md: all_md.append(md) return all_md def _bundle_docs(self, docs): ''' given a dict of parent level nuxeo docs, fetch any components and also figure out when any part of the object was most recently modified/added ''' for doc in docs: last_mod_str = doc['lastModified'] overall_mod_datetime = parse(last_mod_str) doc['components'] = [] for c in doc['components']: mod_str = c['lastModified'] mod_datetime = parse(mod_str) if mod_datetime > overall_mod_datetime: overall_mod_datetime = mod_datetime doc['bundle_lastModified'] = overall_mod_datetime return docs def process_feed(self): ''' create feed for collection and stash on s3 ''' self.logger.info("atom_file: {}".format(self.atom_file)) self.logger.info("Nuxeo path: {}".format(self.path)) self.logger.info("Fetching Nuxeo docs. This could take a while if collection is large...") parent_docs = self.dh.fetch_objects() bundled_docs = self._bundle_docs(parent_docs) bundled_docs.sort(key=itemgetter('bundle_lastModified')) # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in bundled_docs: nxid = document['uid'] self.logger.info("working on document: {} {}".format(nxid, document['path'])) # object, bundled into one <entry> if complex entry = self._construct_entry_bundled(document) self.logger.info("inserting entry for object {} {}".format(nxid, document['path'])) root.insert(0, entry) # add header info logging.info("Adding header info to xml tree") self._add_merritt_id(root, self.merritt_id) self._add_paging_info(root) self._add_collection_alt_link(root, self.path) self._add_atom_elements(root) self._add_feed_updated(root, datetime.now(dateutil.tz.tzutc()).isoformat()) self._write_feed(root) logging.info("Feed written to file: {}".format(self.atom_filepath)) if not self.nostash: self._s3_stash() self.logger.info("Feed stashed on s3: {}".format(self.s3_url))
def main(argv=None): ''' stash Nuxeo image files on s3 ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, create jp2 versions of image ' 'files and stash in S3.') parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument('--bucket', default='ucldc-private-files/jp2000', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help='AWS region') parser.add_argument('--replace', action="store_true", help="replace file on s3 if it already exists") parser.add_argument('--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_image = len([ key for key, value in report.iteritems() if not value['is_image']['is_image'] ]) print "not image:\t{}".format(not_image) unrecognized = len([ key for key, value in report.iteritems() if not value['precheck']['pass'] ]) print "not convertible:\t{}".format(unrecognized) converted = len( [key for key, value in report.iteritems() if value['converted']]) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "converted:\t{}".format(converted) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "stashed:\t{}".format(stashed) print "\nDone."
def main(argv=None): ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a collection ''' parser = argparse.ArgumentParser( description='For Nuxeo collection, stash files (pdf, txt, etc) in S3.') parser.add_argument('path', help="Nuxeo document path to collection") parser.add_argument( '--bucket', default='ucldc-nuxeo-ref-media', help="S3 bucket name") parser.add_argument('--region', default='us-west-2', help="aws region") parser.add_argument( '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux") parser.add_argument( '--replace', action="store_true", help="replace file on s3 if it already exists") if argv is None: argv = parser.parse_args() collection = argv.path.split('/')[-1] # logging logfile = 'logs/{}.log'.format(collection) print "LOG:\t{}".format(logfile) logging.basicConfig( filename=logfile, level=logging.INFO, format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') logger = logging.getLogger(__name__) dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc) report = {} objects = dh.fetch_objects() for obj in objects: nxstash = NuxeoStashFile(obj['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() for c in dh.fetch_components(obj): nxstash = NuxeoStashFile(c['path'], argv.bucket, argv.region, argv.pynuxrc, argv.replace) report[nxstash.uid] = nxstash.nxstashref() # output report to json file reportfile = "reports/{}.json".format(collection) with open(reportfile, 'w') as f: json.dump(report, f, sort_keys=True, indent=4) # parse report to give basic stats report = json.load(open(reportfile)) print "REPORT:\t{}".format(reportfile) print "SUMMARY:" print "processed:\t{}".format(len(report)) not_file = len([ key for key, value in report.iteritems() if not value['calisphere_type'] in VALID_CALISPHERE_TYPES ]) print "not type `file`, `audio` or `video`:\t{}".format(not_file) already_stashed = len([ key for key, value in report.iteritems() if 'already_s3_stashed' in value.keys() and value['already_s3_stashed'] ]) print "already stashed:\t{}".format(already_stashed) stashed = len( [key for key, value in report.iteritems() if value['stashed']]) print "(re)stashed:\t{}".format(stashed) print "\nDone."
def main(argv=None): parser = argparse.ArgumentParser( description= 'Create ATOM feed for a given Nuxeo folder for Merritt harvesting') parser.add_argument("collection", help="UCLDC Registry Collection ID") parser.add_argument("--pynuxrc", help="rc file for use by pynux") if argv is None: argv = parser.parse_args() collection_id = argv.collection if argv.pynuxrc: ma = MerrittAtom(collection_id, argv.pynuxrc) else: ma = MerrittAtom(collection_id) print "atom_file: {}".format(ma.atom_file) print "ma.path: {}".format(ma.path) if argv.pynuxrc: dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc) else: dh = DeepHarvestNuxeo(ma.path, '') print "Nuxeo path: {}".format(ma.path) print "Fetching Nuxeo docs. This could take a while if collection is large..." documents = dh.fetch_objects() # create root root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP) # add entries for document in documents: nxid = document['uid'] print "working on document: {} {}".format(nxid, document['path']) # parent entry = ma._construct_entry(nxid, True) print "inserting entry for parent object {} {}".format( nxid, document['path']) root.insert(0, entry) # children component_entries = [ ma._construct_entry(c['uid'], False) for c in dh.fetch_components(document) ] for ce in component_entries: print "inserting entry for component: {} {}".format( nxid, document['path']) root.insert(0, ce) # add header info print "Adding header info to xml tree" ma._add_merritt_id(root, ma.merritt_id) ma._add_paging_info(root) ma._add_collection_alt_link(root, ma.path) ma._add_atom_elements(root) ma._add_feed_updated(root, ma.last_update) ma._write_feed(root) print "Feed written to file: {}".format(ma.atom_file) ma._s3_stash() print "Feed stashed on s3: {}".format(ma.s3_url)