def deepharvest(self, metadata): ''' given a set of nuxeo metadata for a doc, deep harvest it ''' self.logger.info("Processing {}".format(metadata['uid'])) dh = DeepHarvestNuxeo('') type = dh.get_calisphere_object_type(metadata['type']) self.logger.info("Type: {}".format(type)) report = {} if type == 'image': ''' stash image ''' nxstash = NuxeoStashImage(metadata['path'], IMAGE_BUCKET, IMAGE_REGION, self.pynuxrc, self.replace, metadata=metadata) report[nxstash.uid] = nxstash.nxstashref() print report if type in ['file', 'audio', 'video']: # stash file nxstash = NuxeoStashFile(metadata['path'], FILE_BUCKET, FILE_REGION, self.pynuxrc, self.replace, metadata=metadata) report[nxstash.uid] = nxstash.nxstashref() # stash thumbnail nxstash = NuxeoStashThumb(metadata['path'], THUMB_BUCKET, THUMB_REGION, self.pynuxrc, self.replace, metadata=metadata) report[nxstash.uid] = nxstash.nxstashref() print report # stash media.json '''
class NuxeoStashRef(object): ''' Base class for fetching a Nuxeo file and stashing it in S3 ''' def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=False, **kwargs): self.logger = logging.getLogger(__name__) self.path = path self.bucket = bucket self.pynuxrc = pynuxrc self.region = region self.replace = replace self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r')) if 'metadata' in kwargs: self.metadata = kwargs['metadata'] self.logger.info("got metadata from kwargs") else: self.metadata = self.nx.get_metadata(path=self.path) self.logger.info("got metadata via pynux utils") self.uid = self.metadata['uid'] self.logger.info("initialized NuxeoStashRef with path {}".format( self.path.encode('ascii', 'replace'))) self.dh = DeepHarvestNuxeo(self.path, uid=self.uid) self.calisphere_type = self.dh.get_calisphere_object_type( self.metadata['type']) self.tmp_dir = tempfile.mkdtemp(dir='/tmp') # FIXME put in conf self.report = {} self._update_report('uid', self.uid) self._update_report('path', self.path) self._update_report('bucket', self.bucket) self._update_report('replace', self.replace) self._update_report('pynuxrc', self.pynuxrc) self._update_report('calisphere_type', self.calisphere_type) def nxstashref(self): ''' download, prep and stash file ''' raise NotImplementedError def _update_report(self, key, value): ''' add a key/value pair to report dict ''' self.report[key] = value def _remove_tmp(self): ''' clean up after ourselves ''' shutil.rmtree(self.tmp_dir) def _download_nuxeo_file(self): res = requests.get(self.source_download_url, headers=self.nx.document_property_headers, auth=self.nx.auth) res.raise_for_status() with open(self.source_filepath, 'wb') as f: for block in res.iter_content(1024): if block: f.write(block) f.flush() self.logger.info("Downloaded file from {} to {}".format( self.source_download_url, self.source_filepath)) def _get_file_info(self, metadata): ''' given the full metadata for an object, get file download url ''' info = {} try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError( "Nuxeo object metadata does not contain 'properties/file:" "content' element. Make sure 'X-NXDocumentProperties' " "provided in pynux conf includes 'file'") if file_content is None: return None else: url = file_content['data'].strip() url = url.replace('/nuxeo/', '/Nuxeo/') info['url'] = url.strip() info['mimetype'] = file_content['mime-type'].strip() info['filename'] = file_content['name'].strip() if not info['filename']: try: info['filename'] = metadata['properties']['file:filename'] except KeyError: raise KeyError( "Nuxeo object metadata does not contain 'properties/file:" "filename' element. Make sure 'X-NXDocumentProperties' " "provided in pynux conf includes 'file'") return info def _is_s3_stashed(self): """ Check for existence of key on S3. """ return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid, self.region) def _s3_stash(self, filepath, mimetype): """ Stash file in S3 bucket. """ return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid, self.region, mimetype, self.replace)
class NuxeoStashRef(object): ''' Base class for fetching a Nuxeo file and stashing it in S3 ''' def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=False, **kwargs): self.logger = logging.getLogger(__name__) self.path = path self.bucket = bucket self.pynuxrc = pynuxrc self.region = region self.replace = replace self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r')) if 'metadata' in kwargs: self.metadata = kwargs['metadata'] self.logger.info("got metadata from kwargs") else: self.metadata = self.nx.get_metadata(path=self.path) self.logger.info("got metadata via pynux utils") self.uid = self.metadata['uid'] self.logger.info("initialized NuxeoStashRef with path {}".format( self.path.encode('ascii', 'replace'))) self.dh = DeepHarvestNuxeo(self.path, uid=self.uid) self.calisphere_type = self.dh.get_calisphere_object_type( self.metadata['type']) self.tmp_dir = tempfile.mkdtemp(dir='/tmp') # FIXME put in conf self.report = {} self._update_report('uid', self.uid) self._update_report('path', self.path) self._update_report('bucket', self.bucket) self._update_report('replace', self.replace) self._update_report('pynuxrc', self.pynuxrc) self._update_report('calisphere_type', self.calisphere_type) def nxstashref(self): ''' download, prep and stash file ''' raise NotImplementedError def _update_report(self, key, value): ''' add a key/value pair to report dict ''' self.report[key] = value def _remove_tmp(self): ''' clean up after ourselves ''' shutil.rmtree(self.tmp_dir) def _download_nuxeo_file(self): # https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure retry_strategy = Retry( total=3, status_forcelist=[413, 429, 500, 502, 503, 504], ) adapter = HTTPAdapter(max_retries=retry_strategy) http = requests.Session() http.mount("https://", adapter) http.mount("http://", adapter) # timeouts based on those used by nuxeo-python-client # see: https://github.com/nuxeo/nuxeo-python-client/blob/master/nuxeo/constants.py # but tweaked to be slightly larger than a multiple of 3, which is recommended # in the requests documentation. # see: https://docs.python-requests.org/en/master/user/advanced/#timeouts timeout_connect = 12.05 timeout_read = (60 * 10) + 0.05 res = http.get(self.source_download_url, headers=self.nx.document_property_headers, auth=self.nx.auth, stream=True, timeout=(timeout_connect, timeout_read)) res.raise_for_status() with open(self.source_filepath, 'wb') as f: for block in res.iter_content(chunk_size=None): f.write(block) self.logger.info("Downloaded file from {} to {}".format( self.source_download_url, self.source_filepath)) def _get_file_info(self, metadata): ''' given the full metadata for an object, get file download url ''' info = {} # for videos, try to get nuxeo transcoded video file url first if metadata['type'] == 'CustomVideo': try: transcoded_video = metadata['properties']['vid:transcodedVideos'] for tv in transcoded_video: if tv['content']['mime-type'] == 'video/mp4': url = tv['content']['data'] url = url.replace('/nuxeo/', '/Nuxeo/') info['url'] = url.strip() info['mimetype'] = tv['content']['mime-type'].strip() info['filename'] = tv['content']['name'].strip() return info except KeyError: pass try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError( "Nuxeo object metadata does not contain 'properties/file:" "content' element. Make sure 'X-NXDocumentProperties' " "provided in pynux conf includes 'file'" ) if file_content is None: return None else: url = file_content['data'].strip() url = url.replace('/nuxeo/', '/Nuxeo/') info['url'] = url.strip() info['mimetype'] = file_content['mime-type'].strip() info['filename'] = file_content['name'].strip() if not info['filename']: try: info['filename'] = metadata['properties']['file:filename'] except KeyError: raise KeyError( "Nuxeo object metadata does not contain 'properties/file:" "filename' element. Make sure 'X-NXDocumentProperties' " "provided in pynux conf includes 'file'" ) return info def _is_s3_stashed(self): """ Check for existence of key on S3. """ return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid, self.region) def _s3_stash(self, filepath, mimetype): """ Stash file in S3 bucket. """ return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid, self.region, mimetype, self.replace)
class NuxeoStashMediaJson(NuxeoStashRef): ''' create and stash media.json file for a nuxeo object ''' def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=True, **kwargs): super(NuxeoStashMediaJson, self).__init__(path, bucket, region, pynuxrc, replace, **kwargs) self.dh = DeepHarvestNuxeo( self.path, self.bucket, pynuxrc=self.pynuxrc) self.mj = MediaJson() self.filename = FILENAME_FORMAT.format(self.uid) self.filepath = os.path.join(self.tmp_dir, self.filename) self._update_report('filename', self.filename) self._update_report('filepath', self.filepath) def nxstashref(self): return self.nxstash_mediajson() def nxstash_mediajson(self): ''' create media.json file for object and stash on s3 ''' self._update_report('stashed', False) # extract and transform metadata for parent obj and any components parent_md = self._get_parent_metadata(self.metadata) component_md = [ self._get_component_metadata(c) for c in self.dh.fetch_components(self.metadata) ] # create media.json file media_json = self.mj.create_media_json(parent_md, component_md) self._write_file(media_json, self.filepath) # stash media.json file on s3 stashed, s3_report = s3stash.s3tools.s3stash( self.filepath, self.bucket, self.filename, self.region, 'application/json', self.replace) self._update_report('s3_stash', s3_report) self._update_report('stashed', stashed) self._remove_tmp() return self.report def _get_parent_metadata(self, obj): ''' assemble top-level (parent) object metadata ''' metadata = {} metadata['label'] = obj['title'] # only provide id, href, format if Nuxeo Document has file attached full_metadata = self.nx.get_metadata(uid=obj['uid']) if self.dh.has_file(full_metadata): metadata['id'] = obj['uid'] metadata['href'] = self.dh.get_object_download_url(full_metadata) metadata['format'] = self.dh.get_calisphere_object_type(obj[ 'type']) if metadata['format'] == 'video': metadata['dimensions'] = self.dh.get_video_dimensions( full_metadata) return metadata def _get_component_metadata(self, obj): ''' assemble component object metadata ''' metadata = {} full_metadata = self.nx.get_metadata(uid=obj['uid']) metadata['label'] = obj['title'] metadata['id'] = obj['uid'] metadata['href'] = self.dh.get_object_download_url(full_metadata) # extract additional ucldc metadata from 'properties' element ucldc_md = self._get_ucldc_schema_properties(full_metadata) for key, value in ucldc_md.iteritems(): metadata[key] = value # map 'type' metadata['format'] = self.dh.get_calisphere_object_type(obj['type']) return metadata def _get_ucldc_schema_properties(self, metadata): ''' get additional metadata as mapped by harvester ''' properties = {} mapper = UCLDCNuxeoMapper(metadata) mapper.map_original_record() mapper.map_source_resource() properties = mapper.mapped_data['sourceResource'] properties.update(mapper.mapped_data['originalRecord']) return properties def _write_file(self, content_dict, filepath): """ convert dict to json and write to file """ content_json = json.dumps( content_dict, indent=4, separators=(',', ': '), sort_keys=False) with open(filepath, 'wb') as f: f.write(content_json) f.flush()
class NuxeoStashRef(object): ''' Base class for fetching a Nuxeo file and stashing it in S3 ''' def __init__(self, path, bucket, region, pynuxrc='~/.pynuxrc', replace=False, **kwargs): self.logger = logging.getLogger(__name__) self.path = path self.bucket = bucket self.pynuxrc = pynuxrc self.region = region self.replace = replace self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r')) if 'metadata' in kwargs: self.metadata = kwargs['metadata'] self.logger.info("got metadata from kwargs") else: self.metadata = self.nx.get_metadata(path=self.path) self.logger.info("got metadata via pynux utils") self.uid = self.metadata['uid'] self.logger.info("initialized NuxeoStashRef with path {}".format( self.path.encode('ascii', 'replace'))) self.dh = DeepHarvestNuxeo(self.path, uid=self.uid) self.calisphere_type = self.dh.get_calisphere_object_type( self.metadata['type']) self.tmp_dir = tempfile.mkdtemp(dir='/tmp') # FIXME put in conf self.report = {} self._update_report('uid', self.uid) self._update_report('path', self.path) self._update_report('bucket', self.bucket) self._update_report('replace', self.replace) self._update_report('pynuxrc', self.pynuxrc) self._update_report('calisphere_type', self.calisphere_type) def nxstashref(self): ''' download, prep and stash file ''' raise NotImplementedError def _update_report(self, key, value): ''' add a key/value pair to report dict ''' self.report[key] = value def _remove_tmp(self): ''' clean up after ourselves ''' shutil.rmtree(self.tmp_dir) def _download_nuxeo_file(self): # https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py res = requests.get(self.source_download_url, headers=self.nx.document_property_headers, auth=self.nx.auth, stream=True) res.raise_for_status() with open(self.source_filepath, 'wb') as f: for block in res.iter_content(chunk_size=None): f.write(block) self.logger.info("Downloaded file from {} to {}".format( self.source_download_url, self.source_filepath)) def _get_file_info(self, metadata): ''' given the full metadata for an object, get file download url ''' info = {} try: file_content = metadata['properties']['file:content'] except KeyError: raise KeyError( "Nuxeo object metadata does not contain 'properties/file:" "content' element. Make sure 'X-NXDocumentProperties' " "provided in pynux conf includes 'file'" ) if file_content is None: return None else: url = file_content['data'].strip() url = url.replace('/nuxeo/', '/Nuxeo/') info['url'] = url.strip() info['mimetype'] = file_content['mime-type'].strip() info['filename'] = file_content['name'].strip() if not info['filename']: try: info['filename'] = metadata['properties']['file:filename'] except KeyError: raise KeyError( "Nuxeo object metadata does not contain 'properties/file:" "filename' element. Make sure 'X-NXDocumentProperties' " "provided in pynux conf includes 'file'" ) return info def _is_s3_stashed(self): """ Check for existence of key on S3. """ return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid, self.region) def _s3_stash(self, filepath, mimetype): """ Stash file in S3 bucket. """ return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid, self.region, mimetype, self.replace)