コード例 #1
0
    def deepharvest(self, metadata):
        ''' given a set of nuxeo metadata for a doc, deep harvest it '''

        self.logger.info("Processing {}".format(metadata['uid']))
     
        dh = DeepHarvestNuxeo('')
        type = dh.get_calisphere_object_type(metadata['type'])
        self.logger.info("Type: {}".format(type))


        report = {}
        if type == 'image':
            ''' stash image '''
            nxstash = NuxeoStashImage(metadata['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()
        
        print report
 
        if type in ['file', 'audio', 'video']:
            # stash file
            nxstash = NuxeoStashFile(metadata['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()

            # stash thumbnail
            nxstash = NuxeoStashThumb(metadata['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()

        print report

        # stash media.json
        '''
コード例 #2
0
ファイル: nxstashref.py プロジェクト: mredar/nuxeo-calisphere
class NuxeoStashRef(object):
    ''' Base class for fetching a Nuxeo file and stashing it in S3 '''
    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False,
                 **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

    def nxstashref(self):
        ''' download, prep and stash file '''
        raise NotImplementedError

    def _update_report(self, key, value):
        ''' add a key/value pair to report dict '''
        self.report[key] = value

    def _remove_tmp(self):
        ''' clean up after ourselves '''
        shutil.rmtree(self.tmp_dir)

    def _download_nuxeo_file(self):
        res = requests.get(self.source_download_url,
                           headers=self.nx.document_property_headers,
                           auth=self.nx.auth)
        res.raise_for_status()
        with open(self.source_filepath, 'wb') as f:
            for block in res.iter_content(1024):
                if block:
                    f.write(block)
                    f.flush()
        self.logger.info("Downloaded file from {} to {}".format(
            self.source_download_url, self.source_filepath))

    def _get_file_info(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        info = {}
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError(
                "Nuxeo object metadata does not contain 'properties/file:"
                "content' element. Make sure 'X-NXDocumentProperties' "
                "provided in pynux conf includes 'file'")

        if file_content is None:
            return None
        else:
            url = file_content['data'].strip()
            url = url.replace('/nuxeo/', '/Nuxeo/')
            info['url'] = url.strip()
            info['mimetype'] = file_content['mime-type'].strip()
            info['filename'] = file_content['name'].strip()

        if not info['filename']:
            try:
                info['filename'] = metadata['properties']['file:filename']
            except KeyError:
                raise KeyError(
                    "Nuxeo object metadata does not contain 'properties/file:"
                    "filename' element. Make sure 'X-NXDocumentProperties' "
                    "provided in pynux conf includes 'file'")

        return info

    def _is_s3_stashed(self):
        """ Check for existence of key on S3.
       """
        return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid,
                                             self.region)

    def _s3_stash(self, filepath, mimetype):
        """ Stash file in S3 bucket.
       """
        return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid,
                                       self.region, mimetype, self.replace)
コード例 #3
0
class NuxeoStashRef(object):
    ''' Base class for fetching a Nuxeo file and stashing it in S3 '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)   
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

    def nxstashref(self):
        ''' download, prep and stash file '''
        raise NotImplementedError

    def _update_report(self, key, value):
        ''' add a key/value pair to report dict '''
        self.report[key] = value

    def _remove_tmp(self):
        ''' clean up after ourselves '''
        shutil.rmtree(self.tmp_dir)

    def _download_nuxeo_file(self):

        # https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure
        retry_strategy = Retry(
            total=3,
            status_forcelist=[413, 429, 500, 502, 503, 504],
)
        adapter = HTTPAdapter(max_retries=retry_strategy)
        http = requests.Session()
        http.mount("https://", adapter)
        http.mount("http://", adapter)

        # timeouts based on those used by nuxeo-python-client
        # see: https://github.com/nuxeo/nuxeo-python-client/blob/master/nuxeo/constants.py
        # but tweaked to be slightly larger than a multiple of 3, which is recommended
        # in the requests documentation.
        # see: https://docs.python-requests.org/en/master/user/advanced/#timeouts
        timeout_connect = 12.05
        timeout_read = (60 * 10) + 0.05
        res = http.get(self.source_download_url,
                           headers=self.nx.document_property_headers,
                           auth=self.nx.auth, stream=True, timeout=(timeout_connect, timeout_read))

        res.raise_for_status()
        with open(self.source_filepath, 'wb') as f:
            for block in res.iter_content(chunk_size=None):
                f.write(block)
        self.logger.info("Downloaded file from {} to {}".format(
            self.source_download_url, self.source_filepath))

    def _get_file_info(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        info = {}

        # for videos, try to get nuxeo transcoded video file url first
        if metadata['type'] == 'CustomVideo':
           try:
               transcoded_video = metadata['properties']['vid:transcodedVideos']
               for tv in transcoded_video:
                  if tv['content']['mime-type'] == 'video/mp4':
                     url = tv['content']['data']
                     url = url.replace('/nuxeo/', '/Nuxeo/')
                     info['url'] = url.strip()
                     info['mimetype'] = tv['content']['mime-type'].strip()
                     info['filename'] = tv['content']['name'].strip()
                     return info
           except KeyError:
               pass

        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError(
                "Nuxeo object metadata does not contain 'properties/file:"
                "content' element. Make sure 'X-NXDocumentProperties' "
                "provided in pynux conf includes 'file'"
            )

        if file_content is None:
            return None
        else:
            url = file_content['data'].strip()
            url = url.replace('/nuxeo/', '/Nuxeo/')
            info['url'] = url.strip()
            info['mimetype'] = file_content['mime-type'].strip()
            info['filename'] = file_content['name'].strip()

        if not info['filename']:
            try:
                info['filename'] = metadata['properties']['file:filename']
            except KeyError:
                raise KeyError(
                    "Nuxeo object metadata does not contain 'properties/file:"
                    "filename' element. Make sure 'X-NXDocumentProperties' "
                    "provided in pynux conf includes 'file'"
                )

        return info

    def _is_s3_stashed(self):
        """ Check for existence of key on S3.
       """
        return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid,
                                             self.region)

    def _s3_stash(self, filepath, mimetype):
        """ Stash file in S3 bucket.
       """
        return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid,
                                       self.region, mimetype, self.replace)
コード例 #4
0
class NuxeoStashMediaJson(NuxeoStashRef):
    ''' create and stash media.json file for a nuxeo object '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=True,
                 **kwargs):
        super(NuxeoStashMediaJson, self).__init__(path, bucket, region,
                                                  pynuxrc, replace, **kwargs)

        self.dh = DeepHarvestNuxeo(
            self.path, self.bucket, pynuxrc=self.pynuxrc)
        self.mj = MediaJson()

        self.filename = FILENAME_FORMAT.format(self.uid)
        self.filepath = os.path.join(self.tmp_dir, self.filename)
        self._update_report('filename', self.filename)
        self._update_report('filepath', self.filepath)

    def nxstashref(self):
        return self.nxstash_mediajson()

    def nxstash_mediajson(self):
        ''' create media.json file for object and stash on s3 '''
        self._update_report('stashed', False)

        # extract and transform metadata for parent obj and any components
        parent_md = self._get_parent_metadata(self.metadata)
        component_md = [
            self._get_component_metadata(c)
            for c in self.dh.fetch_components(self.metadata)
        ]

        # create media.json file
        media_json = self.mj.create_media_json(parent_md, component_md)
        self._write_file(media_json, self.filepath)

        # stash media.json file on s3
        stashed, s3_report = s3stash.s3tools.s3stash(
            self.filepath, self.bucket, self.filename, self.region,
            'application/json', self.replace)
        self._update_report('s3_stash', s3_report)
        self._update_report('stashed', stashed)

        self._remove_tmp()

        return self.report

    def _get_parent_metadata(self, obj):
        ''' assemble top-level (parent) object metadata '''
        metadata = {}
        metadata['label'] = obj['title']

        # only provide id, href, format if Nuxeo Document has file attached
        full_metadata = self.nx.get_metadata(uid=obj['uid'])

        if self.dh.has_file(full_metadata):
            metadata['id'] = obj['uid']
            metadata['href'] = self.dh.get_object_download_url(full_metadata)
            metadata['format'] = self.dh.get_calisphere_object_type(obj[
                'type'])
            if metadata['format'] == 'video':
                metadata['dimensions'] = self.dh.get_video_dimensions(
                    full_metadata)

        return metadata

    def _get_component_metadata(self, obj):
        ''' assemble component object metadata '''
        metadata = {}
        full_metadata = self.nx.get_metadata(uid=obj['uid'])
        metadata['label'] = obj['title']
        metadata['id'] = obj['uid']
        metadata['href'] = self.dh.get_object_download_url(full_metadata)

        # extract additional  ucldc metadata from 'properties' element
        ucldc_md = self._get_ucldc_schema_properties(full_metadata)

        for key, value in ucldc_md.iteritems():
            metadata[key] = value

        # map 'type'
        metadata['format'] = self.dh.get_calisphere_object_type(obj['type'])

        return metadata

    def _get_ucldc_schema_properties(self, metadata):
        ''' get additional metadata as mapped by harvester '''
        properties = {}

        mapper = UCLDCNuxeoMapper(metadata)
        mapper.map_original_record()
        mapper.map_source_resource()

        properties = mapper.mapped_data['sourceResource']
        properties.update(mapper.mapped_data['originalRecord'])

        return properties

    def _write_file(self, content_dict, filepath):
        """ convert dict to json and write to file """
        content_json = json.dumps(
            content_dict, indent=4, separators=(',', ': '), sort_keys=False)
        with open(filepath, 'wb') as f:
            f.write(content_json)
            f.flush()
コード例 #5
0
class NuxeoStashRef(object):
    ''' Base class for fetching a Nuxeo file and stashing it in S3 '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)   
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

    def nxstashref(self):
        ''' download, prep and stash file '''
        raise NotImplementedError

    def _update_report(self, key, value):
        ''' add a key/value pair to report dict '''
        self.report[key] = value

    def _remove_tmp(self):
        ''' clean up after ourselves '''
        shutil.rmtree(self.tmp_dir)

    def _download_nuxeo_file(self):
        # https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
        res = requests.get(self.source_download_url,
                           headers=self.nx.document_property_headers,
                           auth=self.nx.auth, stream=True)
        res.raise_for_status()
        with open(self.source_filepath, 'wb') as f:
            for block in res.iter_content(chunk_size=None):
                f.write(block)
        self.logger.info("Downloaded file from {} to {}".format(
            self.source_download_url, self.source_filepath))

    def _get_file_info(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        info = {}
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError(
                "Nuxeo object metadata does not contain 'properties/file:"
                "content' element. Make sure 'X-NXDocumentProperties' "
                "provided in pynux conf includes 'file'"
            )

        if file_content is None:
            return None
        else:
            url = file_content['data'].strip()
            url = url.replace('/nuxeo/', '/Nuxeo/')
            info['url'] = url.strip()
            info['mimetype'] = file_content['mime-type'].strip()
            info['filename'] = file_content['name'].strip()

        if not info['filename']:
            try:
                info['filename'] = metadata['properties']['file:filename']
            except KeyError:
                raise KeyError(
                    "Nuxeo object metadata does not contain 'properties/file:"
                    "filename' element. Make sure 'X-NXDocumentProperties' "
                    "provided in pynux conf includes 'file'"
                )

        return info

    def _is_s3_stashed(self):
        """ Check for existence of key on S3.
       """
        return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid,
                                             self.region)

    def _s3_stash(self, filepath, mimetype):
        """ Stash file in S3 bucket.
       """
        return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid,
                                       self.region, mimetype, self.replace)