コード例 #1
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print count of objects for a given collection.')
    parser.add_argument('path', help="Nuxeo path to collection")
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-prod',
                        help="rcfile for use with pynux utils")
    parser.add_argument('--components',
                        action='store_true',
                        help="show counts for object components")
    if argv is None:
        argv = parser.parse_args()

    dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    if not argv.components:
        return

    print "about to iterate through objects and get components"
    component_count = 0
    for obj in objects:
        components = dh.fetch_components(obj)
        component_count = component_count + len(components)
    print "finished fetching components. {} found".format(component_count)
    print "Grand Total: {}".format(object_count + component_count)
コード例 #2
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='list objects for a given collection.')
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-basic',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    print "about to iterate through objects and get components"
    component_count = 0
    all_components = []
    for obj in objects:
        components = dh.fetch_components(obj)
        all_components.extend(components)
        print "{} components for {}".format(len(components), obj['uid'])
    print "finished fetching components. {} found".format(len(all_components))

    objects.extend(all_components)
    total_obj = len(objects)
    print "Grand Total: {}".format(total_obj)

    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    chunks = [
        objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)
    ]

    count = 0
    for c in chunks:
        count = count + 1
        filepath = 'chunks/{}_{}.txt'.format(registry_id, count)
        print "Writing file: {}".format(filepath)
        with open(filepath, 'w') as f:
            json.dump(c, f, indent=4)
コード例 #3
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='list objects for a given collection.')
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument(
        '--pynuxrc',
        default='~/.pynuxrc-basic',
        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    print "about to iterate through objects and get components"
    component_count = 0
    all_components = []
    for obj in objects:
        components = dh.fetch_components(obj) 
        all_components.extend(components)
        print "{} components for {}".format(len(components), obj['uid'])
    print "finished fetching components. {} found".format(len(all_components))

    objects.extend(all_components)
    total_obj = len(objects)
    print "Grand Total: {}".format(total_obj)

    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    chunks = [objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)]

    count = 0
    for c in chunks:
        count = count + 1        
        filepath = 'chunks/{}_{}.txt'.format(registry_id, count)
        print "Writing file: {}".format(filepath)
        with open(filepath, 'w') as f:
            json.dump(c, f, indent=4)
コード例 #4
0
def main(collection_ids,
         rq_queue='dh-q',
         config=None,
         pynuxrc=None,
         replace=False,
         timeout=JOB_TIMEOUT,
         log_handler=None):
    ''' Queue a deep harvest of a nuxeo object on a worker'''
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()
    log = logbook.Logger('QDH')
    for cid in [x for x in collection_ids.split(';')]:
        url_api = ''.join(
            ('https://registry.cdlib.org/api/v1/collection/', cid, '/'))
        coll = Collection(url_api)

        dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc)

        for obj in dh.fetch_objects():
            log.info('Queueing TOPLEVEL {} :-: {}'.format(
                obj['uid'], obj['path']))
            # deep harvest top level object
            queue_deep_harvest_path(config['redis_host'],
                                    config['redis_port'],
                                    config['redis_password'],
                                    config['redis_connect_timeout'],
                                    rq_queue=rq_queue,
                                    path=obj['path'],
                                    replace=replace,
                                    timeout=timeout)
            # deep harvest component sub-objects
            for c in dh.fetch_components(obj):
                log.info('Queueing {} :-: {}'.format(c['uid'], c['path']))
                queue_deep_harvest_path(config['redis_host'],
                                        config['redis_port'],
                                        config['redis_password'],
                                        config['redis_connect_timeout'],
                                        rq_queue=rq_queue,
                                        path=c['path'],
                                        replace=replace,
                                        timeout=timeout)

    log_handler.pop_application()
コード例 #5
0
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None,
        replace=False, timeout=JOB_TIMEOUT, log_handler=None):
    ''' Queue a deep harvest of a nuxeo object on a worker'''
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()
    log = logbook.Logger('QDH')
    for cid in [x for x in collection_ids.split(';')]:
        url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/',
                    cid, '/'))
        coll = Collection(url_api)

        dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc)

        for obj in dh.fetch_objects():
            log.info('Queueing TOPLEVEL {} :-: {}'.format(
                obj['uid'],
                obj['path']))
            # deep harvest top level object
            queue_deep_harvest_path(
                config['redis_host'],
                config['redis_port'],
                config['redis_password'],
                config['redis_connect_timeout'],
                rq_queue=rq_queue,
                path=obj['path'],
                replace=replace,
                timeout=timeout)
            # deep harvest component sub-objects
            for c in dh.fetch_components(obj):
                log.info('Queueing {} :-: {}'.format(
                    c['uid'],
                    c['path']))
                queue_deep_harvest_path(
                    config['redis_host'],
                    config['redis_port'],
                    config['redis_password'],
                    config['redis_connect_timeout'],
                    rq_queue=rq_queue,
                    path=c['path'],
                    replace=replace,
                    timeout=timeout)

    log_handler.pop_application()
コード例 #6
0
def main(argv=None):
    ''' stash Nuxeo image files on s3 '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, create jp2 versions of image '
        'files and stash in S3.')
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument('--bucket',
                        default='ucldc-private-files/jp2000',
                        help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help='AWS region')
    parser.add_argument('--replace',
                        action="store_true",
                        help="replace file on s3 if it already exists")
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}
    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region,
                                  argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_image = len([
        key for key, value in report.iteritems()
        if not value['is_image']['is_image']
    ])
    print "not image:\t{}".format(not_image)
    unrecognized = len([
        key for key, value in report.iteritems()
        if not value['precheck']['pass']
    ])
    print "not convertible:\t{}".format(unrecognized)
    converted = len(
        [key for key, value in report.iteritems() if value['converted']])
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "converted:\t{}".format(converted)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "stashed:\t{}".format(stashed)

    print "\nDone."
コード例 #7
0
def main(argv=None):
    ''' stash Nuxeo files of type 'file', 'audio', or 'video'
    for a collection '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, stash files (pdf, txt, etc) in S3.')
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument(
        '--bucket', default='ucldc-nuxeo-ref-media', help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help="aws region")
    parser.add_argument(
        '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux")
    parser.add_argument(
        '--replace',
        action="store_true",
        help="replace file on s3 if it already exists")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}

    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashFile(obj['path'], argv.bucket, argv.region,
                                 argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashFile(c['path'], argv.bucket, argv.region,
                                     argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_file = len([
        key for key, value in report.iteritems()
        if not value['calisphere_type'] in VALID_CALISPHERE_TYPES
    ])
    print "not type `file`, `audio` or `video`:\t{}".format(not_file)
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "already stashed:\t{}".format(already_stashed)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "(re)stashed:\t{}".format(stashed)

    print "\nDone."
コード例 #8
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        'Create ATOM feed for a given Nuxeo folder for Merritt harvesting')
    parser.add_argument("collection", help="UCLDC Registry Collection ID")
    parser.add_argument("--pynuxrc", help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()
    collection_id = argv.collection

    if argv.pynuxrc:
        ma = MerrittAtom(collection_id, argv.pynuxrc)
    else:
        ma = MerrittAtom(collection_id)

    print "atom_file: {}".format(ma.atom_file)
    print "ma.path: {}".format(ma.path)

    if argv.pynuxrc:
        dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc)
    else:
        dh = DeepHarvestNuxeo(ma.path, '')

    print "Nuxeo path: {}".format(ma.path)
    print "Fetching Nuxeo docs. This could take a while if collection is large..."
    documents = dh.fetch_objects()

    # create root
    root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

    # add entries
    for document in documents:
        nxid = document['uid']
        print "working on document: {} {}".format(nxid, document['path'])

        # parent
        entry = ma._construct_entry(nxid, True)
        print "inserting entry for parent object {} {}".format(
            nxid, document['path'])
        root.insert(0, entry)

        # children
        component_entries = [
            ma._construct_entry(c['uid'], False)
            for c in dh.fetch_components(document)
        ]
        for ce in component_entries:
            print "inserting entry for component: {} {}".format(
                nxid, document['path'])
            root.insert(0, ce)

    # add header info
    print "Adding header info to xml tree"
    ma._add_merritt_id(root, ma.merritt_id)
    ma._add_paging_info(root)
    ma._add_collection_alt_link(root, ma.path)
    ma._add_atom_elements(root)
    ma._add_feed_updated(root, ma.last_update)

    ma._write_feed(root)
    print "Feed written to file: {}".format(ma.atom_file)

    ma._s3_stash()
    print "Feed stashed on s3: {}".format(ma.s3_url)
コード例 #9
0
class Stash(object):
    '''
        stash various files on s3 for a Nuxeo collection
        in preparation for harvesting into Calisphere
    '''
    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)

    def fetch_objects(self):
        ''' fetch objects to process '''
        return self.dh.fetch_objects()

    def images(self):
        ''' stash Nuxeo image files on s3 '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashImage(obj['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing image {}'.format(c['path']))
                nxstash = NuxeoStashImage(c['path'], IMAGE_BUCKET,
                                          IMAGE_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def files(self):
        ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a
        collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashFile(obj['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing file {}'.format(c['path']))
                nxstash = NuxeoStashFile(c['path'], FILE_BUCKET, FILE_REGION,
                                         self.pynuxrc, self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def thumbnails(self):
        ''' stash thumbnail images for Nuxeo files of type 'file', 'audio',
        or 'video' for a collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashThumb(obj['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing thumb {}'.format(c['path']))
                nxstash = NuxeoStashThumb(c['path'], THUMB_BUCKET,
                                          THUMB_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def media_json(self):
        ''' create and stash media.json files for a nuxeo collection '''
        report = {}
        for obj in self.objects:
            self.logger.info('Stashing media json {}'.format(obj['path']))
            nxstash = NuxeoStashMediaJson(obj['path'], MEDIAJSON_BUCKET,
                                          MEDIAJSON_REGION, self.pynuxrc,
                                          self.replace)
            report[nxstash.uid] = nxstash.nxstashref()

        return report
コード例 #10
0
ファイル: merritt_atom.py プロジェクト: ucldc/ucldc-merritt
class MerrittAtom():

    def __init__(self, collection_id, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.collection_id = collection_id

        if 'bucket' in kwargs:
            self.bucket = kwargs['bucket']
        else:
            self.bucket = BUCKET

        if 'pynuxrc' in kwargs:
            pynuxrc = kwargs['pynuxrc']
        else:
            pynuxrc = None

        if 'dir' in kwargs:
            self.dir = kwargs['dir']
        else:
            self.dir = '.'

        if 'nostash' in kwargs:
            self.nostash = kwargs['nostash']
        else:
            self.nostash = False

        self.logger.info("collection_id: {}".format(self.collection_id))

        if 'nuxeo_path' in kwargs:
            self.path = kwargs['nuxeo_path']
        else:
            self.path = self._get_nuxeo_path()

        if 'merritt_id' in kwargs:
            self.merritt_id = kwargs['merritt_id']
        else:
            self.merritt_id = self._get_merritt_id()

        if not self.merritt_id:
            raise ValueError("No Merritt ID for this collection")

        self.feed_base_url = 'https://s3.amazonaws.com/{}/'.format(self.bucket)

        if pynuxrc:
            self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc),'r'))
            self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=pynuxrc)
        elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')):
            self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r'))
            self.dh = DeepHarvestNuxeo(self.path, '')

        self.atom_file = self._get_filename(self.collection_id)
        if not self.atom_file:
            raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id))

        self.s3_url = "{}{}".format(self.feed_base_url, self.atom_file)

        self.atom_filepath = os.path.join(self.dir, self.atom_file)

    def _get_merritt_id(self):
        ''' given collection registry ID, get corresponding Merritt collection ID '''
        url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id)
        res = requests.get(url)
        res.raise_for_status()
        md = json.loads(res.text)
        merritt_id = md['merritt_id']

        return merritt_id 

    def _get_nuxeo_path(self):
        ''' given ucldc registry collection ID, get Nuxeo path for collection '''
        url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id)
        res = requests.get(url)
        res.raise_for_status()
        md = json.loads(res.text)
        nuxeo_path = md['harvest_extra_data']

        return nuxeo_path 

    def _get_filename(self, collection_id):
        ''' given Collection ID, get a friendly filename for the ATOM feed '''
        filename = 'ucldc_collection_{}.atom'.format(collection_id)

        return filename 

    def _extract_nx_metadata(self, raw_metadata): 
        ''' extract Nuxeo metadata we want to post to the ATOM feed '''
        metadata = {}
        
        # last modified 
        metadata['lastModified'] = raw_metadata['bundle_lastModified']

        # creator
        creators = raw_metadata['properties']['ucldc_schema:creator']
        metadata['creator'] = [creator['name'] for creator in creators]

        # title
        metadata['title'] = raw_metadata['title']

        # date
        dates = raw_metadata['properties']['ucldc_schema:date']
        dates = [date['date'] for date in dates]
        metadata['date'] = dates[0] if dates else None

        # nuxeo id
        metadata['id'] = raw_metadata['properties']['ucldc_schema:identifier']

        # nuxeo collection
        metadata['collection'] = raw_metadata['properties']['ucldc_schema:collection'][0] if raw_metadata['properties']['ucldc_schema:collection'] else None

        return metadata

    def _construct_entry_bundled(self, doc):
        ''' construct ATOM feed entry element for a given nuxeo doc, including files for any component objects '''
        uid = doc['uid']

        # parent
        nx_metadata = self._extract_nx_metadata(doc)
        entry = etree.Element(etree.QName(ATOM_NS, "entry"))
        entry = self._populate_entry(entry, nx_metadata, uid, True)

        # insert component md
        for c in self.dh.fetch_components(doc):
            self._insert_full_md_link(entry, c['uid'])
            self._insert_main_content_link(entry, c['uid'])
            self._insert_aux_links(entry, c['uid'])

        return entry

    def _add_atom_elements(self, doc):
        ''' add atom feed elements to document '''

        # recommended ATOM feed elements
        feed_author = etree.Element(etree.QName(ATOM_NS, "author"))
        feed_author.text = "UC Libraries Digital Collection"
        doc.insert(0, feed_author)

        # required ATOM feed elements
        feed_title = etree.Element(etree.QName(ATOM_NS, "title"))
        feed_title.text = "UCLDC Metadata Feed" # FIXME get campus name from registry API?
        doc.insert(0, feed_title)

        feed_id = etree.Element(etree.QName(ATOM_NS, "id"))
        feed_id.text = self.s3_url 
        doc.insert(0, feed_id)

        return doc 

    def _add_feed_updated(self, doc, updated):
        ''' add feed updated '''
        feed_updated = etree.Element(etree.QName(ATOM_NS, "updated"))
        feed_updated.text = updated 
        doc.insert(0, feed_updated)

    def _add_collection_alt_link(self, doc, path):
        ''' add elements related to Nuxeo collection info to document '''
        collection_metadata = self.nx.get_metadata(path=path)
        collection_title = collection_metadata['title']
        collection_uid = collection_metadata['uid']
        collection_uri = self.get_object_view_url(collection_uid)

        feed_link_alt = etree.Element(etree.QName(ATOM_NS, "link"), rel="alternate", href=collection_uri, title=collection_title) 
        doc.insert(0, feed_link_alt)

        return doc

    def _add_paging_info(self, doc):
        ''' add rel links for paging '''
        # this is just dumb for now
        last_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="last", href=self.s3_url)
        doc.insert(0, last_link)

        first_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="first", href=self.s3_url)
        doc.insert(0, first_link)

        self_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="self", href=self.s3_url)
        doc.insert(0, self_link)

    def _add_merritt_id(self, doc, merritt_collection_id):
        ''' add Merritt ID '''
        merritt_id = etree.Element(etree.QName(ATOM_NS, "merritt_collection_id"))
        merritt_id.text = merritt_collection_id 
        doc.insert(0, merritt_id)

    def _populate_entry(self, entry, metadata, nxid, is_parent):
        ''' get <entry> element for a given set of object metadata '''

        # atom id (URI)
        nuxeo_object_view_url = self.get_object_view_url(nxid)
        atom_id = etree.SubElement(entry, etree.QName(ATOM_NS, "id"))
        atom_id.text = nuxeo_object_view_url

        # atom title
        atom_title = etree.SubElement(entry, etree.QName(ATOM_NS, "title"))
        atom_title.text = metadata["title"]
 
        # atom updated
        atom_updated = etree.SubElement(entry, etree.QName(ATOM_NS, "updated"))
        atom_updated.text = metadata['lastModified'].isoformat()

        # atom author
        atom_author = etree.SubElement(entry, etree.QName(ATOM_NS, "author"))
        atom_author.text = "UC Libraries Digital Collection"

        # metadata file link
        self._insert_full_md_link(entry, nxid)

        # media json link
        if is_parent:
            self._insert_media_json_link(entry, nxid)

        # main content file link
        self._insert_main_content_link(entry, nxid)

        # auxiliary file link(s)
        self._insert_aux_links(entry, nxid)

        # dc creator
        for creator_name in metadata['creator']:
            dc_creator = etree.SubElement(entry, etree.QName(DC_NS, "creator"))
            dc_creator.text = creator_name 

        # dc title
        dc_title = etree.SubElement(entry, etree.QName(DC_NS, "title"))
        dc_title.text = metadata['title']

        # dc date
        dc_date = etree.SubElement(entry, etree.QName(DC_NS, "date"))
        dc_date.text = metadata['date']

        # dc identifier (a.k.a. local identifier) - Nuxeo ID
        nuxeo_identifier = etree.SubElement(entry, etree.QName(DC_NS, "identifier"))
        nuxeo_identifier.text = nxid

        # UCLDC identifier (a.k.a. local identifier) - ucldc_schema:identifier -- this will be the ARK if we have it
        if metadata['id']:
            ucldc_identifier = etree.SubElement(entry, etree.QName(NX_NS, "identifier"))
            ucldc_identifier.text = metadata['id']

        # UCLDC collection identifier
        ucldc_collection_id = etree.SubElement(entry, etree.QName(NX_NS, "collection"))
        ucldc_collection_id.text = metadata['collection']

        return entry

    def _insert_media_json_link(self, entry, uid):
        media_json_url = self.get_media_json_url(uid)
        link_media_json = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=media_json_url, type="application/json", title="Deep Harvest metadata for this object") 


    def _insert_main_content_link(self, entry, uid):
        nx_metadata = self.nx.get_metadata(uid=uid)
        nuxeo_file_download_url = self.get_object_download_url(nx_metadata)
        checksum = self.get_nuxeo_file_checksum(nx_metadata)
        if nuxeo_file_download_url:
            main_content_link = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=nuxeo_file_download_url, title="Main content file") # FIXME add content_type
        
        if checksum:
            checksum_element = etree.SubElement(main_content_link, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5")
            checksum_element.text = checksum

    def _insert_aux_links(self, entry, uid):
        nx_metadata = self.nx.get_metadata(uid=uid)
        aux_files = self.get_aux_files(nx_metadata)
        for af in aux_files:
            link_aux_file = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=af['url'], title="Auxiliary file")
            if af['checksum']:
                checksum_element = etree.SubElement(link_aux_file, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5")
                checksum_element.text = af['checksum']

    def _insert_full_md_link(self, entry, uid):
        full_metadata_url = self.get_full_metadata(uid)
        link_md = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=full_metadata_url, type="application/xml", title="Full metadata for this object from Nuxeo")


    def _write_feed(self, doc):
        ''' publish feed '''
        feed = etree.ElementTree(doc)
        feed_string = etree.tostring(feed, pretty_print=True, encoding='utf-8', xml_declaration=True)

        with open(self.atom_filepath, "w") as f:
            f.write(feed_string)
      
    def _s3_get_feed(self):
       """ Retrieve ATOM feed file from S3. Return as ElementTree object """
       bucketpath = self.bucket.strip("/")
       bucketbase = self.bucket.split("/")[0]
       keyparts = bucketpath.split("/")[1:]
       keyparts.append(self.atom_file)
       keypath = '/'.join(keyparts)

       s3 = boto3.client('s3')
       response = s3.get_object(Bucket=bucketbase,Key=keypath)
       contents = response['Body'].read()

       return etree.fromstring(contents) 

    def _s3_stash(self):
       """ Stash file in S3 bucket.
       """
       bucketpath = self.bucket.strip("/")
       bucketbase = self.bucket.split("/")[0]
       keyparts = bucketpath.split("/")[1:]
       keyparts.append(self.atom_file)
       keypath = '/'.join(keyparts)

       s3 = boto3.client('s3')
       with open(self.atom_filepath, 'r') as f:
           s3.upload_fileobj(f, bucketbase, keypath)

    def get_object_view_url(self, nuxeo_id):
        """ Get object view URL """
        parts = urlparse.urlsplit(self.nx.conf["api"])
        url = "{}://{}/Nuxeo/nxdoc/default/{}/view_documents".format(parts.scheme, parts.netloc, nuxeo_id) 
        return url

    def get_full_metadata(self, nuxeo_id):
        """ Get full metadata via Nuxeo API """
        parts = urlparse.urlsplit(self.nx.conf["api"])
        url = '{}://{}/Merritt/{}.xml'.format(parts.scheme, parts.netloc, nuxeo_id)
    
        return url

    def get_object_download_url(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'")

        if file_content is None:
            return None
        else:
            url = file_content['data']

        # make available via basic auth
        url = url.replace('/nuxeo/', '/Nuxeo/')
     
        return url

    def get_media_json_url(self, nuxeo_id):
        """ Get media.json (deep harvest) url """
        # https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/002130a5-e171-461b-a41b-28ab46af9652-media.json
        url = "https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/{}-media.json".format(nuxeo_id)

        return url

    def get_nuxeo_file_checksum(self, metadata):
        ''' get md5 checksum for nuxeo file '''
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'")

        if file_content is None:
            return None
        else:
            checksum = file_content['digest']

        return checksum

    def get_aux_files(self, metadata):
        ''' get auxiliary file urls '''
        all_md = []
        
        # get any "attachment" files
        if metadata['properties']['files:files']:
            attachments = metadata['properties']['files:files']
            for attachment in attachments:
                md = {}
                if attachment['file'] and attachment['file']['data']:
                    url = attachment['file']['data']
                    url = url.replace('/nuxeo/', '/Nuxeo/')
                    md['url'] = url
                if attachment['file'] and attachment['file']['digest']:
                    md['checksum'] = attachment['file']['digest']
                if md:
                    all_md.append(md)

        # get any "extra_file" files
        if metadata['properties']['extra_files:file']:
            for extra_file in metadata['properties']['extra_files:file']:
                md = {}
                if extra_file['blob'] and extra_file['blob']['data']:
                    url = extra_file['blob']['data']
                    url = url.replace('/nuxeo/', '/Nuxeo/')
                    md['url'] = url
                if extra_file['blob'] and extra_file['blob']['digest']:    
                    md['checksum'] = extra_file['blob']['digest']
                if md:
                    all_md.append(md)

        return all_md 

    def _bundle_docs(self, docs):
        ''' given a dict of parent level nuxeo docs, fetch any components
            and also figure out when any part of the object was most 
            recently modified/added '''

        for doc in docs:

            last_mod_str = doc['lastModified']
            overall_mod_datetime = parse(last_mod_str)

            doc['components'] = []
            
            for c in doc['components']:
                mod_str = c['lastModified']
                mod_datetime = parse(mod_str)
        
                if mod_datetime > overall_mod_datetime:
                    overall_mod_datetime = mod_datetime 

            doc['bundle_lastModified'] = overall_mod_datetime

        return docs 

    def process_feed(self):
        ''' create feed for collection and stash on s3 '''
        self.logger.info("atom_file: {}".format(self.atom_file))
        self.logger.info("Nuxeo path: {}".format(self.path))
        self.logger.info("Fetching Nuxeo docs. This could take a while if collection is large...")

        parent_docs = self.dh.fetch_objects()

        bundled_docs = self._bundle_docs(parent_docs)
        bundled_docs.sort(key=itemgetter('bundle_lastModified'))

        # create root
        root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

        # add entries
        for document in bundled_docs:
            nxid = document['uid']
            self.logger.info("working on document: {} {}".format(nxid, document['path']))

            # object, bundled into one <entry> if complex
            entry = self._construct_entry_bundled(document)
            self.logger.info("inserting entry for object {} {}".format(nxid, document['path']))
            root.insert(0, entry)

        # add header info
        logging.info("Adding header info to xml tree")
        self._add_merritt_id(root, self.merritt_id)
        self._add_paging_info(root)
        self._add_collection_alt_link(root, self.path)
        self._add_atom_elements(root)
        self._add_feed_updated(root, datetime.now(dateutil.tz.tzutc()).isoformat())

        self._write_feed(root)
        logging.info("Feed written to file: {}".format(self.atom_filepath))

        if not self.nostash:
            self._s3_stash()
            self.logger.info("Feed stashed on s3: {}".format(self.s3_url)) 
コード例 #11
0
class NuxeoStashMediaJson(NuxeoStashRef):
    ''' create and stash media.json file for a nuxeo object '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=True,
                 **kwargs):
        super(NuxeoStashMediaJson, self).__init__(path, bucket, region,
                                                  pynuxrc, replace, **kwargs)

        self.dh = DeepHarvestNuxeo(
            self.path, self.bucket, pynuxrc=self.pynuxrc)
        self.mj = MediaJson()

        self.filename = FILENAME_FORMAT.format(self.uid)
        self.filepath = os.path.join(self.tmp_dir, self.filename)
        self._update_report('filename', self.filename)
        self._update_report('filepath', self.filepath)

    def nxstashref(self):
        return self.nxstash_mediajson()

    def nxstash_mediajson(self):
        ''' create media.json file for object and stash on s3 '''
        self._update_report('stashed', False)

        # extract and transform metadata for parent obj and any components
        parent_md = self._get_parent_metadata(self.metadata)
        component_md = [
            self._get_component_metadata(c)
            for c in self.dh.fetch_components(self.metadata)
        ]

        # create media.json file
        media_json = self.mj.create_media_json(parent_md, component_md)
        self._write_file(media_json, self.filepath)

        # stash media.json file on s3
        stashed, s3_report = s3stash.s3tools.s3stash(
            self.filepath, self.bucket, self.filename, self.region,
            'application/json', self.replace)
        self._update_report('s3_stash', s3_report)
        self._update_report('stashed', stashed)

        self._remove_tmp()

        return self.report

    def _get_parent_metadata(self, obj):
        ''' assemble top-level (parent) object metadata '''
        metadata = {}
        metadata['label'] = obj['title']

        # only provide id, href, format if Nuxeo Document has file attached
        full_metadata = self.nx.get_metadata(uid=obj['uid'])

        if self.dh.has_file(full_metadata):
            metadata['id'] = obj['uid']
            metadata['href'] = self.dh.get_object_download_url(full_metadata)
            metadata['format'] = self.dh.get_calisphere_object_type(obj[
                'type'])
            if metadata['format'] == 'video':
                metadata['dimensions'] = self.dh.get_video_dimensions(
                    full_metadata)

        return metadata

    def _get_component_metadata(self, obj):
        ''' assemble component object metadata '''
        metadata = {}
        full_metadata = self.nx.get_metadata(uid=obj['uid'])
        metadata['label'] = obj['title']
        metadata['id'] = obj['uid']
        metadata['href'] = self.dh.get_object_download_url(full_metadata)

        # extract additional  ucldc metadata from 'properties' element
        ucldc_md = self._get_ucldc_schema_properties(full_metadata)

        for key, value in ucldc_md.iteritems():
            metadata[key] = value

        # map 'type'
        metadata['format'] = self.dh.get_calisphere_object_type(obj['type'])

        return metadata

    def _get_ucldc_schema_properties(self, metadata):
        ''' get additional metadata as mapped by harvester '''
        properties = {}

        mapper = UCLDCNuxeoMapper(metadata)
        mapper.map_original_record()
        mapper.map_source_resource()

        properties = mapper.mapped_data['sourceResource']
        properties.update(mapper.mapped_data['originalRecord'])

        return properties

    def _write_file(self, content_dict, filepath):
        """ convert dict to json and write to file """
        content_json = json.dumps(
            content_dict, indent=4, separators=(',', ': '), sort_keys=False)
        with open(filepath, 'wb') as f:
            f.write(content_json)
            f.flush()
コード例 #12
0
def main(argv=None):
    ''' stash Nuxeo image files on s3 '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, create jp2 versions of image '
        'files and stash in S3.'
    )
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument(
        '--bucket',
        default='ucldc-private-files/jp2000',
        help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help='AWS region')
    parser.add_argument(
        '--replace',
        action="store_true",
        help="replace file on s3 if it already exists")
    parser.add_argument(
        '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}
    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region,
                                  argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_image = len([
        key for key, value in report.iteritems()
        if not value['is_image']['is_image']
    ])
    print "not image:\t{}".format(not_image)
    unrecognized = len([
        key for key, value in report.iteritems()
        if not value['precheck']['pass']
    ])
    print "not convertible:\t{}".format(unrecognized)
    converted = len(
        [key for key, value in report.iteritems() if value['converted']])
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "converted:\t{}".format(converted)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "stashed:\t{}".format(stashed)

    print "\nDone."
コード例 #13
0
class Stash(object):
    '''
        stash various files on s3 for a Nuxeo collection
        in preparation for harvesting into Calisphere
    '''

    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)

    def fetch_objects(self):
        ''' fetch objects to process '''
        return self.dh.fetch_objects()

    def images(self):
        ''' stash Nuxeo image files on s3 '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashImage(obj['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing image {}'.format(c['path']))
                nxstash = NuxeoStashImage(c['path'], IMAGE_BUCKET,
                                          IMAGE_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def files(self):
        ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a
        collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashFile(obj['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing file {}'.format(c['path']))
                nxstash = NuxeoStashFile(c['path'], FILE_BUCKET, FILE_REGION,
                                         self.pynuxrc, self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def thumbnails(self):
        ''' stash thumbnail images for Nuxeo files of type 'file', 'audio',
        or 'video' for a collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashThumb(obj['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing thumb {}'.format(c['path']))
                nxstash = NuxeoStashThumb(c['path'], THUMB_BUCKET,
                                          THUMB_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def media_json(self):
        ''' create and stash media.json files for a nuxeo collection '''
        report = {}
        for obj in self.objects:
            self.logger.info('Stashing media json {}'.format(obj['path']))
            nxstash = NuxeoStashMediaJson(obj['path'], MEDIAJSON_BUCKET,
                                          MEDIAJSON_REGION, self.pynuxrc,
                                          self.replace)
            report[nxstash.uid] = nxstash.nxstashref()

        return report