def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print count of objects for a given collection.')
    parser.add_argument('path', help="Nuxeo path to collection")
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-prod',
                        help="rcfile for use with pynux utils")
    parser.add_argument('--components',
                        action='store_true',
                        help="show counts for object components")
    if argv is None:
        argv = parser.parse_args()

    dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    if not argv.components:
        return

    print "about to iterate through objects and get components"
    component_count = 0
    for obj in objects:
        components = dh.fetch_components(obj)
        component_count = component_count + len(components)
    print "finished fetching components. {} found".format(component_count)
    print "Grand Total: {}".format(object_count + component_count)
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print count of objects for a given collection.')
    parser.add_argument('path', help="Nuxeo path to collection")
    parser.add_argument(
        'since_date',
        help=
        "Script will list docs updated since midnight on this date, GMT. Format YYYY-MM-DD",
        type=valid_date)
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rcfile for use with pynux utils")
    parser.add_argument('--components',
                        action='store_true',
                        help="show counts for object components")
    if argv is None:
        argv = parser.parse_args()

    dh = DeepHarvestNuxeo(argv.path, '', pynuxrc=argv.pynuxrc)
    print "about to fetch docs for path {}".format(dh.path)
    objects = dh.fetch_objects()

    component_count = 0
    for obj in objects:
        last_mod_str = obj['lastModified'][:10]
        last_mod_date = parse(last_mod_str)
        if last_mod_date > argv.since_date:
            print last_mod_str, obj['path']
        '''
Example #3
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description='print differences between Nuxeo and CouchDB for a '
                    'given collection'
    )
    parser.add_argument('regid', help="Collection Registry ID")
    parser.add_argument(
        '--pynuxrc',
        default='~/.pynuxrc-basic',
        help="rcfile for use with pynux utils")

    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.regid
    couch = get_couch_objects(registry_id)
    print('couch has {} objects'.format(len(couch)))

    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    for obj in dh.fetch_objects():
        incouch = True if obj['uid'] in couch else False
        if not incouch:
            print(obj['uid'])
    def deepharvest(self, metadata):
        ''' given a set of nuxeo metadata for a doc, deep harvest it '''

        self.logger.info("Processing {}".format(metadata['uid']))
     
        dh = DeepHarvestNuxeo('')
        type = dh.get_calisphere_object_type(metadata['type'])
        self.logger.info("Type: {}".format(type))


        report = {}
        if type == 'image':
            ''' stash image '''
            nxstash = NuxeoStashImage(metadata['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()
        
        print report
 
        if type in ['file', 'audio', 'video']:
            # stash file
            nxstash = NuxeoStashFile(metadata['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()

            # stash thumbnail
            nxstash = NuxeoStashThumb(metadata['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace, metadata=metadata)
            report[nxstash.uid] = nxstash.nxstashref()

        print report

        # stash media.json
        '''
Example #5
0
    def __init__(self, collection_id, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.collection_id = collection_id

        if 'bucket' in kwargs:
            self.bucket = kwargs['bucket']
        else:
            self.bucket = BUCKET

        if 'pynuxrc' in kwargs:
            pynuxrc = kwargs['pynuxrc']
        else:
            pynuxrc = None

        if 'dir' in kwargs:
            self.dir = kwargs['dir']
        else:
            self.dir = '.'

        if 'nostash' in kwargs:
            self.nostash = kwargs['nostash']
        else:
            self.nostash = False

        self.logger.info("collection_id: {}".format(self.collection_id))

        if 'nuxeo_path' in kwargs:
            self.path = kwargs['nuxeo_path']
        else:
            self.path = self._get_nuxeo_path()

        if 'merritt_id' in kwargs:
            self.merritt_id = kwargs['merritt_id']
        else:
            self.merritt_id = self._get_merritt_id()

        if not self.merritt_id:
            raise ValueError("No Merritt ID for this collection")

        self.feed_base_url = 'https://s3.amazonaws.com/{}/'.format(self.bucket)

        if pynuxrc:
            self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc),'r'))
            self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=pynuxrc)
        elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')):
            self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r'))
            self.dh = DeepHarvestNuxeo(self.path, '')

        self.atom_file = self._get_filename(self.collection_id)
        if not self.atom_file:
            raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id))

        self.s3_url = "{}{}".format(self.feed_base_url, self.atom_file)

        self.atom_filepath = os.path.join(self.dir, self.atom_file)
def main(argv=None):
    ''' create and stash media.json files for a nuxeo collection '''

    parser = argparse.ArgumentParser(description='Create and stash media.json'
                                     'files for a nuxeo collection')
    parser.add_argument("path", help="Nuxeo document path")
    parser.add_argument("--bucket",
                        default="static.ucldc.cdlib.org/media_json",
                        help="S3 bucket where media.json files will be stashed")
    parser.add_argument('--region', default='us-east-1', help="aws region")
    parser.add_argument("--pynuxrc", default='~/.pynuxrc',
                        help="rc file for use by pynux")
    parser.add_argument(
        '--replace',
        action="store_true",
        help="replace file on s3 if it already exists")

    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/mediajson-{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}

    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashMediaJson(
            obj['path'],
            argv.bucket,
            argv.region,
            argv.pynuxrc,
            argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/mediajson-{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
def main(argv=None):
    ''' create and stash media.json files for a nuxeo collection '''

    parser = argparse.ArgumentParser(description='Create and stash media.json'
                                     'files for a nuxeo collection')
    parser.add_argument("path", help="Nuxeo document path")
    parser.add_argument(
        "--bucket",
        default="static.ucldc.cdlib.org/media_json",
        help="S3 bucket where media.json files will be stashed")
    parser.add_argument('--region', default='us-east-1', help="aws region")
    parser.add_argument("--pynuxrc",
                        default='~/.pynuxrc',
                        help="rc file for use by pynux")
    parser.add_argument('--replace',
                        action="store_true",
                        help="replace file on s3 if it already exists")

    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/mediajson-{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}

    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashMediaJson(obj['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/mediajson-{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
Example #8
0
    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='list objects for a given collection.')
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-basic',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    print "about to iterate through objects and get components"
    component_count = 0
    all_components = []
    for obj in objects:
        components = dh.fetch_components(obj)
        all_components.extend(components)
        print "{} components for {}".format(len(components), obj['uid'])
    print "finished fetching components. {} found".format(len(all_components))

    objects.extend(all_components)
    total_obj = len(objects)
    print "Grand Total: {}".format(total_obj)

    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    chunks = [
        objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)
    ]

    count = 0
    for c in chunks:
        count = count + 1
        filepath = 'chunks/{}_{}.txt'.format(registry_id, count)
        print "Writing file: {}".format(filepath)
        with open(filepath, 'w') as f:
            json.dump(c, f, indent=4)
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='list objects for a given collection.')
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument(
        '--pynuxrc',
        default='~/.pynuxrc-basic',
        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    print "about to iterate through objects and get components"
    component_count = 0
    all_components = []
    for obj in objects:
        components = dh.fetch_components(obj) 
        all_components.extend(components)
        print "{} components for {}".format(len(components), obj['uid'])
    print "finished fetching components. {} found".format(len(all_components))

    objects.extend(all_components)
    total_obj = len(objects)
    print "Grand Total: {}".format(total_obj)

    # https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    chunks = [objects[i:i + PER_PAGE] for i in xrange(0, len(objects), PER_PAGE)]

    count = 0
    for c in chunks:
        count = count + 1        
        filepath = 'chunks/{}_{}.txt'.format(registry_id, count)
        print "Writing file: {}".format(filepath)
        with open(filepath, 'w') as f:
            json.dump(c, f, indent=4)
Example #11
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Create ATOM feed for a given Nuxeo folder for Merritt harvesting')
    parser.add_argument("collection", help="UCLDC Registry Collection ID")
    parser.add_argument("--pynuxrc", help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()
    collection_id = argv.collection

    if argv.pynuxrc:
        ma = MerrittAtom(collection_id, argv.pynuxrc)
    else:
        ma = MerrittAtom(collection_id)

    print "atom_file: {}".format(ma.atom_file)

    if argv.pynuxrc:
        dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc)
    else:
        dh = DeepHarvestNuxeo(ma.path, '')

    print "Fetching Nuxeo docs. This could take a while if collection is large..."
    documents = dh.fetch_objects()
    # TODO: fetch components also

    # create root
    root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

    # add entries
    for document in documents:
        nxid = document['uid']
        print "constructing entry for {} {}".format(nxid, document['path'])
        nx_metadata = ma._extract_nx_metadata(nxid)
        entry = etree.Element(etree.QName(ATOM_NS, "entry"))
        entry = ma._populate_entry(entry, nx_metadata, nxid)        
        root.insert(0, entry)

    # add header info
    print "Adding header info to xml tree"
    ma._add_merritt_id(root, ma.merritt_id)
    ma._add_paging_info(root)
    ma._add_collection_alt_link(root, ma.path)
    ma._add_atom_elements(root)
    ma._add_feed_updated(root, ma.last_update)

    ma._publish_feed(root)
    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=True,
                 **kwargs):
        super(NuxeoStashMediaJson, self).__init__(path, bucket, region,
                                                  pynuxrc, replace, **kwargs)

        self.dh = DeepHarvestNuxeo(
            self.path, self.bucket, pynuxrc=self.pynuxrc)
        self.mj = MediaJson()

        self.filename = FILENAME_FORMAT.format(self.uid)
        self.filepath = os.path.join(self.tmp_dir, self.filename)
        self._update_report('filename', self.filename)
        self._update_report('filepath', self.filepath)
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None,
        replace=False, timeout=JOB_TIMEOUT, log_handler=None):
    ''' Queue a deep harvest of a nuxeo object on a worker'''
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()
    log = logbook.Logger('QDH')
    for cid in [x for x in collection_ids.split(';')]:
        url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/',
                    cid, '/'))
        coll = Collection(url_api)

        dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc)

        for obj in dh.fetch_objects():
            log.info('Queueing TOPLEVEL {} :-: {}'.format(
                obj['uid'],
                obj['path']))
            # deep harvest top level object
            queue_deep_harvest_path(
                config['redis_host'],
                config['redis_port'],
                config['redis_password'],
                config['redis_connect_timeout'],
                rq_queue=rq_queue,
                path=obj['path'],
                replace=replace,
                timeout=timeout)
            # deep harvest component sub-objects
            for c in dh.fetch_components(obj):
                log.info('Queueing {} :-: {}'.format(
                    c['uid'],
                    c['path']))
                queue_deep_harvest_path(
                    config['redis_host'],
                    config['redis_port'],
                    config['redis_password'],
                    config['redis_connect_timeout'],
                    rq_queue=rq_queue,
                    path=c['path'],
                    replace=replace,
                    timeout=timeout)

    log_handler.pop_application()
def main(collection_ids,
         rq_queue='dh-q',
         config=None,
         pynuxrc=None,
         replace=False,
         timeout=JOB_TIMEOUT,
         log_handler=None):
    ''' Queue a deep harvest of a nuxeo object on a worker'''
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()
    log = logbook.Logger('QDH')
    for cid in [x for x in collection_ids.split(';')]:
        url_api = ''.join(
            ('https://registry.cdlib.org/api/v1/collection/', cid, '/'))
        coll = Collection(url_api)

        dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc)

        for obj in dh.fetch_objects():
            log.info('Queueing TOPLEVEL {} :-: {}'.format(
                obj['uid'], obj['path']))
            # deep harvest top level object
            queue_deep_harvest_path(config['redis_host'],
                                    config['redis_port'],
                                    config['redis_password'],
                                    config['redis_connect_timeout'],
                                    rq_queue=rq_queue,
                                    path=obj['path'],
                                    replace=replace,
                                    timeout=timeout)
            # deep harvest component sub-objects
            for c in dh.fetch_components(obj):
                log.info('Queueing {} :-: {}'.format(c['uid'], c['path']))
                queue_deep_harvest_path(config['redis_host'],
                                        config['redis_port'],
                                        config['redis_password'],
                                        config['redis_connect_timeout'],
                                        rq_queue=rq_queue,
                                        path=c['path'],
                                        replace=replace,
                                        timeout=timeout)

    log_handler.pop_application()
Example #15
0
    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False,
                 **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)
Example #16
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description=
        'Print info on objects missing from couchdb for Nuxeo collection')
    parser.add_argument('id', help='Collection registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.id
    print "Registry ID: {}".format(registry_id)

    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    print "Nuxeo path: {}".format(nxpath)

    # get couchdb data
    view = "https://harvest-stg.cdlib.org/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key=%22{}%22".format(
        registry_id)
    print view
    res = requests.get(view, verify=False)  # FIXME we want to verify
    res.raise_for_status()
    couchdata = json.loads(res.content)
    rows = couchdata['rows']
    delimiter = "{}--".format(registry_id)
    couch_uids = [row['id'].split(delimiter)[1] for row in rows]
    couch_count = len(couch_uids)
    print "Total rows in couchdb: {}".format(couch_count)

    # get nuxeo data
    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    nx_count = len(objects)
    print "Total objects in Nuxeo: {}".format(nx_count)

    for obj in objects:
        if obj['uid'] not in couch_uids:
            print obj['uid'], obj['path']
def main(argv=None):

    parser = argparse.ArgumentParser(
        description='Print info on objects missing from couchdb for Nuxeo collection')
    parser.add_argument('id', help='Collection registry ID')
    parser.add_argument(
        '--pynuxrc',
        default='~/.pynuxrc',
        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.id
    print "Registry ID: {}".format(registry_id)

    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    print "Nuxeo path: {}".format(nxpath)

    # get couchdb data
    view = "https://harvest-stg.cdlib.org/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key=%22{}%22".format(registry_id)
    print view
    res = requests.get(view, verify=False) # FIXME we want to verify
    res.raise_for_status()
    couchdata = json.loads(res.content)
    rows = couchdata['rows']
    delimiter = "{}--".format(registry_id)
    couch_uids = [row['id'].split(delimiter)[1] for row in rows]
    couch_count = len(couch_uids)
    print "Total rows in couchdb: {}".format(couch_count)

    # get nuxeo data
    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    nx_count = len(objects)
    print "Total objects in Nuxeo: {}".format(nx_count)

    for obj in objects:
        if obj['uid'] not in couch_uids:
            print obj['uid'], obj['path']
Example #18
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description=
        'list objects for a given collection where nuxeo doc type is image but file type is pdf'
    )
    parser.add_argument('registry_id', help='UCLDC Registry ID')
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc-basic',
                        help="rcfile for use with pynux utils")
    if argv is None:
        argv = parser.parse_args()

    registry_id = argv.registry_id

    # get nuxeo path
    nxpath = s3stash.s3tools.get_nuxeo_path(registry_id)
    if nxpath is None:
        print "No record found for registry_id: {}".format(registry_id)
        sys.exit()

    dh = DeepHarvestNuxeo(nxpath, '', pynuxrc=argv.pynuxrc)
    print "about to fetch objects for path {}".format(dh.path)
    objects = dh.fetch_objects()
    object_count = len(objects)
    print "finished fetching objects. {} found".format(object_count)

    convert = Convert()
    counter = 0
    for obj in objects:
        if dh.has_file(obj) and obj['type'] == u'SampleCustomPicture' and obj[
                'properties']['file:content'][
                    'mime-type'] == u'application/pdf':
            print obj['uid'], obj['path'], obj['type'], obj['properties'][
                'file:content']['name']
            counter = counter + 1

    print counter
Example #19
0
    def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs):
        '''
        uses pynux (https://github.com/ucldc/pynux) to grab objects from
        the Nuxeo API

        api url is set from url_harvest, overriding pynuxrc config and
        passed in conf.

        the pynux config file should have user & password
        and X-NXDocumemtProperties values filled in.
        '''
        super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs)
        self._url = url_harvest
        self._path = extra_data
        self._nx = pynux.utils.Nuxeo(conf=conf_pynux)
        self._nx.conf['api'] = self._url
        self._structmap_bucket = STRUCTMAP_S3_BUCKET

        # get harvestable child objects
        conf_pynux['api'] = self._url
        self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux)

        self._children = iter(self._dh.fetch_objects())
    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)
Example #21
0
    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)   
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)
Example #22
0
    def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs):
        '''
        uses pynux (https://github.com/ucldc/pynux) to grab objects from
        the Nuxeo API

        api url is set from url_harvest, overriding pynuxrc config and
        passed in conf.

        the pynux config file should have user & password
        and X-NXDocumemtProperties values filled in.
        '''
        super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs)
        self._url = url_harvest
        self._path = extra_data
        self._nx = pynux.utils.Nuxeo(conf=conf_pynux)
        self._nx.conf['api'] = self._url
        self._structmap_bucket = STRUCTMAP_S3_BUCKET

        # get harvestable child objects
        conf_pynux['api'] = self._url
        self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux)

        self._children = iter(self._dh.fetch_objects())
def main(argv=None):
    ''' stash Nuxeo files of type 'file', 'audio', or 'video'
    for a collection '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, stash files (pdf, txt, etc) in S3.')
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument(
        '--bucket', default='ucldc-nuxeo-ref-media', help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help="aws region")
    parser.add_argument(
        '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux")
    parser.add_argument(
        '--replace',
        action="store_true",
        help="replace file on s3 if it already exists")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}

    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashFile(obj['path'], argv.bucket, argv.region,
                                 argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashFile(c['path'], argv.bucket, argv.region,
                                     argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_file = len([
        key for key, value in report.iteritems()
        if not value['calisphere_type'] in VALID_CALISPHERE_TYPES
    ])
    print "not type `file`, `audio` or `video`:\t{}".format(not_file)
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "already stashed:\t{}".format(already_stashed)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "(re)stashed:\t{}".format(stashed)

    print "\nDone."
Example #24
0
''' write nuxeo uid, identifier to file for UCSF Berne collections '''
collections = [
                  #'/asset-library/UCSF/Berne_Eric_Collection',
                  '/asset-library/UCSF/MSS 2003-12 Eric Berne papers',
                  '/asset-library/UCSF/MSS 2005-08 Eric Berne papers',
                  '/asset-library/UCSF/MSS 2013-18 Eric Berne papers',
                  '/asset-library/UCSF/MSS 2013-19 Eric Berne papers',
                  '/asset-library/UCSF/MSS 82-0 Eric Berne papers',
                  '/asset-library/UCSF/MSS 89-12 Eric Berne papers'
              ]


map = {}
for collection in collections:
    dh = DeepHarvestNuxeo(collection, '')
    objects = dh.fetch_objects()
    for obj in objects:
        uid = obj['uid']
        filename = obj['path'].split('/')[-1]
        identifier = filename.split('.')[0]
        map[identifier] = uid 

# Additions
map['mss2005-08_1_7_CTmedapplication_1937-08-02'] = 'bef32337-6ca6-43c9-9eaa-e0553f26f3dc'
map['mss2013-19_5_17_difficulties-comparative-psychiatry_ca1959'] = 'ad8c13d2-a89d-4346-809b-03cd612b9c80'
map['mss2013-19_18_CAboardmedexaminers-cert_1945-07-19'] = 'ad8c13d2-a89d-4346-809b-03cd612b9c80'
map['mss82-0_cover_gamespeopleplay-Israeli-ed'] = '13d32c79-0e59-4b25-b590-08c3249ca420'
map['mss2005-08_1_13_AUS-certofservice_1946-09-23'] = '74a79291-43b4-4ae0-9d56-634ee1a5953c'
map['mss2013-19_1_2_statement-interests-activities_ca1937'] = 'a52a3ab2-8c2b-48d0-b082-969e6cb6dcc3'
map['mss82-0_cover_juegos-en-que-participamos001'] = 'ec8a4de2-e65d-4963-9af2-6693fef19763'
Example #25
0
class NuxeoFetcher(Fetcher):
    '''Harvest a Nuxeo FILE. Can be local or at a URL'''
    def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs):
        '''
        uses pynux (https://github.com/ucldc/pynux) to grab objects from
        the Nuxeo API

        api url is set from url_harvest, overriding pynuxrc config and
        passed in conf.

        the pynux config file should have user & password
        and X-NXDocumemtProperties values filled in.
        '''
        super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs)
        self._url = url_harvest
        self._path = extra_data
        self._nx = pynux.utils.Nuxeo(conf=conf_pynux)
        self._nx.conf['api'] = self._url
        self._structmap_bucket = STRUCTMAP_S3_BUCKET

        # get harvestable child objects
        conf_pynux['api'] = self._url
        self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux)

        self._children = iter(self._dh.fetch_objects())

    def _get_structmap_url(self, bucket, obj_key):
        '''Get structmap_url property for object'''
        structmap_url = "s3://{0}/{1}{2}".format(bucket, obj_key,
                                                 '-media.json')
        return structmap_url

    def _get_structmap_text(self, structmap_url):
        '''
           Get structmap_text for object. This is all the words from 'label'
           in the json.
           See https://github.com/ucldc/ucldc-docs/wiki/media.json
        '''
        structmap_text = ""

        bucketpath = self._structmap_bucket.strip("/")
        bucketbase = bucketpath.split("/")[0]
        parts = urlparse.urlsplit(structmap_url)

        # get contents of <nuxeo_id>-media.json file
        conn = boto.connect_s3()
        bucket = conn.get_bucket(bucketbase)
        key = bucket.get_key(parts.path)
        if not key:  # media_json hasn't been harvested yet for this record
            self.logger.error('Media json at: {} missing.'.format(parts.path))
            return structmap_text
        mediajson = key.get_contents_as_string()
        mediajson_dict = json.loads(mediajson)

        # concatenate all of the words from 'label' in the json
        labels = []
        labels.append(mediajson_dict['label'])
        if 'structMap' in mediajson_dict:
            labels.extend([sm['label'] for sm in mediajson_dict['structMap']])
        structmap_text = ' '.join(labels)
        return structmap_text

    def _get_isShownBy(self, nuxeo_metadata):
        '''
            Get isShownBy value for object
            1) if object has image at parent level, use this
            2) if component(s) have image, use first one we can find
            3) if object has PDF or video at parent level,
                use image stashed on S3
            4) if component(s) have PDF or video, use first component image stashed on S3 we can find
            5) return None
        '''
        is_shown_by = None
        uid = nuxeo_metadata['uid']
        self.logger.info("About to get isShownBy for uid {}".format(uid))

        # 1) if object has image at parent level, use this
        if self._has_image(nuxeo_metadata):
            self.logger.info("Nuxeo doc with uid {} has an image at the "
                             "parent level".format(uid))
            is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format(
                nuxeo_metadata['uid'])
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 2) if component(s) have image, use first one we can find
        first_image_component_uid = self._get_first_image_component(
            nuxeo_metadata)
        self.logger.info(
            "first_image_component_uid: {}".format(first_image_component_uid))
        if first_image_component_uid:
            self.logger.info("Nuxeo doc with uid {} has an image at the"
                             "component level".format(uid))
            is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format(
                first_image_component_uid)
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 3) if object has PDF at parent level, use image stashed on S3
        if self._has_s3_thumbnail(nuxeo_metadata):
            self.logger.info(
                "Nuxeo doc with uid {} has a thumbnail for"
                "parent file (probably PDF) stashed on S3".format(uid))
            is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format(
                nuxeo_metadata['uid'])
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 4) if component(s) have PDF or video, use first component image stashed on S3 we can find
        first_thumb_component_uid = self._get_first_thumb_component(
            nuxeo_metadata)
        self.logger.info(
            "first_thumb_component_uid: {}".format(first_thumb_component_uid))
        if first_thumb_component_uid:
            self.logger.info("Nuxeo doc with uid {} has thumbnail at the"
                             "component level".format(uid))
            is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format(
                first_thumb_component_uid)
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 5) return None
        self.logger.info("Could not find any image for Nuxeo doc with uid "
                         "{}! Returning None".format(uid))
        return is_shown_by

    def _has_image(self, metadata):
        ''' based on json metadata, determine whether or not this Nuxeo doc
        has an image file associated
        '''

        if metadata['type'] != "SampleCustomPicture":
            return False

        properties = metadata['properties']
        file_content = properties.get('file:content')
        if file_content and 'name' in file_content and file_content[
                'name'] == 'empty_picture.png':
            return False
        elif file_content and 'data' in file_content:
            return True
        else:
            return False

    def _has_s3_thumbnail(self, metadata):
        ''' based on json metadata, determine whether or not this Nuxeo doc
        is PDF (or other non-image)
            that will have thumb image stashed on S3 for it '''
        if metadata['type'] not in ("CustomFile", "CustomVideo"):
            return False

        properties = metadata['properties']
        file_content = properties.get('file:content')
        if file_content and 'data' in file_content:
            return True
        else:
            return False

    def _get_first_image_component(self, parent_metadata):
        ''' get first image component we can find '''
        component_uid = None

        query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \
                "ecm:currentLifeCycleState != 'deleted' ORDER BY " \
                "ecm:pos".format(parent_metadata['uid'])
        for child in self._nx.nxql(query):
            child_metadata = self._nx.get_metadata(uid=child['uid'])
            if self._has_image(child_metadata):
                component_uid = child_metadata['uid']
                break

        return component_uid

    def _get_first_thumb_component(self, parent_metadata):
        ''' get first non-image component with thumbnail we can find '''
        component_uid = None

        query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \
                "ecm:currentLifeCycleState != 'deleted' ORDER BY " \
                "ecm:pos".format(parent_metadata['uid'])
        for child in self._nx.nxql(query):
            child_metadata = self._nx.get_metadata(uid=child['uid'])
            if self._has_s3_thumbnail(child_metadata):
                component_uid = child_metadata['uid']
                break

        return component_uid

    def next(self):
        '''Return Nuxeo record by record to the controller'''
        doc = self._children.next()
        self.metadata = self._nx.get_metadata(uid=doc['uid'])
        self.structmap_url = self._get_structmap_url(self._structmap_bucket,
                                                     doc['uid'])
        self.metadata['structmap_url'] = self.structmap_url
        self.metadata['structmap_text'] = self._get_structmap_text(
            self.structmap_url)
        self.metadata['isShownBy'] = self._get_isShownBy(self.metadata)

        return self.metadata
class Stash(object):
    '''
        stash various files on s3 for a Nuxeo collection
        in preparation for harvesting into Calisphere
    '''

    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)

    def fetch_objects(self):
        ''' fetch objects to process '''
        return self.dh.fetch_objects()

    def images(self):
        ''' stash Nuxeo image files on s3 '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashImage(obj['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing image {}'.format(c['path']))
                nxstash = NuxeoStashImage(c['path'], IMAGE_BUCKET,
                                          IMAGE_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def files(self):
        ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a
        collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashFile(obj['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing file {}'.format(c['path']))
                nxstash = NuxeoStashFile(c['path'], FILE_BUCKET, FILE_REGION,
                                         self.pynuxrc, self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def thumbnails(self):
        ''' stash thumbnail images for Nuxeo files of type 'file', 'audio',
        or 'video' for a collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashThumb(obj['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing thumb {}'.format(c['path']))
                nxstash = NuxeoStashThumb(c['path'], THUMB_BUCKET,
                                          THUMB_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def media_json(self):
        ''' create and stash media.json files for a nuxeo collection '''
        report = {}
        for obj in self.objects:
            self.logger.info('Stashing media json {}'.format(obj['path']))
            nxstash = NuxeoStashMediaJson(obj['path'], MEDIAJSON_BUCKET,
                                          MEDIAJSON_REGION, self.pynuxrc,
                                          self.replace)
            report[nxstash.uid] = nxstash.nxstashref()

        return report
Example #27
0
class MerrittAtom():

    def __init__(self, collection_id, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.collection_id = collection_id

        if 'bucket' in kwargs:
            self.bucket = kwargs['bucket']
        else:
            self.bucket = BUCKET

        if 'pynuxrc' in kwargs:
            pynuxrc = kwargs['pynuxrc']
        else:
            pynuxrc = None

        if 'dir' in kwargs:
            self.dir = kwargs['dir']
        else:
            self.dir = '.'

        if 'nostash' in kwargs:
            self.nostash = kwargs['nostash']
        else:
            self.nostash = False

        self.logger.info("collection_id: {}".format(self.collection_id))

        if 'nuxeo_path' in kwargs:
            self.path = kwargs['nuxeo_path']
        else:
            self.path = self._get_nuxeo_path()

        if 'merritt_id' in kwargs:
            self.merritt_id = kwargs['merritt_id']
        else:
            self.merritt_id = self._get_merritt_id()

        if not self.merritt_id:
            raise ValueError("No Merritt ID for this collection")

        self.feed_base_url = 'https://s3.amazonaws.com/{}/'.format(self.bucket)

        if pynuxrc:
            self.nx = utils.Nuxeo(rcfile=open(expanduser(pynuxrc),'r'))
            self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=pynuxrc)
        elif not(pynuxrc) and os.path.isfile(expanduser('~/.pynuxrc')):
            self.nx = utils.Nuxeo(rcfile=open(expanduser('~/.pynuxrc'),'r'))
            self.dh = DeepHarvestNuxeo(self.path, '')

        self.atom_file = self._get_filename(self.collection_id)
        if not self.atom_file:
            raise ValueError("Could not create filename for ATOM feed based on collection id: {}".format(self.collection_id))

        self.s3_url = "{}{}".format(self.feed_base_url, self.atom_file)

        self.atom_filepath = os.path.join(self.dir, self.atom_file)

    def _get_merritt_id(self):
        ''' given collection registry ID, get corresponding Merritt collection ID '''
        url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id)
        res = requests.get(url)
        res.raise_for_status()
        md = json.loads(res.text)
        merritt_id = md['merritt_id']

        return merritt_id 

    def _get_nuxeo_path(self):
        ''' given ucldc registry collection ID, get Nuxeo path for collection '''
        url = "{}collection/{}/?format=json".format(REGISTRY_API_BASE, self.collection_id)
        res = requests.get(url)
        res.raise_for_status()
        md = json.loads(res.text)
        nuxeo_path = md['harvest_extra_data']

        return nuxeo_path 

    def _get_filename(self, collection_id):
        ''' given Collection ID, get a friendly filename for the ATOM feed '''
        filename = 'ucldc_collection_{}.atom'.format(collection_id)

        return filename 

    def _extract_nx_metadata(self, raw_metadata): 
        ''' extract Nuxeo metadata we want to post to the ATOM feed '''
        metadata = {}
        
        # last modified 
        metadata['lastModified'] = raw_metadata['bundle_lastModified']

        # creator
        creators = raw_metadata['properties']['ucldc_schema:creator']
        metadata['creator'] = [creator['name'] for creator in creators]

        # title
        metadata['title'] = raw_metadata['title']

        # date
        dates = raw_metadata['properties']['ucldc_schema:date']
        dates = [date['date'] for date in dates]
        metadata['date'] = dates[0] if dates else None

        # nuxeo id
        metadata['id'] = raw_metadata['properties']['ucldc_schema:identifier']

        # nuxeo collection
        metadata['collection'] = raw_metadata['properties']['ucldc_schema:collection'][0] if raw_metadata['properties']['ucldc_schema:collection'] else None

        return metadata

    def _construct_entry_bundled(self, doc):
        ''' construct ATOM feed entry element for a given nuxeo doc, including files for any component objects '''
        uid = doc['uid']

        # parent
        nx_metadata = self._extract_nx_metadata(doc)
        entry = etree.Element(etree.QName(ATOM_NS, "entry"))
        entry = self._populate_entry(entry, nx_metadata, uid, True)

        # insert component md
        for c in self.dh.fetch_components(doc):
            self._insert_full_md_link(entry, c['uid'])
            self._insert_main_content_link(entry, c['uid'])
            self._insert_aux_links(entry, c['uid'])

        return entry

    def _add_atom_elements(self, doc):
        ''' add atom feed elements to document '''

        # recommended ATOM feed elements
        feed_author = etree.Element(etree.QName(ATOM_NS, "author"))
        feed_author.text = "UC Libraries Digital Collection"
        doc.insert(0, feed_author)

        # required ATOM feed elements
        feed_title = etree.Element(etree.QName(ATOM_NS, "title"))
        feed_title.text = "UCLDC Metadata Feed" # FIXME get campus name from registry API?
        doc.insert(0, feed_title)

        feed_id = etree.Element(etree.QName(ATOM_NS, "id"))
        feed_id.text = self.s3_url 
        doc.insert(0, feed_id)

        return doc 

    def _add_feed_updated(self, doc, updated):
        ''' add feed updated '''
        feed_updated = etree.Element(etree.QName(ATOM_NS, "updated"))
        feed_updated.text = updated 
        doc.insert(0, feed_updated)

    def _add_collection_alt_link(self, doc, path):
        ''' add elements related to Nuxeo collection info to document '''
        collection_metadata = self.nx.get_metadata(path=path)
        collection_title = collection_metadata['title']
        collection_uid = collection_metadata['uid']
        collection_uri = self.get_object_view_url(collection_uid)

        feed_link_alt = etree.Element(etree.QName(ATOM_NS, "link"), rel="alternate", href=collection_uri, title=collection_title) 
        doc.insert(0, feed_link_alt)

        return doc

    def _add_paging_info(self, doc):
        ''' add rel links for paging '''
        # this is just dumb for now
        last_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="last", href=self.s3_url)
        doc.insert(0, last_link)

        first_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="first", href=self.s3_url)
        doc.insert(0, first_link)

        self_link = etree.Element(etree.QName(ATOM_NS, "link"), rel="self", href=self.s3_url)
        doc.insert(0, self_link)

    def _add_merritt_id(self, doc, merritt_collection_id):
        ''' add Merritt ID '''
        merritt_id = etree.Element(etree.QName(ATOM_NS, "merritt_collection_id"))
        merritt_id.text = merritt_collection_id 
        doc.insert(0, merritt_id)

    def _populate_entry(self, entry, metadata, nxid, is_parent):
        ''' get <entry> element for a given set of object metadata '''

        # atom id (URI)
        nuxeo_object_view_url = self.get_object_view_url(nxid)
        atom_id = etree.SubElement(entry, etree.QName(ATOM_NS, "id"))
        atom_id.text = nuxeo_object_view_url

        # atom title
        atom_title = etree.SubElement(entry, etree.QName(ATOM_NS, "title"))
        atom_title.text = metadata["title"]
 
        # atom updated
        atom_updated = etree.SubElement(entry, etree.QName(ATOM_NS, "updated"))
        atom_updated.text = metadata['lastModified'].isoformat()

        # atom author
        atom_author = etree.SubElement(entry, etree.QName(ATOM_NS, "author"))
        atom_author.text = "UC Libraries Digital Collection"

        # metadata file link
        self._insert_full_md_link(entry, nxid)

        # media json link
        if is_parent:
            self._insert_media_json_link(entry, nxid)

        # main content file link
        self._insert_main_content_link(entry, nxid)

        # auxiliary file link(s)
        self._insert_aux_links(entry, nxid)

        # dc creator
        for creator_name in metadata['creator']:
            dc_creator = etree.SubElement(entry, etree.QName(DC_NS, "creator"))
            dc_creator.text = creator_name 

        # dc title
        dc_title = etree.SubElement(entry, etree.QName(DC_NS, "title"))
        dc_title.text = metadata['title']

        # dc date
        dc_date = etree.SubElement(entry, etree.QName(DC_NS, "date"))
        dc_date.text = metadata['date']

        # dc identifier (a.k.a. local identifier) - Nuxeo ID
        nuxeo_identifier = etree.SubElement(entry, etree.QName(DC_NS, "identifier"))
        nuxeo_identifier.text = nxid

        # UCLDC identifier (a.k.a. local identifier) - ucldc_schema:identifier -- this will be the ARK if we have it
        if metadata['id']:
            ucldc_identifier = etree.SubElement(entry, etree.QName(NX_NS, "identifier"))
            ucldc_identifier.text = metadata['id']

        # UCLDC collection identifier
        ucldc_collection_id = etree.SubElement(entry, etree.QName(NX_NS, "collection"))
        ucldc_collection_id.text = metadata['collection']

        return entry

    def _insert_media_json_link(self, entry, uid):
        media_json_url = self.get_media_json_url(uid)
        link_media_json = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=media_json_url, type="application/json", title="Deep Harvest metadata for this object") 


    def _insert_main_content_link(self, entry, uid):
        nx_metadata = self.nx.get_metadata(uid=uid)
        nuxeo_file_download_url = self.get_object_download_url(nx_metadata)
        checksum = self.get_nuxeo_file_checksum(nx_metadata)
        if nuxeo_file_download_url:
            main_content_link = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=nuxeo_file_download_url, title="Main content file") # FIXME add content_type
        
        if checksum:
            checksum_element = etree.SubElement(main_content_link, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5")
            checksum_element.text = checksum

    def _insert_aux_links(self, entry, uid):
        nx_metadata = self.nx.get_metadata(uid=uid)
        aux_files = self.get_aux_files(nx_metadata)
        for af in aux_files:
            link_aux_file = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=af['url'], title="Auxiliary file")
            if af['checksum']:
                checksum_element = etree.SubElement(link_aux_file, etree.QName(OPENSEARCH_NS, "checksum"), algorithm="MD5")
                checksum_element.text = af['checksum']

    def _insert_full_md_link(self, entry, uid):
        full_metadata_url = self.get_full_metadata(uid)
        link_md = etree.SubElement(entry, etree.QName(ATOM_NS, "link"), rel="alternate", href=full_metadata_url, type="application/xml", title="Full metadata for this object from Nuxeo")


    def _write_feed(self, doc):
        ''' publish feed '''
        feed = etree.ElementTree(doc)
        feed_string = etree.tostring(feed, pretty_print=True, encoding='utf-8', xml_declaration=True)

        with open(self.atom_filepath, "w") as f:
            f.write(feed_string)
      
    def _s3_get_feed(self):
       """ Retrieve ATOM feed file from S3. Return as ElementTree object """
       bucketpath = self.bucket.strip("/")
       bucketbase = self.bucket.split("/")[0]
       keyparts = bucketpath.split("/")[1:]
       keyparts.append(self.atom_file)
       keypath = '/'.join(keyparts)

       s3 = boto3.client('s3')
       response = s3.get_object(Bucket=bucketbase,Key=keypath)
       contents = response['Body'].read()

       return etree.fromstring(contents) 

    def _s3_stash(self):
       """ Stash file in S3 bucket.
       """
       bucketpath = self.bucket.strip("/")
       bucketbase = self.bucket.split("/")[0]
       keyparts = bucketpath.split("/")[1:]
       keyparts.append(self.atom_file)
       keypath = '/'.join(keyparts)

       s3 = boto3.client('s3')
       with open(self.atom_filepath, 'r') as f:
           s3.upload_fileobj(f, bucketbase, keypath)

    def get_object_view_url(self, nuxeo_id):
        """ Get object view URL """
        parts = urlparse.urlsplit(self.nx.conf["api"])
        url = "{}://{}/Nuxeo/nxdoc/default/{}/view_documents".format(parts.scheme, parts.netloc, nuxeo_id) 
        return url

    def get_full_metadata(self, nuxeo_id):
        """ Get full metadata via Nuxeo API """
        parts = urlparse.urlsplit(self.nx.conf["api"])
        url = '{}://{}/Merritt/{}.xml'.format(parts.scheme, parts.netloc, nuxeo_id)
    
        return url

    def get_object_download_url(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'")

        if file_content is None:
            return None
        else:
            url = file_content['data']

        # make available via basic auth
        url = url.replace('/nuxeo/', '/Nuxeo/')
     
        return url

    def get_media_json_url(self, nuxeo_id):
        """ Get media.json (deep harvest) url """
        # https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/002130a5-e171-461b-a41b-28ab46af9652-media.json
        url = "https://s3.amazonaws.com/static.ucldc.cdlib.org/media_json/{}-media.json".format(nuxeo_id)

        return url

    def get_nuxeo_file_checksum(self, metadata):
        ''' get md5 checksum for nuxeo file '''
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError("Nuxeo object metadata does not contain 'properties/file:content' element. Make sure 'X-NXDocumentProperties' provided in pynux conf includes 'file'")

        if file_content is None:
            return None
        else:
            checksum = file_content['digest']

        return checksum

    def get_aux_files(self, metadata):
        ''' get auxiliary file urls '''
        all_md = []
        
        # get any "attachment" files
        if metadata['properties']['files:files']:
            attachments = metadata['properties']['files:files']
            for attachment in attachments:
                md = {}
                if attachment['file'] and attachment['file']['data']:
                    url = attachment['file']['data']
                    url = url.replace('/nuxeo/', '/Nuxeo/')
                    md['url'] = url
                if attachment['file'] and attachment['file']['digest']:
                    md['checksum'] = attachment['file']['digest']
                if md:
                    all_md.append(md)

        # get any "extra_file" files
        if metadata['properties']['extra_files:file']:
            for extra_file in metadata['properties']['extra_files:file']:
                md = {}
                if extra_file['blob'] and extra_file['blob']['data']:
                    url = extra_file['blob']['data']
                    url = url.replace('/nuxeo/', '/Nuxeo/')
                    md['url'] = url
                if extra_file['blob'] and extra_file['blob']['digest']:    
                    md['checksum'] = extra_file['blob']['digest']
                if md:
                    all_md.append(md)

        return all_md 

    def _bundle_docs(self, docs):
        ''' given a dict of parent level nuxeo docs, fetch any components
            and also figure out when any part of the object was most 
            recently modified/added '''

        for doc in docs:

            last_mod_str = doc['lastModified']
            overall_mod_datetime = parse(last_mod_str)

            doc['components'] = []
            
            for c in doc['components']:
                mod_str = c['lastModified']
                mod_datetime = parse(mod_str)
        
                if mod_datetime > overall_mod_datetime:
                    overall_mod_datetime = mod_datetime 

            doc['bundle_lastModified'] = overall_mod_datetime

        return docs 

    def process_feed(self):
        ''' create feed for collection and stash on s3 '''
        self.logger.info("atom_file: {}".format(self.atom_file))
        self.logger.info("Nuxeo path: {}".format(self.path))
        self.logger.info("Fetching Nuxeo docs. This could take a while if collection is large...")

        parent_docs = self.dh.fetch_objects()

        bundled_docs = self._bundle_docs(parent_docs)
        bundled_docs.sort(key=itemgetter('bundle_lastModified'))

        # create root
        root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

        # add entries
        for document in bundled_docs:
            nxid = document['uid']
            self.logger.info("working on document: {} {}".format(nxid, document['path']))

            # object, bundled into one <entry> if complex
            entry = self._construct_entry_bundled(document)
            self.logger.info("inserting entry for object {} {}".format(nxid, document['path']))
            root.insert(0, entry)

        # add header info
        logging.info("Adding header info to xml tree")
        self._add_merritt_id(root, self.merritt_id)
        self._add_paging_info(root)
        self._add_collection_alt_link(root, self.path)
        self._add_atom_elements(root)
        self._add_feed_updated(root, datetime.now(dateutil.tz.tzutc()).isoformat())

        self._write_feed(root)
        logging.info("Feed written to file: {}".format(self.atom_filepath))

        if not self.nostash:
            self._s3_stash()
            self.logger.info("Feed stashed on s3: {}".format(self.s3_url)) 
Example #28
0
class NuxeoStashRef(object):
    ''' Base class for fetching a Nuxeo file and stashing it in S3 '''
    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False,
                 **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

    def nxstashref(self):
        ''' download, prep and stash file '''
        raise NotImplementedError

    def _update_report(self, key, value):
        ''' add a key/value pair to report dict '''
        self.report[key] = value

    def _remove_tmp(self):
        ''' clean up after ourselves '''
        shutil.rmtree(self.tmp_dir)

    def _download_nuxeo_file(self):
        res = requests.get(self.source_download_url,
                           headers=self.nx.document_property_headers,
                           auth=self.nx.auth)
        res.raise_for_status()
        with open(self.source_filepath, 'wb') as f:
            for block in res.iter_content(1024):
                if block:
                    f.write(block)
                    f.flush()
        self.logger.info("Downloaded file from {} to {}".format(
            self.source_download_url, self.source_filepath))

    def _get_file_info(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        info = {}
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError(
                "Nuxeo object metadata does not contain 'properties/file:"
                "content' element. Make sure 'X-NXDocumentProperties' "
                "provided in pynux conf includes 'file'")

        if file_content is None:
            return None
        else:
            url = file_content['data'].strip()
            url = url.replace('/nuxeo/', '/Nuxeo/')
            info['url'] = url.strip()
            info['mimetype'] = file_content['mime-type'].strip()
            info['filename'] = file_content['name'].strip()

        if not info['filename']:
            try:
                info['filename'] = metadata['properties']['file:filename']
            except KeyError:
                raise KeyError(
                    "Nuxeo object metadata does not contain 'properties/file:"
                    "filename' element. Make sure 'X-NXDocumentProperties' "
                    "provided in pynux conf includes 'file'")

        return info

    def _is_s3_stashed(self):
        """ Check for existence of key on S3.
       """
        return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid,
                                             self.region)

    def _s3_stash(self, filepath, mimetype):
        """ Stash file in S3 bucket.
       """
        return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid,
                                       self.region, mimetype, self.replace)
Example #29
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        description=
        'Create ATOM feed for a given Nuxeo folder for Merritt harvesting')
    parser.add_argument("collection", help="UCLDC Registry Collection ID")
    parser.add_argument("--pynuxrc", help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()
    collection_id = argv.collection

    if argv.pynuxrc:
        ma = MerrittAtom(collection_id, argv.pynuxrc)
    else:
        ma = MerrittAtom(collection_id)

    print "atom_file: {}".format(ma.atom_file)
    print "ma.path: {}".format(ma.path)

    if argv.pynuxrc:
        dh = DeepHarvestNuxeo(ma.path, '', pynuxrc=argv.pynuxrc)
    else:
        dh = DeepHarvestNuxeo(ma.path, '')

    print "Nuxeo path: {}".format(ma.path)
    print "Fetching Nuxeo docs. This could take a while if collection is large..."
    documents = dh.fetch_objects()

    # create root
    root = etree.Element(etree.QName(ATOM_NS, "feed"), nsmap=NS_MAP)

    # add entries
    for document in documents:
        nxid = document['uid']
        print "working on document: {} {}".format(nxid, document['path'])

        # parent
        entry = ma._construct_entry(nxid, True)
        print "inserting entry for parent object {} {}".format(
            nxid, document['path'])
        root.insert(0, entry)

        # children
        component_entries = [
            ma._construct_entry(c['uid'], False)
            for c in dh.fetch_components(document)
        ]
        for ce in component_entries:
            print "inserting entry for component: {} {}".format(
                nxid, document['path'])
            root.insert(0, ce)

    # add header info
    print "Adding header info to xml tree"
    ma._add_merritt_id(root, ma.merritt_id)
    ma._add_paging_info(root)
    ma._add_collection_alt_link(root, ma.path)
    ma._add_atom_elements(root)
    ma._add_feed_updated(root, ma.last_update)

    ma._write_feed(root)
    print "Feed written to file: {}".format(ma.atom_file)

    ma._s3_stash()
    print "Feed stashed on s3: {}".format(ma.s3_url)
def main(argv=None):
    ''' stash Nuxeo image files on s3 '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, create jp2 versions of image '
        'files and stash in S3.'
    )
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument(
        '--bucket',
        default='ucldc-private-files/jp2000',
        help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help='AWS region')
    parser.add_argument(
        '--replace',
        action="store_true",
        help="replace file on s3 if it already exists")
    parser.add_argument(
        '--pynuxrc', default='~/.pynuxrc', help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}
    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region,
                                  argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_image = len([
        key for key, value in report.iteritems()
        if not value['is_image']['is_image']
    ])
    print "not image:\t{}".format(not_image)
    unrecognized = len([
        key for key, value in report.iteritems()
        if not value['precheck']['pass']
    ])
    print "not convertible:\t{}".format(unrecognized)
    converted = len(
        [key for key, value in report.iteritems() if value['converted']])
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "converted:\t{}".format(converted)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "stashed:\t{}".format(stashed)

    print "\nDone."
Example #31
0
class Stash(object):
    '''
        stash various files on s3 for a Nuxeo collection
        in preparation for harvesting into Calisphere
    '''
    def __init__(self, path, pynuxrc, replace=False, loglevel=_loglevel_):
        self.logger = logging.getLogger(__name__)

        self.path = path
        self.pynuxrc = pynuxrc
        self.replace = replace

        self.dh = DeepHarvestNuxeo(self.path, '', pynuxrc=self.pynuxrc)

        self.objects = self.fetch_objects()

        self.components = {}
        for obj in self.objects:
            self.components[obj['uid']] = self.dh.fetch_components(obj)

    def fetch_objects(self):
        ''' fetch objects to process '''
        return self.dh.fetch_objects()

    def images(self):
        ''' stash Nuxeo image files on s3 '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashImage(obj['path'], IMAGE_BUCKET, IMAGE_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing image {}'.format(c['path']))
                nxstash = NuxeoStashImage(c['path'], IMAGE_BUCKET,
                                          IMAGE_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def files(self):
        ''' stash Nuxeo files of type 'file', 'audio', or 'video' for a
        collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashFile(obj['path'], FILE_BUCKET, FILE_REGION,
                                     self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing file {}'.format(c['path']))
                nxstash = NuxeoStashFile(c['path'], FILE_BUCKET, FILE_REGION,
                                         self.pynuxrc, self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def thumbnails(self):
        ''' stash thumbnail images for Nuxeo files of type 'file', 'audio',
        or 'video' for a collection
        '''
        report = {}
        for obj in self.objects:
            nxstash = NuxeoStashThumb(obj['path'], THUMB_BUCKET, THUMB_REGION,
                                      self.pynuxrc, self.replace)
            report[nxstash.uid] = nxstash.nxstashref()
            for c in self.components[obj['uid']]:
                self.logger.info('Stashing thumb {}'.format(c['path']))
                nxstash = NuxeoStashThumb(c['path'], THUMB_BUCKET,
                                          THUMB_REGION, self.pynuxrc,
                                          self.replace)
                report[nxstash.uid] = nxstash.nxstashref()

        return report

    def media_json(self):
        ''' create and stash media.json files for a nuxeo collection '''
        report = {}
        for obj in self.objects:
            self.logger.info('Stashing media json {}'.format(obj['path']))
            nxstash = NuxeoStashMediaJson(obj['path'], MEDIAJSON_BUCKET,
                                          MEDIAJSON_REGION, self.pynuxrc,
                                          self.replace)
            report[nxstash.uid] = nxstash.nxstashref()

        return report
Example #32
0
class NuxeoStashRef(object):
    ''' Base class for fetching a Nuxeo file and stashing it in S3 '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)   
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

    def nxstashref(self):
        ''' download, prep and stash file '''
        raise NotImplementedError

    def _update_report(self, key, value):
        ''' add a key/value pair to report dict '''
        self.report[key] = value

    def _remove_tmp(self):
        ''' clean up after ourselves '''
        shutil.rmtree(self.tmp_dir)

    def _download_nuxeo_file(self):
        # https://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
        res = requests.get(self.source_download_url,
                           headers=self.nx.document_property_headers,
                           auth=self.nx.auth, stream=True)
        res.raise_for_status()
        with open(self.source_filepath, 'wb') as f:
            for block in res.iter_content(chunk_size=None):
                f.write(block)
        self.logger.info("Downloaded file from {} to {}".format(
            self.source_download_url, self.source_filepath))

    def _get_file_info(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        info = {}
        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError(
                "Nuxeo object metadata does not contain 'properties/file:"
                "content' element. Make sure 'X-NXDocumentProperties' "
                "provided in pynux conf includes 'file'"
            )

        if file_content is None:
            return None
        else:
            url = file_content['data'].strip()
            url = url.replace('/nuxeo/', '/Nuxeo/')
            info['url'] = url.strip()
            info['mimetype'] = file_content['mime-type'].strip()
            info['filename'] = file_content['name'].strip()

        if not info['filename']:
            try:
                info['filename'] = metadata['properties']['file:filename']
            except KeyError:
                raise KeyError(
                    "Nuxeo object metadata does not contain 'properties/file:"
                    "filename' element. Make sure 'X-NXDocumentProperties' "
                    "provided in pynux conf includes 'file'"
                )

        return info

    def _is_s3_stashed(self):
        """ Check for existence of key on S3.
       """
        return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid,
                                             self.region)

    def _s3_stash(self, filepath, mimetype):
        """ Stash file in S3 bucket.
       """
        return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid,
                                       self.region, mimetype, self.replace)
Example #33
0
class NuxeoStashRef(object):
    ''' Base class for fetching a Nuxeo file and stashing it in S3 '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=False, **kwargs):

        self.logger = logging.getLogger(__name__)

        self.path = path
        self.bucket = bucket
        self.pynuxrc = pynuxrc
        self.region = region
        self.replace = replace

        self.nx = utils.Nuxeo(rcfile=open(expanduser(self.pynuxrc), 'r'))

        if 'metadata' in kwargs:
            self.metadata = kwargs['metadata']
            self.logger.info("got metadata from kwargs")
        else:
            self.metadata = self.nx.get_metadata(path=self.path)   
            self.logger.info("got metadata via pynux utils")

        self.uid = self.metadata['uid']

        self.logger.info("initialized NuxeoStashRef with path {}".format(
            self.path.encode('ascii', 'replace')))

        self.dh = DeepHarvestNuxeo(self.path, uid=self.uid)
        self.calisphere_type = self.dh.get_calisphere_object_type(
            self.metadata['type'])
        self.tmp_dir = tempfile.mkdtemp(dir='/tmp')  # FIXME put in conf

        self.report = {}
        self._update_report('uid', self.uid)
        self._update_report('path', self.path)
        self._update_report('bucket', self.bucket)
        self._update_report('replace', self.replace)
        self._update_report('pynuxrc', self.pynuxrc)
        self._update_report('calisphere_type', self.calisphere_type)

    def nxstashref(self):
        ''' download, prep and stash file '''
        raise NotImplementedError

    def _update_report(self, key, value):
        ''' add a key/value pair to report dict '''
        self.report[key] = value

    def _remove_tmp(self):
        ''' clean up after ourselves '''
        shutil.rmtree(self.tmp_dir)

    def _download_nuxeo_file(self):

        # https://findwork.dev/blog/advanced-usage-python-requests-timeouts-retries-hooks/#retry-on-failure
        retry_strategy = Retry(
            total=3,
            status_forcelist=[413, 429, 500, 502, 503, 504],
)
        adapter = HTTPAdapter(max_retries=retry_strategy)
        http = requests.Session()
        http.mount("https://", adapter)
        http.mount("http://", adapter)

        # timeouts based on those used by nuxeo-python-client
        # see: https://github.com/nuxeo/nuxeo-python-client/blob/master/nuxeo/constants.py
        # but tweaked to be slightly larger than a multiple of 3, which is recommended
        # in the requests documentation.
        # see: https://docs.python-requests.org/en/master/user/advanced/#timeouts
        timeout_connect = 12.05
        timeout_read = (60 * 10) + 0.05
        res = http.get(self.source_download_url,
                           headers=self.nx.document_property_headers,
                           auth=self.nx.auth, stream=True, timeout=(timeout_connect, timeout_read))

        res.raise_for_status()
        with open(self.source_filepath, 'wb') as f:
            for block in res.iter_content(chunk_size=None):
                f.write(block)
        self.logger.info("Downloaded file from {} to {}".format(
            self.source_download_url, self.source_filepath))

    def _get_file_info(self, metadata):
        ''' given the full metadata for an object, get file download url '''
        info = {}

        # for videos, try to get nuxeo transcoded video file url first
        if metadata['type'] == 'CustomVideo':
           try:
               transcoded_video = metadata['properties']['vid:transcodedVideos']
               for tv in transcoded_video:
                  if tv['content']['mime-type'] == 'video/mp4':
                     url = tv['content']['data']
                     url = url.replace('/nuxeo/', '/Nuxeo/')
                     info['url'] = url.strip()
                     info['mimetype'] = tv['content']['mime-type'].strip()
                     info['filename'] = tv['content']['name'].strip()
                     return info
           except KeyError:
               pass

        try:
            file_content = metadata['properties']['file:content']
        except KeyError:
            raise KeyError(
                "Nuxeo object metadata does not contain 'properties/file:"
                "content' element. Make sure 'X-NXDocumentProperties' "
                "provided in pynux conf includes 'file'"
            )

        if file_content is None:
            return None
        else:
            url = file_content['data'].strip()
            url = url.replace('/nuxeo/', '/Nuxeo/')
            info['url'] = url.strip()
            info['mimetype'] = file_content['mime-type'].strip()
            info['filename'] = file_content['name'].strip()

        if not info['filename']:
            try:
                info['filename'] = metadata['properties']['file:filename']
            except KeyError:
                raise KeyError(
                    "Nuxeo object metadata does not contain 'properties/file:"
                    "filename' element. Make sure 'X-NXDocumentProperties' "
                    "provided in pynux conf includes 'file'"
                )

        return info

    def _is_s3_stashed(self):
        """ Check for existence of key on S3.
       """
        return s3stash.s3tools.is_s3_stashed(self.bucket, self.uid,
                                             self.region)

    def _s3_stash(self, filepath, mimetype):
        """ Stash file in S3 bucket.
       """
        return s3stash.s3tools.s3stash(filepath, self.bucket, self.uid,
                                       self.region, mimetype, self.replace)
def main(argv=None):
    ''' stash Nuxeo image files on s3 '''
    parser = argparse.ArgumentParser(
        description='For Nuxeo collection, create jp2 versions of image '
        'files and stash in S3.')
    parser.add_argument('path', help="Nuxeo document path to collection")
    parser.add_argument('--bucket',
                        default='ucldc-private-files/jp2000',
                        help="S3 bucket name")
    parser.add_argument('--region', default='us-west-2', help='AWS region')
    parser.add_argument('--replace',
                        action="store_true",
                        help="replace file on s3 if it already exists")
    parser.add_argument('--pynuxrc',
                        default='~/.pynuxrc',
                        help="rc file for use by pynux")
    if argv is None:
        argv = parser.parse_args()

    collection = argv.path.split('/')[-1]

    # logging
    logfile = 'logs/{}.log'.format(collection)
    print "LOG:\t{}".format(logfile)
    logging.basicConfig(
        filename=logfile,
        level=logging.INFO,
        format='%(asctime)s (%(name)s) [%(levelname)s]: %(message)s',
        datefmt='%m/%d/%Y %I:%M:%S %p')
    logger = logging.getLogger(__name__)

    dh = DeepHarvestNuxeo(argv.path, argv.bucket, pynuxrc=argv.pynuxrc)

    report = {}
    objects = dh.fetch_objects()
    for obj in objects:
        nxstash = NuxeoStashImage(obj['path'], argv.bucket, argv.region,
                                  argv.pynuxrc, argv.replace)
        report[nxstash.uid] = nxstash.nxstashref()
        for c in dh.fetch_components(obj):
            nxstash = NuxeoStashImage(c['path'], argv.bucket, argv.region,
                                      argv.pynuxrc, argv.replace)
            report[nxstash.uid] = nxstash.nxstashref()

    # output report to json file
    reportfile = "reports/{}.json".format(collection)
    with open(reportfile, 'w') as f:
        json.dump(report, f, sort_keys=True, indent=4)

    # parse report to give basic stats
    report = json.load(open(reportfile))
    print "REPORT:\t{}".format(reportfile)
    print "SUMMARY:"
    print "processed:\t{}".format(len(report))
    not_image = len([
        key for key, value in report.iteritems()
        if not value['is_image']['is_image']
    ])
    print "not image:\t{}".format(not_image)
    unrecognized = len([
        key for key, value in report.iteritems()
        if not value['precheck']['pass']
    ])
    print "not convertible:\t{}".format(unrecognized)
    converted = len(
        [key for key, value in report.iteritems() if value['converted']])
    already_stashed = len([
        key for key, value in report.iteritems()
        if 'already_s3_stashed' in value.keys() and value['already_s3_stashed']
    ])
    print "converted:\t{}".format(converted)
    stashed = len(
        [key for key, value in report.iteritems() if value['stashed']])
    print "stashed:\t{}".format(stashed)

    print "\nDone."
Example #35
0
class NuxeoFetcher(Fetcher):
    '''Harvest a Nuxeo FILE. Can be local or at a URL'''

    def __init__(self, url_harvest, extra_data, conf_pynux={}, **kwargs):
        '''
        uses pynux (https://github.com/ucldc/pynux) to grab objects from
        the Nuxeo API

        api url is set from url_harvest, overriding pynuxrc config and
        passed in conf.

        the pynux config file should have user & password
        and X-NXDocumemtProperties values filled in.
        '''
        super(NuxeoFetcher, self).__init__(url_harvest, extra_data, **kwargs)
        self._url = url_harvest
        self._path = extra_data
        self._nx = pynux.utils.Nuxeo(conf=conf_pynux)
        self._nx.conf['api'] = self._url
        self._structmap_bucket = STRUCTMAP_S3_BUCKET

        # get harvestable child objects
        conf_pynux['api'] = self._url
        self._dh = DeepHarvestNuxeo(self._path, '', conf_pynux=conf_pynux)

        self._children = iter(self._dh.fetch_objects())

    def _get_structmap_url(self, bucket, obj_key):
        '''Get structmap_url property for object'''
        structmap_url = "s3://{0}/{1}{2}".format(bucket, obj_key,
                                                 '-media.json')
        return structmap_url

    def _get_structmap_text(self, structmap_url):
        '''
           Get structmap_text for object. This is all the words from 'label'
           in the json.
           See https://github.com/ucldc/ucldc-docs/wiki/media.json
        '''
        structmap_text = ""

        bucketpath = self._structmap_bucket.strip("/")
        bucketbase = bucketpath.split("/")[0]
        parts = urlparse.urlsplit(structmap_url)

        # get contents of <nuxeo_id>-media.json file
        conn = boto.connect_s3()
        bucket = conn.get_bucket(bucketbase)
        key = bucket.get_key(parts.path)
        if not key:  # media_json hasn't been harvested yet for this record
            self.logger.error('Media json at: {} missing.'.format(parts.path))
            return structmap_text
        mediajson = key.get_contents_as_string()
        mediajson_dict = json.loads(mediajson)

        # concatenate all of the words from 'label' in the json
        labels = []
        labels.append(mediajson_dict['label'])
        if 'structMap' in mediajson_dict:
            labels.extend([sm['label'] for sm in mediajson_dict['structMap']])
        structmap_text = ' '.join(labels)
        return structmap_text

    def _get_isShownBy(self, nuxeo_metadata):
        '''
            Get isShownBy value for object
            1) if object has image at parent level, use this
            2) if component(s) have image, use first one we can find
            3) if object has PDF or video at parent level,
                use image stashed on S3
            4) if component(s) have PDF or video, use first component image stashed on S3 we can find
            5) return None
        '''
        is_shown_by = None
        uid = nuxeo_metadata['uid']
        self.logger.info("About to get isShownBy for uid {}".format(uid))

        # 1) if object has image at parent level, use this
        if self._has_image(nuxeo_metadata):
            self.logger.info("Nuxeo doc with uid {} has an image at the "
                             "parent level".format(uid))
            is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format(nuxeo_metadata[
                'uid'])
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 2) if component(s) have image, use first one we can find
        first_image_component_uid = self._get_first_image_component(
            nuxeo_metadata)
        self.logger.info("first_image_component_uid: {}".format(
            first_image_component_uid))
        if first_image_component_uid:
            self.logger.info("Nuxeo doc with uid {} has an image at the"
                             "component level".format(uid))
            is_shown_by = NUXEO_MEDIUM_IMAGE_URL_FORMAT.format(
                first_image_component_uid)
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 3) if object has PDF at parent level, use image stashed on S3
        if self._has_s3_thumbnail(nuxeo_metadata):
            self.logger.info("Nuxeo doc with uid {} has a thumbnail for"
                             "parent file (probably PDF) stashed on S3".format(
                                 uid))
            is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format(nuxeo_metadata[
                'uid'])
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 4) if component(s) have PDF or video, use first component image stashed on S3 we can find
        first_thumb_component_uid = self._get_first_thumb_component(
            nuxeo_metadata)
        self.logger.info("first_thumb_component_uid: {}".format(
            first_thumb_component_uid))
        if first_thumb_component_uid:
            self.logger.info("Nuxeo doc with uid {} has thumbnail at the"
                             "component level".format(uid))
            is_shown_by = NUXEO_S3_THUMB_URL_FORMAT.format(
                first_thumb_component_uid)
            self.logger.info("is_shown_by: {}".format(is_shown_by))
            return is_shown_by

        # 5) return None
        self.logger.info("Could not find any image for Nuxeo doc with uid "
                         "{}! Returning None".format(uid))
        return is_shown_by

    def _has_image(self, metadata):
        ''' based on json metadata, determine whether or not this Nuxeo doc
        has an image file associated
        '''

        if metadata['type'] != "SampleCustomPicture":
            return False

        properties = metadata['properties']
        file_content = properties.get('file:content')
        if file_content and 'name' in file_content and file_content['name'] == 'empty_picture.png':
            return False
        elif file_content and 'data' in file_content:
            return True
        else:
            return False

    def _has_s3_thumbnail(self, metadata):
        ''' based on json metadata, determine whether or not this Nuxeo doc
        is PDF (or other non-image)
            that will have thumb image stashed on S3 for it '''
        if metadata['type'] not in ("CustomFile", "CustomVideo"):
            return False

        properties = metadata['properties']
        file_content = properties.get('file:content')
        if file_content and 'data' in file_content:
            return True
        else:
            return False

    def _get_first_image_component(self, parent_metadata):
        ''' get first image component we can find '''
        component_uid = None

        query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \
                "ecm:currentLifeCycleState != 'deleted' ORDER BY " \
                "ecm:pos".format(parent_metadata['uid'])
        for child in self._nx.nxql(query):
            child_metadata = self._nx.get_metadata(uid=child['uid'])
            if self._has_image(child_metadata):
                component_uid = child_metadata['uid']
                break

        return component_uid

    def _get_first_thumb_component(self, parent_metadata):
        ''' get first non-image component with thumbnail we can find '''
        component_uid = None

        query = "SELECT * FROM Document WHERE ecm:parentId = '{}' AND " \
                "ecm:currentLifeCycleState != 'deleted' ORDER BY " \
                "ecm:pos".format(parent_metadata['uid'])
        for child in self._nx.nxql(query):
            child_metadata = self._nx.get_metadata(uid=child['uid'])
            if self._has_s3_thumbnail(child_metadata):
                component_uid = child_metadata['uid']
                break

        return component_uid

    def next(self):
        '''Return Nuxeo record by record to the controller'''
        doc = self._children.next()
        self.metadata = self._nx.get_metadata(uid=doc['uid'])
        self.structmap_url = self._get_structmap_url(self._structmap_bucket,
                                                     doc['uid'])
        self.metadata['structmap_url'] = self.structmap_url
        self.metadata['structmap_text'] = self._get_structmap_text(
            self.structmap_url)
        self.metadata['isShownBy'] = self._get_isShownBy(self.metadata)

        return self.metadata
class NuxeoStashMediaJson(NuxeoStashRef):
    ''' create and stash media.json file for a nuxeo object '''

    def __init__(self,
                 path,
                 bucket,
                 region,
                 pynuxrc='~/.pynuxrc',
                 replace=True,
                 **kwargs):
        super(NuxeoStashMediaJson, self).__init__(path, bucket, region,
                                                  pynuxrc, replace, **kwargs)

        self.dh = DeepHarvestNuxeo(
            self.path, self.bucket, pynuxrc=self.pynuxrc)
        self.mj = MediaJson()

        self.filename = FILENAME_FORMAT.format(self.uid)
        self.filepath = os.path.join(self.tmp_dir, self.filename)
        self._update_report('filename', self.filename)
        self._update_report('filepath', self.filepath)

    def nxstashref(self):
        return self.nxstash_mediajson()

    def nxstash_mediajson(self):
        ''' create media.json file for object and stash on s3 '''
        self._update_report('stashed', False)

        # extract and transform metadata for parent obj and any components
        parent_md = self._get_parent_metadata(self.metadata)
        component_md = [
            self._get_component_metadata(c)
            for c in self.dh.fetch_components(self.metadata)
        ]

        # create media.json file
        media_json = self.mj.create_media_json(parent_md, component_md)
        self._write_file(media_json, self.filepath)

        # stash media.json file on s3
        stashed, s3_report = s3stash.s3tools.s3stash(
            self.filepath, self.bucket, self.filename, self.region,
            'application/json', self.replace)
        self._update_report('s3_stash', s3_report)
        self._update_report('stashed', stashed)

        self._remove_tmp()

        return self.report

    def _get_parent_metadata(self, obj):
        ''' assemble top-level (parent) object metadata '''
        metadata = {}
        metadata['label'] = obj['title']

        # only provide id, href, format if Nuxeo Document has file attached
        full_metadata = self.nx.get_metadata(uid=obj['uid'])

        if self.dh.has_file(full_metadata):
            metadata['id'] = obj['uid']
            metadata['href'] = self.dh.get_object_download_url(full_metadata)
            metadata['format'] = self.dh.get_calisphere_object_type(obj[
                'type'])
            if metadata['format'] == 'video':
                metadata['dimensions'] = self.dh.get_video_dimensions(
                    full_metadata)

        return metadata

    def _get_component_metadata(self, obj):
        ''' assemble component object metadata '''
        metadata = {}
        full_metadata = self.nx.get_metadata(uid=obj['uid'])
        metadata['label'] = obj['title']
        metadata['id'] = obj['uid']
        metadata['href'] = self.dh.get_object_download_url(full_metadata)

        # extract additional  ucldc metadata from 'properties' element
        ucldc_md = self._get_ucldc_schema_properties(full_metadata)

        for key, value in ucldc_md.iteritems():
            metadata[key] = value

        # map 'type'
        metadata['format'] = self.dh.get_calisphere_object_type(obj['type'])

        return metadata

    def _get_ucldc_schema_properties(self, metadata):
        ''' get additional metadata as mapped by harvester '''
        properties = {}

        mapper = UCLDCNuxeoMapper(metadata)
        mapper.map_original_record()
        mapper.map_source_resource()

        properties = mapper.mapped_data['sourceResource']
        properties.update(mapper.mapped_data['originalRecord'])

        return properties

    def _write_file(self, content_dict, filepath):
        """ convert dict to json and write to file """
        content_json = json.dumps(
            content_dict, indent=4, separators=(',', ': '), sort_keys=False)
        with open(filepath, 'wb') as f:
            f.write(content_json)
            f.flush()