def add_ejson_profile(data, fixup_obj_labels=True):
    objkeys = dict([ (k, k) for obj in data for k in obj ])
    #FIXME: reduce from 3 full passes through obj to 2 (don't think we can go lower than 2)
    for k in objkeys:
        kcount = reduce(lambda count, obj, k=k: count + int(k in obj), data, 0)
        logger.debug("Key usage count %s: %i" % (k, kcount))
        if not kcount:
            del objkeys[k]
    logger.debug("Modified data profile keys: " + repr(objkeys))
    if fixup_obj_labels:
        for obj in data:
            for k in obj:
                #Yes we could receive non-string "labels"
                if not isinstance(k, basestring):
                    k = str(k)
                new_k = UNSUPPORTED_IN_EXHIBITKEY.sub('_', k)
                if not new_k or new_k[0].isdigit():
                    new_k = '_' + new_k
                if k != new_k:
                    objkeys[new_k] = k
                    try:
                        del objkeys[k]
                    except KeyError:
                        pass
                    obj[new_k] = obj[k]
                    del obj[k]
    #print >> sys.stderr, objkeys

    return {"properties": [
                {"property": k, "enabled": (k not in ("id", "label")), "label": v, "types": ["text"]} for k, v in objkeys.iteritems()
            ]}
 def fixup_newkeys():
     for k in newkeys:
         if not isinstance(k, basestring):
             #Yes we could receive non-string "labels"
             k = unicode(k)
         new_k = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', k)
         if not new_k or new_k[0].isdigit():
             new_k = u'_' + new_k
         if k != new_k:
             newkeys[new_k] = k
         if k in newkeys: del newkeys[k]
Esempio n. 3
0
def read_contentdm(site, collection=None, query=None, limit=None, logger=logging, proxy=None, cachedir='/tmp/.cache'):
    '''
    A generator of CDM records
    First generates header info

    >>> from zen.contentdm import read_contentdm
    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None)
    >>> results.next()
    {'basequeryurl': 'http://digital.library.louisville.edu/cdm4/results.php?CISOOP1=any&CISOROOT=%2Fjthom&CISOBOX1=&CISOFIELD1=CISOSEARCHALL'}
    >>> results.next()
    {u'Title': u'60 years in darkness.  ', u'Object_Type': u'Negatives, ', u'Source': u"4 x 5 in. b&w safety negative. Item no. 1979.33.1026 in the Jean Thomas, The Traipsin' Woman, Collection, University of Louisville Photographic Archives. ", u'Collection': u"Jean Thomas, The Traipsin' Woman, Collection, ",...}

    The first yielded value is global metadata; the  second is the record
    for the first item  in the collection/query, and so on until all the items
    are returned, or the limit reached.

    If you want to see the debug messages, just do (before calling read_contentdm for the first time):

    >>> import logging; logging.basicConfig(level=logging.DEBUG)

    for a nice-sized collection to try:
    >>> read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/maps')

    Auburn theater collection:

    >>> read_contentdm('http://content.lib.auburn.edu', collection='/theatre01')
    >>> read_contentdm('http://content.lib.auburn.edu', collection='/football')

    i.e.: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/maps

    See also:

    * /cdm4/browse.php?CISOROOT=/football (51 items)

    >>> results = read_contentdm('http://digital.library.louisville.edu/cdm4/', collection='/jthom', query=None, limit=None, proxy="http://*****:*****@name="searchResultsForm"]//a[starts-with(@href, "item_viewer.php")]')

    def follow_pagination(doc):
        #e.g. of page 1: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh
        #e.g. of page 2: http://digital.library.louisville.edu/cdm4/browse.php?CISOROOT=/afamoh&CISOSTART=1,21
        page_start = 1
        while True:
            items = doc.xml_select(u'//a[contains(@href, "item_viewer.php") or contains(@href, "document.php")]')
            #items = list(items)
            #for i in items: yield i
            for i in items:
                #logger.debug("item: {0}".format(i.title.encode('utf-8')))
                yield i
            next = [ l.href for l in doc.xml_select(u'//a[@class="res_submenu"]') if int(l.href.split(u',')[-1]) > page_start ]
            if not next:
                #e.g. http://vilda.alaska.edu/ uses yet another pattern with just @class=submenu links *sigh*
                next = [ l.href for l in doc.xml_select(u'//a[@class="submenu"]') if u'CISOSTART' in l.href and int(l.href.split(u',')[-1]) > page_start ]
                if not next:
                    break
            page_start = int(next[0].split(u',')[-1])
            url = absolutize(next[0], site)

            resp, doc = cdmsite.index_page(url, "Next page URL: {0}")
        return

    items = follow_pagination(resultsdoc)

    at_least_one = False
    count = 0
    for it in items:
        at_least_one = True
        pageuri = absolutize(it.href, site)
        if pageuri in seen_links:
            continue
        seen_links.add(pageuri)
        entry = {}
        logger.debug("Processing item URL: {0}".format(pageuri))
        (scheme, netloc, path, query, fragment) = split_uri_ref(pageuri)
        entry['domain'] = netloc
        params = parse_qs(query)
        entry['cdm-coll'] = params['CISOROOT'][0].strip('/').split('/')[0]
        entry['id'] = params['CISOPTR'][0]
        logger.debug("Item id: {0}".format(entry['id']))
        if entry['id'] in seen_ids:
            continue
        seen_ids.add(entry['id'])
        entry['link'] = unicode(pageuri)
        entry['local_link'] = '#' + entry['id']

        resp, page, cachekey, cached = cdmsite.item_page(pageuri)

        if cached:
            entry = cached
        else:
            image = first_item(page.xml_select(u'//td[@class="tdimage"]//img'))
            if image:
                imageuri = absolutize(image.src, site)
                entry['imageuri'] = imageuri
                try:
                    entry['thumbnail'] = absolutize(dict(it.xml_parent.a.img.xml_attributes.items())[None, u'src'], site)
                except AttributeError:
                    logger.debug("No thumbnail")
            #entry['thumbnail'] = DEFAULT_RESOLVER.normalize(it.xml_parent.a.img.src, root)
            #fields = page.xml_select(u'//tr[td[@class="tdtext"]]')
            #fields = page.xml_select(u'//table[@class="metatable"]/tr')
            fields = chain(page.xml_select(u'//tr[td[@class="tdtext"]]'), page.xml_select(u'//table[@class="metatable"]//tr'))
            for f in fields:
                #key = unicode(f.td[0].span.b).replace(' ', '_')
                key = UNSUPPORTED_IN_EXHIBITKEY.sub(u'_', U(f.xml_select(u'td[1]//b')))
                #logger.debug("{0}".format(key))
                value = u''.join(CONTENT.dispatch(f.td[1]))
                #value = u''.join(CONTENT.dispatch(f.xml_select(u'td[2]')))
                entry[key] = unicode(value)
            if u'Title' in entry:
                #logger.debug("{0}".format(entry['Title']))
                entry['label'] = entry['Title']
            else:
                entry['label'] = u'[NO LABEL AVAILABLE]'
            if u"Location_Depicted" in entry:
                locations = entry[u"Location_Depicted"].split(u', ')
                #locations = [ l.replace(' (', ', ').replace(')', '').replace(' ', '+') for l in locations if l.strip() ]
                locations = [ l.replace(' (', ', ').replace(')', '').replace('.', '') for l in locations if l.strip() ]
                #print >> sys.stderr, "LOCATIONS", repr(locations)
                entry[u"Locations_Depicted"] = locations
            if u"Date_Original" in entry:
                entry[u"Estimated_Original_Date"] = entry[u"Date_Original"].strip().replace('-', '5').replace('?', '') 
            entry[u"Subject"] = [ s for s in entry.get(u"Subject", u'').split(', ') if s.strip() ]
            if cachedir:
                try:
                    json_stream = open(os.path.join(cachedir, cachekey+'.extract.js'), 'w')
                    json.dump(entry, json_stream)
                except IOError, ValueError:
                    pass

        yield entry
        count += 1
        if limit and count >= limit:
            logger.debug("Limit reached")
            break
    #Keeping it to avoid breaking what's working for now
    objkeys = dict([ (k, k) for obj in data for k in obj ])
    #FIXME: reduce from 3 full passes through obj to 2 (don't think we can go lower than 2)
    for k in objkeys:
        kcount = reduce(lambda count, obj, k=k: count + int(k in obj), data, 0)
        logger.debug("Key usage count %s: %i" % (k, kcount))
        if not kcount:
            del objkeys[k]
    logger.debug("Modified data profile keys: " + repr(objkeys))
    if fixup_obj_labels:
        for obj in data:
            for k in obj:
                #Yes we could receive non-string "labels"
                if not isinstance(k, basestring):
                    k = str(k)
                new_k = UNSUPPORTED_IN_EXHIBITKEY.sub('_', k)
                if not new_k or new_k[0].isdigit():
                    new_k = '_' + new_k
                if k != new_k:
                    objkeys[new_k] = k
                    try:
                        del objkeys[k]
                    except KeyError:
                        pass
                    obj[new_k] = obj[k]
                    del obj[k]
    #print >> sys.stderr, objkeys

    profile = {
        "original_MIME_type": ctype,
        "Akara_MIME_type_magic_guess": imt_saved,