Example #1
0
    def test_coins_citation_info(self):
        # minimal record
        item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic')

        info = item.coins_citation_info
        self.assert_('rfr_id' in info, 'referrer id should be set in COinS info')
        self.assert_('rft_val_fmt' in info, 'format is specified in COinS info')
        self.assertEqual(item.title, info['rft.title'])
        self.assertEqual(item.url, info['rft.identifier'])

        for key in ['rft.date', 'rft.place', 'rft.source', 'rft.format']:
            self.assert_(key not in info,
                         'unavailable data should not be set in COinS info')

        # add all fields to simulate a complete record
        item.date = '1887'
        item.format = 'Image'
        item.source = 'Smithsonian'
        item.location = 'USA'

        info = item.coins_citation_info
        self.assertEqual(item.date, info['rft.date'])
        self.assertEqual(item.format, info['rft.format'])
        self.assertEqual(item.source, info['rft.source'])
        self.assertEqual(item.location, info['rft.place'])
Example #2
0
    def find_items(keywords):
        # example use:
        # keyword should be a list of terms
        # DPLA.find_items(keywords=['term1', 'term2'])

        api = Bibs()
        qry = 'api_key->%s:q->%s' % (
            DPLA.API_KEY,
            ' OR '.join(keywords)
        )

        #qry from unicode string to regular string
        qry = qry.encode("utf8", "ignore")
        logger.debug('dpla query: %s' % qry)

        # TODO: restrict to image only, or at least things with preview image
        start = time.time()
        results = api.search(qry, 'dplav2', 'items')
        # TODO: error handling...
        logger.info('dpla query completed in %.2f sec' % (time.time() - start))

        items = []
        for doc in results['docs']:
            src_res = doc['sourceResource']

            # for now, just skip items without an image url
            if not doc.get('object', None): continue

            i = DisplayItem(
                title=src_res.get('title', None),
                format=src_res.get('type', None),
                source=doc['provider'].get('name', None),
                # collection or provider here? src_rec['collection']['title']
                # NOTE: collection apparently not set for all items

                thumbnail=doc.get('object', None),
                # according to dpla docs, should be url preview for item
                # docs reference a field for object mimetype, not seeing in results

                # url on provider's website with context
                url=doc.get('isShownAt', None)
            )
            if 'date' in src_res:
                i.date = src_res['date'].get('displayDate', None)

            if 'spatial' in src_res and src_res['spatial']:
                # sometimes a list but not always
                if isinstance(src_res['spatial'], list):
                    space = src_res['spatial'][0]
                else:
                    space = src_res['spatial']
                # country? state? coords?
                i.location = space.get('name', None)

            # Add the aggregator for reference
            i.aggregator = DPLA.name

            items.append(i)

        return items
Example #3
0
    def find_items(keywords):
        # example use:
        # keyword should be a list of terms
        # DPLA.find_items(keywords=['term1', 'term2'])

        api = Bibs()
        qry = 'api_key->%s:q->%s' % (DPLA.API_KEY, ' OR '.join(keywords))

        #qry from unicode string to regular string
        qry = qry.encode("utf8", "ignore")
        logger.debug('dpla query: %s' % qry)

        # TODO: restrict to image only, or at least things with preview image
        start = time.time()
        results = api.search(qry, 'dplav2', 'items')
        # TODO: error handling...
        logger.info('dpla query completed in %.2f sec' % (time.time() - start))

        items = []
        for doc in results['docs']:
            src_res = doc['sourceResource']

            # for now, just skip items without an image url
            if not doc.get('object', None):
                continue

            # url on DPLA site
            item_url = '%sitem/%s' % (url, doc.get('id'))

            i = DisplayItem(
                title=src_res.get('title', None),
                format=src_res.get('type', None),
                source=doc['provider'].get('name', None),
                # collection or provider here? src_rec['collection']['title']
                # NOTE: collection apparently not set for all items
                thumbnail=doc.get('object', None),
                # according to dpla docs, should be url preview for item
                # docs reference a field for object mimetype, not seeing in results

                # url on DPLA site
                url=item_url)
            if 'date' in src_res:
                i.date = src_res['date'].get('displayDate', None)

            if 'spatial' in src_res and src_res['spatial']:
                # sometimes a list but not always
                if isinstance(src_res['spatial'], list):
                    space = src_res['spatial'][0]
                else:
                    space = src_res['spatial']
                # country? state? coords?
                i.location = space.get('name', None)

            # Add the aggregator for reference
            i.aggregator = DPLA.name

            items.append(i)

        return items
Example #4
0
    def find_items(keywords=[]):
        qry = ' OR '.join(keywords)

        #qry from unicode string to regular string
        qry = qry.encode("utf8", "ignore")
        logger.debug('trove query: %s' % qry)
        qry_url = Trove.API_URL % (quote_plus(qry), Trove.API_KEY)
        items = []
        start = time.time()
        try:
            response = urlopen(qry_url)
        except HTTPError as e:
            logger.error('trove api error: %s' % e)
        except URLError as e:
            logger.error('trove api error: %s' % e)
        else:
            logger.info('trove query completed in %.2f sec' % (time.time() - start))
            results = simplejson.load(response)
            try:
                for doc in results['response']['zone'][0]['records']['work']:

                    # skip items without a thumbnail url
                    # have to dig around in identifier
                    thumbnail = None
                    if 'identifier' in doc:
                        for link in doc['identifier']:
                            if link['linktype'] == "thumbnail":
                                thumbnail = link['value']
                    if not thumbnail:
                        continue

                    i = DisplayItem(
                        title=doc.get('title', None),
                        format='; '.join(doc.get('type', [])),
                        # no way to get contributor name without another API call
                        # so just set source to Trove for now
                        source='Trove',
                        url=doc.get('troveUrl', None),
                        date=doc.get('issued', None),
                        thumbnail=thumbnail
                    )

                    # Add the aggregator for reference
                    i.aggregator = Trove.name
                    items.append(i)
            except (KeyError, IndexError, TypeError):
                # Either no results or something was wrong with the JSON
                logger.debug('Trove returned no results')
        return items
Example #5
0
    def test_coins_citation(self):
        # minimal record
        item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic')

        cit = item.coins_citation
        # just some basic sanity checks
        self.assert_(cit.startswith('ctx_ver=Z39.88-2004'))
        self.assert_('rft.title=%s' % item.title in cit)
Example #6
0
    def test_coins_citation_info(self):
        # minimal record
        item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic')

        info = item.coins_citation_info
        self.assert_('rfr_id' in info,
                     'referrer id should be set in COinS info')
        self.assert_('rft_val_fmt' in info,
                     'format is specified in COinS info')
        self.assertEqual(item.title, info['rft.title'])
        self.assertEqual(item.url, info['rft.identifier'])

        for key in ['rft.date', 'rft.place', 'rft.source', 'rft.format']:
            self.assert_(key not in info,
                         'unavailable data should not be set in COinS info')

        # add all fields to simulate a complete record
        item.date = '1887'
        item.format = 'Image'
        item.source = 'Smithsonian'
        item.location = 'USA'

        info = item.coins_citation_info
        self.assertEqual(item.date, info['rft.date'])
        self.assertEqual(item.format, info['rft.format'])
        self.assertEqual(item.source, info['rft.source'])
        self.assertEqual(item.location, info['rft.place'])
Example #7
0
    def find_items(keywords=[]):
        qry = 'wskey->%s:query->%s' % (
            Europeana.API_KEY,
            # ' OR '.join(['%s' % kw for kw in keywords])
            ' OR '.join(keywords))

        #qry from unicode string to regular string
        qry = qry.encode("utf8", "ignore")

        logger.debug('europeana query: %s' % qry)
        b = Bibs()
        results = b.search(qry, 'europeanav2', 'search')

        items = []
        # no results! log this error?
        if 'items' not in results:
            return items

        for doc in results['items']:
            # NOTE: result includes a 'completeness' score
            # which we could use for a first-pass filter to weed out junk records

            # for now, just skip items without an image url
            if not 'edmPreview' in doc or not doc['edmPreview']:
                continue

            i = DisplayItem(
                format=doc.get('type', None),
                source='; '.join(doc.get('dataProvider', [])),
                # NOTE: provider is aggregator (i.e., 'The European Library')
                # dataProvider is original source

                # url on provider's website with context
                url=doc.get('guid', None),
                date=doc.get('edmTimespanLabel', None))

            # NOTE: doc['link'] provides json with full record data
            # if we want more item details
            # should NOT be displayed to users (includes api key)

            # preview and title are both lists; for now, in both cases,
            # just grab the first one

            if 'edmTimespanLabel' in doc:
                i.date = doc['edmTimespanLabel'][0]['def']
            if 'title' in doc:
                i.title = doc['title'][0]
            if 'edmPreview' in doc:
                i.thumbnail = doc['edmPreview'][0]

            # Add the aggregator for reference
            i.aggregator = Europeana.name

            # NOTE: spatial/location information doesn't seem to be included
            # in this item result
            items.append(i)

        return items
Example #8
0
    def test_coins_citation(self):
        # minimal record
        item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic')

        cit = item.coins_citation
        # just some basic sanity checks
        self.assert_(cit.startswith('ctx_ver=Z39.88-2004'))
        self.assert_('rft.title=%s' % item.title in cit)

        # variant content - lists
        item = DisplayItem(title=['Hippo'],
                           url='http://some.url/to/a/hippo/pic')
        # should not throw an exception
        cit = item.coins_citation
        self.assert_('rft.title=%s' % item.title[0] in cit)

        # variant content - integer
        item = DisplayItem(title='Hippo',
                           url='http://some.url/to/a/hippo/pic',
                           date=1936)
        # should not throw an exception
        cit = item.coins_citation
        self.assert_('rft.date=%s' % item.date in cit)
Example #9
0
    def find_items(keywords):

        flickr = flickrapi.FlickrAPI(Flickr.API_KEY)

        # photos = flickr.photos_search(user_id='73509078@N00', per_page='10')
        start = time.time()
        # NOTE: flickr does support or, but doesn't like too many terms at once
        # (15 terms is apparently too many)
        query = ' OR '.join(set(keywords[:10]))
        logger.debug('flickr query: %s' % query)
        results = flickr.photos_search(text=query, format='json', is_commons='true',
                                       extras='owner_name',
                                       sort='relevance',
                                       per_page=15)
                                       # restrict to first 15 items (only ~10 for other apis currently)
        # comma-delimited list of extra fields
        # need owner name for source
        # TODO: future enhancement: access to date, location info, etc
        #                              extras='owner_name,date_upload,date_taken,geo')

        logger.info('flickr query completed in %.2f sec' % (time.time() - start))

        # this is really stupid and should be uncessary but the 'jsonFlickrApi( )' needs to be stripped for the json to parse properly
        results = results.lstrip('jsonFlickrApi(')
        results = results.rstrip(')')

        results = simplejson.loads(results)
        # import pprint
        # pprint.pprint(results)

        items = []
        # no results! log this error?

        # NOTE: could be bad api key; check code/stat in response
        if not 'photos' in results or 'photo' not in results['photos']:
            return items

        for doc in results['photos']['photo']:
            # NOTE: result includes a 'completeness' score
            # which we could use for a first-pass filter to weed out junk records

            i = DisplayItem(

                format=doc.get('type', None),
                source=doc.get('ownername', None),
                # url on provider's website with context
                # http://www.flickr.com/photos/{user-id}/{photo-id}
                url='http://www.flickr.com/photos/%(owner)s/%(id)s/' % (doc)

                # TODO get date data
                # date=doc.get('edmTimespanLabel', None)
            )

            # NOTE: doc['link'] provides json with full record data
            # if we want more item details
            # should NOT be displayed to users (includes api key)

            # flickr title not a list
            if 'title' in doc:
                i.title = doc['title']
            # build the url back to the image
            # http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg
            i.thumbnail = 'http://farm%(farm)s.staticflickr.com/%(server)s/%(id)s_%(secret)s_m.jpg' % doc
            # i.thumbnail = 'http://farm'+str(doc['farm'])+'.staticflickr.com/'+str(doc['server'])+'/'+str(doc['id'])+'_'+str(doc['secret'])+'.jpg'

            # Add the aggregator for reference
            i.aggregator = 'Flickr Commons'

            # NOTE: spatial/location information doesn't seem to be included
            # in this item result
            items.append(i)

        return items
Example #10
0
    def find_items(keywords=[]):
        qry = 'wskey->%s:query->%s' % (
            Europeana.API_KEY,
            # ' OR '.join(['%s' % kw for kw in keywords])
            ' OR '.join(keywords)
        )

        #qry from unicode string to regular string
        qry = qry.encode("utf8", "ignore")

        logger.debug('europeana query: %s' % qry)
        b = Bibs()
        results = b.search(qry, 'europeanav2', 'search')

        items = []
        # no results! log this error?
        if 'items' not in results:
            return items

        for doc in results['items']:
            # NOTE: result includes a 'completeness' score
            # which we could use for a first-pass filter to weed out junk records

            # for now, just skip items without an image url
            if not 'edmPreview' in doc or not doc['edmPreview']:
                continue

            i = DisplayItem(

                format=doc.get('type', None),
                source='; '.join(doc.get('dataProvider', [])),
                # NOTE: provider is aggregator (i.e., 'The European Library')
                # dataProvider is original source

                # url on provider's website with context
                url=doc.get('guid', None),
                date=doc.get('edmTimespanLabel', None)
            )

            # NOTE: doc['link'] provides json with full record data
            # if we want more item details
            # should NOT be displayed to users (includes api key)

            # preview and title are both lists; for now, in both cases,
            # just grab the first one

            if 'edmTimespanLabel' in doc:
                i.date = doc['edmTimespanLabel'][0]['def']
            if 'title' in doc:
                i.title = doc['title'][0]
            if 'edmPreview' in doc:
                i.thumbnail = doc['edmPreview'][0]

            # Add the aggregator for reference
            i.aggregator = Europeana.name

            # NOTE: spatial/location information doesn't seem to be included
            # in this item result
            items.append(i)

        return items