def testResourceIteratoreMultiPage(self): '''Test when less than one page worth of objects fetched''' httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/repository/?limit=20&offset=20', body=open(DIR_FIXTURES + '/registry_api_repository-page-2.json').read()) httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/repository/', body=open(DIR_FIXTURES + '/registry_api_repository.json').read()) riter = self.registry.resource_iter('repository') self.assertEqual(riter.url, 'https://registry.cdlib.org/api/v1/repository/') self.assertEqual(riter.path_next, '/api/v1/repository/?limit=20&offset=20') r = '' for x in range(0, 38): r = riter.next() self.assertFalse(isinstance(r, Collection)) self.assertEqual(r['resource_uri'], '/api/v1/repository/42/') self.assertEqual( riter.url, 'https://registry.cdlib.org/api/v1/repository/?limit=20&offset=20') self.assertEqual(riter.path_next, None) self.assertRaises(StopIteration, riter.next)
def testCollectionSlice(self): '''Test that results are correct for a known couchdb result''' url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design', COUCHDB_VIEW.split('/')[0], '_view', COUCHDB_VIEW.split('/')[1]) httpretty.register_uri( httpretty.GET, re.compile(url_to_pretty + ".*$"), body=open(DIR_FIXTURES + '/couchdb_by_provider_name-5112.json').read(), etag="2U5BW2TDDX9EHZJOO0DNE29D1", content_type='application/json', ) #transfer_encoding='chunked', #NOTE: doesn't work with httpretty results = self._cdbrunner.queue_collection('5112', 6000, self.function, 'arg1', 'arg2', kwarg1='1', kwarg2=2) self.assertEqual(len(results), 3) self.assertEqual(results[0].args, ('5112--http://ark.cdlib.org/ark:/13030/kt7580382j', 'arg1', 'arg2')) self.assertEqual(results[0].kwargs, {'kwarg1': '1', 'kwarg2': 2}) self.assertEqual(results[0].func_name, 'test.test_couchdb_runner.func_for_test')
def testInit(self): '''Basic tdd start''' url = 'https://s3.amazonaws.com/pastperfectonline/xmlfiles/museum_231' httpretty.register_uri( httpretty.GET, url, body=open(DIR_FIXTURES + '/xml-fetch.xml').read()) h = fetcher.XML_Fetcher(url, None) self.assertEqual(h.url_base, url) docs = [] d = h.next() self.assertEqual(len(d), 999) docs.extend(d) for d in h: docs.extend(d) self.assertEqual(len(docs), 2320) test1 = docs[0] test2 = docs[2] self.assertIn('title', test1['metadata']) self.assertEqual(test1['metadata']['title'], [ 'California desperadoes : stories of early California outlaws in their own word' ]) # test that attributes are captured, even from empty elements self.assertEqual(test1['metadata']['q'], ['taken']) self.assertEqual(test1['metadata']['d'], ['Kodak']) self.assertEqual(test2['metadata']['q'], ['scanned']) self.assertEqual(test2['metadata']['d'], ['Epson'])
def testCreateProfile(self): '''Test the creation of a DPLA style proflie file''' httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/collection/178', body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read()) c = Collection('https://registry.cdlib.org/api/v1/collection/178') self.assertTrue(hasattr(c, 'dpla_profile')) self.assertIsInstance(c.dpla_profile, str) j = json.loads(c.dpla_profile) self.assertEqual(j['name'], '178') self.assertEqual(j['enrichments_coll'], ['/compare_with_schema']) self.assertTrue('enrichments_item' in j) self.assertIsInstance(j['enrichments_item'], list) self.assertEqual(len(j['enrichments_item']), 30) self.assertIn('contributor', j) self.assertIsInstance(j['contributor'], list) self.assertEqual(len(j['contributor']), 4) self.assertEqual(j['contributor'][1], { u'@id': u'/api/v1/campus/1/', u'name': u'UCB' }) self.assertTrue(hasattr(c, 'dpla_profile_obj')) self.assertIsInstance(c.dpla_profile_obj, dict) self.assertIsInstance(c.dpla_profile_obj['enrichments_item'], list) e = c.dpla_profile_obj['enrichments_item'] self.assertEqual(e[0], '/oai-to-dpla') self.assertEqual( e[1], '/shred?prop=sourceResource/contributor%2CsourceResource/creator%2CsourceResource/date' )
def testSolrHarvest(self, mock_boto3): '''Test the function of the Solr harvest with <date> objects in stream''' httpretty.register_uri( httpretty.POST, 'http://example.edu/solr/blacklight/select', responses=[ httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-0.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-1.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-2.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-3.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-4.xml').read()) ]) self.assertTrue(hasattr(self.controller, 'harvest')) self.controller.harvest() print "LOGS:{}".format(self.test_log_handler.formatted_records) self.assertEqual(len(self.test_log_handler.records), 2) self.assertTrue( 'UC San Diego' in self.test_log_handler.formatted_records[0]) self.assertEqual(self.test_log_handler.formatted_records[1], '[INFO] HarvestController: 13 records harvested')
def testInit(self): '''Basic tdd start''' url = 'https://example.edu' user_id = 'testuser' page_size = 10 url_first = fetcher.Flickr_Fetcher.url_get_photos_template.format( api_key='boguskey', user_id=user_id, per_page=page_size, page=1) httpretty.register_uri( httpretty.GET, url_first, body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read()) h = fetcher.Flickr_Fetcher(url, user_id, page_size=page_size) self.assertEqual(h.url_base, url) self.assertEqual(h.user_id, user_id) self.assertEqual(h.page_size, 10) self.assertEqual(h.page_current, 1) self.assertEqual(h.doc_current, 0) self.assertEqual(h.docs_fetched, 0) self.assertEqual(h.url_get_photos_template, 'https://api.flickr.com/services/rest/' '?api_key={api_key}&user_id={user_id}&per_page' '={per_page}&method=' 'flickr.people.getPublicPhotos&page={page}') self.assertEqual(h.api_key, 'boguskey') self.assertEqual(h.url_current, url_first) self.assertEqual(h.docs_total, 10) self.assertEqual(h.url_get_photo_info_template, 'https://api.flickr.com/services/rest/' '?api_key={api_key}&method=' 'flickr.photos.getInfo&photo_id={photo_id}')
def testLoggingMoreThan1000(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/198/", body=open(DIR_FIXTURES + '/collection_api_big_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-2400-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/198/') controller = fetcher.HarvestController( '*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) controller.harvest() self.assertEqual(len(self.test_log_handler.records), 13) self.assertEqual(self.test_log_handler.formatted_records[1], '[INFO] HarvestController: 100 records harvested') shutil.rmtree(controller.dir_save) self.assertEqual(self.test_log_handler.formatted_records[10], '[INFO] HarvestController: 1000 records harvested') self.assertEqual(self.test_log_handler.formatted_records[11], '[INFO] HarvestController: 2000 records harvested') self.assertEqual(self.test_log_handler.formatted_records[12], '[INFO] HarvestController: 2400 records harvested')
def testFetch(self): httpretty.register_uri( httpretty.GET, 'http://digitalcollections.hoover.org/search/*/objects/xml?filter=approved:true&page=1', responses=[ httpretty.Response(body=open(DIR_FIXTURES + '/eMuseum-page-1.xml').read()), httpretty.Response(body=open(DIR_FIXTURES + '/eMuseum-page-2.xml').read()), httpretty.Response(body=open(DIR_FIXTURES + '/eMuseum-page-3.xml').read()), ]) url = 'http://digitalcollections.hoover.org' h = fetcher.eMuseum_Fetcher(url, None) self.assertEqual(h.url_base, url) docs = [] d = h.next() docs.extend(d) for d in h: docs.extend(d) self.assertEqual(len(docs), 24) test1 = docs[12] self.assertIn('title', test1) self.assertEqual( test1['title']['text'], 'Money is power. A war savings certificate in every Canadian home. Get yours now at post offices or banks.' ) self.assertIn('unknown2', test1) self.assertIn('text2', test1['primaryMaker']) self.assertNotIn('attrib', test1['unknown1'])
def test_get_isShownBy_pdf(self, mock_deepharvest, mock_boto): ''' test getting correct isShownBy value for Nuxeo doc with no images and PDF at parent level ''' deepharvest_mocker(mock_deepharvest) httpretty.register_uri( httpretty.GET, 'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+' 'Document+WHERE+ecm%3AparentId+%3D+' '%2700d55837-01b6-4211-80d8-b966a15c257e%27+ORDER+BY+' 'ecm%3Apos¤tPageIndex=0&pageSize=100', responses=[ httpretty.Response(body=open(DIR_FIXTURES + '/nuxeo_no_children.json').read(), status=200), ]) h = fetcher.NuxeoFetcher('https://example.edu/api/v1', 'path-to-asset/here') nuxeo_metadata = open(DIR_FIXTURES + '/nuxeo_doc_pdf_parent.json').read() nuxeo_metadata = json.loads(nuxeo_metadata) isShownBy = h._get_isShownBy(nuxeo_metadata) self.assertEqual( isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/' 'ucldc-nuxeo-thumb-media/00d55837-01b6-4211-80d8-b966a15c257e')
def testMainHarvestController__init__Error(self, mock_method): '''Test the try-except block in main when HarvestController not created correctly''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) sys.argv = [ 'thisexe', '*****@*****.**', 'https://registry.cdlib.org/api/v1/collection/197/' ] self.assertRaises(Exception, fetcher.main, self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile) self.assertEqual(len(self.test_log_handler.records), 4) self.assertTrue("[ERROR] HarvestMain: Exception in harvester init" in self.test_log_handler.formatted_records[3]) self.assertTrue("Boom!" in self.test_log_handler.formatted_records[3]) c = Collection('https://registry.cdlib.org/api/v1/collection/197/') os.remove( os.path.abspath(os.path.join(self.dir_test_profile, c.id + '.pjs')))
def test_ignore_content_type(self, mock_stash, mock_couch): '''Test that content type check is not called if --ignore_content_type parameter given''' url = 'http://getthisimage/image' doc = {'_id': 'IGNORE_CONTENT', 'isShownBy': url} httpretty.register_uri( httpretty.HEAD, url, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', ) httpretty.register_uri( httpretty.GET, url, body='', content_length='0', content_type='text/html; charset=utf-8', connection='close', ) image_harvester = image_harvest.ImageHarvester( url_cache={}, hash_cache={}, bucket_bases=['region:x'], ignore_content_type=True) r = StashReport('test url', 'md5 test value', 's3 url object', 'mime_type', 'dimensions') ret = image_harvester.stash_image(doc) self.assertEqual(ret, [r])
def testFetch(self): '''Test the httpretty mocked fetching of documents''' url = 'https://example.edu/action/search/xml?q=ddu%3A20*&' \ 'asf=ddu&asd=&fd=1&_hd=&hd=on&sf=&_rs=&_ef=&ef=on&sd=&ed=&c=ga' httpretty.register_uri( httpretty.GET, url, responses=[ httpretty.Response( open(DIR_FIXTURES+'/ucsf-page-1.xml').read(), status=200), httpretty.Response( open(DIR_FIXTURES+'/ucsf-page-1.xml').read(), status=200), httpretty.Response( open(DIR_FIXTURES+'/ucsf-page-2.xml').read(), status=200), httpretty.Response( open(DIR_FIXTURES+'/ucsf-page-3.xml').read(), status=200), ] ) h = fetcher.UCSF_XML_Fetcher(url, None, page_size=3) docs = [] for d in h: docs.extend(d) self.assertEqual(len(docs), 7) testy = docs[0] self.assertIn('tid', testy) self.assertEqual(testy['tid'], "nga13j00") self.assertEqual(testy['uri'], 'http://legacy.library.ucsf.edu/tid/nga13j00') self.assertIn('aup', testy['metadata']) self.assertEqual(testy['metadata']['aup'], ['Whent, Peter'])
def test_single_fetching(self): url = 'http://single.edu' playlist_id = 'PLwtrWl_IBMJtjP5zMk6dVR-BRjzKqCPOM' url_vids = fetcher.YouTube_Fetcher.url_video httpretty.register_uri( httpretty.GET, url_vids, responses=[ httpretty.Response( body=open(DIR_FIXTURES + '/youtube_single_video.json').read(), status=200) ]) h = fetcher.YouTube_Fetcher(url, playlist_id) vids = [] for v in h: vids.extend(v) self.assertEqual(len(vids), 1) self.assertEqual(vids[0], { u'contentDetails': { u'definition': u'sd', u'projection': u'rectangular', u'caption': u'false', u'duration': u'PT19M35S', u'licensedContent': True, u'dimension': u'2d' }, u'kind': u'youtube#video', u'etag': u'"m2yskBQFythfE4irbTIeOgYYfBU/-3AtVAYcRLEynWZprpf0OGaY8zo"', u'id': u'0Yx8zrbsUu8' })
def testMainCreatesCollectionProfile(self, mock_boto3): '''Test that the main function produces a collection profile file for DPLA. The path to this file is needed when creating a DPLA ingestion document. ''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) Collection("https://registry.cdlib.org/api/v1/collection/197/") with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(ingest_doc_id, 'test-id') self.assertEqual(num, 128) self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
def testRunIngestProductionNotReady(self, mock_couch, mock_dash_clean, mock_check, mock_remove, mock_save, mock_enrich, mock_couchdb, mock_redis, mock_boto3): mock_couch.return_value._create_ingestion_document.return_value = \ 'test-id' # this next is because the redis client unpickles.... mock_redis.return_value.hget.return_value = pickle.dumps('RQ-result!') mail_handler = MagicMock() url_api_collection = 'https://registry.cdlib.org/api/v1/' \ 'collection/178/' httpretty.httpretty.enable() httpretty.register_uri( httpretty.GET, url_api_collection, body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read()) httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/tf2v19n928', body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read()) os.environ['DATA_BRANCH'] = 'production' self.assertRaises( Exception, run_ingest.main, '*****@*****.**', url_api_collection, log_handler=self.test_log_handler, mail_handler=mail_handler) print self.test_log_handler.records self.assertEqual(len(self.test_log_handler.records), 9)
def testMainFnWithException(self, mock_method): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' self.assertRaises( Exception, fetcher.main, self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(len(self.test_log_handler.records), 7) self.assertTrue("[ERROR] HarvestMain: Error while harvesting:" in self.test_log_handler.formatted_records[6]) self.assertTrue("Boom!" in self.test_log_handler.formatted_records[6])
def test_get_isShownBy_pdf(self, mock_deepharvest, mock_boto): ''' test getting correct isShownBy value for Nuxeo doc with no images and PDF at parent level ''' deepharvest_mocker(mock_deepharvest) httpretty.register_uri( httpretty.GET, 'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+' 'Document+WHERE+ecm%3AparentId+%3D+' '%2700d55837-01b6-4211-80d8-b966a15c257e%27+ORDER+BY+' 'ecm%3Apos¤tPageIndex=0&pageSize=100', responses=[ httpretty.Response( body=open(DIR_FIXTURES + '/nuxeo_no_children.json').read(), status=200), ]) h = fetcher.NuxeoFetcher('https://example.edu/api/v1', 'path-to-asset/here') nuxeo_metadata = open(DIR_FIXTURES + '/nuxeo_doc_pdf_parent.json').read() nuxeo_metadata = json.loads(nuxeo_metadata) isShownBy = h._get_isShownBy(nuxeo_metadata) self.assertEqual( isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/' 'ucldc-nuxeo-thumb-media/00d55837-01b6-4211-80d8-b966a15c257e')
def testRunIngestProductionNotReady(self, mock_couch, mock_dash_clean, mock_check, mock_remove, mock_save, mock_enrich, mock_couchdb, mock_redis, mock_boto3): mock_couch.return_value._create_ingestion_document.return_value = \ 'test-id' # this next is because the redis client unpickles.... mock_redis.return_value.hget.return_value = pickle.dumps('RQ-result!') mail_handler = MagicMock() url_api_collection = 'https://registry.cdlib.org/api/v1/' \ 'collection/178/' httpretty.httpretty.enable() httpretty.register_uri( httpretty.GET, url_api_collection, body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read()) httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/tf2v19n928', body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read()) os.environ['DATA_BRANCH'] = 'production' self.assertRaises(Exception, run_ingest.main, '*****@*****.**', url_api_collection, log_handler=self.test_log_handler, mail_handler=mail_handler) print self.test_log_handler.records self.assertEqual(len(self.test_log_handler.records), 9)
def setUp(self): super(HarvestControllerTestCase, self).setUp() httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) self.collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') config_file, profile_path = self.setUp_config(self.collection) self.controller_oai = fetcher.HarvestController( '*****@*****.**', self.collection, profile_path=profile_path, config_file=config_file) self.objset_test_doc = json.load( open(DIR_FIXTURES + '/objset_test_doc.json')) class myNow(datetime.datetime): @classmethod def now(cls): return cls(2017, 7, 14, 12, 1) self.old_dt = datetime.datetime datetime.datetime = myNow
def testMainCreatesCollectionProfile(self, mock_boto3): '''Test that the main function produces a collection profile file for DPLA. The path to this file is needed when creating a DPLA ingestion document. ''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) Collection("https://registry.cdlib.org/api/v1/collection/197/") with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(ingest_doc_id, 'test-id') self.assertEqual(num, 128) self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
def testLoggingMoreThan1000(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/198/", body=open(DIR_FIXTURES + '/collection_api_big_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-2400-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/198/') controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) controller.harvest() self.assertEqual(len(self.test_log_handler.records), 13) self.assertEqual(self.test_log_handler.formatted_records[1], '[INFO] HarvestController: 100 records harvested') shutil.rmtree(controller.dir_save) self.assertEqual(self.test_log_handler.formatted_records[10], '[INFO] HarvestController: 1000 records harvested') self.assertEqual(self.test_log_handler.formatted_records[11], '[INFO] HarvestController: 2000 records harvested') self.assertEqual(self.test_log_handler.formatted_records[12], '[INFO] HarvestController: 2400 records harvested')
def testInit(self): '''Basic tdd start''' url = 'https://example.edu' playlist_id = 'testplaylist' page_size = 3 url_first = fetcher.YouTube_Fetcher.url_playlistitems.format( api_key='boguskey', page_size=page_size, playlist_id=playlist_id, page_token='') httpretty.register_uri( httpretty.GET, url_first, body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read()) h = fetcher.YouTube_Fetcher(url, playlist_id, page_size=page_size) self.assertEqual(h.url_base, url) self.assertEqual(h.playlist_id, playlist_id) self.assertEqual(h.api_key, 'boguskey') self.assertEqual(h.page_size, page_size) self.assertEqual(h.playlistitems, {'nextPageToken': ''}) self.assertEqual( h.url_playlistitems, 'https://www.googleapis.com/youtube/v3/playlistItems' '?key={api_key}&maxResults={page_size}&part=contentDetails&' 'playlistId={playlist_id}&pageToken={page_token}') self.assertEqual( h.url_video, 'https://www.googleapis.com/youtube/v3/videos?' 'key={api_key}&part=snippet&id={video_ids}' )
def testInit(self): '''Basic tdd start''' url = 'https://example.edu' user_id = 'test@Nuser' page_size = 10 url_first = fetcher.Flickr_Fetcher.url_get_user_photos_template.format( api_key='boguskey', user_id=user_id, per_page=page_size, page=1) httpretty.register_uri( httpretty.GET, url_first, body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read()) h = fetcher.Flickr_Fetcher(url, user_id, page_size=page_size) self.assertEqual(h.url_base, url) self.assertEqual(h.user_id, user_id) self.assertEqual(h.page_size, 10) self.assertEqual(h.page_current, 1) self.assertEqual(h.doc_current, 0) self.assertEqual(h.docs_fetched, 0) self.assertEqual(h.url_get_user_photos_template, 'https://api.flickr.com/services/rest/' '?api_key={api_key}&user_id={user_id}&per_page' '={per_page}&method=' 'flickr.people.getPublicPhotos&page={page}') self.assertEqual(h.api_key, 'boguskey') self.assertEqual(h.url_current, url_first) self.assertEqual(h.docs_total, 10) self.assertEqual(h.url_get_photo_info_template, 'https://api.flickr.com/services/rest/' '?api_key={api_key}&method=' 'flickr.photos.getInfo&photo_id={photo_id}')
def testInit(self): '''Basic tdd start''' url = 'https://s3.amazonaws.com/pastperfectonline/xmlfiles/museum_231' httpretty.register_uri(httpretty.GET, url, body=open(DIR_FIXTURES + '/xml-fetch.xml').read()) h = fetcher.XML_Fetcher(url, None) self.assertEqual(h.url_base, url) docs = [] d = h.next() self.assertEqual(len(d), 999) docs.extend(d) for d in h: docs.extend(d) self.assertEqual(len(docs), 2320) test1 = docs[0] test2 = docs[2] self.assertIn('title', test1['metadata']) self.assertEqual(test1['metadata']['title'], [ 'California desperadoes : stories of early California outlaws in their own word' ]) # test that attributes are captured, even from empty elements self.assertEqual(test1['metadata']['q'], ['taken']) self.assertEqual(test1['metadata']['d'], ['Kodak']) self.assertEqual(test2['metadata']['q'], ['scanned']) self.assertEqual(test2['metadata']['d'], ['Epson'])
def test_get_isShownBy_video(self, mock_deepharvest, mock_boto): ''' test getting correct isShownBy value for Nuxeo video object ''' deepharvest_mocker(mock_deepharvest) httpretty.register_uri( httpretty.GET, 'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+' 'Document+WHERE+ecm%3AparentId+%3D+' '%274c80e254-6def-4230-9f28-bc48878568d4%27+' 'AND+ecm%3AcurrentLifeCycleState+%21%3D+%27deleted%27+ORDER+BY+' 'ecm%3Apos¤tPageIndex=0&pageSize=100', responses=[ httpretty.Response(body=open(DIR_FIXTURES + '/nuxeo_no_children.json').read(), status=200), ]) h = fetcher.NuxeoFetcher('https://example.edu/api/v1', 'path-to-asset/here') nuxeo_metadata = open(DIR_FIXTURES + '/nuxeo_doc_video.json').read() nuxeo_metadata = json.loads(nuxeo_metadata) isShownBy = h._get_isShownBy(nuxeo_metadata) self.assertEqual( isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/' 'ucldc-nuxeo-thumb-media/4c80e254-6def-4230-9f28-bc48878568d4')
def testFetch(self): '''Test the httpretty mocked fetching of documents''' url = 'https://example.edu/action/search/xml?q=ddu%3A20*&' \ 'asf=ddu&asd=&fd=1&_hd=&hd=on&sf=&_rs=&_ef=&ef=on&sd=&ed=&c=ga' httpretty.register_uri( httpretty.GET, url, responses=[ httpretty.Response(open(DIR_FIXTURES + '/ucsf-page-1.xml').read(), status=200), httpretty.Response(open(DIR_FIXTURES + '/ucsf-page-1.xml').read(), status=200), httpretty.Response(open(DIR_FIXTURES + '/ucsf-page-2.xml').read(), status=200), httpretty.Response(open(DIR_FIXTURES + '/ucsf-page-3.xml').read(), status=200), ]) h = fetcher.UCSF_XML_Fetcher(url, None, page_size=3) docs = [] for d in h: docs.extend(d) self.assertEqual(len(docs), 7) testy = docs[0] self.assertIn('tid', testy) self.assertEqual(testy['tid'], "nga13j00") self.assertEqual(testy['uri'], 'http://legacy.library.ucsf.edu/tid/nga13j00') self.assertIn('aup', testy['metadata']) self.assertEqual(testy['metadata']['aup'], ['Whent, Peter'])
def testAddRegistryData(self): '''Unittest the _add_registry_data function''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') self.tearDown_config() # remove ones setup in setUp self.setUp_config(collection) controller = fetcher.HarvestController( '*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) obj = {'id': 'fakey', 'otherdata': 'test'} self.assertNotIn('collection', obj) controller._add_registry_data(obj) self.assertIn('collection', obj) self.assertEqual(obj['collection'][0]['@id'], 'https://registry.cdlib.org/api/v1/collection/197/') self.assertNotIn('campus', obj) self.assertIn('campus', obj['collection'][0]) self.assertNotIn('repository', obj) self.assertIn('repository', obj['collection'][0]) # need to test one without campus self.assertEqual(obj['collection'][0]['campus'][0]['@id'], 'https://registry.cdlib.org/api/v1/campus/12/') self.assertEqual(obj['collection'][0]['repository'][0]['@id'], 'https://registry.cdlib.org/api/v1/repository/37/')
def test_single_fetching(self): url = 'http://single.edu' playlist_id = 'PLwtrWl_IBMJtjP5zMk6dVR-BRjzKqCPOM' url_vids = fetcher.YouTube_Fetcher.url_video httpretty.register_uri( httpretty.GET, url_vids, responses=[ httpretty.Response( body=open(DIR_FIXTURES + '/youtube_single_video.json').read(), status=200) ]) h = fetcher.YouTube_Fetcher(url, playlist_id) vids = [] for v in h: vids.extend(v) self.assertEqual(len(vids), 1) self.assertEqual( vids[0], { u'contentDetails': { u'definition': u'sd', u'projection': u'rectangular', u'caption': u'false', u'duration': u'PT19M35S', u'licensedContent': True, u'dimension': u'2d' }, u'kind': u'youtube#video', u'etag': u'"m2yskBQFythfE4irbTIeOgYYfBU/-3AtVAYcRLEynWZprpf0OGaY8zo"', u'id': u'0Yx8zrbsUu8' })
def testCollectionSlice(self): '''Test that results are correct for a known couchdb result''' url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design', COUCHDB_VIEW.split('/')[0], '_view', COUCHDB_VIEW.split('/')[1]) httpretty.register_uri( httpretty.GET, re.compile(url_to_pretty + ".*$"), body=open(DIR_FIXTURES + '/couchdb_by_provider_name-5112.json').read(), etag="2U5BW2TDDX9EHZJOO0DNE29D1", content_type='application/json', ) results = self._cdbworker.run_by_collection('5112', self.function, 'arg1', 'arg2', kwarg1='1', kwarg2=2) self.assertEqual(len(results), 3) self.assertEqual(results[1][0], '5112--http://ark.cdlib.org/ark:/13030/kt7779r8zj') self.assertEqual(results[1][1][1], ('arg1', 'arg2')) self.assertEqual(results[1][1][2], {'kwarg1': '1', 'kwarg2': 2}) doc = results[0][1][0] self.assertEqual(doc['isShownAt'], 'http://www.coronado.ca.us/library/')
def setUp(self): super(OAIFetcherTestCase, self).setUp() httpretty.register_uri(httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES + '/testOAI.xml').read()) self.fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai', 'oac:images')
def testMainHarvestController__init__Error(self, mock_method): '''Test the try-except block in main when HarvestController not created correctly''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) sys.argv = [ 'thisexe', '*****@*****.**', 'https://registry.cdlib.org/api/v1/collection/197/' ] self.assertRaises( Exception, fetcher.main, self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile) self.assertEqual(len(self.test_log_handler.records), 4) self.assertTrue("[ERROR] HarvestMain: Exception in harvester init" in self.test_log_handler.formatted_records[3]) self.assertTrue("Boom!" in self.test_log_handler.formatted_records[3]) c = Collection('https://registry.cdlib.org/api/v1/collection/197/') os.remove( os.path.abspath( os.path.join(self.dir_test_profile, c.id + '.pjs')))
def testOverrideMetadataPrefix(self): '''test that the metadataPrefix for an OAI feed can be overridden. The extra_data for OAI can be either just a set spec or a html query string of set= &metadataPrefix= ''' httpretty.register_uri(httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES + '/testOAI.xml').read()) set_fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai', 'set=oac:images') self.assertEqual(set_fetcher._set, 'oac:images') rec = set_fetcher.next() self.assertIsInstance(rec, dict) self.assertIn('id', rec) self.assertEqual(rec['id'], '13030/hb796nb5mn') self.assertIn('datestamp', rec) self.assertIn(rec['datestamp'], '2005-12-13') self.assertEqual( httpretty.last_request().querystring, { u'verb': [u'ListRecords'], u'set': [u'oac:images'], u'metadataPrefix': [u'oai_dc'] }) httpretty.register_uri(httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES + '/testOAI-didl.xml').read()) didl_fetcher = fetcher.OAIFetcher( 'http://content.cdlib.org/oai', 'set=oac:images&metadataPrefix=didl') self.assertEqual(didl_fetcher._set, 'oac:images') self.assertEqual(didl_fetcher._metadataPrefix, 'didl') rec = didl_fetcher.next() self.assertIsInstance(rec, dict) self.assertIn('id', rec) self.assertEqual(rec['id'], 'oai:ucispace-prod.lib.uci.edu:10575/25') self.assertEqual(rec['title'], ['Schedule of lectures']) self.assertIn('datestamp', rec) self.assertEqual(rec['datestamp'], '2015-05-20T11:04:23Z') self.assertEqual( httpretty.last_request().querystring, { u'verb': [u'ListRecords'], u'set': [u'oac:images'], u'metadataPrefix': [u'didl'] }) self.assertEqual( rec['Resource']['@ref'], 'http://ucispace-prod.lib.uci.edu/xmlui/bitstream/' + '10575/25/1/!COLLOQU.IA.pdf') self.assertEqual(rec['Item']['@id'], 'uuid-640925bd-9cdf-46be-babb-b2138c3fce9c') self.assertEqual(rec['Component']['@id'], 'uuid-897984d8-9392-4a68-912f-ffdf6fd7ce59') self.assertIn('Descriptor', rec) self.assertEqual(rec['Statement']['@mimeType'], 'application/xml; charset=utf-8') self.assertEqual( rec['DIDLInfo']['{urn:mpeg:mpeg21:2002:02-DIDL-NS}DIDLInfo'][0] ['text'], '2015-05-20T20:30:26Z') del didl_fetcher
def testIterateOverResults(self): '''Test the iteration over a mock set of data''' httpretty.register_uri( httpretty.POST, 'http://example.edu/solr/select', responses=[ httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-0.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-1.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-2.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-3.xml').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-4.xml').read()) ]) h = fetcher.SolrFetcher('http://example.edu/solr', 'extra_data', rows=3) self.assertEqual(len(h.resp.results), 3) n = 0 for r in h: n += 1 self.assertEqual(['Mission at Santa Barbara'], r['title_tesim']) self.assertEqual(n, 10)
def testInit(self): '''Basic tdd start''' url = 'https://example.edu' playlist_id = 'testplaylist' page_size = 3 url_first = fetcher.YouTube_Fetcher.url_playlistitems.format( api_key='boguskey', page_size=page_size, playlist_id=playlist_id, page_token='') httpretty.register_uri(httpretty.GET, url_first, body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read()) h = fetcher.YouTube_Fetcher(url, playlist_id, page_size=page_size) self.assertEqual(h.url_base, url) self.assertEqual(h.playlist_id, playlist_id) self.assertEqual(h.api_key, 'boguskey') self.assertEqual(h.page_size, page_size) self.assertEqual(h.playlistitems, {'nextPageToken': ''}) self.assertEqual( h.url_playlistitems, 'https://www.googleapis.com/youtube/v3/playlistItems' '?key={api_key}&maxResults={page_size}&part=contentDetails&' 'playlistId={playlist_id}&pageToken={page_token}') self.assertEqual( h.url_video, 'https://www.googleapis.com/youtube/v3/videos?' 'key={api_key}&part=snippet&id={video_ids}')
def test_get_isShownBy_video(self, mock_deepharvest, mock_boto): ''' test getting correct isShownBy value for Nuxeo video object ''' deepharvest_mocker(mock_deepharvest) httpretty.register_uri( httpretty.GET, 'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+' 'Document+WHERE+ecm%3AparentId+%3D+' '%274c80e254-6def-4230-9f28-bc48878568d4%27+' 'AND+ecm%3AcurrentLifeCycleState+%21%3D+%27deleted%27+ORDER+BY+' 'ecm%3Apos¤tPageIndex=0&pageSize=100', responses=[ httpretty.Response( body=open(DIR_FIXTURES + '/nuxeo_no_children.json').read(), status=200), ]) h = fetcher.NuxeoFetcher('https://example.edu/api/v1', 'path-to-asset/here') nuxeo_metadata = open(DIR_FIXTURES + '/nuxeo_doc_video.json').read() nuxeo_metadata = json.loads(nuxeo_metadata) isShownBy = h._get_isShownBy(nuxeo_metadata) self.assertEqual( isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/' 'ucldc-nuxeo-thumb-media/4c80e254-6def-4230-9f28-bc48878568d4')
def testHarvestControllerExists(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') controller = fetcher.HarvestController( '*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) self.assertTrue(hasattr(controller, 'fetcher')) self.assertIsInstance(controller.fetcher, fetcher.OAIFetcher) self.assertTrue(hasattr(controller, 'campus_valid')) self.assertTrue(hasattr(controller, 'dc_elements')) self.assertTrue(hasattr(controller, 'datetime_start')) print(controller.s3path) self.assertEqual(controller.s3path, 'data-fetched/197/2017-07-14-1201/') shutil.rmtree(controller.dir_save)
def setUp(self): super(HarvestControllerTestCase, self).setUp() httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) self.collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') config_file, profile_path = self.setUp_config(self.collection) self.controller_oai = fetcher.HarvestController( '*****@*****.**', self.collection, profile_path=profile_path, config_file=config_file) self.objset_test_doc = json.load( open(DIR_FIXTURES + '/objset_test_doc.json')) class myNow(datetime.datetime): @classmethod def now(cls): return cls(2017, 7, 14, 12, 1) self.old_dt = datetime.datetime datetime.datetime = myNow
def testAddRegistryData(self): '''Unittest the _add_registry_data function''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') self.tearDown_config() # remove ones setup in setUp self.setUp_config(collection) controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) obj = {'id': 'fakey', 'otherdata': 'test'} self.assertNotIn('collection', obj) controller._add_registry_data(obj) self.assertIn('collection', obj) self.assertEqual(obj['collection'][0]['@id'], 'https://registry.cdlib.org/api/v1/collection/197/') self.assertNotIn('campus', obj) self.assertIn('campus', obj['collection'][0]) self.assertNotIn('repository', obj) self.assertIn('repository', obj['collection'][0]) # need to test one without campus self.assertEqual(obj['collection'][0]['campus'][0]['@id'], 'https://registry.cdlib.org/api/v1/campus/12/') self.assertEqual(obj['collection'][0]['repository'][0]['@id'], 'https://registry.cdlib.org/api/v1/repository/37/')
def testIterateOverResults(self): '''Test the iteration over a mock set of data''' httpretty.register_uri( httpretty.GET, 'http://example.edu/solr/query', responses=[ httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-0.json').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-1.json').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-2.json').read()), httpretty.Response(body=open( DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-3.json').read()), ]) self.assertRaises(TypeError, fetcher.PySolrFetcher) h = fetcher.PySolrQueryFetcher('http://example.edu/solr', 'extra_data', **{'rows': 3}) self.assertEqual( h._query_path, 'query?q=extra_data&sort=id+asc&cursorMark=%2A&wt=json&rows=3') n = 0 for r in h: n += 1 self.assertEqual(n, 10) self.assertEqual(['Mission Santa Ynez'], r['title_tesim'])
def setUp(self): super(OAIFetcherTestCase, self).setUp() httpretty.register_uri( httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES+'/testOAI.xml').read()) self.fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai', 'oac:images')
def setUp(self): httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/', body= '''{"campus": {"list_endpoint": "/api/v1/campus/", "schema": "/api/v1/campus/schema/"}, "collection": {"list_endpoint": "/api/v1/collection/", "schema": "/api/v1/collection/schema/"}, "repository": {"list_endpoint": "/api/v1/repository/", "schema": "/api/v1/repository/schema/"}}''' ) self.registry = Registry()
def testCollectionNoEnrichItems(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/36/", body=open(DIR_FIXTURES + '/collection_api_no_enrich_item.json').read()) c = Collection("https://registry.cdlib.org/api/v1/collection/36/") with self.assertRaises(ValueError): c.dpla_profile_obj
def testCMISFetch(self): httpretty.register_uri(httpretty.GET, 'http://cmis-atom-endpoint/descendants', body=open(DIR_FIXTURES + '/cmis-atom-descendants.xml').read()) h = fetcher.CMISAtomFeedFetcher( 'http://cmis-atom-endpoint/descendants', 'uname, pswd') self.assertTrue(hasattr(h, 'objects')) self.assertEqual(42, len(h.objects))
def testEnrichDoc(self): httpretty.register_uri(httpretty.POST, 'http://localhost:8889/enrich', body=open(DIR_FIXTURES+'/akara_response.json').read(), ) indoc = json.load(open(DIR_FIXTURES+'/couchdb_doc.json')) doc = akara_enrich_doc(indoc, '/select-oac-id,/dpla_mapper?mapper_type=oac_dc') self.assertIn('added-key', doc['sourceResource']) self.assertEqual(doc['sourceResource']['title'], 'changed title')
def testCollectionNoEnrichItems(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/36/", body=open(DIR_FIXTURES + '/collection_api_no_enrich_item.json') .read()) c = Collection("https://registry.cdlib.org/api/v1/collection/36/") with self.assertRaises(ValueError): c.dpla_profile_obj
def setUp(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj', body=open(DIR_FIXTURES + '/testOAC-url_next-0.json').read()) super(OAC_JSON_FetcherTestCase, self).setUp() self.fetcher = fetcher.OAC_JSON_Fetcher( 'http://dsc.cdlib.org/search?rmode=json&facet=type-tab&' 'style=cui&relation=ark:/13030/hb5d5nb7dj', 'extra_data')
def testHarvestIsIter(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj&startDoc=26', body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read()) self.assertTrue(hasattr(self.fetcher, '__iter__')) self.assertEqual(self.fetcher, self.fetcher.__iter__()) self.fetcher.next_record() self.fetcher.next()
def testBadOACSearch(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj--xxxx', body=open(DIR_FIXTURES + '/testOAC-badsearch.xml').read()) self.assertRaises( ValueError, fetcher.OAC_XML_Fetcher, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj--xxxx', 'extra_data')
def setUp(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/tf0c600134', body=open(DIR_FIXTURES + '/testOAC-url_next-0.xml').read()) super(OAC_XML_FetcherTestCase, self).setUp() self.fetcher = fetcher.OAC_XML_Fetcher( 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/tf0c600134', 'extra_data')
def testBadOACSearch(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj--xxxx', body=open(DIR_FIXTURES + '/testOAC-badsearch.xml').read()) self.assertRaises( ValueError, fetcher.OAC_XML_Fetcher, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj--xxxx', 'extra_data')
def setUp(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj', body=open(DIR_FIXTURES + '/testOAC-url_next-0.json').read()) super(OAC_JSON_FetcherTestCase, self).setUp() self.fetcher = fetcher.OAC_JSON_Fetcher( 'http://dsc.cdlib.org/search?rmode=json&facet=type-tab&' 'style=cui&relation=ark:/13030/hb5d5nb7dj', 'extra_data')
def testHarvestIsIter(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj&startDoc=26', body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read()) self.assertTrue(hasattr(self.fetcher, '__iter__')) self.assertEqual(self.fetcher, self.fetcher.__iter__()) self.fetcher.next_record() self.fetcher.next()
def testCMISFetch(self): httpretty.register_uri( httpretty.GET, 'http://cmis-atom-endpoint/descendants', body=open(DIR_FIXTURES+'/cmis-atom-descendants.xml').read()) h = fetcher.CMISAtomFeedFetcher( 'http://cmis-atom-endpoint/descendants', 'uname, pswd') self.assertTrue(hasattr(h, 'objects')) self.assertEqual(42, len(h.objects))
def testResourceIteratorOnePage(self): '''Test when less than one page worth of objects fetched''' httpretty.register_uri(httpretty.GET, 'https://registry.cdlib.org/api/v1/campus/', body=open(DIR_FIXTURES+'/registry_api_campus.json').read()) l = [] for c in self.registry.resource_iter('campus'): l.append(c) self.assertEqual(len(l), 10) self.assertEqual(l[0]['slug'], 'UCB')
def testNuxeoHarvest(self, mock_deepharvest, mock_boto, mock_boto3): '''Test the function of the Nuxeo harvest''' media_json = open(DIR_FIXTURES + '/nuxeo_media_structmap.json').read() mock_boto.return_value.get_bucket.return_value.\ get_key.return_value.\ get_contents_as_string.return_value = media_json httpretty.register_uri( httpretty.GET, 'http://registry.cdlib.org/api/v1/collection/19/', body=open(DIR_FIXTURES + '/collection_api_test_nuxeo.json').read()) mock_deepharvest.return_value.fetch_objects.return_value = json.load( open(DIR_FIXTURES + '/nuxeo_object_list.json')) httpretty.register_uri( httpretty.GET, re.compile('https://example.edu/Nuxeo/site/api/v1/id/.*'), body=open(DIR_FIXTURES + '/nuxeo_doc.json').read()) self.collection = Collection( 'http://registry.cdlib.org/api/v1/collection/19/') with patch( 'ConfigParser.SafeConfigParser', autospec=True) as mock_configparser: config_inst = mock_configparser.return_value config_inst.get.return_value = 'dublincore,ucldc_schema,picture' self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path) self.assertTrue(hasattr(self.controller, 'harvest')) num = self.controller.harvest() self.assertEqual(num, 5) self.tearDown_config() # verify one record has collection and such filled in fname = os.listdir(self.controller.dir_save)[0] saved_objset = json.load( open(os.path.join(self.controller.dir_save, fname))) saved_obj = saved_objset[0] self.assertEqual(saved_obj['collection'][0]['@id'], u'http://registry.cdlib.org/api/v1/collection/19/') self.assertEqual(saved_obj['collection'][0]['name'], u'Cochems (Edward W.) Photographs') self.assertEqual(saved_obj['collection'][0]['title'], u'Cochems (Edward W.) Photographs') self.assertEqual(saved_obj['collection'][0]['id'], u'19') self.assertEqual(saved_obj['collection'][0]['dcmi_type'], 'I') self.assertEqual(saved_obj['collection'][0]['rights_statement'], 'a sample rights statement') self.assertEqual(saved_obj['collection'][0]['rights_status'], 'PD') self.assertEqual(saved_obj['state'], 'project') self.assertEqual( saved_obj['title'], 'Adeline Cochems having her portrait taken by her father ' 'Edward W, Cochems in Santa Ana, California: Photograph')
def testResourceIteratorReturnsCollection(self): '''Test that the resource iterator returns a Collection object for library collection resources''' httpretty.register_uri(httpretty.GET, 'https://registry.cdlib.org/api/v1/collection/', body=open(DIR_FIXTURES+'/registry_api_collection.json').read()) riter = self.registry.resource_iter('collection') c = riter.next() self.assertTrue(isinstance(c, Collection)) self.assertTrue(hasattr(c, 'auth')) self.assertEqual(c.auth, None)
def testOAC_JSON_FetcherReturnedData(self): '''test that the data returned by the OAC Fetcher is a proper dc dictionary ''' httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj&startDoc=26', body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read()) rec = self.fetcher.next()[0] self.assertIsInstance(rec, dict)
def test_fetching(self): url = 'https://example.edu' playlist_id = 'testplaylist' page_size = 3 url_first = fetcher.YouTube_Fetcher.url_playlistitems.format( api_key='boguskey', page_size=page_size, playlist_id=playlist_id, page_token='') url_vids = fetcher.YouTube_Fetcher.url_video # Ugly but works httpretty.register_uri( httpretty.GET, url_first, responses=[ httpretty.Response( body=open(DIR_FIXTURES + '/youtube_playlist_with_next.json').read(), status=200), httpretty.Response( body=open(DIR_FIXTURES + '/youtube_playlist_no_next.json') .read(), status=200), ]) httpretty.register_uri( httpretty.GET, url_vids, responses=[ httpretty.Response( body=open(DIR_FIXTURES + '/youtube_video.json').read(), status=200), httpretty.Response( body=open(DIR_FIXTURES + '/youtube_video.json').read(), status=200), ]) h = fetcher.YouTube_Fetcher(url, playlist_id, page_size=page_size) vids = [] for v in h: vids.extend(v) self.assertEqual(len(vids), 6) self.assertEqual(vids[0], { u'contentDetails': { u'definition': u'sd', u'projection': u'rectangular', u'caption': u'false', u'duration': u'PT19M35S', u'licensedContent': True, u'dimension': u'2d' }, u'kind': u'youtube#video', u'etag': u'"m2yskBQFythfE4irbTIeOgYYfBU/-3AtVAYcRLEynWZprpf0OGaY8zo"', u'id': u'0Yx8zrbsUu8' })
def testOverrideMetadataPrefix(self): '''test that the metadataPrefix for an OAI feed can be overridden. The extra_data for OAI can be either just a set spec or a html query string of set= &metadataPrefix= ''' httpretty.register_uri( httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES+'/testOAI.xml').read()) set_fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai', 'set=oac:images') self.assertEqual(set_fetcher._set, 'oac:images') rec = set_fetcher.next() self.assertIsInstance(rec, dict) self.assertIn('id', rec) self.assertEqual(rec['id'], '13030/hb796nb5mn') self.assertIn('datestamp', rec) self.assertIn(rec['datestamp'], '2005-12-13') self.assertEqual(httpretty.last_request().querystring, {u'verb': [u'ListRecords'], u'set': [u'oac:images'], u'metadataPrefix': [u'oai_dc']}) httpretty.register_uri( httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES+'/testOAI-didl.xml').read()) didl_fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai', 'set=oac:images&metadataPrefix=didl') self.assertEqual(didl_fetcher._set, 'oac:images') self.assertEqual(didl_fetcher._metadataPrefix, 'didl') rec = didl_fetcher.next() self.assertIsInstance(rec, dict) self.assertIn('id', rec) self.assertEqual(rec['id'], 'oai:ucispace-prod.lib.uci.edu:10575/25') self.assertEqual(rec['title'], ['Schedule of lectures']) self.assertIn('datestamp', rec) self.assertEqual(rec['datestamp'], '2015-05-20T11:04:23Z') self.assertEqual(httpretty.last_request().querystring, {u'verb': [u'ListRecords'], u'set': [u'oac:images'], u'metadataPrefix': [u'didl']}) self.assertEqual(rec['Resource']['@ref'], 'http://ucispace-prod.lib.uci.edu/xmlui/bitstream/' + '10575/25/1/!COLLOQU.IA.pdf') self.assertEqual(rec['Item']['@id'], 'uuid-640925bd-9cdf-46be-babb-b2138c3fce9c') self.assertEqual(rec['Component']['@id'], 'uuid-897984d8-9392-4a68-912f-ffdf6fd7ce59') self.assertIn('Descriptor', rec) self.assertEqual(rec['Statement']['@mimeType'], 'application/xml; charset=utf-8') self.assertEqual( rec['DIDLInfo'] ['{urn:mpeg:mpeg21:2002:02-DIDL-NS}DIDLInfo'][0]['text'], '2015-05-20T20:30:26Z') del didl_fetcher
def testAmpersandInDoc(self): httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj', body=open(DIR_FIXTURES + '/testOAC-utf8-content.xml').read()) h = fetcher.OAC_XML_Fetcher( 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/hb5d5nb7dj', 'extra_data') self.assertEqual(h.totalDocs, 25) self.assertEqual(h.currentDoc, 0) h.next()