def testAddRegistryData(self): '''Unittest the _add_registry_data function''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') self.tearDown_config() # remove ones setup in setUp self.setUp_config(collection) controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) obj = {'id': 'fakey', 'otherdata': 'test'} self.assertNotIn('collection', obj) controller._add_registry_data(obj) self.assertIn('collection', obj) self.assertEqual(obj['collection'][0]['@id'], 'https://registry.cdlib.org/api/v1/collection/197/') self.assertNotIn('campus', obj) self.assertIn('campus', obj['collection'][0]) self.assertNotIn('repository', obj) self.assertIn('repository', obj['collection'][0]) # need to test one without campus self.assertEqual(obj['collection'][0]['campus'][0]['@id'], 'https://registry.cdlib.org/api/v1/campus/12/') self.assertEqual(obj['collection'][0]['repository'][0]['@id'], 'https://registry.cdlib.org/api/v1/repository/37/')
def testLoggingMoreThan1000(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/198/", body=open(DIR_FIXTURES + '/collection_api_big_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-2400-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/198/') controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) controller.harvest() self.assertEqual(len(self.test_log_handler.records), 13) self.assertEqual(self.test_log_handler.formatted_records[1], '[INFO] HarvestController: 100 records harvested') shutil.rmtree(controller.dir_save) self.assertEqual(self.test_log_handler.formatted_records[10], '[INFO] HarvestController: 1000 records harvested') self.assertEqual(self.test_log_handler.formatted_records[11], '[INFO] HarvestController: 2000 records harvested') self.assertEqual(self.test_log_handler.formatted_records[12], '[INFO] HarvestController: 2400 records harvested')
def testCreateProfile(self): '''Test the creation of a DPLA style proflie file''' httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/collection/178', body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read()) c = Collection('https://registry.cdlib.org/api/v1/collection/178') self.assertTrue(hasattr(c, 'dpla_profile')) self.assertIsInstance(c.dpla_profile, str) j = json.loads(c.dpla_profile) self.assertEqual(j['name'], '178') self.assertEqual(j['enrichments_coll'], ['/compare_with_schema']) self.assertTrue('enrichments_item' in j) self.assertIsInstance(j['enrichments_item'], list) self.assertEqual(len(j['enrichments_item']), 30) self.assertIn('contributor', j) self.assertIsInstance(j['contributor'], list) self.assertEqual(len(j['contributor']), 4) self.assertEqual(j['contributor'][1], { u'@id': u'/api/v1/campus/1/', u'name': u'UCB' }) self.assertTrue(hasattr(c, 'dpla_profile_obj')) self.assertIsInstance(c.dpla_profile_obj, dict) self.assertIsInstance(c.dpla_profile_obj['enrichments_item'], list) e = c.dpla_profile_obj['enrichments_item'] self.assertEqual(e[0], '/oai-to-dpla') self.assertEqual( e[1], '/shred?prop=sourceResource/contributor%2CsourceResource/creator%2CsourceResource/date' )
def testMainCreatesCollectionProfile(self, mock_boto3): '''Test that the main function produces a collection profile file for DPLA. The path to this file is needed when creating a DPLA ingestion document. ''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) Collection("https://registry.cdlib.org/api/v1/collection/197/") with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(ingest_doc_id, 'test-id') self.assertEqual(num, 128) self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
def testMainHarvestController__init__Error(self, mock_method): '''Test the try-except block in main when HarvestController not created correctly''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) sys.argv = [ 'thisexe', '*****@*****.**', 'https://registry.cdlib.org/api/v1/collection/197/' ] self.assertRaises(Exception, fetcher.main, self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile) self.assertEqual(len(self.test_log_handler.records), 4) self.assertTrue("[ERROR] HarvestMain: Exception in harvester init" in self.test_log_handler.formatted_records[3]) self.assertTrue("Boom!" in self.test_log_handler.formatted_records[3]) c = Collection('https://registry.cdlib.org/api/v1/collection/197/') os.remove( os.path.abspath(os.path.join(self.dir_test_profile, c.id + '.pjs')))
def update_collection_from_remote(url_remote_couchdb, url_api_collection, delete_first=True): '''Update a collection from a remote couchdb. ''' if delete_first: delete_collection(url_api_collection.rsplit('/', 2)[1]) collection = Collection(url_api_collection) # guard against updating production for not ready_for_publication # collections if 'prod' in environ.get('DATA_BRANCH', ''): if not collection.ready_for_publication: raise Exception( 'In PRODUCTION ENV and collection {} not ready for ' 'publication'.format(collection.id)) doc_ids = get_collection_doc_ids(collection.id, url_remote_couchdb) couchdb_remote = get_couchdb(url_remote_couchdb) couchdb_env = get_couchdb() created = 0 updated = 0 for doc_id in doc_ids: msg = update_from_remote(doc_id, couchdb_remote=couchdb_remote, couchdb_env=couchdb_env) if 'created' in msg: created += 1 else: updated += 1 return len(doc_ids), updated, created
def setUp(self): super(HarvestControllerTestCase, self).setUp() httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) self.collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') config_file, profile_path = self.setUp_config(self.collection) self.controller_oai = fetcher.HarvestController( '*****@*****.**', self.collection, profile_path=profile_path, config_file=config_file) self.objset_test_doc = json.load( open(DIR_FIXTURES + '/objset_test_doc.json')) class myNow(datetime.datetime): @classmethod def now(cls): return cls(2017, 7, 14, 12, 1) self.old_dt = datetime.datetime datetime.datetime = myNow
def testCollectionNoEnrichItems(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/36/", body=open(DIR_FIXTURES + '/collection_api_no_enrich_item.json').read()) c = Collection("https://registry.cdlib.org/api/v1/collection/36/") with self.assertRaises(ValueError): c.dpla_profile_obj
def testNuxeoHarvest(self, mock_deepharvest, mock_boto, mock_boto3): '''Test the function of the Nuxeo harvest''' media_json = open(DIR_FIXTURES + '/nuxeo_media_structmap.json').read() mock_boto.return_value.get_bucket.return_value.\ get_key.return_value.\ get_contents_as_string.return_value = media_json httpretty.register_uri( httpretty.GET, 'http://registry.cdlib.org/api/v1/collection/19/', body=open(DIR_FIXTURES + '/collection_api_test_nuxeo.json').read()) mock_deepharvest.return_value.fetch_objects.return_value = json.load( open(DIR_FIXTURES + '/nuxeo_object_list.json')) httpretty.register_uri( httpretty.GET, re.compile('https://example.edu/Nuxeo/site/api/v1/id/.*'), body=open(DIR_FIXTURES + '/nuxeo_doc.json').read()) self.collection = Collection( 'http://registry.cdlib.org/api/v1/collection/19/') with patch( 'ConfigParser.SafeConfigParser', autospec=True) as mock_configparser: config_inst = mock_configparser.return_value config_inst.get.return_value = 'dublincore,ucldc_schema,picture' self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path) self.assertTrue(hasattr(self.controller, 'harvest')) num = self.controller.harvest() self.assertEqual(num, 5) self.tearDown_config() # verify one record has collection and such filled in fname = os.listdir(self.controller.dir_save)[0] saved_objset = json.load( open(os.path.join(self.controller.dir_save, fname))) saved_obj = saved_objset[0] self.assertEqual(saved_obj['collection'][0]['@id'], u'http://registry.cdlib.org/api/v1/collection/19/') self.assertEqual(saved_obj['collection'][0]['name'], u'Cochems (Edward W.) Photographs') self.assertEqual(saved_obj['collection'][0]['title'], u'Cochems (Edward W.) Photographs') self.assertEqual(saved_obj['collection'][0]['id'], u'19') self.assertEqual(saved_obj['collection'][0]['dcmi_type'], 'I') self.assertEqual(saved_obj['collection'][0]['rights_statement'], 'a sample rights statement') self.assertEqual(saved_obj['collection'][0]['rights_status'], 'PD') self.assertEqual(saved_obj['state'], 'project') self.assertEqual( saved_obj['title'], 'Adeline Cochems having her portrait taken by her father ' 'Edward W, Cochems in Santa Ana, California: Photograph')
def main(url_remote_couchdb, url_api_collection): '''Update to the current environment's couchdb a remote couchdb collection ''' collection = Collection(url_api_collection) total, updated, created = update_collection_from_remote( url_remote_couchdb, url_api_collection) msg = 'Synced {} documents to production for CouchDB collection {}'.format( total, collection.id) msg += '\nUpdated {} documents, created {} documents.'.format( updated, created) publish_to_harvesting('Synced CouchDB Collection {}'.format(collection.id), msg)
def get_id_on_queue_and_run(queue): cdbworker = CouchDBWorker() cid = queue.get_nowait() while cid: c_reg = Collection(url_api_base + cid) h = HarvestController('*****@*****.**', c_reg) c_couch = h._add_registry_data({})['collection'] del (h) print "STARTING COLLECTION: {}".format(cid) cdbworker.run_by_collection(cid, fix_registry_data, c_couch, cdbworker._couchdb) print "FINISHED COLLECTION: {}".format(cid) cid = queue.get_nowait()
def update_collection_description(doc): cjson = doc['originalRecord']['collection'][0] # get collection description if 'description' not in cjson: if cjson['@id'] in C_CACHE: c = C_CACHE[cjson['@id']] else: c = Collection(url_api=cjson['@id']) C_CACHE[cjson['@id']] = c description = c['description'] if c['description'] else c['name'] print('DOC: {} DESCRIP: {}'.format(doc['_id'], c['description'].encode('utf8'))) doc['originalRecord']['collection'][0]['description'] = description doc['sourceResource']['collection'][0]['description'] = description return doc
def testOAICollectionAPI(self): httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/collection/197', body=open(DIR_FIXTURES + '/collection_api_test.json').read()) c = Collection('https://registry.cdlib.org/api/v1/collection/197') self.assertEqual(c['harvest_type'], 'OAI') self.assertEqual(c.harvest_type, 'OAI') self.assertEqual( c['name'], 'Calisphere - Santa Clara University: Digital Objects') self.assertEqual( c.name, 'Calisphere - Santa Clara University: Digital Objects') self.assertEqual(c['url_oai'], 'fixtures/testOAI-128-records.xml') self.assertEqual(c.url_oai, 'fixtures/testOAI-128-records.xml') self.assertEqual(c.campus[0]['resource_uri'], '/api/v1/campus/12/') self.assertEqual(c.campus[0]['slug'], 'UCDL')
def testOACApiCollection(self): httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/collection/178', body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read()) c = Collection('https://registry.cdlib.org/api/v1/collection/178') self.assertEqual(c['harvest_type'], 'OAJ') self.assertEqual(c.harvest_type, 'OAJ') self.assertEqual(c['name'], 'Harry Crosby Collection') self.assertEqual(c.name, 'Harry Crosby Collection') self.assertEqual(c['url_oac'], 'fixtures/testOAC.json') self.assertEqual(c.url_oac, 'fixtures/testOAC.json') self.assertEqual(c.campus[0]['resource_uri'], '/api/v1/campus/6/') self.assertEqual(c.campus[0]['slug'], 'UCSD') self.assertEqual(c.dcmi_type, 'I') self.assertEqual(c.rights_statement, "a sample rights statement") self.assertEqual(c.rights_status, "PD")
def setUp(self): super(MainTestCase, self).setUp() self.dir_test_profile = '/tmp/profiles/test' self.dir_save = None if not os.path.isdir(self.dir_test_profile): os.makedirs(self.dir_test_profile) self.user_email = '*****@*****.**' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) self.url_api_collection = \ "https://registry.cdlib.org/api/v1/collection/197/" sys.argv = ['thisexe', self.user_email, self.url_api_collection] self.collection = Collection(self.url_api_collection) self.setUp_config(self.collection) self.mail_handler = logbook.TestHandler(bubble=True) self.mail_handler.push_thread()
def main(collection_ids, rq_queue='dh-q', config=None, pynuxrc=None, replace=False, timeout=JOB_TIMEOUT, log_handler=None): ''' Queue a deep harvest of a nuxeo object on a worker''' if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() log = logbook.Logger('QDH') for cid in [x for x in collection_ids.split(';')]: url_api = ''.join( ('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc) for obj in dh.fetch_objects(): log.info('Queueing TOPLEVEL {} :-: {}'.format( obj['uid'], obj['path'])) # deep harvest top level object queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=obj['path'], replace=replace, timeout=timeout) # deep harvest component sub-objects for c in dh.fetch_components(obj): log.info('Queueing {} :-: {}'.format(c['uid'], c['path'])) queue_deep_harvest_path(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, path=c['path'], replace=replace, timeout=timeout) log_handler.pop_application()
def testMARCHarvest(self, mock_boto3): '''Test the function of the MARC harvest''' httpretty.register_uri( httpretty.GET, 'http://registry.cdlib.org/api/v1/collection/', body=open(DIR_FIXTURES + '/collection_api_test_marc.json').read()) self.collection = Collection( 'http://registry.cdlib.org/api/v1/collection/') self.collection.url_harvest = 'file:' + DIR_FIXTURES + '/marc-test' self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path) self.assertTrue(hasattr(self.controller, 'harvest')) num = self.controller.harvest() self.assertEqual(num, 10) self.tearDown_config()
def testFailsIfNoRecords(self): '''Test that the Controller throws an error if no records come back from fetcher ''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/101/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-no-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/101/') controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) self.assertRaises(fetcher.NoRecordsFetchedException, controller.harvest)
def setUp(self): super(HarvestOAC_JSON_ControllerTestCase, self).setUp() # self.testFile = DIR_FIXTURES+'/collection_api_test_oac.json' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/178/", body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read()) httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/tf2v19n928', body=open(DIR_FIXTURES + '/testOAC.json').read()) self.collection = Collection( 'https://registry.cdlib.org/api/v1/collection/178/') self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path)
def testOAIHarvest(self): '''Test the function of the OAI harvest''' httpretty.register_uri( httpretty.GET, 'http://registry.cdlib.org/api/v1/collection/', body=open(DIR_FIXTURES+'/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, 'http://content.cdlib.org/oai', body=open(DIR_FIXTURES+'/testOAC-url_next-0.xml').read()) self.collection = Collection( 'http://registry.cdlib.org/api/v1/collection/') self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path) self.assertTrue(hasattr(self.controller, 'harvest')) # TODO: fix why logbook.TestHandler not working for previous logging # self.assertEqual(len(self.test_log_handler.records), 2) self.tearDown_config()
def testSaveToS3(self, mock_boto3): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) controller.save_objset_s3({"xxxx": "yyyy"}) mock_boto3.assert_called_with('s3') mock_boto3().Bucket.assert_called_with('ucldc-ingest') mock_boto3().Bucket().put_object.assert_called_with( Body='{"xxxx": "yyyy"}\n', Key='data-fetched/197/2017-07-14-1201/page-0.jsonl')
def main(user_email, url_api_collections, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file='akara.ini', rq_queue=None, **kwargs): '''Runs a UCLDC ingest process for the given collection''' emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() config = config_harvest(config_file=config_file) if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() for url_api_collection in [x for x in url_api_collections.split(';')]: try: collection = Collection(url_api_collection) except Exception, e: msg = 'Exception in Collection {}, init {}'.format( url_api_collection, str(e)) logbook.error(msg) raise e queue_image_harvest(config['redis_host'], config['redis_port'], config['redis_password'], config['redis_connect_timeout'], rq_queue=rq_queue, collection_key=collection.id, object_auth=collection.auth, **kwargs)
def setUp(self): super(HarvestOAC_XML_ControllerTestCase, self).setUp() # self.testFile = DIR_FIXTURES+'/collection_api_test_oac.json' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/178/", body=open(DIR_FIXTURES + '/collection_api_test_oac_xml.json').read()) httpretty.register_uri( httpretty.GET, 'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&' 'relation=ark:/13030/tf0c600134', body=open(DIR_FIXTURES + '/testOAC-url_next-0.xml').read()) self.collection = Collection( 'https://registry.cdlib.org/api/v1/collection/178/') self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path) print "DIR SAVE::::: {}".format(self.controller.dir_save)
def setUp(self): super(HarvestSolr_ControllerTestCase, self).setUp() # self.testFile = DIR_FIXTURES+'/collection_api_test_oac.json' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/183/", body=open(DIR_FIXTURES + '/collection_api_solr_harvest.json').read()) httpretty.register_uri( httpretty.POST, 'http://example.edu/solr/blacklight/select', body=open(DIR_FIXTURES + '/ucsd-new-feed-missions-bb3038949s-0.xml').read()) self.collection = Collection( 'https://registry.cdlib.org/api/v1/collection/183/') self.setUp_config(self.collection) self.controller = fetcher.HarvestController( '*****@*****.**', self.collection, config_file=self.config_file, profile_path=self.profile_path) print "DIR SAVE::::: {}".format(self.controller.dir_save)
def add_rights_and_type_to_collection(doc): cjson = doc['originalRecord']['collection'][0] # get collection description if cjson['@id'] in C_CACHE: c = C_CACHE[cjson['@id']] else: c = Collection(url_api=cjson['@id']) C_CACHE[cjson['@id']] = c doc['originalRecord']['collection'][0]['rights_status'] = c[ 'rights_status'] doc['originalRecord']['collection'][0]['rights_statement'] = c[ 'rights_statement'] doc['originalRecord']['collection'][0]['dcmi_type'] = c['dcmi_type'] if 'collection' in doc['sourceResource']: doc['sourceResource']['collection'][0]['rights_status'] = c[ 'rights_status'] doc['sourceResource']['collection'][0]['rights_statement'] = c[ 'rights_statement'] doc['sourceResource']['collection'][0]['dcmi_type'] = c['dcmi_type'] else: doc['sourceResource']['collection'] = doc['originalRecord'][ 'collection'] return doc
def main(args): parser = argparse.ArgumentParser( description='run the enrichments stored for a collection.') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--collection_id', help='Registry id for the collection') group.add_argument('--cid_file', help='File with collection ids for running') parser.add_argument('--rq_queue', help='Override queue for jobs, normal-stage is default') args = parser.parse_args(args) Q = 'normal-stage' if args.rq_queue: Q = args.rq_queue enq = CouchDBJobEnqueue(Q) timeout = 10000 cids = [] if args.collection_id: cids = [ args.collection_id ] else: #cid file with open(args.cid_file) as foo: lines = foo.readlines() cids = [ l.strip() for l in lines] print "CIDS:{}".format(cids) for cid in cids: url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) print coll.id enrichments = coll.enrichments_item enq.queue_collection(cid, timeout, harvester.post_processing.enrich_existing_couch_doc.main, enrichments )
def testNuxeoCollectionAuth(self): '''Test that a Nuxeo harvest collection returns an authentication tuple, not None ''' httpretty.register_uri( httpretty.GET, 'https://registry.cdlib.org/api/v1/collection/19', body=open(DIR_FIXTURES + '/registry_api_collection_nuxeo.json').read()) c = Collection('https://registry.cdlib.org/api/v1/collection/19') self.assertTrue(c.harvest_type, 'NUX') defaultrc = """\ [nuxeo_account] user = TestUser password = TestPass [platform_importer] base = http://localhost:8080/nuxeo/site/fileImporter """ with patch('__builtin__.open') as fakeopen: fakeopen.return_value = StringIO.StringIO(defaultrc) self.assertEqual(c.auth[0], 'TestUser') self.assertEqual(c.auth[1], 'TestPass')
def testHarvestControllerExists(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) collection = Collection( 'https://registry.cdlib.org/api/v1/collection/197/') controller = fetcher.HarvestController('*****@*****.**', collection, config_file=self.config_file, profile_path=self.profile_path) self.assertTrue(hasattr(controller, 'fetcher')) self.assertIsInstance(controller.fetcher, fetcher.OAIFetcher) self.assertTrue(hasattr(controller, 'campus_valid')) self.assertTrue(hasattr(controller, 'dc_elements')) self.assertTrue(hasattr(controller, 'datetime_start')) print(controller.s3path) self.assertEqual(controller.s3path, 'data-fetched/197/2017-07-14-1201/') shutil.rmtree(controller.dir_save)
cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc') else: cdb = get_couchdb(dbname='ucldc') collections = get_indexed_collection_list(SOLR) date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M') fname = 'indexed_collections-{}.csv'.format(date_to_minute) with open(fname, 'wb') as csvfile: csvwriter = UnicodeWriter(csvfile) csvwriter.writerow( ('Collection Name', 'Collection URL', 'Number in index', 'Number in couchdb', 'Number in OAC', 'Couch missing in solr', 'OAC missing in couch', 'Repository Name', 'Repository URL', 'Campus')) for c_url, num in collections: try: c = Collection(c_url) except ValueError, e: print "NO COLLECTION FOR :{}".format(c_url) continue couch_count = get_couch_count(cdb, c.id) solr_equal_couch = False if couch_count == num: solr_equal_couch = True oac_num = None couch_equal_oac = None if c.harvest_type == 'OAC': fetcher = OAC_XML_Fetcher(c.url_harvest, c.harvest_extra_data) oac_num = fetcher.totalDocs if couch_count == oac_num: couch_equal_oac = True else:
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format( url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting( subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()