def main(user_email, cid, url_couchdb=None): enq = CouchDBJobEnqueue() timeout = 10000 enq.queue_collection(cid, timeout, harvest_image_for_doc, url_couchdb=url_couchdb, )
def q_collection(collection_id, enrichment): timeout = 10000 print "ENRICH {} with {}".format(collection_id, enrichment) ENQ = CouchDBJobEnqueue() ENQ.queue_collection(collection_id, timeout, harvester.post_processing.enrich_existing_couch_doc.main, enrichment )
def main(user_email, cid, url_couchdb=None): enq = CouchDBJobEnqueue() timeout = 10000 enq.queue_collection( cid, timeout, harvest_image_for_doc, url_couchdb=url_couchdb, )
def main(user_email, doc_id_list_file, rq_queue=None, url_couchdb=None): enq = CouchDBJobEnqueue(rq_queue=rq_queue) timeout = 10000000 with open(doc_id_list_file) as foo: doc_id_list = [ l.strip() for l in foo.readlines()] results = enq.queue_list_of_ids(doc_id_list, timeout, harvest_image_for_doc, url_couchdb=url_couchdb, ) print "Results:{}".format(results)
def main(user_email, doc_id_list_file, rq_queue=None, url_couchdb=None): enq = CouchDBJobEnqueue(rq_queue=rq_queue) timeout = 10000000 with open(doc_id_list_file) as foo: doc_id_list = [l.strip() for l in foo.readlines()] results = enq.queue_list_of_ids( doc_id_list, timeout, harvest_image_for_doc, url_couchdb=url_couchdb, ) print "Results:{}".format(results)
def main(cid): worker = CouchDBWorker() enq = CouchDBJobEnqueue() timeout = 100000 cdb = get_couchdb() worker.run_by_collection(cid, delete_field_and_queue_image_harvest, 'object', cdb, enq)
def main(doc_ids, **kwargs): enq = CouchDBJobEnqueue(rq_queue=kwargs['rq_queue']) timeout = 10000 if 'rq_queue' in kwargs: del kwargs['rq_queue'] if 'timeout' in kwargs: if type(kwargs['timeout']) == int: timeout = kwargs['timeout'] del kwargs['timeout'] if 'object_auth' in kwargs: kwargs['object_auth'] = (kwargs['object_auth'].split(':')[0], kwargs['object_auth'].split(':')[1]) enq.queue_list_of_ids(doc_ids, timeout, harvest_image_for_doc, force=True, **kwargs)
def main(args): parser = argparse.ArgumentParser( description='run an Akara enrichment chain on documents in a \ collection.') parser.add_argument('document_id', help='Registry id for the collection') parser.add_argument('enrichment', help='enrichment chain to run') args = parser.parse_args(args) print(args.collection_id) print(args.enrichment) enq = CouchDBJobEnqueue() timeout = 10000 enq.queue_collection(args.collection_id, timeout, harvester.post_processing.enrich_existing_couch_doc.main, args.enrichment )
def main(doc_ids, **kwargs): enq = CouchDBJobEnqueue(rq_queue=kwargs['rq_queue']) timeout = 10000 if 'rq_queue' in kwargs: del kwargs['rq_queue'] if 'timeout' in kwargs: if type(kwargs['timeout']) == int: timeout = kwargs['timeout'] del kwargs['timeout'] if 'object_auth' in kwargs: kwargs['object_auth'] = (kwargs['object_auth'].split(':')[0], kwargs['object_auth'].split(':')[1]) enq.queue_list_of_ids(doc_ids, timeout, harvest_image_for_doc, force=True, **kwargs )
class CouchDBJobEnqueueTestCase(TestCase): #@patch('redis.client.Redis', autospec=True) @patch('harvester.post_processing.couchdb_runner.Redis') @httpretty.activate def setUp(self, mock_redis): self.conf = config() self.url_couch_base = self.conf['couchdb_url'] self.cdb = self.conf['couchdb_dbname'] print "+++++++++++++confg:{0}".format(self.conf) url_head = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_head, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT') self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete') def func_for_test(doc, *args, **kwargs): return doc, args, kwargs self.function = func_for_test @httpretty.activate def testCollectionSlice(self): '''Test that results are correct for a known couchdb result''' url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design', COUCHDB_VIEW.split('/')[0], '_view', COUCHDB_VIEW.split('/')[1]) httpretty.register_uri( httpretty.GET, re.compile(url_to_pretty + ".*$"), body=open(DIR_FIXTURES + '/couchdb_by_provider_name-5112.json').read(), etag="2U5BW2TDDX9EHZJOO0DNE29D1", content_type='application/json', ) #transfer_encoding='chunked', #NOTE: doesn't work with httpretty results = self._cdbrunner.queue_collection('5112', 6000, self.function, 'arg1', 'arg2', kwarg1='1', kwarg2=2) self.assertEqual(len(results), 3) self.assertEqual(results[0].args, ('5112--http://ark.cdlib.org/ark:/13030/kt7580382j', 'arg1', 'arg2')) self.assertEqual(results[0].kwargs, {'kwarg1': '1', 'kwarg2': 2}) self.assertEqual(results[0].func_name, 'test.test_couchdb_runner.func_for_test')
def setUp(self, mock_redis): self.conf = config() self.url_couch_base = self.conf['couchdb_url'] self.cdb = self.conf['couchdb_dbname'] print "+++++++++++++confg:{0}".format(self.conf) url_head = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_head, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT') self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete') def func_for_test(doc, *args, **kwargs): return doc, args, kwargs self.function = func_for_test
def main(args): parser = argparse.ArgumentParser( description='run the enrichments stored for a collection.') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--collection_id', help='Registry id for the collection') group.add_argument('--cid_file', help='File with collection ids for running') parser.add_argument('--rq_queue', help='Override queue for jobs, normal-stage is default') args = parser.parse_args(args) Q = 'normal-stage' if args.rq_queue: Q = args.rq_queue enq = CouchDBJobEnqueue(Q) timeout = 10000 cids = [] if args.collection_id: cids = [ args.collection_id ] else: #cid file with open(args.cid_file) as foo: lines = foo.readlines() cids = [ l.strip() for l in lines] print "CIDS:{}".format(cids) for cid in cids: url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/', cid, '/')) coll = Collection(url_api) print coll.id enrichments = coll.enrichments_item enq.queue_collection(cid, timeout, harvester.post_processing.enrich_existing_couch_doc.main, enrichments )
def main(args): parser = argparse.ArgumentParser( description='run an Akara enrichment chain on documents in a \ collection.') parser.add_argument('collection_id', help='Registry id for the collection') parser.add_argument('enrichment', help='File of enrichment chain to run') parser.add_argument('--rq_queue', help='Override queue for jobs, normal-stage is default') args = parser.parse_args(args) print "CID:{}".format(args.collection_id) print "ENRICH FILE:{}".format(args.enrichment) with open(args.enrichment) as enrichfoo: enrichments = enrichfoo.read() Q = 'normal-stage' if args.rq_queue: Q = args.rq_queue enq = CouchDBJobEnqueue(Q) timeout = 10000 enq.queue_collection(args.collection_id, timeout, harvester.post_processing.enrich_existing_couch_doc.main, enrichments )
class CouchDBJobEnqueueTestCase(TestCase): #@patch('redis.client.Redis', autospec=True) @patch('harvester.post_processing.couchdb_runner.Redis') @httpretty.activate def setUp(self, mock_redis): self.conf = config() self.url_couch_base = self.conf['couchdb_url'] self.cdb = self.conf['couchdb_dbname'] print "+++++++++++++confg:{0}".format(self.conf) url_head = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_head, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT' ) self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete') def func_for_test(doc, *args, **kwargs): return doc, args, kwargs self.function = func_for_test @httpretty.activate def testCollectionSlice(self): '''Test that results are correct for a known couchdb result''' url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design', COUCHDB_VIEW.split('/')[0], '_view', COUCHDB_VIEW.split('/')[1]) httpretty.register_uri(httpretty.GET, re.compile(url_to_pretty+".*$"), body=open(DIR_FIXTURES+'/couchdb_by_provider_name-5112.json').read(), etag="2U5BW2TDDX9EHZJOO0DNE29D1", content_type='application/json', ) #transfer_encoding='chunked', #NOTE: doesn't work with httpretty results = self._cdbrunner.queue_collection('5112', 6000, self.function, 'arg1', 'arg2', kwarg1='1', kwarg2=2) self.assertEqual(len(results), 3) self.assertEqual(results[0].args, ('5112--http://ark.cdlib.org/ark:/13030/kt7580382j', 'arg1', 'arg2')) self.assertEqual(results[0].kwargs, {'kwarg1': '1', 'kwarg2': 2}) self.assertEqual(results[0].func_name, 'test.test_couchdb_runner.func_for_test')
def setUp(self, mock_redis): self.conf = config() self.url_couch_base = self.conf['couchdb_url'] self.cdb = self.conf['couchdb_dbname'] print "+++++++++++++confg:{0}".format(self.conf) url_head = os.path.join(self.url_couch_base, self.cdb) httpretty.register_uri(httpretty.HEAD, url_head, body='', content_length='0', content_type='text/plain; charset=utf-8', connection='close', server='CouchDB/1.5.0 (Erlang OTP/R16B03)', cache_control='must-revalidate', date='Mon, 24 Nov 2014 21:30:38 GMT' ) self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete') def func_for_test(doc, *args, **kwargs): return doc, args, kwargs self.function = func_for_test
from harvester.post_processing.couchdb_runner import CouchDBJobEnqueue from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_doc from harvester.post_processing.fix_repeated_displayDate import fix_repeated_date import harvester import sys fname = sys.argv[1] cid_list = [ x.strip() for x in open(fname).readlines()] for cid in cid_list: results = CouchDBJobEnqueue().queue_collection( cid, 300, run_on_couchdb_doc, 'harvester.post_processing.fix_repeated_displayDate.fix_repeated_date')
/cleanup_value, /move_date_values?prop=sourceResource%2Fsubject, /move_date_values?prop=sourceResource%2Fspatial, /shred?prop=sourceResource%2Fspatial&delim=--, /capitalize_value?exclude=sourceResource%2Frelation, /enrich-subject, /enrich_earliest_date, /enrich_date, /enrich-type, /enrich-format, /enrich_location, /copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider, /enrich_language, /lookup?prop=sourceResource%2Flanguage%2Fname&target=sourceResource%2Flanguage%2Fname&substitution=iso639_3, /lookup?prop=sourceResource%2Flanguage%2Fname&target=sourceResource%2Flanguage%2Fiso639_3&substitution=iso639_3&inverse=True, /copy_prop?prop=provider%2Fname&to_prop=dataProvider&skip_if_exists=True, /set_prop?prop=sourceResource%2FstateLocatedIn&value=California, /enrich_location?prop=sourceResource%2FstateLocatedIn, /dedupe-sourceresource, /validate_mapv3''' enrichments = enrichments.replace('\n','').replace(' ','') print enrichments results = CouchDBJobEnqueue().queue_collection("26094", 30000, reenrich, enrichments ) print results
# -*- coding: utf-8 -*- from harvester.post_processing.couchdb_runner import CouchDBJobEnqueue from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_doc from harvester.post_processing.fix_repeated_displayDate import fix_repeated_date import harvester results = CouchDBJobEnqueue().queue_collection( '26094', 300, run_on_couchdb_doc, 'harvester.post_processing.set_rights_lapl.set_rights_lapl') # Copyright © 2016, Regents of the University of California # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # - Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # - Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # - Neither the name of the University of California nor the names of its # contributors may be used to endorse or promote products derived from this # software without specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF