def main(user_email, cid, url_couchdb=None):
    enq = CouchDBJobEnqueue()
    timeout = 10000
    enq.queue_collection(cid,
                     timeout,
                     harvest_image_for_doc,
                     url_couchdb=url_couchdb,
                     )
def q_collection(collection_id, enrichment):
    timeout = 10000
    print "ENRICH {} with {}".format(collection_id, enrichment)
    ENQ = CouchDBJobEnqueue()
    ENQ.queue_collection(collection_id, timeout,
                     harvester.post_processing.enrich_existing_couch_doc.main,
                     enrichment
                     )
Example #3
0
def main(user_email, cid, url_couchdb=None):
    enq = CouchDBJobEnqueue()
    timeout = 10000
    enq.queue_collection(
        cid,
        timeout,
        harvest_image_for_doc,
        url_couchdb=url_couchdb,
    )
def main(user_email, doc_id_list_file, rq_queue=None, url_couchdb=None):
    enq = CouchDBJobEnqueue(rq_queue=rq_queue)
    timeout = 10000000
    with open(doc_id_list_file) as foo:
        doc_id_list = [ l.strip() for l in foo.readlines()]
    results = enq.queue_list_of_ids(doc_id_list,
                     timeout,
                     harvest_image_for_doc,
                     url_couchdb=url_couchdb,
                     )
    print "Results:{}".format(results)
def main(user_email, doc_id_list_file, rq_queue=None, url_couchdb=None):
    enq = CouchDBJobEnqueue(rq_queue=rq_queue)
    timeout = 10000000
    with open(doc_id_list_file) as foo:
        doc_id_list = [l.strip() for l in foo.readlines()]
    results = enq.queue_list_of_ids(
        doc_id_list,
        timeout,
        harvest_image_for_doc,
        url_couchdb=url_couchdb,
    )
    print "Results:{}".format(results)
Example #6
0
def main(cid):
    worker = CouchDBWorker()
    enq = CouchDBJobEnqueue()
    timeout = 100000
    cdb = get_couchdb()
    worker.run_by_collection(cid, delete_field_and_queue_image_harvest,
                             'object', cdb, enq)
Example #7
0
def main(doc_ids, **kwargs):
    enq = CouchDBJobEnqueue(rq_queue=kwargs['rq_queue'])
    timeout = 10000
    if 'rq_queue' in kwargs:
        del kwargs['rq_queue']
    if 'timeout' in kwargs:
        if type(kwargs['timeout']) == int:
            timeout = kwargs['timeout']
        del kwargs['timeout']
    if 'object_auth' in kwargs:
        kwargs['object_auth'] = (kwargs['object_auth'].split(':')[0],
                                 kwargs['object_auth'].split(':')[1])
    enq.queue_list_of_ids(doc_ids,
                          timeout,
                          harvest_image_for_doc,
                          force=True,
                          **kwargs)
def main(args):
    parser = argparse.ArgumentParser(
        description='run an Akara enrichment chain on documents in a \
                collection.')
    parser.add_argument('document_id',
                        help='Registry id for the collection')
    parser.add_argument('enrichment', help='enrichment chain to run')

    args = parser.parse_args(args)
    print(args.collection_id)
    print(args.enrichment)
    enq = CouchDBJobEnqueue()
    timeout = 10000
    enq.queue_collection(args.collection_id, timeout,
                     harvester.post_processing.enrich_existing_couch_doc.main,
                     args.enrichment
                     )
def main(doc_ids, **kwargs):
    enq = CouchDBJobEnqueue(rq_queue=kwargs['rq_queue'])
    timeout = 10000
    if 'rq_queue' in kwargs:
        del kwargs['rq_queue']
    if 'timeout' in kwargs:
        if type(kwargs['timeout']) == int:
            timeout = kwargs['timeout']
        del kwargs['timeout']
    if 'object_auth' in kwargs:
        kwargs['object_auth'] = (kwargs['object_auth'].split(':')[0],
                                 kwargs['object_auth'].split(':')[1])
    enq.queue_list_of_ids(doc_ids,
                     timeout,
                     harvest_image_for_doc,
                     force=True,
                     **kwargs
                     )
Example #10
0
class CouchDBJobEnqueueTestCase(TestCase):
    #@patch('redis.client.Redis', autospec=True)
    @patch('harvester.post_processing.couchdb_runner.Redis')
    @httpretty.activate
    def setUp(self, mock_redis):
        self.conf = config()
        self.url_couch_base = self.conf['couchdb_url']
        self.cdb = self.conf['couchdb_dbname']
        print "+++++++++++++confg:{0}".format(self.conf)
        url_head = os.path.join(self.url_couch_base, self.cdb)
        httpretty.register_uri(httpretty.HEAD,
                               url_head,
                               body='',
                               content_length='0',
                               content_type='text/plain; charset=utf-8',
                               connection='close',
                               server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
                               cache_control='must-revalidate',
                               date='Mon, 24 Nov 2014 21:30:38 GMT')

        self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete')

        def func_for_test(doc, *args, **kwargs):
            return doc, args, kwargs

        self.function = func_for_test

    @httpretty.activate
    def testCollectionSlice(self):
        '''Test that results are correct for a known couchdb result'''
        url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design',
                                     COUCHDB_VIEW.split('/')[0], '_view',
                                     COUCHDB_VIEW.split('/')[1])
        httpretty.register_uri(
            httpretty.GET,
            re.compile(url_to_pretty + ".*$"),
            body=open(DIR_FIXTURES +
                      '/couchdb_by_provider_name-5112.json').read(),
            etag="2U5BW2TDDX9EHZJOO0DNE29D1",
            content_type='application/json',
        )
        #transfer_encoding='chunked', #NOTE: doesn't work with httpretty
        results = self._cdbrunner.queue_collection('5112',
                                                   6000,
                                                   self.function,
                                                   'arg1',
                                                   'arg2',
                                                   kwarg1='1',
                                                   kwarg2=2)
        self.assertEqual(len(results), 3)
        self.assertEqual(results[0].args,
                         ('5112--http://ark.cdlib.org/ark:/13030/kt7580382j',
                          'arg1', 'arg2'))
        self.assertEqual(results[0].kwargs, {'kwarg1': '1', 'kwarg2': 2})
        self.assertEqual(results[0].func_name,
                         'test.test_couchdb_runner.func_for_test')
Example #11
0
    def setUp(self, mock_redis):
        self.conf = config()
        self.url_couch_base = self.conf['couchdb_url']
        self.cdb = self.conf['couchdb_dbname']
        print "+++++++++++++confg:{0}".format(self.conf)
        url_head = os.path.join(self.url_couch_base, self.cdb)
        httpretty.register_uri(httpretty.HEAD,
                               url_head,
                               body='',
                               content_length='0',
                               content_type='text/plain; charset=utf-8',
                               connection='close',
                               server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
                               cache_control='must-revalidate',
                               date='Mon, 24 Nov 2014 21:30:38 GMT')

        self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete')

        def func_for_test(doc, *args, **kwargs):
            return doc, args, kwargs

        self.function = func_for_test
Example #12
0
def main(args):
    parser = argparse.ArgumentParser(
        description='run the enrichments stored for a collection.')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--collection_id',
                        help='Registry id for the collection')
    group.add_argument('--cid_file',
                        help='File with collection ids for running')
    parser.add_argument('--rq_queue',
			help='Override queue for jobs, normal-stage is default')

    args = parser.parse_args(args)
    Q = 'normal-stage'
    if args.rq_queue:
        Q = args.rq_queue
    enq = CouchDBJobEnqueue(Q)
    timeout = 10000

    cids = []
    if args.collection_id:
        cids = [ args.collection_id ]
    else: #cid file
        with open(args.cid_file) as foo:
            lines = foo.readlines()
        cids = [ l.strip() for l in lines]
    print "CIDS:{}".format(cids)

    for cid in cids:
        url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/',
                    cid, '/'))
        coll = Collection(url_api)
        print coll.id
        enrichments = coll.enrichments_item
        enq.queue_collection(cid, timeout,
                     harvester.post_processing.enrich_existing_couch_doc.main,
                     enrichments
                     )
def main(args):
    parser = argparse.ArgumentParser(
        description='run an Akara enrichment chain on documents in a \
                collection.')
    parser.add_argument('collection_id',
                        help='Registry id for the collection')
    parser.add_argument('enrichment', help='File of enrichment chain to run')
    parser.add_argument('--rq_queue',
			help='Override queue for jobs, normal-stage is default')

    args = parser.parse_args(args)
    print "CID:{}".format(args.collection_id)
    print "ENRICH FILE:{}".format(args.enrichment)
    with open(args.enrichment) as enrichfoo:
        enrichments = enrichfoo.read() 
    Q = 'normal-stage'
    if args.rq_queue:
        Q = args.rq_queue
    enq = CouchDBJobEnqueue(Q)
    timeout = 10000
    enq.queue_collection(args.collection_id, timeout,
                     harvester.post_processing.enrich_existing_couch_doc.main,
                     enrichments
                     )
Example #14
0
class CouchDBJobEnqueueTestCase(TestCase):
    #@patch('redis.client.Redis', autospec=True)
    @patch('harvester.post_processing.couchdb_runner.Redis')
    @httpretty.activate
    def setUp(self, mock_redis):
        self.conf = config()
        self.url_couch_base = self.conf['couchdb_url']
        self.cdb = self.conf['couchdb_dbname']
	print "+++++++++++++confg:{0}".format(self.conf)
        url_head = os.path.join(self.url_couch_base, self.cdb)
        httpretty.register_uri(httpretty.HEAD,
                url_head,
                body='',
                content_length='0',
                content_type='text/plain; charset=utf-8',
                connection='close',
                server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
                cache_control='must-revalidate',
                date='Mon, 24 Nov 2014 21:30:38 GMT'
                )

        self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete')
        def func_for_test(doc, *args, **kwargs):
            return doc, args, kwargs
        self.function = func_for_test

    @httpretty.activate
    def testCollectionSlice(self):
        '''Test that results are correct for a known couchdb result'''
        url_to_pretty = os.path.join(self.url_couch_base, self.cdb,
                '_design', COUCHDB_VIEW.split('/')[0],
                '_view', COUCHDB_VIEW.split('/')[1])
        httpretty.register_uri(httpretty.GET,
                re.compile(url_to_pretty+".*$"),
                body=open(DIR_FIXTURES+'/couchdb_by_provider_name-5112.json').read(),
                etag="2U5BW2TDDX9EHZJOO0DNE29D1",
                content_type='application/json',
                )
                #transfer_encoding='chunked', #NOTE: doesn't work with httpretty
        results = self._cdbrunner.queue_collection('5112', 6000, self.function,
                'arg1', 'arg2', kwarg1='1', kwarg2=2)
        self.assertEqual(len(results), 3)
        self.assertEqual(results[0].args, ('5112--http://ark.cdlib.org/ark:/13030/kt7580382j', 'arg1', 'arg2'))
        self.assertEqual(results[0].kwargs, {'kwarg1': '1', 'kwarg2': 2})
        self.assertEqual(results[0].func_name, 'test.test_couchdb_runner.func_for_test')
Example #15
0
    def setUp(self, mock_redis):
        self.conf = config()
        self.url_couch_base = self.conf['couchdb_url']
        self.cdb = self.conf['couchdb_dbname']
	print "+++++++++++++confg:{0}".format(self.conf)
        url_head = os.path.join(self.url_couch_base, self.cdb)
        httpretty.register_uri(httpretty.HEAD,
                url_head,
                body='',
                content_length='0',
                content_type='text/plain; charset=utf-8',
                connection='close',
                server='CouchDB/1.5.0 (Erlang OTP/R16B03)',
                cache_control='must-revalidate',
                date='Mon, 24 Nov 2014 21:30:38 GMT'
                )

        self._cdbrunner = CouchDBJobEnqueue(rq_queue='test-delete')
        def func_for_test(doc, *args, **kwargs):
            return doc, args, kwargs
        self.function = func_for_test
Example #16
0
from harvester.post_processing.couchdb_runner import CouchDBJobEnqueue
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_doc
from harvester.post_processing.fix_repeated_displayDate import fix_repeated_date
import harvester

import sys

fname = sys.argv[1]

cid_list = [ x.strip() for x in open(fname).readlines()]

for cid in cid_list:
  results = CouchDBJobEnqueue().queue_collection(
    cid,
    300,
    run_on_couchdb_doc,
    'harvester.post_processing.fix_repeated_displayDate.fix_repeated_date')
Example #17
0
          /cleanup_value,
          /move_date_values?prop=sourceResource%2Fsubject,
          /move_date_values?prop=sourceResource%2Fspatial,
          /shred?prop=sourceResource%2Fspatial&delim=--,
          /capitalize_value?exclude=sourceResource%2Frelation,
          /enrich-subject,
          /enrich_earliest_date,
          /enrich_date,
          /enrich-type,
          /enrich-format,
          /enrich_location,
          /copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider,
          /enrich_language,
          /lookup?prop=sourceResource%2Flanguage%2Fname&target=sourceResource%2Flanguage%2Fname&substitution=iso639_3,
          /lookup?prop=sourceResource%2Flanguage%2Fname&target=sourceResource%2Flanguage%2Fiso639_3&substitution=iso639_3&inverse=True,
          /copy_prop?prop=provider%2Fname&to_prop=dataProvider&skip_if_exists=True,
          /set_prop?prop=sourceResource%2FstateLocatedIn&value=California,
          /enrich_location?prop=sourceResource%2FstateLocatedIn,
          /dedupe-sourceresource,
          /validate_mapv3'''

enrichments = enrichments.replace('\n','').replace(' ','')
print enrichments
results = CouchDBJobEnqueue().queue_collection("26094",
                                            30000,
                                            reenrich, 
                                            enrichments
          )

print results
Example #18
0
# -*- coding: utf-8 -*-
from harvester.post_processing.couchdb_runner import CouchDBJobEnqueue
from harvester.post_processing.run_transform_on_couchdb_docs import run_on_couchdb_doc
from harvester.post_processing.fix_repeated_displayDate import fix_repeated_date
import harvester

results = CouchDBJobEnqueue().queue_collection(
    '26094',
    300,
    run_on_couchdb_doc,
    'harvester.post_processing.set_rights_lapl.set_rights_lapl')


# Copyright © 2016, Regents of the University of California
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# - Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
# - Neither the name of the University of California nor the names of its
#   contributors may be used to endorse or promote products derived from this
#   software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF