Beispiel #1
0
def update_from_remote(doc_id,
                       url_remote_couchdb=None,
                       couchdb_remote=None,
                       couchdb_env=None):
    '''Update the environment's couchdb from a remote couchdb document
    '''
    msg = None
    if not couchdb_remote:
        couchdb_remote = get_couchdb(url_remote_couchdb)
    if not couchdb_env:
        couchdb_env = get_couchdb()
    doc = couchdb_remote.get(doc_id)
    # need to remove the revision data, as will be different
    del doc['_rev']
    # if doc exists, need to update metadata for the existing document
    # and then save that, due to revision number in couch
    doc_in_target = couchdb_env.get(doc_id)
    if doc_in_target:
        doc_in_target.update(doc)
        couchdb_env[doc_id] = doc_in_target
        msg = "updated {}".format(doc_id)
    else:
        doc_no_rev = doc.copy()
        couchdb_env[doc_id] = doc_no_rev
        msg = "created {}".format(doc_id)
    print >> sys.stderr, msg
    return msg
def update_collection_from_remote(url_remote_couchdb,
                                  url_api_collection,
                                  delete_first=True):
    '''Update a collection from a remote couchdb.
    '''
    if delete_first:
        delete_collection(url_api_collection.rsplit('/', 2)[1])
    collection = Collection(url_api_collection)
    # guard against updating production for not ready_for_publication
    # collections
    if 'prod' in environ.get('DATA_BRANCH', ''):
        if not collection.ready_for_publication:
            raise Exception(
                'In PRODUCTION ENV and collection {} not ready for '
                'publication'.format(collection.id))
    doc_ids = get_collection_doc_ids(collection.id, url_remote_couchdb)
    couchdb_remote = get_couchdb(url_remote_couchdb)
    couchdb_env = get_couchdb()
    created = 0
    updated = 0

    for doc_id in doc_ids:
        msg = update_from_remote(doc_id,
                                 couchdb_remote=couchdb_remote,
                                 couchdb_env=couchdb_env)
        if 'created' in msg:
            created += 1
        else:
            updated += 1

    return len(doc_ids), updated, created
def update_from_remote(doc_id,
                       url_remote_couchdb=None,
                       couchdb_remote=None,
                       couchdb_env=None):
    '''Update the environment's couchdb from a remote couchdb document
    '''
    msg = None
    if not couchdb_remote:
        couchdb_remote = get_couchdb(url_remote_couchdb)
    if not couchdb_env:
        couchdb_env = get_couchdb()
    doc = couchdb_remote.get(doc_id)
    # need to remove the revision data, as will be different
    del doc['_rev']
    # if doc exists, need to update metadata for the existing document
    # and then save that, due to revision number in couch
    doc_in_target = couchdb_env.get(doc_id)
    if doc_in_target:
        doc_in_target.update(doc)
        couchdb_env[doc_id] = doc_in_target
        msg = "updated {}".format(doc_id)
    else:
        doc_no_rev = doc.copy()
        couchdb_env[doc_id] = doc_no_rev
        msg = "created {}".format(doc_id)
    print >> sys.stderr, msg
    return msg
Beispiel #4
0
def update_collection_from_remote(url_remote_couchdb,
                                  url_api_collection,
                                  delete_first=True):
    '''Update a collection from a remote couchdb.
    '''
    if delete_first:
        delete_collection(url_api_collection.rsplit('/', 2)[1])
    collection = Collection(url_api_collection)
    # guard against updating production for not ready_for_publication
    # collections
    if 'prod' in environ.get('DATA_BRANCH', ''):
        if not collection.ready_for_publication:
            raise Exception(
                'In PRODUCTION ENV and collection {} not ready for '
                'publication'.format(collection.id))
    doc_ids = get_collection_doc_ids(collection.id, url_remote_couchdb)
    couchdb_remote = get_couchdb(url_remote_couchdb)
    couchdb_env = get_couchdb()
    created = 0
    updated = 0

    for doc_id in doc_ids:
        msg = update_from_remote(
            doc_id, couchdb_remote=couchdb_remote, couchdb_env=couchdb_env)
        if 'created' in msg:
            created += 1
        else:
            updated += 1

    return len(doc_ids), updated, created
Beispiel #5
0
def main(user_email, cid, url_couchdb_src, field_list, url_couchdb_dest=None):
    worker = CouchDBWorker()
    timeout = 100000
    cdb_src = get_couchdb(url=url_couchdb_src, username=False, password=False)
    if url_couchdb_dest:
        cdb_dest = get_couchdb(url=url_couchdb_dest)
    else:
        cdb_dest = get_couchdb()
    worker.run_by_collection(cid, copy_fields_for_doc, cdb_src, field_list,
                             cdb_dest)
def main(user_email, cid, url_couchdb_src, field_list, url_couchdb_dest=None):
    worker = CouchDBWorker()
    timeout = 100000
    cdb_src = get_couchdb(url=url_couchdb_src, username=False, password=False)
    if url_couchdb_dest:
        cdb_dest= get_couchdb(url=url_couchdb_dest)
    else:
        cdb_dest= get_couchdb()
    worker.run_by_collection(cid,
                     copy_fields_for_doc,
                     cdb_src,
                     field_list,
                     cdb_dest
                     )
Beispiel #7
0
def main(cid):
    worker = CouchDBWorker()
    enq = CouchDBJobEnqueue()
    timeout = 100000
    cdb = get_couchdb()
    worker.run_by_collection(cid, delete_field_and_queue_image_harvest,
                             'object', cdb, enq)
Beispiel #8
0
def main(user_email, cid, field_list):
    worker = CouchDBWorker()
    timeout = 100000
    cdb = get_couchdb()
    worker.run_by_collection(cid,
                     delete_field_list,
                     field_list,
                     cdb
                     )
Beispiel #9
0
def delete_collection(cid):
    print >> sys.stderr, "DELETING COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb)
    publish_to_harvesting(
        'Deleted CouchDB Collection {}'.format(cid),
        'Deleted {} documents from CouchDB collection {}'.format(num_deleted,
                                                                 cid))
    return num_deleted, deleted_docs
Beispiel #10
0
def get_collection_doc_ids(collection_id, url_couchdb_source=None):
    '''Use the by_provider_name view to get doc ids for a given collection
    '''
    _couchdb = get_couchdb(url=url_couchdb_source)
    v = CouchDBCollectionFilter(couchdb_obj=_couchdb,
                                collection_key=str(collection_id),
                                include_docs=False)
    doc_ids = []
    for r in v:
        doc_ids.append(r.id)
    return doc_ids
Beispiel #11
0
def get_collection_doc_ids(collection_id, url_couchdb_source=None):
    '''Use the by_provider_name view to get doc ids for a given collection
    '''
    _couchdb = get_couchdb(url=url_couchdb_source)
    v = CouchDBCollectionFilter(couchdb_obj=_couchdb,
                                collection_key=str(collection_id),
                                include_docs=False)
    doc_ids = []
    for r in v:
        doc_ids.append(r.id)
    return doc_ids
def main(cid):
    worker = CouchDBWorker()
    enq = CouchDBJobEnqueue()
    timeout = 100000
    cdb = get_couchdb()
    worker.run_by_collection(cid,
                     delete_field_and_queue_image_harvest,
                     'object',
                     cdb,
                     enq
                     )
def delete_collection(cid):
    print >> sys.stderr, "DELETING COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_deleted, deleted_docs = delete_id_list(ids, _couchdb=_couchdb)
    subject = format_results_subject(cid,
                                     'Deleted documents from CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Deleted {} documents from CouchDB collection CID: {}'.format(
            num_deleted, cid))
    return num_deleted, deleted_docs
def run_on_couchdb_doc(docid, func):
    '''Run on a doc, by doc id'''
    _couchdb = get_couchdb()
    doc = _couchdb[docid]
    mod_name, func_name = func.rsplit('.', 1)
    fmod = importlib.import_module(mod_name)
    ffunc = getattr(fmod, func_name)
    doc_new = ffunc(doc)
    if doc_new and doc_new != doc:
        _couchdb.save(doc_new)
        return True
    return False
def run_on_couchdb_doc(docid, func):
    '''Run on a doc, by doc id'''
    _couchdb = get_couchdb()
    doc = _couchdb[docid]
    mod_name, func_name = func.rsplit('.', 1)
    fmod = importlib.import_module(mod_name)
    ffunc = getattr(fmod, func_name)
    doc_new = ffunc(doc)
    if doc_new and doc_new != doc:
        _couchdb.save(doc_new)
        return True
    return False
Beispiel #16
0
def harvest_image_for_doc(doc_id, url_couchdb=None, object_auth=None, get_if_object=False):
    """Wrapper to call from rqworker.
    Creates ImageHarvester object & then calls harvest_image_for_doc
    """
    harvester = ImageHarvester(url_couchdb=url_couchdb, object_auth=object_auth, get_if_object=get_if_object)
    # get doc from couchdb
    couchdb = get_couchdb(url=url_couchdb)
    doc = couchdb[doc_id]
    if not get_if_object and "object" in doc:
        print >>sys.stderr, "Skipping {}, has object field".format(doc["_id"])
    else:
        harvester.harvest_image_for_doc(doc)
def update_couch_docs_by_collection(cid, fieldName, newValue):
    print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_updated, updated_docs = update_by_id_list(
        ids, fieldName, newValue, _couchdb=_couchdb)
    subject = format_results_subject(cid,
                                     'Updated documents from CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Updated {} documents from CouchDB collection CID: {}'.format(
            num_updated, cid))
    return num_updated, updated_docs
Beispiel #18
0
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None):
    solr_db = solr.Solr(url_solr)
    db = get_couchdb(url=url_couchdb, dbname=couchdb_db)
    v = couchdb_pager(db, include_docs='true')
    # update or create new solr doc for each couchdb doc
    for r in v:
        doc_couch = r.doc
        if '_design' not in doc_couch['_id']:
            try:
                if not isinstance(doc_couch['originalRecord']['collection'],
                                  list):
                    doc_couch['originalRecord']['collection'] = [
                        doc_couch['originalRecord']['collection'],
                    ]
                    print("orgRec.Collection: {}".format(
                        doc_couch['sourceResource']['collection']))
            except KeyError:
                pass
            try:
                if not isinstance(doc_couch['sourceResource']['collection'],
                                  list):
                    doc_couch['sourceResource']['collection'] = [
                        doc_couch['sourceResource']['collection'],
                    ]
                    print("srcRes.Collection: {}".format(
                        doc_couch['sourceResource']['subject']))
            except KeyError:
                pass
            try:
                subject = doc_couch['sourceResource'].get('subject', None)
                if not isinstance(subject, list):
                    subject = [subject]
                subjects_norm = []
                for sub in subject:
                    if not isinstance(sub, dict):
                        subjects_norm.append({'name': sub})
                    else:
                        subjects_norm.append(sub)
                doc_couch['sourceResource']['subject'] = subjects_norm
            except KeyError:
                pass
            db.save(doc_couch)
            try:
                doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch),
                                            solr_db=solr_db)
                print("PUSHED {} to solr".format(doc_couch['_id']))
            except TypeError:
                pass
    solr_db.commit()
def update_couch_docs_by_collection(cid, fieldName, newValue, substring):
    print >> sys.stderr, "UPDATING DOCS FOR COLLECTION: {}".format(cid)
    _couchdb = get_couchdb()
    rows = CouchDBCollectionFilter(collection_key=cid, couchdb_obj=_couchdb)
    ids = [row['id'] for row in rows]
    num_updated, updated_docs = update_by_id_list(ids,
                                                  fieldName,
                                                  newValue,
                                                  substring,
                                                  _couchdb=_couchdb)
    subject = format_results_subject(cid,
                                     'Updated documents from CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Updated {} documents from CouchDB collection CID: {}'.format(
            num_updated, cid))
    return num_updated, updated_docs
Beispiel #20
0
 def __init__(self, rq_queue=None):
     self._config = config()
     self._couchdb = get_couchdb()
     self._redis = Redis(
         host=self._config['redis_host'],
         port=self._config['redis_port'],
         password=self._config['redis_password'],
         socket_connect_timeout=self._config['redis_connect_timeout'])
     self.rqname = self._config['rq_queue']
     if rq_queue:
         self.rqname = rq_queue
     if not self.rqname:
         raise ValueError(''.join(
             ('Must set RQ_QUEUE env var', ' or pass in rq_queue to ',
              'CouchDBJobEnqueue')))
     self._rQ = Queue(self.rqname, connection=self._redis)
Beispiel #21
0
 def __init__(self, rq_queue=None):
     self._config = config()
     self._couchdb = get_couchdb()
     self._redis = Redis(
             host=self._config['redis_host'],
             port=self._config['redis_port'],
             password=self._config['redis_password'],
             socket_connect_timeout=self._config['redis_connect_timeout'])
     self.rqname = self._config['rq_queue']
     if rq_queue:
         self.rqname = rq_queue
     if not self.rqname:
         raise ValueError(''.join(('Must set RQ_QUEUE env var',
                                   ' or pass in rq_queue to ',
                                   'CouchDBJobEnqueue')))
     self._rQ = Queue(self.rqname, connection=self._redis)
Beispiel #22
0
 def __init__(
     self,
     cdb=None,
     url_couchdb=None,
     couchdb_name=None,
     couch_view=COUCHDB_VIEW,
     bucket_bases=BUCKET_BASES,
     object_auth=None,
     get_if_object=False,
     url_cache=None,
     hash_cache=None,
     harvested_object_cache=None,
 ):
     self._config = config()
     if cdb:
         self._couchdb = cdb
     else:
         if not url_couchdb:
             url_couchdb = self._config["couchdb_url"]
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     self._bucket_bases = bucket_bases
     self._view = couch_view
     # auth is a tuple of username, password
     self._auth = object_auth
     self.get_if_object = get_if_object  # if object field exists, get
     self._redis = Redis(
         host=self._config["redis_host"],
         port=self._config["redis_port"],
         password=self._config["redis_password"],
         socket_connect_timeout=self._config["redis_connect_timeout"],
     )
     self._url_cache = (
         url_cache
         if url_cache is not None
         else redis_collections.Dict(key="ucldc-image-url-cache", redis=self._redis)
     )
     self._hash_cache = (
         hash_cache
         if hash_cache is not None
         else redis_collections.Dict(key="ucldc-image-hash-cache", redis=self._redis)
     )
     self._object_cache = (
         harvested_object_cache
         if harvested_object_cache
         else redis_collections.Dict(key="ucldc:harvester:harvested-images", redis=self._redis)
     )
def main(collection_key):
    v = CouchDBCollectionFilter(
        couchdb_obj=get_couchdb(), collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    results = []
    for r in v:
        dt_start = dt_end = datetime.datetime.now()
        try:
            doc = fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError, e:
            print(e.message)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        results.append(solr_doc)
        solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
        dt_end = datetime.datetime.now()
def main(collection_key):
    v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(),
                                collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    results = []
    for r in v:
        dt_start = dt_end = datetime.datetime.now()
        try:
            doc = fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError, e:
            print(e.message)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        results.append(solr_doc)
        solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
        dt_end = datetime.datetime.now()
Beispiel #25
0
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None):
    solr_db = solr.Solr(url_solr)
    db = get_couchdb(url=url_couchdb, dbname=couchdb_db)
    v = couchdb_pager(db, include_docs='true')
    # update or create new solr doc for each couchdb doc
    for r in v:
        doc_couch = r.doc
        if '_design' not in doc_couch['_id']:
            try:
                if not isinstance(doc_couch['originalRecord']['collection'], list):
                    doc_couch['originalRecord']['collection'] = [
                                    doc_couch['originalRecord']['collection'],
                                    ]
                    print("orgRec.Collection: {}".format(doc_couch['sourceResource']['collection']))
            except KeyError:
                pass
            try:
                if not isinstance(doc_couch['sourceResource']['collection'], list):
                    doc_couch['sourceResource']['collection'] = [
                                    doc_couch['sourceResource']['collection'],
                                    ]
                    print("srcRes.Collection: {}".format(doc_couch['sourceResource']['subject']))
            except KeyError:
                pass
            try:
                subject = doc_couch['sourceResource'].get('subject', None)
                if not isinstance(subject, list):
                    subject = [subject]
                subjects_norm = []
                for sub in subject:
                    if not isinstance(sub, dict):
                        subjects_norm.append({'name': sub})
                    else:
                        subjects_norm.append(sub)
                doc_couch['sourceResource']['subject'] = subjects_norm
            except KeyError:
                pass
            db.save(doc_couch)
            try:
                doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch),
                                        solr_db=solr_db)
                print("PUSHED {} to solr".format(doc_couch['_id']))
            except TypeError:
                pass
    solr_db.commit()
Beispiel #26
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(
        couchdb_obj=get_couchdb(), collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    results = []
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError, e:
            print(e.message)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        results.append(solr_doc)
        solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
Beispiel #27
0
def harvest_image_for_doc(doc_id,
                          url_couchdb=None,
                          object_auth=None,
                          get_if_object=False,
                          force=False):
    '''Wrapper to call from rqworker.
    Creates ImageHarvester object & then calls harvest_image_for_doc
    '''
    harvester = ImageHarvester(
        url_couchdb=url_couchdb,
        object_auth=object_auth,
        get_if_object=get_if_object)
    # get doc from couchdb
    couchdb = get_couchdb(url=url_couchdb)
    doc = couchdb[doc_id]
    if not get_if_object and 'object' in doc and not force:
        print >> sys.stderr, 'Skipping {}, has object field'.format(doc['_id'])
    else:
        harvester.harvest_image_for_doc(doc, force=force)
Beispiel #28
0
def harvest_image_for_doc(doc_id,
                          url_couchdb=None,
                          object_auth=None,
                          get_if_object=False,
                          force=False):
    '''Wrapper to call from rqworker.
    Creates ImageHarvester object & then calls harvest_image_for_doc
    '''
    harvester = ImageHarvester(url_couchdb=url_couchdb,
                               object_auth=object_auth,
                               get_if_object=get_if_object,
                               ignore_content_type=ignore_content_type)
    # get doc from couchdb
    couchdb = get_couchdb(url=url_couchdb)
    doc = couchdb[doc_id]
    if not get_if_object and 'object' in doc and not force:
        print >> sys.stderr, 'Skipping {}, has object field'.format(doc['_id'])
    else:
        harvester.harvest_image_for_doc(doc, force=force)
Beispiel #29
0
 def __init__(self,
              cdb=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              bucket_bases=BUCKET_BASES,
              object_auth=None,
              get_if_object=False,
              ignore_content_type=False,
              url_cache=None,
              hash_cache=None,
              harvested_object_cache=None):
     self._config = config()
     if cdb:
         self._couchdb = cdb
     else:
         if not url_couchdb:
             url_couchdb = self._config['couchdb_url']
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     self._bucket_bases = bucket_bases
     self._view = couch_view
     # auth is a tuple of username, password
     self._auth = object_auth
     self.get_if_object = get_if_object  # if object field exists, get
     self.ignore_content_type = ignore_content_type  # Don't check content-type in headers
     self._redis = Redis(
         host=self._config['redis_host'],
         port=self._config['redis_port'],
         password=self._config['redis_password'],
         socket_connect_timeout=self._config['redis_connect_timeout'])
     self._url_cache = url_cache if url_cache is not None else \
         redis_collections.Dict(key='ucldc-image-url-cache',
                                redis=self._redis)
     self._hash_cache = hash_cache if hash_cache is not None else \
         redis_collections.Dict(key='ucldc-image-hash-cache',
                                redis=self._redis)
     self._object_cache = harvested_object_cache if harvested_object_cache \
         else \
         redis_collections.Dict(
             key='ucldc:harvester:harvested-images',
             redis=self._redis)
Beispiel #30
0
 def __init__(self,
              cdb=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              bucket_bases=BUCKET_BASES,
              object_auth=None,
              get_if_object=False,
              ignore_content_type=False,
              url_cache=None,
              hash_cache=None,
              harvested_object_cache=None):
     self._config = config()
     if cdb:
         self._couchdb = cdb
     else:
         if not url_couchdb:
             url_couchdb = self._config['couchdb_url']
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     self._bucket_bases = bucket_bases
     self._view = couch_view
     # auth is a tuple of username, password
     self._auth = object_auth
     self.get_if_object = get_if_object  # if object field exists, get
     self.ignore_content_type = ignore_content_type # Don't check content-type in headers
     self._redis = Redis(
         host=self._config['redis_host'],
         port=self._config['redis_port'],
         password=self._config['redis_password'],
         socket_connect_timeout=self._config['redis_connect_timeout'])
     self._url_cache = url_cache if url_cache is not None else \
         redis_collections.Dict(key='ucldc-image-url-cache',
                                redis=self._redis)
     self._hash_cache = hash_cache if hash_cache is not None else \
         redis_collections.Dict(key='ucldc-image-hash-cache',
                                redis=self._redis)
     self._object_cache = harvested_object_cache if harvested_object_cache \
         else \
         redis_collections.Dict(
             key='ucldc:harvester:harvested-images',
             redis=self._redis)
Beispiel #31
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(
        couchdb_obj=get_couchdb(), collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(
            collection_key,
            updated_docs,
            num_added,
            report))
    return updated_docs, report
Beispiel #32
0
 def __init__(self,
              collection_key=None,
              couchdb_obj=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              include_docs=True
              ):
     if not collection_key:
         collection_key = '{}'
     if couchdb_obj is None:
         if not url_couchdb or not couchdb_name:
             raise ValueError('Need url and name to couch database')
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     else:
         self._couchdb = couchdb_obj
     self._view = couch_view
     self._view_iter = couchdb_pager(
             self._couchdb, self._view,
             key=collection_key,
             include_docs='true' if include_docs else 'false')
def run_on_couchdb_by_collection(func, collection_key=None):
    '''If collection_key is none, trying to grab all of docs and modify
    func is a function that takes a couchdb doc in and returns it modified.
    (can take long time - not recommended)
    Function should return new document or None if no changes made
    '''
    _couchdb = get_couchdb()
    v = _couchdb.view(COUCHDB_VIEW, include_docs='true', key=collection_key) \
        if collection_key else _couchdb.view(COUCHDB_VIEW,
                                             include_docs='true')
    doc_ids = []
    n = 0
    for r in v:
        n += 1
        doc_new = func(r.doc)
        if doc_new and doc_new != doc:
            _couchdb.save(doc_new)
            doc_ids.append(r.doc['_id'])
        if n % 100 == 0:
            print '{} docs ran. Last doc:{}\n'.format(n, r.doc['_id'])
    return doc_ids
Beispiel #34
0
 def __init__(self,
              collection_key=None,
              couchdb_obj=None,
              url_couchdb=None,
              couchdb_name=None,
              couch_view=COUCHDB_VIEW,
              include_docs=True):
     if not collection_key:
         collection_key = '{}'
     if couchdb_obj is None:
         if not url_couchdb or not couchdb_name:
             raise ValueError('Need url and name to couch database')
         self._couchdb = get_couchdb(url=url_couchdb, dbname=couchdb_name)
     else:
         self._couchdb = couchdb_obj
     self._view = couch_view
     self._view_iter = couchdb_pager(
         self._couchdb,
         self._view,
         key=collection_key,
         include_docs='true' if include_docs else 'false')
def run_on_couchdb_by_collection(func, collection_key=None):
    '''If collection_key is none, trying to grab all of docs and modify
    func is a function that takes a couchdb doc in and returns it modified.
    (can take long time - not recommended)
    Function should return new document or None if no changes made
    '''
    _couchdb = get_couchdb()
    v = _couchdb.view(COUCHDB_VIEW, include_docs='true', key=collection_key) \
        if collection_key else _couchdb.view(COUCHDB_VIEW,
                                             include_docs='true')
    doc_ids = []
    n = 0
    for r in v:
        n += 1
        doc_new = func(r.doc)
        if doc_new and doc_new != doc:
            _couchdb.save(doc_new)
            doc_ids.append(r.doc['_id'])
        if n % 100 == 0:
            print '{} docs ran. Last doc:{}\n'.format(n, r.doc['_id'])
    return doc_ids
Beispiel #36
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    delete_solr_collection(collection_key)
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(),
                                collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(collection_key, updated_docs, num_added, report))
    return updated_docs, report
Beispiel #37
0
     description='Make csv report of indexed collections')
 parser.add_argument('auth_token', help='Authentication token')
 parser.add_argument('--solr_url', help='Solr index url')
 parser.add_argument('--couchdb_url', help='CouchDB url')
 args = parser.parse_args()
 solr_url = args.solr_url if args.solr_url else SOLR_URL
 print "SOLR_URL:{}".format(solr_url)
 SOLR = solr.SearchHandler(
     solr.Solr(
         solr_url,
         post_headers={
             'X-Authentication-Token': args.auth_token,
         },
     ), "/query")
 if args.couchdb_url:
     cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc')
 else:
     cdb = get_couchdb(dbname='ucldc')
 collections = get_indexed_collection_list(SOLR)
 date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M')
 fname = 'indexed_collections-{}.csv'.format(date_to_minute)
 with open(fname, 'wb') as csvfile:
     csvwriter = UnicodeWriter(csvfile)
     csvwriter.writerow(
         ('Collection Name', 'Collection URL', 'Number in index',
          'Number in couchdb', 'Number in OAC', 'Couch missing in solr',
          'OAC missing in couch', 'Repository Name', 'Repository URL',
          'Campus'))
     for c_url, num in collections:
         try:
             c = Collection(c_url)
Beispiel #38
0
def main(url_couchdb=None,
         dbname=None,
         url_solr=None,
         all_docs=False,
         since=None):
    '''Use the _changes feed with a "since" parameter to only catch new
    changes to docs. The _changes feed will only have the *last* event on
    a document and does not retain intermediate changes.
    Setting the "since" to 0 will result in getting a _changes record for
    each document, essentially dumping the db to solr
    '''
    print('Solr update PID: {}'.format(os.getpid()))
    dt_start = datetime.datetime.now()
    print('Start time:{}'.format(dt_start))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    s3_seq_cache = CouchdbLastSeq_S3()
    if not since:
        since = s3_seq_cache.last_seq
    if all_docs:
        since = '0'
    print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname))
    print('Getting changes since:{}'.format(since))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    changes = db.changes(since=since)
    previous_since = since
    last_since = int(
        changes['last_seq'])  # get new last_since for changes feed
    results = changes['results']
    n_up = n_design = n_delete = 0
    solr_db = Solr(url_solr)
    start_time = datetime.datetime.now()
    for row in results:
        cur_id = row['id']
        if '_design' in cur_id:
            n_design += 1
            print("Skip {0}".format(cur_id))
            continue
        if row.get('deleted', False):
            # need to get the solr doc for this couch
            resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"')))
            if resp.numFound == 1:
                sdoc = resp.results[0]
                print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id']))
                solr_db.delete(id=sdoc['id'])
                n_delete += 1
            else:
                print("-----DELETION of {} - FOUND {} docs".format(
                    cur_id, resp.numFound))
        else:
            doc = db.get(cur_id)
            try:
                doc = fill_in_title(doc)
                has_required_fields(doc)
            except KeyError as e:
                print(e.message)
                continue
            except ValueError as e:
                print(e.message)
                continue
            try:
                try:
                    solr_doc = map_couch_to_solr_doc(doc)
                except OldCollectionException:
                    print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id))
                    continue
                try:
                    check_nuxeo_media(solr_doc)
                except ValueError as e:
                    print(e.message)
                    continue
                solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
            except TypeError as e:
                print('TypeError for {0} : {1}'.format(cur_id, e))
                continue
        n_up += 1
        if n_up % 1000 == 0:
            elapsed_time = datetime.datetime.now() - start_time
            print("Updated {} so far in {}".format(n_up, elapsed_time))
    solr_db.commit()
    if not all_docs:
        s3_seq_cache.last_seq = last_since
    print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete))
    print("PREVIOUS SINCE:{0}".format(previous_since))
    print("LAST SINCE:{0}".format(last_since))
    run_time = datetime.datetime.now() - dt_start
    print("RUN TIME:{}".format(run_time))
 parser.add_argument('--solr_url', help='Solr index url')
 parser.add_argument('--couchdb_url', help='CouchDB url')
 args = parser.parse_args()
 solr_url = args.solr_url if args.solr_url else SOLR_URL
 print "SOLR_URL:{}".format(solr_url)
 SOLR = solr.SearchHandler(
             solr.Solr(
                 solr_url,
                 post_headers = {
                     'X-Authentication-Token': args.auth_token,
                     },
             ),
         "/query"
 )
 if args.couchdb_url:
     cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc')
 else:
     cdb = get_couchdb(dbname='ucldc')
 collections = get_indexed_collection_list(SOLR)
 date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M')
 fname = 'indexed_collections-{}.csv'.format(date_to_minute)
 with open(fname, 'wb') as csvfile:
     csvwriter = UnicodeWriter(csvfile)
     csvwriter.writerow(('Collection Name', 'Collection URL',
             'Number in index', 'Number in couchdb', 'Number in OAC',
             'Couch missing in solr', 'OAC missing in couch',
             'Repository Name', 'Repository URL',
             'Campus'))
     for c_url, num in collections:
         try:
             c = Collection(c_url)
Beispiel #40
0
 def __init__(self):
     self._couchdb = get_couchdb()
Beispiel #41
0
 def __init__(self):
     self._couchdb = get_couchdb()
def main(doc_id, enrichment, port=8889):
    '''Run akara_enrich_doc for one document and save result'''
    _couchdb = get_couchdb()
    indoc = _couchdb.get(doc_id)
    doc = akara_enrich_doc(indoc, enrichment, port)
    _couchdb[doc_id] = doc
Beispiel #43
0
def main(doc_id, enrichment, port=8889):
    '''Run akara_enrich_doc for one document and save result'''
    _couchdb = get_couchdb()
    indoc = _couchdb.get(doc_id)
    doc = akara_enrich_doc(indoc, enrichment, port)
    _couchdb[doc_id] = doc
Beispiel #44
0
def main(url_couchdb=None,
         dbname=None,
         url_solr=None,
         all_docs=False,
         since=None):
    '''Use the _changes feed with a "since" parameter to only catch new
    changes to docs. The _changes feed will only have the *last* event on
    a document and does not retain intermediate changes.
    Setting the "since" to 0 will result in getting a _changes record for
    each document, essentially dumping the db to solr
    '''
    print('Solr update PID: {}'.format(os.getpid()))
    dt_start = datetime.datetime.now()
    print('Start time:{}'.format(dt_start))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    s3_seq_cache = CouchdbLastSeq_S3()
    if not since:
        since = s3_seq_cache.last_seq
    if all_docs:
        since = '0'
    print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname))
    print('Getting changes since:{}'.format(since))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    changes = db.changes(since=since)
    previous_since = since
    last_since = int(
        changes['last_seq'])  # get new last_since for changes feed
    results = changes['results']
    n_up = n_design = n_delete = 0
    solr_db = Solr(url_solr)
    start_time = datetime.datetime.now()
    for row in results:
        cur_id = row['id']
        if '_design' in cur_id:
            n_design += 1
            print("Skip {0}".format(cur_id))
            continue
        if row.get('deleted', False):
            # need to get the solr doc for this couch
            resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"')))
            if resp.numFound == 1:
                sdoc = resp.results[0]
                print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id']))
                solr_db.delete(id=sdoc['id'])
                n_delete += 1
            else:
                print("-----DELETION of {} - FOUND {} docs".format(
                    cur_id, resp.numFound))
        else:
            doc = db.get(cur_id)
            try:
                doc = fill_in_title(doc)
                has_required_fields(doc)
            except KeyError as e:
                print(e.message)
                continue
            except ValueError as e:
                print(e.message)
                continue
            try:
                try:
                    solr_doc = map_couch_to_solr_doc(doc)
                except OldCollectionException:
                    print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id))
                    continue
                try:
                    check_nuxeo_media(solr_doc)
                except ValueError as e:
                    print(e.message)
                    continue
                solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
            except TypeError as e:
                print('TypeError for {0} : {1}'.format(cur_id, e))
                continue
        n_up += 1
        if n_up % 1000 == 0:
            elapsed_time = datetime.datetime.now() - start_time
            print("Updated {} so far in {}".format(n_up, elapsed_time))
    solr_db.commit()
    if not all_docs:
        s3_seq_cache.last_seq = last_since
    print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete))
    print("PREVIOUS SINCE:{0}".format(previous_since))
    print("LAST SINCE:{0}".format(last_since))
    run_time = datetime.datetime.now() - dt_start
    print("RUN TIME:{}".format(run_time))
'''one time script to populate redis with harvested image object data'''
from harvester.config import config
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_pager import couchdb_pager
from redis import Redis
import redis_collections

_config = config()

_redis = Redis(host=_config['redis_host'],
               port=_config['redis_port'],
               password=_config['redis_password'],
               socket_connect_timeout=_config['redis_connect_timeout'])

object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images',
                                      redis=_redis)

_couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc')
v = couchdb_pager(_couchdb, include_docs='true')
for r in v:
    doc = r.doc
    if 'object' in doc:
        did = doc['_id']
        if 'object_dimensions' not in doc:
            print "NO DIMS for {} -- not caching".format(did)
        else:
            object_cache[did] = [doc['object'], doc['object_dimensions']]
            print "OBJECT CACHE : {} === {}".format(did, object_cache[did])
import sys
import argparse
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_sync_db_by_collection import delete_id_list

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Delete all documents in given collection')
    parser.add_argument('id_list', help='File with ids in it, one per line')
    args = parser.parse_args(sys.argv[1:])
    ids = []
    with open(args.id_list) as id_file:
        ids = [l.strip() for l in id_file.readlines()]

    _couchdb = get_couchdb()
    num_deleted, delete_ids = delete_id_list(ids, _couchdb=_couchdb)
    print 'Deleted {} documents'.format(num_deleted)

# Copyright © 2016, Regents of the University of California
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# - Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
# - Neither the name of the University of California nor the names of its
#   contributors may be used to endorse or promote products derived from this
#   software without specific prior written permission.
'''one time script to populate redis with harvested image object data'''
from harvester.config import config
from harvester.couchdb_init import get_couchdb
from harvester.couchdb_pager import couchdb_pager
from redis import Redis
import redis_collections

_config = config()

_redis = Redis(host=_config['redis_host'],
               port=_config['redis_port'],
               password=_config['redis_password'],
               socket_connect_timeout=_config['redis_connect_timeout'])

object_cache = redis_collections.Dict(key='ucldc:harvester:harvested-images',
                        redis=_redis)


_couchdb = get_couchdb(url=_config['couchdb_url'], dbname='ucldc')
v = couchdb_pager(_couchdb, include_docs='true')
for r in v:
    doc = r.doc
    if 'object' in doc:
        did = doc['_id']
        if 'object_dimensions' not in doc:
            print "NO DIMS for {} -- not caching".format(did)
        else:
            object_cache[did] = [doc['object'], doc['object_dimensions']]
            print "OBJECT CACHE : {} === {}".format(did, object_cache[did])