def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr ''' print('Solr update PID: {}'.format(os.getpid())) dt_start = datetime.datetime.now() print('Start time:{}'.format(dt_start)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) s3_seq_cache = CouchdbLastSeq_S3() if not since: since = s3_seq_cache.last_seq if all_docs: since = '0' print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname)) print('Getting changes since:{}'.format(since)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) changes = db.changes(since=since) previous_since = since last_since = int( changes['last_seq']) # get new last_since for changes feed results = changes['results'] n_up = n_design = n_delete = 0 solr_db = Solr(url_solr) start_time = datetime.datetime.now() for row in results: cur_id = row['id'] if '_design' in cur_id: n_design += 1 print("Skip {0}".format(cur_id)) continue if row.get('deleted', False): # need to get the solr doc for this couch resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"'))) if resp.numFound == 1: sdoc = resp.results[0] print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id'])) solr_db.delete(id=sdoc['id']) n_delete += 1 else: print("-----DELETION of {} - FOUND {} docs".format( cur_id, resp.numFound)) else: doc = db.get(cur_id) try: doc = fill_in_title(doc) has_required_fields(doc) except KeyError as e: print(e.message) continue except ValueError as e: print(e.message) continue try: try: solr_doc = map_couch_to_solr_doc(doc) except OldCollectionException: print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id)) continue try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message) continue solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) except TypeError as e: print('TypeError for {0} : {1}'.format(cur_id, e)) continue n_up += 1 if n_up % 1000 == 0: elapsed_time = datetime.datetime.now() - start_time print("Updated {} so far in {}".format(n_up, elapsed_time)) solr_db.commit() if not all_docs: s3_seq_cache.last_seq = last_since print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete)) print("PREVIOUS SINCE:{0}".format(previous_since)) print("LAST SINCE:{0}".format(last_since)) run_time = datetime.datetime.now() - dt_start print("RUN TIME:{}".format(run_time))