def scan_and_queue(self,p_queue,p_index,p_query={},p_doctype=None,p_scroll_time='5m',p_timeout='1m'): """Reads docs from an es index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_scroll_time: Time for scroll method p_timeout: Timeout - After this period, scan context is closed p_index: Index where items are picked from p_doctype: DocType of the items p_query: ElasticSearch query for scanning the index """ try: param = [{'host':self.host,'port':self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server for reading: %s',json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server for reading: %s',json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) try: if 'p_doctype' is not None: documents = helpers.scan(client=es, query=p_query, size=1000, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout) else: documents = helpers.scan(client=es, query=p_query, size=1000, scroll= p_scroll_time, index=p_index, timeout=p_timeout) for doc in documents: logger.debug(doc) p_queue.put(doc) except Exception as e: logger.info("Error while scanning ES index %s with query %s",p_index,p_query)
def __init__(self,p_host,p_port,p_base,p_user,p_password,p_connect_timeout=60000): """Class creation p_host: Mongo Server address p_port: Mongo Server port p_base: Mongo base p_user: Mongo user p_password: Mongo password """ self.host = p_host self.port = p_port self.base = p_base self.user = p_user self.password = p_password self.connect_timeout = p_connect_timeout # uri for mongo connection uri = 'mongodb://%s:%s@%s:%s/%s?connectTimeoutMS=' % (self.user,self.password,self.host,self.port,self.base,self.connect_timeout) # Connect to mongo try: mongo_client = MongoClient(uri) self.mongo = mongo_client[self.base] logger.info('Connection succeeded on %s',uri) except PyMongoError as e: logger.error('Failed to connect to %s',uri) logger.error(e) sys.exit(EXIT_IO_ERROR)
def set_settings(self, p_index, p_conf): """Sets the index settings """ try: client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) index.set_settings(p_conf) logger.info('Index %s set', p_index) except Exception as e: logger.error('Error setting the index %s', p_index) logger.error(e)
def set_settings(self,p_index,p_conf): """Sets the index settings """ try: client = algoliasearch.Client(self.app_id,self.api_key) index = client.init_index(p_index) index.set_settings(p_conf) logger.info('Index %s set',p_index) except Exception as e: logger.error('Error setting the index %s',p_index) logger.error(e)
def get_settings(self, p_index): """Gets the index settings """ try: client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) result = index.get_settings() logger.info('Index %s get', p_index) return result except Exception as e: logger.error('Error getting settings of %s', p_index) logger.error(e)
def remove_items(self, p_collection, p_query): """Execute a delete query on collection using p_query selection p_collection: mongo collection where to store the docs; p_query: selection query """ try: self.mongo[p_collection].remove(p_query) logger.info('Collection items removal done') except PyMongoError as e: logger.error('Failed to remove entries from %s',p_collection) logger.error(e) sys.exit(EXIT_IO_ERROR)
def count_items(self, p_collection, p_query): """Return item count using p_query selection p_collection: mongo collection to query; p_query: selection query """ try: return self.mongo[p_collection].find(p_query).count() except PyMongoError as e: logger.error('Failed to count entries from %s',p_collection) logger.error(e) sys.exit(EXIT_IO_ERROR)
def clear_index(self, p_index): """Deletes and index - p_index: index to delete - returns true if p_index has been deleted, false if not """ delete_ok = True try: param = [{'host': self.host, 'port': self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s', json.dumps(param)) logger.error(e) delete_ok = False try: es.indices.delete(index=p_index) logger.info('Index %s deleted', p_index) except Exception as e: logger.error('Error deleting the index %s', p_index) logger.error(e) delete_ok = False return delete_ok
def remove_items(self, p_collection, p_query): """Execute a delete query on collection using p_query selection p_collection: mongo collection where to store the docs; p_query: selection query """ # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) try: mongo_connect[p_collection].remove(p_query) logger.info('Collection items removal done') except PyMongoError as e: logger.error('Failed to remove entries from %s', p_collection) logger.error(e)
def count_items(self, p_collection, p_query): """Return item count using p_query selection p_collection: mongo collection to query; p_query: selection query """ # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) try: return mongo_connect[p_collection].find(p_query).count() except PyMongoError as e: logger.error('Failed to count entries from %s', p_collection) logger.error(e)
def count(self, p_index, p_query={}): """Gets the number of docs for a query p_index: elasticsearch index where to query p_query: the query to process return the number of docs from the index p_index and the query p_query """ try: param = [{'host': self.host, 'port': self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s', json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) try: result = es.count(index=p_index, body=p_query) logger.info('Count the number of items from %s for the query %s', p_index, p_query) except Exception as e: logger.error('Error querying the index %s with query %s', p_index, p_query) logger.error(e) return result['count']
def set_mapping(self, p_index, p_mapping): """Create an index with a given p_mapping - p_index: index to delete - p_mapping: mapping forced """ try: param = [{'host': self.host, 'port': self.port}] if self.proxy is None: es = Elasticsearch(param) else: es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy}) logger.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s', json.dumps(param)) logger.error(e) try: es.indices.create(index=p_index, body=p_mapping) logger.info('Index %s created', p_index) except Exception as e: logger.error('Error creating the index %s', p_index) logger.error(e)
def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100): """Reads docs from a collection according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_collection: Collection where items are picked from p_query: MongoDB query for scanning the collection p_batch_size: Number of read docs by iteration """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) # Scan collection according to the query documents = mongo_connect[p_collection].find(p_query) nb_docs = documents.count() logger.info('Scanning %i items in %s', nb_docs, p_collection) # Each items is put into the queue documents.batch_size(p_batch_size) start_time = time.time() for doc in documents: p_queue.put(doc) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters[ 'log_every'] == 0: logger.info( "Scan in progress : {0} items read from source".format( self.counters['nb_items_scanned'].value)) # logger.warn('In Queue size : %i',p_queue.qsize()) time_for_x_items = time.time() if nb_docs == 0: logger.info("No document to process") else: logger.info("Average reading time : %fs", (time_for_x_items - start_time) / nb_docs)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30): """Reads docs from an Algolia index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_index: Index where items are picked from p_query: query for scanning the index """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: client = algoliasearch.Client(self.app_id, self.api_key) client.timeout = (p_connect_timeout, p_read_timeout) index = client.init_index(p_index) except Exception as e: logger.error(e) sys.exit(EXIT_IO_ERROR) try: documents = index.browse_all(p_query) start = time.time() for doc in documents: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger.info("Scan : {0} items".format(nb_items)) logger.debug(" -> Avg scan time : {0}ms".format( 1000 * self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def dequeue_and_store(self,p_queue, p_collection): """Gets docs from p_queue and stores them in a mongo collection Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_collection: mongo collection where to store the docs; """ # uri for mongo connection uri = 'mongodb://%s:%s@%s:%s/%s?connectTimeoutMS=' % (self.user,self.password,self.host,self.port,self.base,self.connect_timeout) # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connection = mongo_client[self.base] logger.info('Connection succeeded on %s',uri) except PyMongoError as e: logger.error('Failed to connect to %s',uri) logger.error(e) sys.exit(EXIT_IO_ERROR) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: logger.debug("Mongoio has received 'poison pill' and is now ending ...") poison_pill = True p_queue.task_done() break #management of 'update/set' style request try: find = source_doc['_mongo_find'] except KeyError: find = {'_id':source_doc['_id']} try: update = source_doc['_mongo_update'] except KeyError: update = source_doc #insert into collection try: mongo_connection[p_collection].update(find,update,upsert=True) except Exception as e: logger.error("Document not inserted in Mongo Collection %s", source_doc['_id']) logger.error(e) p_queue.task_done() except KeyboardInterrupt: logger.info("Mongoio.dequeue_and_store : User interruption of the process") sys.exit(EXIT_USER_INTERRUPT)
def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100): """Reads docs from a collection according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_collection: Collection where items are picked from p_query: MongoDB query for scanning the collection p_batch_size: Number of read docs by iteration """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) sys.exit(EXIT_IO_ERROR) # Scan collection according to the query documents = mongo_connect[p_collection].find(p_query) nb_docs = documents.count() logger.info('Scanning %i items in %s', nb_docs, p_collection) # Each items is put into the queue documents.batch_size(p_batch_size) start_time = time.time() for doc in documents: p_queue.put(doc) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0: logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value)) # logger.warn('In Queue size : %i',p_queue.qsize()) time_for_x_items = time.time() if nb_docs == 0: logger.info("No document to process") else: logger.info("Average reading time : %fs", (time_for_x_items - start_time)/nb_docs)
def delete_document(self, p_index, p_id): """Deletes a doc from an index - p_index: index where to delete the doc - p_id: id of the doc to delete """ delete_ok = True try: client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) index.delete_object(p_id) logger.info('%s deleted from %s', p_id, p_index) except Exception as e: logger.error('Error deleting the %s from index %s', p_id, p_index) logger.error(e) delete_ok = False return delete_ok
def delete_document(self,p_index,p_id): """Deletes a doc from an index - p_index: index where to delete the doc - p_id: id of the doc to delete """ delete_ok = True try: client = algoliasearch.Client(self.app_id,self.api_key) index = client.init_index(p_index) index.delete_object(p_id) logger.info('%s deleted from %s',p_id, p_index) except Exception as e: logger.error('Error deleting the %s from index %s',p_id, p_index) logger.error(e) delete_ok = False return delete_ok
def clear_index(self, p_index): """Deletes and index - p_index: index to delete - returns true if p_index has been deleted, false if not """ delete_ok = True try: client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) index.clear_index() logger.info('Index %s deleted', p_index) except Exception as e: logger.error('Error deleting the index %s', p_index) logger.error(e) delete_ok = False return delete_ok
def clear_index(self,p_index): """Deletes and index - p_index: index to delete - returns true if p_index has been deleted, false if not """ delete_ok = True try: client = algoliasearch.Client(self.app_id,self.api_key) index = client.init_index(p_index) index.clear_index() logger.info('Index %s deleted',p_index) except Exception as e: logger.error('Error deleting the index %s',p_index) logger.error(e) delete_ok = False return delete_ok
def scan_and_queue(self,p_queue,p_file, p_xpath): """Reads xml files in a directory and pushes them to the queue p_queue: Queue where items are pushed to p_file: XML File to scan p_xpath: XPATH used to split document into multiple docs """ logger.info('Scanning xml in %s', p_file) start_time = datetime.datetime.now() tree = ET.parse(p_file) root = tree.getroot() # Each items is put into the queue compteur = 0 if p_xpath: nodeList=root.findall(p_xpath) else: nodeList=[root] for foundElem in nodeList: compteur = compteur + 1 #logger.debug("queue size=",p_queue.qsize()) #logger.debug(ET.tostring(foundElem, encoding="us-ascii", method="xml")) try: p_queue.put(ET.tostring(foundElem, encoding="us-ascii", method="xml")) except Exception as e: logger.error(e) # start_time = datetime.datetime.now() # for doc in documents.skip(p_skip).limit(p_skip+p_limit): #for doc in documents: # compteur = compteur + 1 # if compteur % 500 == 0: # elsapsed_time = datetime.datetime.now() - start_time # start_time = datetime.datetime.now() # logger.info('Pushing item number %i in the queue in %s',compteur,elsapsed_time) # logger.info('Pushing item number %i in the queue',compteur) # p_queue.put(doc)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30): """Reads docs from an Algolia index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_index: Index where items are picked from p_query: query for scanning the index """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: client = algoliasearch.Client(self.app_id, self.api_key) client.timeout = (p_connect_timeout, p_read_timeout) index = client.init_index(p_index) except Exception as e: logger.error(e) sys.exit(EXIT_IO_ERROR) try: documents = index.browse_all(p_query) start = time.time() for doc in documents: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger.info("Scan : {0} items".format(nb_items)) logger.debug(" -> Avg scan time : {0}ms".format(1000*self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def set_mapping(self, p_index, p_mapping): """Create an index with a given p_mapping - p_index: index to delete - p_mapping: mapping forced """ try: param = [{'host': self.host, 'port': self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s', json.dumps(param)) logger.error(e) try: es.indices.create(index=p_index, body=p_mapping) logger.info('Index %s created', p_index) except Exception as e: logger.error('Error creating the index %s', p_index) logger.error(e)
def dequeue_and_store(self,p_queue,p_index,p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ client = algoliasearch.Client(self.app_id,self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: logger.debug("ESio has received 'poison pill' and is now ending ...") poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: try: # Bulk indexation if len(bulk) > 0: logger.debug("Indexing %i documents",len(bulk)) index.add_objects(bulk) except Exception as e: logger.error("Bulk not indexed in algolia - Retry number %i",try_counter) logger.error(e) try_counter += 1 else: is_indexed = True if not is_indexed: logger.error("Bulk not indexed in algolia : operation aborted after %i retries",try_counter-1) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") sys.exit(1)
def dequeue_and_store(self,p_queue,p_index,p_timeout=10,p_nbmax_retry=3): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: elasticsearch index where to store the docs p_timeout: timeout for bulk (default is 10s) p_nbmax_retry: number of tries when failing on a request (default is 3) """ try: param = [{'host':self.host,'port':self.port,'timeout':p_timeout,'max_retries':p_nbmax_retry,'retry_on_timeout':True}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s',json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s',json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: logger.debug("ESio has received 'poison pill' and is now ending ...") poison_pill = True p_queue.task_done() break # Bulk element creation from the source_doc source_doc['_index'] = p_index bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: try: # Bulk indexation if len(bulk) > 0: logger.debug("Indexing %i documents",len(bulk)) helpers.bulk(es, bulk, raise_on_error=True) # es.index(index=self.index,doc_type=p_doctype,body=source_doc) except Exception as e: logger.error("Bulk not indexed in ES - Retry n°%i",try_counter) logger.error(e) try_counter += 1 else: is_indexed = True if not is_indexed: logger.error("Bulk not indexed in elasticsearch : operation aborted after %i retries",try_counter-1) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") sys.exit(EXIT_USER_INTERRUPT)
def dequeue_and_store(self, p_queue, p_collection, p_upsert=True): """Gets docs from p_queue and stores them in a mongo collection Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_collection: mongo collection where to store the docs; p_upsert: if true, new documents are created, if false they are ignored """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 poison_pill = False while not(poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # management of 'update/set' style request try: find = source_doc['_mongo_find'] except KeyError: find = {'_id': source_doc['_id']} try: update = source_doc['_mongo_update'] except KeyError: update = source_doc # insert into collection try: mongo_connect[p_collection].update(find, update, upsert=p_upsert, multi=True if '_mongo_update' in source_doc else False) except Exception as e: with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1 logger.error("Document not inserted in Mongo Collection %s", source_doc['_id']) logger.error(e) else: with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters['nb_items_stored'].value % self.counters['log_every'] == 0: logger.info("Storage in progress : {0} items written to target".format(self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info("Mongoio.dequeue_and_store : User interruption of the process") poison_pill = True p_queue.task_done() except Exception as e: logger.error("An error occured while storing elements to Mongo : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry)) poison_pill = True p_queue.task_done()
def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) start = time.time() poison_pill = False while not (poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: index.add_objects(bulk) except Exception as e: logger.error( "Bulk not indexed in algolia - Retry number %i", try_counter) logger.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters[ 'whole_storage_time'].value += elapsed self.counters[ 'bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters[ 'log_every'] == 0 and nb_items != 0: logger.info( "Store : {0} items".format(nb_items)) logger.debug( " -> Avg store time : {0}ms".format( 1000 * self. counters['whole_storage_time'].value / nb_items)) logger.debug( " -> Avg bulk time : {0}ms".format( 1000 * self. counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger.error( "Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger.info( "ESio.dequeue_and_store : User interruption of the process" ) sys.exit(1)
def dequeue_and_store(self, p_queue, p_collection, p_upsert=True): """Gets docs from p_queue and stores them in a mongo collection Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_collection: mongo collection where to store the docs; p_upsert: if true, new documents are created, if false they are ignored """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 poison_pill = False while not (poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # management of 'update/set' style request try: find = source_doc['_mongo_find'] except KeyError: find = {'_id': source_doc['_id']} try: update = source_doc['_mongo_update'] except KeyError: update = source_doc # insert into collection try: mongo_connect[p_collection].update( find, update, upsert=p_upsert, multi=True if '_mongo_update' in source_doc else False) except Exception as e: with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1 logger.error( "Document not inserted in Mongo Collection %s", source_doc['_id']) logger.error(e) else: with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters[ 'nb_items_stored'].value % self.counters[ 'log_every'] == 0: logger.info( "Storage in progress : {0} items written to target" .format( self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info( "Mongoio.dequeue_and_store : User interruption of the process" ) poison_pill = True p_queue.task_done() except Exception as e: logger.error( "An error occured while storing elements to Mongo : {0}". format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error( "Too many errors while storing. Process interrupted after {0} errors" .format(main_loop_retry)) poison_pill = True p_queue.task_done()
def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 start = time.time() poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: index.add_objects(bulk) except Exception as e: logger.error("Bulk not indexed in algolia - Retry number %i", try_counter) logger.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters['whole_storage_time'].value += elapsed self.counters['bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters['log_every'] == 0 and nb_items != 0: logger.info("Store : {0} items".format(nb_items)) logger.debug(" -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items)) logger.debug(" -> Avg bulk time : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger.error("Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") poison_pill = True p_queue.task_done() except Exception as e: logger.error("An error occured while storing elements to Algolia : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry)) poison_pill = True p_queue.task_done()