def scan_and_queue(self, p_queue, p_file): """ Reads json file and pushes docs to the queue If the file contains a list, each doc is pushed in the queue If the file contains a doc, the whole doc is pushed in the queue p_queue: Queue where items are pushed to p_file: Json File to scan """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) logger.info('Scanning json in %s', p_file) # Each items is put into the queue try: documents = json.load(open(p_file)) except Exception as e: logger.error("Can't read the file %s", p_file) logger.error(e) if isinstance(documents, list): for doc in documents: p_queue.put(doc) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0: logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value)) else: p_queue.put(documents)
def scan_and_queue(self, p_queue, p_file, p_xpath): """Reads xml files in a directory and pushes them to the queue p_queue: Queue where items are pushed to p_file: XML File to scan p_xpath: XPATH used to split document into multiple docs """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) logger.info('Scanning xml in %s', p_file) tree = ET.parse(p_file) root = tree.getroot() # Each items is put into the queue compteur = 0 if p_xpath: nodeList = root.findall(p_xpath) else: nodeList = [root] for foundElem in nodeList: compteur = compteur + 1 try: p_queue.put(ET.tostring(foundElem, encoding="us-ascii", method="xml")) except Exception as e: logger.error(e)
def scan_and_queue(self, p_queue, p_file, p_xpath): """Reads xml files in a directory and pushes them to the queue p_queue: Queue where items are pushed to p_file: XML File to scan p_xpath: XPATH used to split document into multiple docs """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) logger.info('Scanning xml in %s', p_file) tree = ET.parse(p_file) root = tree.getroot() # Each items is put into the queue compteur = 0 if p_xpath: nodeList = root.findall(p_xpath) else: nodeList = [root] for foundElem in nodeList: compteur = compteur + 1 try: p_queue.put( ET.tostring(foundElem, encoding="us-ascii", method="xml")) except Exception as e: logger.error(e)
def scan_and_queue(self, p_queue, p_file, p_delimiter=',', p_skip_header=True): """Reads csv file and pushes each line to the queue p_queue: Queue where items are pushed to p_file: CSV File to scan p_skip_header: Don't pass the first line """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) logger.info('Scanning csv in %s', p_file) filecursor = open(p_file, 'r') reader = csv.reader(filecursor, delimiter=p_delimiter) # Skip first line ? skipline = p_skip_header for row in reader: if not skipline: p_queue.put(row) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters[ 'log_every'] == 0: logger.info( "Scan in progress : {0} items read from source". format(self.counters['nb_items_scanned'].value)) else: skipline = False
def scan_and_queue(self, p_queue, p_file): """ Reads json file and pushes docs to the queue If the file contains a list, each doc is pushed in the queue If the file contains a doc, the whole doc is pushed in the queue p_queue: Queue where items are pushed to p_file: Json File to scan """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) logger.info('Scanning json in %s', p_file) # Each items is put into the queue try: documents = json.load(open(p_file)) except Exception as e: logger.error("Can't read the file %s", p_file) logger.error(e) if isinstance(documents, list): for doc in documents: p_queue.put(doc) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters[ 'log_every'] == 0: logger.info( "Scan in progress : {0} items read from source". format(self.counters['nb_items_scanned'].value)) else: p_queue.put(documents)
def dequeue_and_store(self, p_queue, p_file, p_delimiter=',', p_quotechar='"', p_quoting=csv.QUOTE_NONNUMERIC): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_file: file to store in """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # If not exists, creates the cursor if p_file not in self.csvfilecursor: self.csvfilecursor[p_file] = open(p_file, "w") self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file], delimiter=p_delimiter, quotechar=p_quotechar, quoting=p_quoting, lineterminator=os.linesep) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not (poison_pill): try: source_doc = p_queue.get() # Manage poison pill : stop trying to get elements if source_doc is None: logger.debug( "CSVio has received 'poison pill' and is now ending ..." ) poison_pill = True self.csvfilecursor[p_file].close() p_queue.task_done() break self.out_csvfile[p_file].writerow(source_doc) with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters['nb_items_stored'].value % self.counters[ 'log_every'] == 0: logger.info( "Storage in progress : {0} items written to target" .format(self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info( "CSVio.dequeue_and_store : User interruption of the process" ) self.csvfilecursor[p_file].close() sys.exit(EXIT_USER_INTERRUPT)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_doctype=None, p_scroll_time='5m', p_timeout='1m', p_size=100, p_overall_timeout=30, p_nbmax_retry=3): """Reads docs from an es index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_scroll_time: Time for scroll method p_timeout: Timeout - After this period, scan context is closed p_index: Index where items are picked from p_doctype: DocType of the items p_query: ElasticSearch query for scanning the index """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: param = [{'host': self.host, 'port': self.port, 'timeout': p_overall_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}] if self.proxy is None: es = Elasticsearch(param) else: es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy}) es.ping() logger_mp.info('Connected to ES Server for reading: {0}'.format(json.dumps(param))) except Exception as e: logger_mp.error('Connection failed to ES Server for reading: {0}'.format(json.dumps(param))) logger_mp.error(e) try: if not self.scroll_docs: if 'p_doctype' is not None: self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout) else: self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, timeout=p_timeout) start = time.time() for doc in self.scroll_docs: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger_mp.info("Scan : {0} items".format(nb_items)) logger_mp.debug(" -> Avg scan time : {0}ms".format(1000 * self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger_mp.info("Error while scanning ES index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def scan_and_queue(self, p_queue, p_query, p_bulksize=1000, p_start=0): """Reads docs according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_query: MongoDB query for scanning the collection """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) connection = pymysql.connect(host=self.host, user=self.user, password=self.password, db=self.base, charset='utf8', cursorclass=pymysql.cursors.DictCursor) try: offset = p_start stop = False # delete ";" if set at the end of the query query = p_query if query.endswith(';'): query = query[:-1] with connection.cursor() as cursor: while not stop: paginated_query = "{0} limit {1},{2}".format( p_query, offset, p_bulksize) logger.debug( "MySqlIo : Start dealing with records from {0} to {1}". format(offset, p_bulksize + offset)) try: cursor.execute(paginated_query) except pymysql.OperationalError as e: logger.error( "MySqlIo : Error while dealing with records from {0} to {1}" .format(offset, p_bulksize + offset)) logger.error(e) raise e if cursor.rowcount: for row in cursor: p_queue.put(row) offset += p_bulksize else: stop = True logger.debug( "MySqlIo : All records from {0} to {1} has been put in the queue" .format(offset, p_bulksize + offset)) cursor.close() except Exception as e: logger.error("MySqlIo : Stop reading !") logger.error(e) finally: connection.close()
def scan_and_queue(self, p_queue, p_query, p_bulksize=1000, p_start=0): """Reads docs according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_query: PostgreSQL query for scanning the collection """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) connection_string = "host='{dbhost}' dbname='{dbname}' user='******' password='******'".format( dbhost=self.host, dbname=self.base, dbuser=self.user, dbpass=self.password) connection = psycopg2.connect(connection_string) cursor = connection.cursor(cursor_factory=RealDictCursor) try: offset = p_start stop = False # delete ";" if set at the end of the query query = p_query if query.strip().endswith(';'): query = query.strip()[:-1] while not stop: paginated_query = "{0} LIMIT {1} OFFSET {2};".format( query, p_bulksize, offset) logger.debug( "PostgreSqlIo : Start dealing with records from {0} to {1}" .format(offset, p_bulksize + offset)) try: cursor.execute(paginated_query) except psycopg2.OperationalError as e: logger.error( "PostgreSqlIo : Error while dealing with records from {0} to {1}" .format(offset, p_bulksize + offset)) logger.error(e) raise e if cursor.rowcount: for row in cursor: p_queue.put(row) offset += p_bulksize else: stop = True logger.debug( "PostgreSqlIo : All records from {0} to {1} has been put in the queue" .format(offset, p_bulksize + offset)) except Exception as e: logger.error("PostgreSqlIo : Stop reading !") logger.error(e) finally: cursor.close() connection.close()
def dequeue_and_store(self, p_queue, p_file, p_delimiter=',', p_quotechar='"', p_quoting=csv.QUOTE_NONNUMERIC): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_file: file to store in """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # If not exists, creates the cursor if p_file not in self.csvfilecursor: self.csvfilecursor[p_file] = open(p_file, "w") self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file], delimiter=p_delimiter, quotechar=p_quotechar, quoting=p_quoting, lineterminator=os.linesep) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 poison_pill = False while not(poison_pill): try: source_doc = p_queue.get() # Manage poison pill : stop trying to get elements if source_doc is None: logger.debug("CSVio has received 'poison pill' and is now ending ...") poison_pill = True self.csvfilecursor[p_file].close() p_queue.task_done() break self.out_csvfile[p_file].writerow(source_doc) with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters['nb_items_stored'].value % self.counters['log_every'] == 0: logger.info("Storage in progress : {0} items written to target".format(self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info("CSVio.dequeue_and_store : User interruption of the process") self.csvfilecursor[p_file].close() poison_pill = True p_queue.task_done() except Exception as e: logger.error("An error occured while storing elements to CSV : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry)) poison_pill = True p_queue.task_done()
def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100): """Reads docs from a collection according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_collection: Collection where items are picked from p_query: MongoDB query for scanning the collection p_batch_size: Number of read docs by iteration """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) # Scan collection according to the query documents = mongo_connect[p_collection].find(p_query) nb_docs = documents.count() logger.info('Scanning %i items in %s', nb_docs, p_collection) # Each items is put into the queue documents.batch_size(p_batch_size) start_time = time.time() for doc in documents: p_queue.put(doc) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters[ 'log_every'] == 0: logger.info( "Scan in progress : {0} items read from source".format( self.counters['nb_items_scanned'].value)) # logger.warn('In Queue size : %i',p_queue.qsize()) time_for_x_items = time.time() if nb_docs == 0: logger.info("No document to process") else: logger.info("Average reading time : %fs", (time_for_x_items - start_time) / nb_docs)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30): """Reads docs from an Algolia index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_index: Index where items are picked from p_query: query for scanning the index """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: client = algoliasearch.Client(self.app_id, self.api_key) client.timeout = (p_connect_timeout, p_read_timeout) index = client.init_index(p_index) except Exception as e: logger.error(e) sys.exit(EXIT_IO_ERROR) try: documents = index.browse_all(p_query) start = time.time() for doc in documents: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger.info("Scan : {0} items".format(nb_items)) logger.debug(" -> Avg scan time : {0}ms".format( 1000 * self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100): """Reads docs from a collection according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_collection: Collection where items are picked from p_query: MongoDB query for scanning the collection p_batch_size: Number of read docs by iteration """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) sys.exit(EXIT_IO_ERROR) # Scan collection according to the query documents = mongo_connect[p_collection].find(p_query) nb_docs = documents.count() logger.info('Scanning %i items in %s', nb_docs, p_collection) # Each items is put into the queue documents.batch_size(p_batch_size) start_time = time.time() for doc in documents: p_queue.put(doc) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0: logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value)) # logger.warn('In Queue size : %i',p_queue.qsize()) time_for_x_items = time.time() if nb_docs == 0: logger.info("No document to process") else: logger.info("Average reading time : %fs", (time_for_x_items - start_time)/nb_docs)
def scan_and_queue(self, p_queue, p_query, p_bulksize=1000, p_start=0): """Reads docs according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_query: MongoDB query for scanning the collection """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) connection = pymysql.connect(host=self.host, user=self.user, password=self.password, db=self.base, charset='utf8', cursorclass=pymysql.cursors.DictCursor) try: offset = p_start stop = False # delete ";" if set at the end of the query query = p_query if query.endswith(';'): query = query[:-1] with connection.cursor() as cursor: while not stop: paginated_query = "{0} limit {1},{2}".format(p_query, offset, p_bulksize) logger.debug("MySqlIo : Start dealing with records from {0} to {1}".format(offset, p_bulksize + offset)) try: cursor.execute(paginated_query) except pymysql.OperationalError as e: logger.error("MySqlIo : Error while dealing with records from {0} to {1}".format(offset, p_bulksize + offset)) logger.error(e) raise e if cursor.rowcount: for row in cursor: p_queue.put(row) offset += p_bulksize else: stop = True logger.debug("MySqlIo : All records from {0} to {1} has been put in the queue".format(offset, p_bulksize + offset)) cursor.close() except Exception as e: logger.error("MySqlIo : Stop reading !") logger.error(e) finally: connection.close()
def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30): """Reads docs from an Algolia index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_index: Index where items are picked from p_query: query for scanning the index """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: client = algoliasearch.Client(self.app_id, self.api_key) client.timeout = (p_connect_timeout, p_read_timeout) index = client.init_index(p_index) except Exception as e: logger.error(e) sys.exit(EXIT_IO_ERROR) try: documents = index.browse_all(p_query) start = time.time() for doc in documents: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger.info("Scan : {0} items".format(nb_items)) logger.debug(" -> Avg scan time : {0}ms".format(1000*self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def scan_and_queue(self, p_queue, p_file, p_delimiter=',', p_skip_header=True): """Reads csv file and pushes each line to the queue p_queue: Queue where items are pushed to p_file: CSV File to scan p_skip_header: Don't pass the first line """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) logger.info('Scanning csv in %s', p_file) filecursor = open(p_file, 'r') reader = csv.reader(filecursor, delimiter=p_delimiter) # Skip first line ? skipline = p_skip_header for row in reader: if not skipline: p_queue.put(row) with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0: logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value)) else: skipline = False
def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 start = time.time() poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: index.add_objects(bulk) except Exception as e: logger.error("Bulk not indexed in algolia - Retry number %i", try_counter) logger.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters['whole_storage_time'].value += elapsed self.counters['bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters['log_every'] == 0 and nb_items != 0: logger.info("Store : {0} items".format(nb_items)) logger.debug(" -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items)) logger.debug(" -> Avg bulk time : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger.error("Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger.info("ESio.dequeue_and_store : User interruption of the process") poison_pill = True p_queue.task_done() except Exception as e: logger.error("An error occured while storing elements to Algolia : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry)) poison_pill = True p_queue.task_done()
def dequeue_and_store(self, p_queue, p_collection, p_upsert=True): """Gets docs from p_queue and stores them in a mongo collection Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_collection: mongo collection where to store the docs; p_upsert: if true, new documents are created, if false they are ignored """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 poison_pill = False while not (poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # management of 'update/set' style request try: find = source_doc['_mongo_find'] except KeyError: find = {'_id': source_doc['_id']} try: update = source_doc['_mongo_update'] except KeyError: update = source_doc # insert into collection try: mongo_connect[p_collection].update( find, update, upsert=p_upsert, multi=True if '_mongo_update' in source_doc else False) except Exception as e: with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1 logger.error( "Document not inserted in Mongo Collection %s", source_doc['_id']) logger.error(e) else: with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters[ 'nb_items_stored'].value % self.counters[ 'log_every'] == 0: logger.info( "Storage in progress : {0} items written to target" .format( self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info( "Mongoio.dequeue_and_store : User interruption of the process" ) poison_pill = True p_queue.task_done() except Exception as e: logger.error( "An error occured while storing elements to Mongo : {0}". format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error( "Too many errors while storing. Process interrupted after {0} errors" .format(main_loop_retry)) poison_pill = True p_queue.task_done()
def dequeue_and_store(self, p_queue, p_table, p_id_field="id", p_commit_on_each_document=False): """ Gets docs from p_queue and stores them in a postgresql database Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list" p_table: table to operate p_id_field: name of the table id field (default "id") """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: connection_string = "host='{dbhost}' dbname='{dbname}' user='******' password='******'".format( dbhost=self.host, dbname=self.base, dbuser=self.user, dbpass=self.password) connection = psycopg2.connect(connection_string) cursor = connection.cursor(cursor_factory=RealDictCursor) except psycopg2.Error as e: logger.error('Failed to connect to {db}'.format(db=self.base)) logger.error(e) sys.exit(EXIT_IO_ERROR) # Loop untill receiving the "poison pill" item (meaning : no more element to read) poison_pill = False while not (poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() connection.commit() break # Manage SQL parameters sql_fields = "({0})".format(",".join(source_doc.keys())) # sql_values = "{0}".format(",".join(repr(e).strip().replace("'", "''") for e in source_doc.values())) # sql_update_fields_values_excluded = ",".join(["{field}=EXCLUDED.{field}".format(field=field) for field in source_doc.keys()]) sql_update_fields_values = ",".join([ "{field}=%s".format(field=field) for field in source_doc.keys() if field != p_id_field ]) try: cursor = connection.cursor(cursor_factory=RealDictCursor) # Only for V. psql > 9.5 # sql_p95 = """INSERT INTO {table} {fields} # VALUES ({values}) # ON CONFLICT ({id_field}) DO UPDATE SET {update_fields_values};""".format( # table=p_table, # fields=sql_fields, # values=sql_values, # id_field=p_id_field, # update_fields_values=sql_update_fields_values_excluded) insert_sql = "INSERT INTO {table} {fields} SELECT {values}".format( table=p_table, fields=sql_fields, values=('%s,' * len(source_doc.values()))[:-1]) update_sql = "UPDATE {table} SET {update_fields_values} WHERE {id_field} = {id_value}".format( table=p_table, update_fields_values=sql_update_fields_values, id_field=p_id_field, id_value=source_doc[p_id_field]) sql = """ WITH upsert AS ({update_sql} RETURNING *) {insert_sql} WHERE NOT EXISTS (SELECT * FROM upsert); """.format(update_sql=update_sql, insert_sql=insert_sql) parameters = [ source_doc[key] for key in source_doc.keys() if key != p_id_field ] + [source_doc[key] for key in source_doc.keys()] cursor.execute(sql, parameters) if p_commit_on_each_document: connection.commit() except psycopg2.Error as e: with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1 logger.error( "Document not inserted in PostgreSQL Database %s", source_doc) logger.error(e) else: with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters[ 'nb_items_stored'].value % self.counters[ 'log_every'] == 0: logger.info( "Storage in progress : {0} items written to target" .format( self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info( "Postgresqlio.dequeue_and_store : User interruption of the process" ) sys.exit(EXIT_USER_INTERRUPT)
def dequeue_and_store(self, p_queue, p_collection, p_upsert=True): """Gets docs from p_queue and stores them in a mongo collection Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_collection: mongo collection where to store the docs; p_upsert: if true, new documents are created, if false they are ignored """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) # uri for mongo connection uri = self._get_mongo_uri() # Connect to mongo try: mongo_client = MongoClient(uri) mongo_connect = mongo_client[self.base] logger.info('Connection succeeded on %s', uri) except PyMongoError as e: logger.error('Failed to connect to %s', uri) logger.error(e) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 poison_pill = False while not(poison_pill): try: source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # management of 'update/set' style request try: find = source_doc['_mongo_find'] except KeyError: find = {'_id': source_doc['_id']} try: update = source_doc['_mongo_update'] except KeyError: update = source_doc # insert into collection try: mongo_connect[p_collection].update(find, update, upsert=p_upsert, multi=True if '_mongo_update' in source_doc else False) except Exception as e: with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1 logger.error("Document not inserted in Mongo Collection %s", source_doc['_id']) logger.error(e) else: with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += 1 if self.counters['nb_items_stored'].value % self.counters['log_every'] == 0: logger.info("Storage in progress : {0} items written to target".format(self.counters['nb_items_stored'].value)) p_queue.task_done() except KeyboardInterrupt: logger.info("Mongoio.dequeue_and_store : User interruption of the process") poison_pill = True p_queue.task_done() except Exception as e: logger.error("An error occured while storing elements to Mongo : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry)) poison_pill = True p_queue.task_done()
def dequeue_and_store(self, p_queue, p_index, p_timeout=10, p_nbmax_retry=3, p_disable_indexing=False): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: elasticsearch index where to store the docs p_timeout: timeout for bulk (default is 10s) p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) es = None try: param = [{'host': self.host, 'port': self.port, 'timeout': p_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}] if self.proxy is None: es = Elasticsearch(param) else: es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy}) es.ping() logger_mp.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger_mp.error('Connection failed to ES Server : %s', json.dumps(param)) logger_mp.error(e) # We need to record the previous setting, so as to apply it again after bulk operations current_settings = {} try: current_settings = es.indices.get_settings(index=p_index) logger_mp.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger_mp.error('Connection failed to ES Server : %s', json.dumps(param)) logger_mp.error(e) if p_disable_indexing: try: self._disable_indexing_and_replicat(logger_mp, es, p_index) except Exception as e: logger_mp.error("Can't disable indexing and replicat on {}".format(p_index)) logger_mp.error(e) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 start = time.time() poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # Bulk element creation from the source_doc source_doc['_index'] = p_index bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: helpers.bulk(es, bulk, raise_on_error=True) except Exception as e: logger_mp.error("Bulk not indexed in ES - Retry n°{0}".format(try_counter)) logger_mp.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters['whole_storage_time'].value += elapsed self.counters['bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters['log_every'] == 0 and nb_items != 0: logger_mp.info("Store : {0} items".format(nb_items)) logger_mp.debug(" -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items)) logger_mp.debug(" -> Avg bulk time : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger_mp.error("Bulk not indexed in elasticsearch : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger_mp.info("ESio.dequeue_and_store : User interruption of the process") # If indexing has been disabled, enable it again if p_disable_indexing: self._enable_indexing_and_replicat(logger_mp, es, p_index, current_settings) poison_pill = True p_queue.task_done() except Exception as e: logger_mp.error("An error occured while storing elements to ES : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: poison_pill = True p_queue.task_done() # If indexing has been disabled, enable it again if p_disable_indexing: try: self._enable_indexing_and_replicat(logger_mp, es, p_index, current_settings) except Exception as e: logger_mp.error("Can't enable indexing and replicat again on {} from previous settings {}".format(p_index, current_settings)) logger_mp.error(e)
def scan_and_queue(self, p_queue, p_index, p_query={}, p_doctype=None, p_scroll_time='5m', p_timeout='1m', p_size=100, p_overall_timeout=30, p_nbmax_retry=3): """Reads docs from an es index according to a query and pushes them to the queue p_queue: Queue where items are pushed to p_scroll_time: Time for scroll method p_timeout: Timeout - After this period, scan context is closed p_index: Index where items are picked from p_doctype: DocType of the items p_query: ElasticSearch query for scanning the index """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: param = [{ 'host': self.host, 'port': self.port, 'timeout': p_overall_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True }] es = Elasticsearch(param) es.ping() logger_mp.info('Connected to ES Server for reading: {0}'.format( json.dumps(param))) except Exception as e: logger_mp.error( 'Connection failed to ES Server for reading: {0}'.format( json.dumps(param))) logger_mp.error(e) sys.exit(EXIT_IO_ERROR) try: if not self.scroll_docs: if 'p_doctype' is not None: self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout) else: self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, timeout=p_timeout) start = time.time() for doc in self.scroll_docs: p_queue.put(doc) elapsed = time.time() - start with self.counters['nb_items_scanned'].get_lock(): self.counters['nb_items_scanned'].value += 1 nb_items = self.counters['nb_items_scanned'].value self.counters['scan_time'].value += elapsed if nb_items % self.counters['log_every'] == 0: logger_mp.info("Scan : {0} items".format(nb_items)) logger_mp.debug(" -> Avg scan time : {0}ms".format( 1000 * self.counters['scan_time'].value / nb_items)) # Start timers reinit start = time.time() except Exception as e: logger_mp.info("Error while scanning ES index %s with query %s", p_index, p_query) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += 1
def dequeue_and_store(self, p_queue, p_index, p_timeout=10, p_nbmax_retry=3, p_disable_indexing=False): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: elasticsearch index where to store the docs p_timeout: timeout for bulk (default is 10s) p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: param = [{ 'host': self.host, 'port': self.port, 'timeout': p_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True }] es = Elasticsearch(param) logger_mp.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger_mp.error('Connection failed to ES Server : %s', json.dumps(param)) logger_mp.error(e) sys.exit(EXIT_IO_ERROR) if p_disable_indexing: self._set_indexing_refresh(logger_mp, es, p_index, "-1") # Loop untill receiving the "poison pill" item (meaning : no more element to read) start = time.time() poison_pill = False while not (poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # Bulk element creation from the source_doc source_doc['_index'] = p_index bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: helpers.bulk(es, bulk, raise_on_error=True) except Exception as e: logger_mp.error( "Bulk not indexed in ES - Retry n°{0}".format( try_counter)) logger_mp.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters[ 'whole_storage_time'].value += elapsed self.counters[ 'bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters[ 'log_every'] == 0 and nb_items != 0: logger_mp.info( "Store : {0} items".format(nb_items)) logger_mp.debug( " -> Avg store time : {0}ms".format( 1000 * self. counters['whole_storage_time'].value / nb_items)) logger_mp.debug( " -> Avg bulk time : {0}ms".format( 1000 * self. counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger_mp.error( "Bulk not indexed in elasticsearch : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger_mp.info( "ESio.dequeue_and_store : User interruption of the process" ) # If indexing has been disabled, enable it again if p_disable_indexing: self._set_indexing_refresh(logger_mp, es, p_index, "1s") sys.exit(EXIT_USER_INTERRUPT) # If indexing has been disabled, enable it again if p_disable_indexing: self._set_indexing_refresh(logger_mp, es, p_index, "1s")
def get_and_parse(p_inqueue, p_outqueue, p_process, p_counters, p_log_queue, p_log_level, p_formatter, **kwargs): """ Gets doc from an input queue, applies transformation according to p_process function, then pushes the so produced new doc into an output queue p_process must take a "doc" as a first parameter @param p_inqueue In queue containing docs to process @param p_outqueue Out queue where processed docs are pushed @param p_process function taking a doc as an input and returning a list of docs as a result @param p_nb_items_processed Number of processed items """ logger = get_logger_mp(__name__, p_log_queue, p_log_level, p_formatter) start = time.time() start_idle = None # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 queue_get_timeout = 60 while True: try: try: in_doc = p_inqueue.get(block=True, timeout=queue_get_timeout) except Empty: # Idle starts with the first exception (queue empty) logger.debug("No doc in queue in the last {}s".format(queue_get_timeout)) if not start_idle: start_idle = time.time() else: if start_idle: elapsed_idle = time.time() - start_idle else: elapsed_idle = 0 # Manage poison pill if in_doc is None: p_inqueue.task_done() break # Call the proc with the arg list (keeping the * means : unwrap the list when calling the function) start_p_process = time.time() out_doc = p_process(in_doc, **kwargs) elapsed_p_process = time.time() - start_p_process for doc in out_doc: p_outqueue.put(doc) p_inqueue.task_done() with p_counters['nb_items_processed'].get_lock(): p_counters['nb_items_processed'].value += 1 now = time.time() elapsed = now - start p_counters['whole_process_time'].value += elapsed p_counters['real_process_time'].value += elapsed_p_process p_counters['idle_process_time'].value += elapsed_idle nb_items = p_counters['nb_items_processed'].value if p_counters['nb_items_processed'].value % p_counters['log_every'] == 0: logger.info("Process : {0} items".format(nb_items)) logger.debug(" -> Avg process time : {0}ms".format(1000 * p_counters['whole_process_time'].value / nb_items)) logger.debug(" -> Avg real time : {0}ms".format(1000 * p_counters['real_process_time'].value / nb_items)) logger.debug(" -> Avg idle time : {0}ms".format(1000 * p_counters['idle_process_time'].value / nb_items)) logger.debug("State of queues :") logger.debug(" -> Read : {0}".format(p_inqueue.qsize())) logger.debug(" -> Write : {0}".format(p_outqueue.qsize())) # Start timers reinit start = time.time() start_idle = None except TimeoutError: logger.warn('Timeout exception while parsing with %s method', p_process) with p_counters['nb_items_error'].get_lock(): p_counters['nb_items_error'].value += 1 except KeyboardInterrupt: logger.info("user interruption") p_inqueue.task_done() break except Exception as e: logger.error("An error occured while processing elements : {0}".format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: logger.error("Too many errors while processing. Process interrupted after {0} errors".format(main_loop_retry)) p_inqueue.task_done() break
def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3): """Gets docs from p_queue and stores them in the algolia Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: algolia index where to store the docs p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) client = algoliasearch.Client(self.app_id, self.api_key) index = client.init_index(p_index) # Loop untill receiving the "poison pill" item (meaning : no more element to read) start = time.time() poison_pill = False while not (poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: index.add_objects(bulk) except Exception as e: logger.error( "Bulk not indexed in algolia - Retry number %i", try_counter) logger.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters[ 'whole_storage_time'].value += elapsed self.counters[ 'bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters[ 'log_every'] == 0 and nb_items != 0: logger.info( "Store : {0} items".format(nb_items)) logger.debug( " -> Avg store time : {0}ms".format( 1000 * self. counters['whole_storage_time'].value / nb_items)) logger.debug( " -> Avg bulk time : {0}ms".format( 1000 * self. counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger.error( "Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger.info( "ESio.dequeue_and_store : User interruption of the process" ) sys.exit(1)
def dequeue_and_store(self, p_queue, p_index, p_timeout=10, p_nbmax_retry=3, p_disable_indexing=False): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: elasticsearch index where to store the docs p_timeout: timeout for bulk (default is 10s) p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) try: param = [{'host': self.host, 'port': self.port, 'timeout': p_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}] es = Elasticsearch(param) logger_mp.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger_mp.error('Connection failed to ES Server : %s', json.dumps(param)) logger_mp.error(e) sys.exit(EXIT_IO_ERROR) if p_disable_indexing: self._set_indexing_refresh(logger_mp, es, p_index, "-1") # Loop untill receiving the "poison pill" item (meaning : no more element to read) start = time.time() poison_pill = False while not(poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # Bulk element creation from the source_doc source_doc['_index'] = p_index bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: helpers.bulk(es, bulk, raise_on_error=True) except Exception as e: logger_mp.error("Bulk not indexed in ES - Retry n°{0}".format(try_counter)) logger_mp.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters['whole_storage_time'].value += elapsed self.counters['bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters['log_every'] == 0 and nb_items != 0: logger_mp.info("Store : {0} items".format(nb_items)) logger_mp.debug(" -> Avg store time : {0}ms".format(1000*self.counters['whole_storage_time'].value / nb_items)) logger_mp.debug(" -> Avg bulk time : {0}ms".format(1000*self.counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger_mp.error("Bulk not indexed in elasticsearch : operation aborted after %i retries", try_counter-1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger_mp.info("ESio.dequeue_and_store : User interruption of the process") # If indexing has been disabled, enable it again if p_disable_indexing: self._set_indexing_refresh(logger_mp, es, p_index, "1s") sys.exit(EXIT_USER_INTERRUPT) # If indexing has been disabled, enable it again if p_disable_indexing: self._set_indexing_refresh(logger_mp, es, p_index, "1s")
def dequeue_and_store(self, p_queue, p_index, p_timeout=10, p_nbmax_retry=3, p_disable_indexing=False): """Gets docs from p_queue and stores them in the csv file Stops dealing with the queue when receiving a "None" item p_queue: queue wich items are picked from. Elements has to be "list". p_index: elasticsearch index where to store the docs p_timeout: timeout for bulk (default is 10s) p_nbmax_retry: number of tries when failing on a request (default is 3) """ logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter) es = None try: param = [{ 'host': self.host, 'port': self.port, 'timeout': p_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True }] if self.proxy is None: es = Elasticsearch(param) else: es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy}) es.ping() logger_mp.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger_mp.error('Connection failed to ES Server : %s', json.dumps(param)) logger_mp.error(e) # We need to record the previous setting, so as to apply it again after bulk operations current_settings = {} try: current_settings = es.indices.get_settings(index=p_index) logger_mp.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger_mp.error('Connection failed to ES Server : %s', json.dumps(param)) logger_mp.error(e) if p_disable_indexing: try: self._disable_indexing_and_replicat(logger_mp, es, p_index) except Exception as e: logger_mp.error( "Can't disable indexing and replicat on {}".format( p_index)) logger_mp.error(e) # Loop untill receiving the "poison pill" item (meaning : no more element to read) # Main loop max retry main_loop_max_retry = 5 main_loop_retry = 0 start = time.time() poison_pill = False while not (poison_pill): try: bulk = [] while (len(bulk) < self.bulk_size): source_doc = p_queue.get() # Manage poison pill if source_doc is None: poison_pill = True p_queue.task_done() break # Bulk element creation from the source_doc source_doc['_index'] = p_index bulk.append(source_doc) p_queue.task_done() try_counter = 1 is_indexed = False while try_counter <= p_nbmax_retry and not is_indexed: start_bulking = time.time() try: # Bulk indexation if len(bulk) > 0: helpers.bulk(es, bulk, raise_on_error=True) except Exception as e: logger_mp.error( "Bulk not indexed in ES - Retry n°{0}".format( try_counter)) logger_mp.error(e) try_counter += 1 else: is_indexed = True now = time.time() elapsed_bulking = now - start_bulking elapsed = now - start with self.counters['nb_items_stored'].get_lock(): self.counters['nb_items_stored'].value += len(bulk) self.counters[ 'whole_storage_time'].value += elapsed self.counters[ 'bulk_storage_time'].value += elapsed_bulking nb_items = self.counters['nb_items_stored'].value if nb_items % self.counters[ 'log_every'] == 0 and nb_items != 0: logger_mp.info( "Store : {0} items".format(nb_items)) logger_mp.debug( " -> Avg store time : {0}ms".format( 1000 * self. counters['whole_storage_time'].value / nb_items)) logger_mp.debug( " -> Avg bulk time : {0}ms".format( 1000 * self. counters['bulk_storage_time'].value / nb_items)) start = time.time() if not is_indexed: start = time.time() logger_mp.error( "Bulk not indexed in elasticsearch : operation aborted after %i retries", try_counter - 1) with self.counters['nb_items_error'].get_lock(): self.counters['nb_items_error'].value += len(bulk) except KeyboardInterrupt: logger_mp.info( "ESio.dequeue_and_store : User interruption of the process" ) # If indexing has been disabled, enable it again if p_disable_indexing: self._enable_indexing_and_replicat(logger_mp, es, p_index, current_settings) poison_pill = True p_queue.task_done() except Exception as e: logger_mp.error( "An error occured while storing elements to ES : {0}". format(e)) main_loop_retry += 1 if main_loop_retry >= main_loop_max_retry: poison_pill = True p_queue.task_done() # If indexing has been disabled, enable it again if p_disable_indexing: try: self._enable_indexing_and_replicat(logger_mp, es, p_index, current_settings) except Exception as e: logger_mp.error( "Can't enable indexing and replicat again on {} from previous settings {}" .format(p_index, current_settings)) logger_mp.error(e)