Example #1
0
    def scan_and_queue(self, p_queue, p_file):
        """ Reads json file and pushes docs to the queue
            If the file contains a list, each doc is pushed in the queue
            If the file contains a doc, the whole doc is pushed in the queue

            p_queue:         Queue where items are pushed to
            p_file:            Json File to scan
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        logger.info('Scanning json in %s', p_file)

        # Each items is put into the queue
        try:
            documents = json.load(open(p_file))
        except Exception as e:
            logger.error("Can't read the file %s", p_file)
            logger.error(e)

        if isinstance(documents, list):
            for doc in documents:
                p_queue.put(doc)
                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0:
                        logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value))
        else:
            p_queue.put(documents)
Example #2
0
    def scan_and_queue(self, p_queue, p_file, p_xpath):
        """Reads xml files in a directory and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_file:            XML File to scan
            p_xpath:        XPATH used to split document into multiple docs
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        logger.info('Scanning xml in %s', p_file)

        tree = ET.parse(p_file)
        root = tree.getroot()

        # Each items is put into the queue
        compteur = 0

        if p_xpath:
            nodeList = root.findall(p_xpath)
        else:
            nodeList = [root]

        for foundElem in nodeList:
            compteur = compteur + 1
            try:
                p_queue.put(ET.tostring(foundElem, encoding="us-ascii", method="xml"))
            except Exception as e:
                logger.error(e)
Example #3
0
    def scan_and_queue(self, p_queue, p_file, p_xpath):
        """Reads xml files in a directory and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_file:            XML File to scan
            p_xpath:        XPATH used to split document into multiple docs
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        logger.info('Scanning xml in %s', p_file)

        tree = ET.parse(p_file)
        root = tree.getroot()

        # Each items is put into the queue
        compteur = 0

        if p_xpath:
            nodeList = root.findall(p_xpath)
        else:
            nodeList = [root]

        for foundElem in nodeList:
            compteur = compteur + 1
            try:
                p_queue.put(
                    ET.tostring(foundElem, encoding="us-ascii", method="xml"))
            except Exception as e:
                logger.error(e)
Example #4
0
    def scan_and_queue(self,
                       p_queue,
                       p_file,
                       p_delimiter=',',
                       p_skip_header=True):
        """Reads csv file and pushes each line to the queue

            p_queue:    Queue where items are pushed to
            p_file:     CSV File to scan
            p_skip_header: Don't pass the first line
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        logger.info('Scanning csv in %s', p_file)

        filecursor = open(p_file, 'r')
        reader = csv.reader(filecursor, delimiter=p_delimiter)

        # Skip first line ?
        skipline = p_skip_header
        for row in reader:
            if not skipline:
                p_queue.put(row)
                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    if self.counters['nb_items_scanned'].value % self.counters[
                            'log_every'] == 0:
                        logger.info(
                            "Scan in progress : {0} items read from source".
                            format(self.counters['nb_items_scanned'].value))
            else:
                skipline = False
Example #5
0
    def scan_and_queue(self, p_queue, p_file):
        """ Reads json file and pushes docs to the queue
            If the file contains a list, each doc is pushed in the queue
            If the file contains a doc, the whole doc is pushed in the queue

            p_queue:         Queue where items are pushed to
            p_file:            Json File to scan
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        logger.info('Scanning json in %s', p_file)

        # Each items is put into the queue
        try:
            documents = json.load(open(p_file))
        except Exception as e:
            logger.error("Can't read the file %s", p_file)
            logger.error(e)

        if isinstance(documents, list):
            for doc in documents:
                p_queue.put(doc)
                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    if self.counters['nb_items_scanned'].value % self.counters[
                            'log_every'] == 0:
                        logger.info(
                            "Scan in progress : {0} items read from source".
                            format(self.counters['nb_items_scanned'].value))
        else:
            p_queue.put(documents)
Example #6
0
    def dequeue_and_store(self,
                          p_queue,
                          p_file,
                          p_delimiter=',',
                          p_quotechar='"',
                          p_quoting=csv.QUOTE_NONNUMERIC):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:    queue wich items are picked from. Elements has to be "list".
            p_file:     file to store in
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        # If not exists, creates the cursor
        if p_file not in self.csvfilecursor:
            self.csvfilecursor[p_file] = open(p_file, "w")
            self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file],
                                                  delimiter=p_delimiter,
                                                  quotechar=p_quotechar,
                                                  quoting=p_quoting,
                                                  lineterminator=os.linesep)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not (poison_pill):
            try:
                source_doc = p_queue.get()

                # Manage poison pill : stop trying to get elements
                if source_doc is None:
                    logger.debug(
                        "CSVio has received 'poison pill' and is now ending ..."
                    )
                    poison_pill = True
                    self.csvfilecursor[p_file].close()
                    p_queue.task_done()
                    break

                self.out_csvfile[p_file].writerow(source_doc)
                with self.counters['nb_items_stored'].get_lock():
                    self.counters['nb_items_stored'].value += 1
                    if self.counters['nb_items_stored'].value % self.counters[
                            'log_every'] == 0:
                        logger.info(
                            "Storage in progress : {0} items written to target"
                            .format(self.counters['nb_items_stored'].value))

                p_queue.task_done()
            except KeyboardInterrupt:
                logger.info(
                    "CSVio.dequeue_and_store : User interruption of the process"
                )
                self.csvfilecursor[p_file].close()
                sys.exit(EXIT_USER_INTERRUPT)
Example #7
0
    def scan_and_queue(self, p_queue, p_index, p_query={}, p_doctype=None, p_scroll_time='5m', p_timeout='1m', p_size=100, p_overall_timeout=30, p_nbmax_retry=3):
        """Reads docs from an es index according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_scroll_time:    Time for scroll method
            p_timeout:        Timeout - After this period, scan context is closed
            p_index:        Index where items are picked from
            p_doctype:        DocType of the items
            p_query:        ElasticSearch query for scanning the index
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        try:
            param = [{'host': self.host, 'port': self.port, 'timeout': p_overall_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}]
            if self.proxy is None:
                es = Elasticsearch(param)
            else:
                es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy})
            es.ping()
            logger_mp.info('Connected to ES Server for reading: {0}'.format(json.dumps(param)))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server for reading: {0}'.format(json.dumps(param)))
            logger_mp.error(e)

        try:
            if not self.scroll_docs:
                if 'p_doctype' is not None:
                    self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout)
                else:
                    self.scroll_docs = helpers.scan(client=es, query=p_query, size=p_size, scroll=p_scroll_time, index=p_index, timeout=p_timeout)

            start = time.time()
            for doc in self.scroll_docs:
                p_queue.put(doc)

                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger_mp.info("Scan : {0} items".format(nb_items))
                        logger_mp.debug("   -> Avg scan time : {0}ms".format(1000 * self.counters['scan_time'].value / nb_items))

                    # Start timers reinit
                    start = time.time()

        except Exception as e:
            logger_mp.info("Error while scanning ES index %s with query %s", p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Example #8
0
    def scan_and_queue(self, p_queue, p_query, p_bulksize=1000, p_start=0):
        """Reads docs according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_query:        MongoDB query for scanning the collection
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        connection = pymysql.connect(host=self.host,
                                     user=self.user,
                                     password=self.password,
                                     db=self.base,
                                     charset='utf8',
                                     cursorclass=pymysql.cursors.DictCursor)

        try:
            offset = p_start
            stop = False
            # delete ";" if set at the end of the query
            query = p_query
            if query.endswith(';'):
                query = query[:-1]
            with connection.cursor() as cursor:
                while not stop:
                    paginated_query = "{0} limit {1},{2}".format(
                        p_query, offset, p_bulksize)
                    logger.debug(
                        "MySqlIo : Start dealing with records from {0} to {1}".
                        format(offset, p_bulksize + offset))
                    try:
                        cursor.execute(paginated_query)
                    except pymysql.OperationalError as e:
                        logger.error(
                            "MySqlIo : Error while dealing with records from {0} to {1}"
                            .format(offset, p_bulksize + offset))
                        logger.error(e)
                        raise e
                    if cursor.rowcount:
                        for row in cursor:
                            p_queue.put(row)
                        offset += p_bulksize
                    else:
                        stop = True
                    logger.debug(
                        "MySqlIo : All records from {0} to {1} has been put in the queue"
                        .format(offset, p_bulksize + offset))
                cursor.close()
        except Exception as e:
            logger.error("MySqlIo : Stop reading !")
            logger.error(e)
        finally:
            connection.close()
Example #9
0
    def scan_and_queue(self, p_queue, p_query, p_bulksize=1000, p_start=0):
        """Reads docs according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_query:         PostgreSQL query for scanning the collection
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        connection_string = "host='{dbhost}' dbname='{dbname}' user='******' password='******'".format(
            dbhost=self.host,
            dbname=self.base,
            dbuser=self.user,
            dbpass=self.password)
        connection = psycopg2.connect(connection_string)
        cursor = connection.cursor(cursor_factory=RealDictCursor)

        try:
            offset = p_start
            stop = False
            # delete ";" if set at the end of the query
            query = p_query
            if query.strip().endswith(';'):
                query = query.strip()[:-1]
            while not stop:
                paginated_query = "{0} LIMIT {1} OFFSET {2};".format(
                    query, p_bulksize, offset)
                logger.debug(
                    "PostgreSqlIo : Start dealing with records from {0} to {1}"
                    .format(offset, p_bulksize + offset))
                try:
                    cursor.execute(paginated_query)
                except psycopg2.OperationalError as e:
                    logger.error(
                        "PostgreSqlIo : Error while dealing with records from {0} to {1}"
                        .format(offset, p_bulksize + offset))
                    logger.error(e)
                    raise e
                if cursor.rowcount:
                    for row in cursor:
                        p_queue.put(row)
                    offset += p_bulksize
                else:
                    stop = True
                logger.debug(
                    "PostgreSqlIo : All records from {0} to {1} has been put in the queue"
                    .format(offset, p_bulksize + offset))
        except Exception as e:
            logger.error("PostgreSqlIo : Stop reading !")
            logger.error(e)
        finally:
            cursor.close()
            connection.close()
Example #10
0
    def dequeue_and_store(self, p_queue, p_file, p_delimiter=',', p_quotechar='"', p_quoting=csv.QUOTE_NONNUMERIC):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:    queue wich items are picked from. Elements has to be "list".
            p_file:     file to store in
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        # If not exists, creates the cursor
        if p_file not in self.csvfilecursor:
            self.csvfilecursor[p_file] = open(p_file, "w")
            self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file], delimiter=p_delimiter, quotechar=p_quotechar, quoting=p_quoting, lineterminator=os.linesep)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        poison_pill = False
        while not(poison_pill):
            try:
                source_doc = p_queue.get()

                # Manage poison pill : stop trying to get elements
                if source_doc is None:
                    logger.debug("CSVio has received 'poison pill' and is now ending ...")
                    poison_pill = True
                    self.csvfilecursor[p_file].close()
                    p_queue.task_done()
                    break

                self.out_csvfile[p_file].writerow(source_doc)
                with self.counters['nb_items_stored'].get_lock():
                    self.counters['nb_items_stored'].value += 1
                    if self.counters['nb_items_stored'].value % self.counters['log_every'] == 0:
                        logger.info("Storage in progress : {0} items written to target".format(self.counters['nb_items_stored'].value))

                p_queue.task_done()
            except KeyboardInterrupt:
                logger.info("CSVio.dequeue_and_store : User interruption of the process")
                self.csvfilecursor[p_file].close()
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error("An error occured while storing elements to CSV : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Example #11
0
    def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100):
        """Reads docs from a collection according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_collection:    Collection where items are picked from
            p_query:        MongoDB query for scanning the collection
            p_batch_size:   Number of read docs by iteration
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        # Scan collection according to the query
        documents = mongo_connect[p_collection].find(p_query)
        nb_docs = documents.count()
        logger.info('Scanning %i items in %s', nb_docs, p_collection)

        # Each items is put into the queue
        documents.batch_size(p_batch_size)

        start_time = time.time()
        for doc in documents:
            p_queue.put(doc)
            with self.counters['nb_items_scanned'].get_lock():
                self.counters['nb_items_scanned'].value += 1
                if self.counters['nb_items_scanned'].value % self.counters[
                        'log_every'] == 0:
                    logger.info(
                        "Scan in progress : {0} items read from source".format(
                            self.counters['nb_items_scanned'].value))

            # logger.warn('In Queue size : %i',p_queue.qsize())
        time_for_x_items = time.time()

        if nb_docs == 0:
            logger.info("No document to process")
        else:
            logger.info("Average reading time : %fs",
                        (time_for_x_items - start_time) / nb_docs)
Example #12
0
    def scan_and_queue(self,
                       p_queue,
                       p_index,
                       p_query={},
                       p_connect_timeout=1,
                       p_read_timeout=30):
        """Reads docs from an Algolia index according to a query and pushes them to the queue

            p_queue:        Queue where items are pushed to
            p_index:        Index where items are picked from
            p_query:        query for scanning the index
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            client.timeout = (p_connect_timeout, p_read_timeout)
            index = client.init_index(p_index)
        except Exception as e:
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            documents = index.browse_all(p_query)
            start = time.time()
            for doc in documents:
                p_queue.put(doc)
                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger.info("Scan : {0} items".format(nb_items))
                        logger.debug("   -> Avg scan time : {0}ms".format(
                            1000 * self.counters['scan_time'].value /
                            nb_items))

                    # Start timers reinit
                    start = time.time()
        except Exception as e:
            logger.info("Error while scanning Algolia index %s with query %s",
                        p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Example #13
0
    def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100):
        """Reads docs from a collection according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_collection:    Collection where items are picked from
            p_query:        MongoDB query for scanning the collection
            p_batch_size:   Number of read docs by iteration
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Scan collection according to the query
        documents = mongo_connect[p_collection].find(p_query)
        nb_docs = documents.count()
        logger.info('Scanning %i items in %s', nb_docs, p_collection)

        # Each items is put into the queue
        documents.batch_size(p_batch_size)

        start_time = time.time()
        for doc in documents:
            p_queue.put(doc)
            with self.counters['nb_items_scanned'].get_lock():
                self.counters['nb_items_scanned'].value += 1
                if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0:
                    logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value))

            # logger.warn('In Queue size : %i',p_queue.qsize())
        time_for_x_items = time.time()

        if nb_docs == 0:
            logger.info("No document to process")
        else:
            logger.info("Average reading time : %fs", (time_for_x_items - start_time)/nb_docs)
Example #14
0
    def scan_and_queue(self, p_queue, p_query, p_bulksize=1000, p_start=0):
        """Reads docs according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_query:        MongoDB query for scanning the collection
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        connection = pymysql.connect(host=self.host,
                                     user=self.user,
                                     password=self.password,
                                     db=self.base,
                                     charset='utf8',
                                     cursorclass=pymysql.cursors.DictCursor)

        try:
            offset = p_start
            stop = False
            # delete ";" if set at the end of the query
            query = p_query
            if query.endswith(';'):
                query = query[:-1]
            with connection.cursor() as cursor:
                while not stop:
                    paginated_query = "{0} limit {1},{2}".format(p_query, offset, p_bulksize)
                    logger.debug("MySqlIo : Start dealing with records from {0} to {1}".format(offset, p_bulksize + offset))
                    try:
                        cursor.execute(paginated_query)
                    except pymysql.OperationalError as e:
                        logger.error("MySqlIo : Error while dealing with records from {0} to {1}".format(offset, p_bulksize + offset))
                        logger.error(e)
                        raise e
                    if cursor.rowcount:
                        for row in cursor:
                            p_queue.put(row)
                        offset += p_bulksize
                    else:
                        stop = True
                    logger.debug("MySqlIo : All records from {0} to {1} has been put in the queue".format(offset, p_bulksize + offset))
                cursor.close()
        except Exception as e:
            logger.error("MySqlIo : Stop reading !")
            logger.error(e)
        finally:
            connection.close()
Example #15
0
    def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30):
        """Reads docs from an Algolia index according to a query and pushes them to the queue

            p_queue:        Queue where items are pushed to
            p_index:        Index where items are picked from
            p_query:        query for scanning the index
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            client.timeout = (p_connect_timeout, p_read_timeout)
            index = client.init_index(p_index)
        except Exception as e:
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            documents = index.browse_all(p_query)
            start = time.time()
            for doc in documents:
                p_queue.put(doc)
                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger.info("Scan : {0} items".format(nb_items))
                        logger.debug("   -> Avg scan time : {0}ms".format(1000*self.counters['scan_time'].value / nb_items))

                    # Start timers reinit
                    start = time.time()
        except Exception as e:
            logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Example #16
0
    def scan_and_queue(self, p_queue, p_file, p_delimiter=',', p_skip_header=True):
        """Reads csv file and pushes each line to the queue

            p_queue:    Queue where items are pushed to
            p_file:     CSV File to scan
            p_skip_header: Don't pass the first line
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        logger.info('Scanning csv in %s', p_file)

        filecursor = open(p_file, 'r')
        reader = csv.reader(filecursor, delimiter=p_delimiter)

        # Skip first line ?
        skipline = p_skip_header
        for row in reader:
            if not skipline:
                p_queue.put(row)
                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0:
                        logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value))
            else:
                skipline = False
Example #17
0
    def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        client = algoliasearch.Client(self.app_id, self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        start = time.time()
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error("Bulk not indexed in algolia - Retry number %i", try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters['whole_storage_time'].value += elapsed
                            self.counters['bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters['log_every'] == 0 and nb_items != 0:
                                logger.info("Store : {0} items".format(nb_items))
                                logger.debug("   -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items))
                                logger.debug("   -> Avg bulk time  : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger.error("Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error("An error occured while storing elements to Algolia : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Example #18
0
    def dequeue_and_store(self, p_queue, p_collection, p_upsert=True):
        """Gets docs from p_queue and stores them in a mongo collection
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_collection:        mongo collection where to store the docs;
            p_upsert:            if true, new documents are created, if false they are ignored
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        poison_pill = False

        while not (poison_pill):
            try:

                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    poison_pill = True
                    p_queue.task_done()
                    break

                # management of 'update/set' style request
                try:
                    find = source_doc['_mongo_find']
                except KeyError:
                    find = {'_id': source_doc['_id']}

                try:
                    update = source_doc['_mongo_update']
                except KeyError:
                    update = source_doc

                # insert into collection
                try:
                    mongo_connect[p_collection].update(
                        find,
                        update,
                        upsert=p_upsert,
                        multi=True if '_mongo_update' in source_doc else False)
                except Exception as e:
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += 1
                    logger.error(
                        "Document not inserted in Mongo Collection %s",
                        source_doc['_id'])
                    logger.error(e)
                else:
                    with self.counters['nb_items_stored'].get_lock():
                        self.counters['nb_items_stored'].value += 1
                        if self.counters[
                                'nb_items_stored'].value % self.counters[
                                    'log_every'] == 0:
                            logger.info(
                                "Storage in progress : {0} items written to target"
                                .format(
                                    self.counters['nb_items_stored'].value))

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info(
                    "Mongoio.dequeue_and_store : User interruption of the process"
                )
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error(
                    "An error occured while storing elements to Mongo : {0}".
                    format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error(
                        "Too many errors while storing. Process interrupted after {0} errors"
                        .format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Example #19
0
    def dequeue_and_store(self,
                          p_queue,
                          p_table,
                          p_id_field="id",
                          p_commit_on_each_document=False):
        """
            Gets docs from p_queue and stores them in a postgresql database
            Stops dealing with the queue when receiving a "None" item
            p_queue:             queue wich items are picked from. Elements has to be "list"
            p_table:             table to operate
            p_id_field:          name of the table id field (default "id")
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        try:
            connection_string = "host='{dbhost}' dbname='{dbname}' user='******' password='******'".format(
                dbhost=self.host,
                dbname=self.base,
                dbuser=self.user,
                dbpass=self.password)
            connection = psycopg2.connect(connection_string)
            cursor = connection.cursor(cursor_factory=RealDictCursor)
        except psycopg2.Error as e:
            logger.error('Failed to connect to {db}'.format(db=self.base))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False

        while not (poison_pill):
            try:

                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    poison_pill = True
                    p_queue.task_done()
                    connection.commit()
                    break

                # Manage SQL parameters
                sql_fields = "({0})".format(",".join(source_doc.keys()))
                # sql_values = "{0}".format(",".join(repr(e).strip().replace("'", "''") for e in source_doc.values()))
                # sql_update_fields_values_excluded = ",".join(["{field}=EXCLUDED.{field}".format(field=field) for field in source_doc.keys()])
                sql_update_fields_values = ",".join([
                    "{field}=%s".format(field=field)
                    for field in source_doc.keys() if field != p_id_field
                ])

                try:
                    cursor = connection.cursor(cursor_factory=RealDictCursor)

                    # Only for V. psql > 9.5
                    # sql_p95 = """INSERT INTO {table} {fields}
                    #          VALUES ({values})
                    #          ON CONFLICT ({id_field}) DO UPDATE SET {update_fields_values};""".format(
                    #             table=p_table,
                    #             fields=sql_fields,
                    #             values=sql_values,
                    #             id_field=p_id_field,
                    #             update_fields_values=sql_update_fields_values_excluded)

                    insert_sql = "INSERT INTO {table} {fields} SELECT {values}".format(
                        table=p_table,
                        fields=sql_fields,
                        values=('%s,' * len(source_doc.values()))[:-1])
                    update_sql = "UPDATE {table} SET {update_fields_values} WHERE {id_field} = {id_value}".format(
                        table=p_table,
                        update_fields_values=sql_update_fields_values,
                        id_field=p_id_field,
                        id_value=source_doc[p_id_field])

                    sql = """
                        WITH upsert AS ({update_sql} RETURNING *) {insert_sql} WHERE NOT EXISTS (SELECT * FROM upsert);
                    """.format(update_sql=update_sql, insert_sql=insert_sql)

                    parameters = [
                        source_doc[key]
                        for key in source_doc.keys() if key != p_id_field
                    ] + [source_doc[key] for key in source_doc.keys()]
                    cursor.execute(sql, parameters)
                    if p_commit_on_each_document:
                        connection.commit()
                except psycopg2.Error as e:
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += 1
                    logger.error(
                        "Document not inserted in PostgreSQL Database %s",
                        source_doc)
                    logger.error(e)
                else:
                    with self.counters['nb_items_stored'].get_lock():
                        self.counters['nb_items_stored'].value += 1
                        if self.counters[
                                'nb_items_stored'].value % self.counters[
                                    'log_every'] == 0:
                            logger.info(
                                "Storage in progress : {0} items written to target"
                                .format(
                                    self.counters['nb_items_stored'].value))

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info(
                    "Postgresqlio.dequeue_and_store : User interruption of the process"
                )
                sys.exit(EXIT_USER_INTERRUPT)
Example #20
0
    def dequeue_and_store(self, p_queue, p_collection, p_upsert=True):
        """Gets docs from p_queue and stores them in a mongo collection
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_collection:        mongo collection where to store the docs;
            p_upsert:            if true, new documents are created, if false they are ignored
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        poison_pill = False

        while not(poison_pill):
            try:

                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    poison_pill = True
                    p_queue.task_done()
                    break

                # management of 'update/set' style request
                try:
                    find = source_doc['_mongo_find']
                except KeyError:
                    find = {'_id': source_doc['_id']}

                try:
                    update = source_doc['_mongo_update']
                except KeyError:
                    update = source_doc

                # insert into collection
                try:
                    mongo_connect[p_collection].update(find, update, upsert=p_upsert, multi=True if '_mongo_update' in source_doc else False)
                except Exception as e:
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += 1
                    logger.error("Document not inserted in Mongo Collection %s", source_doc['_id'])
                    logger.error(e)
                else:
                    with self.counters['nb_items_stored'].get_lock():
                        self.counters['nb_items_stored'].value += 1
                        if self.counters['nb_items_stored'].value % self.counters['log_every'] == 0:
                            logger.info("Storage in progress : {0} items written to target".format(self.counters['nb_items_stored'].value))

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info("Mongoio.dequeue_and_store : User interruption of the process")
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error("An error occured while storing elements to Mongo : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Example #21
0
    def dequeue_and_store(self, p_queue, p_index, p_timeout=10, p_nbmax_retry=3, p_disable_indexing=False):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:            queue wich items are picked from. Elements has to be "list".
            p_index:            elasticsearch index where to store the docs
            p_timeout:          timeout for bulk (default is 10s)
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        es = None
        try:
            param = [{'host': self.host, 'port': self.port, 'timeout': p_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}]
            if self.proxy is None:
                es = Elasticsearch(param)
            else:
                es = Elasticsearch(param, connection_class=MyConnection, proxies={'http': self.proxy})
            es.ping()
            logger_mp.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server : %s', json.dumps(param))
            logger_mp.error(e)

        # We need to record the previous setting, so as to apply it again after bulk operations
        current_settings = {}
        try:
            current_settings = es.indices.get_settings(index=p_index)
            logger_mp.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server : %s', json.dumps(param))
            logger_mp.error(e)

        if p_disable_indexing:
            try:
                self._disable_indexing_and_replicat(logger_mp, es, p_index)
            except Exception as e:
                logger_mp.error("Can't disable indexing and replicat on {}".format(p_index))
                logger_mp.error(e)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        start = time.time()
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    # Bulk element creation from the source_doc
                    source_doc['_index'] = p_index

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()

                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            helpers.bulk(es, bulk, raise_on_error=True)
                    except Exception as e:
                        logger_mp.error("Bulk not indexed in ES - Retry n°{0}".format(try_counter))
                        logger_mp.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters['whole_storage_time'].value += elapsed
                            self.counters['bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters['log_every'] == 0 and nb_items != 0:
                                logger_mp.info("Store : {0} items".format(nb_items))
                                logger_mp.debug("   -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items))
                                logger_mp.debug("   -> Avg bulk time  : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger_mp.error("Bulk not indexed in elasticsearch : operation aborted after %i retries", try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger_mp.info("ESio.dequeue_and_store : User interruption of the process")
                # If indexing has been disabled, enable it again
                if p_disable_indexing:
                    self._enable_indexing_and_replicat(logger_mp, es, p_index, current_settings)
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger_mp.error("An error occured while storing elements to ES : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    poison_pill = True
                    p_queue.task_done()

        # If indexing has been disabled, enable it again
        if p_disable_indexing:
            try:
                self._enable_indexing_and_replicat(logger_mp, es, p_index, current_settings)
            except Exception as e:
                logger_mp.error("Can't enable indexing and replicat again on {} from previous settings {}".format(p_index, current_settings))
                logger_mp.error(e)
Example #22
0
    def scan_and_queue(self,
                       p_queue,
                       p_index,
                       p_query={},
                       p_doctype=None,
                       p_scroll_time='5m',
                       p_timeout='1m',
                       p_size=100,
                       p_overall_timeout=30,
                       p_nbmax_retry=3):
        """Reads docs from an es index according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_scroll_time:    Time for scroll method
            p_timeout:        Timeout - After this period, scan context is closed
            p_index:        Index where items are picked from
            p_doctype:        DocType of the items
            p_query:        ElasticSearch query for scanning the index
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level,
                                  self.formatter)

        try:
            param = [{
                'host': self.host,
                'port': self.port,
                'timeout': p_overall_timeout,
                'max_retries': p_nbmax_retry,
                'retry_on_timeout': True
            }]
            es = Elasticsearch(param)
            es.ping()
            logger_mp.info('Connected to ES Server for reading: {0}'.format(
                json.dumps(param)))
        except Exception as e:
            logger_mp.error(
                'Connection failed to ES Server for reading: {0}'.format(
                    json.dumps(param)))
            logger_mp.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            if not self.scroll_docs:
                if 'p_doctype' is not None:
                    self.scroll_docs = helpers.scan(client=es,
                                                    query=p_query,
                                                    size=p_size,
                                                    scroll=p_scroll_time,
                                                    index=p_index,
                                                    doc_type=p_doctype,
                                                    timeout=p_timeout)
                else:
                    self.scroll_docs = helpers.scan(client=es,
                                                    query=p_query,
                                                    size=p_size,
                                                    scroll=p_scroll_time,
                                                    index=p_index,
                                                    timeout=p_timeout)

            start = time.time()
            for doc in self.scroll_docs:
                p_queue.put(doc)

                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger_mp.info("Scan : {0} items".format(nb_items))
                        logger_mp.debug("   -> Avg scan time : {0}ms".format(
                            1000 * self.counters['scan_time'].value /
                            nb_items))

                    # Start timers reinit
                    start = time.time()

        except Exception as e:
            logger_mp.info("Error while scanning ES index %s with query %s",
                           p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Example #23
0
    def dequeue_and_store(self,
                          p_queue,
                          p_index,
                          p_timeout=10,
                          p_nbmax_retry=3,
                          p_disable_indexing=False):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:            queue wich items are picked from. Elements has to be "list".
            p_index:            elasticsearch index where to store the docs
            p_timeout:          timeout for bulk (default is 10s)
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level,
                                  self.formatter)

        try:
            param = [{
                'host': self.host,
                'port': self.port,
                'timeout': p_timeout,
                'max_retries': p_nbmax_retry,
                'retry_on_timeout': True
            }]
            es = Elasticsearch(param)
            logger_mp.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server : %s',
                            json.dumps(param))
            logger_mp.error(e)
            sys.exit(EXIT_IO_ERROR)

        if p_disable_indexing:
            self._set_indexing_refresh(logger_mp, es, p_index, "-1")

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        start = time.time()
        poison_pill = False
        while not (poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    # Bulk element creation from the source_doc
                    source_doc['_index'] = p_index

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()

                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            helpers.bulk(es, bulk, raise_on_error=True)
                    except Exception as e:
                        logger_mp.error(
                            "Bulk not indexed in ES - Retry n°{0}".format(
                                try_counter))
                        logger_mp.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters[
                                'whole_storage_time'].value += elapsed
                            self.counters[
                                'bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters[
                                    'log_every'] == 0 and nb_items != 0:
                                logger_mp.info(
                                    "Store : {0} items".format(nb_items))
                                logger_mp.debug(
                                    "   -> Avg store time : {0}ms".format(
                                        1000 * self.
                                        counters['whole_storage_time'].value /
                                        nb_items))
                                logger_mp.debug(
                                    "   -> Avg bulk time  : {0}ms".format(
                                        1000 * self.
                                        counters['bulk_storage_time'].value /
                                        nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger_mp.error(
                        "Bulk not indexed in elasticsearch : operation aborted after %i retries",
                        try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger_mp.info(
                    "ESio.dequeue_and_store : User interruption of the process"
                )
                # If indexing has been disabled, enable it again
                if p_disable_indexing:
                    self._set_indexing_refresh(logger_mp, es, p_index, "1s")

                sys.exit(EXIT_USER_INTERRUPT)

        # If indexing has been disabled, enable it again
        if p_disable_indexing:
            self._set_indexing_refresh(logger_mp, es, p_index, "1s")
Example #24
0
def get_and_parse(p_inqueue, p_outqueue, p_process, p_counters, p_log_queue, p_log_level, p_formatter, **kwargs):
    """
        Gets doc from an input queue, applies transformation according to p_process function,
        then pushes the so produced new doc into an output queue

        p_process must take a "doc" as a first parameter

        @param p_inqueue    In queue containing docs to process
        @param p_outqueue   Out queue where processed docs are pushed
        @param p_process    function taking a doc as an input and returning a list of docs as a result
        @param p_nb_items_processed    Number of processed items
    """

    logger = get_logger_mp(__name__, p_log_queue, p_log_level, p_formatter)

    start = time.time()
    start_idle = None
    # Main loop max retry
    main_loop_max_retry = 5
    main_loop_retry = 0
    queue_get_timeout = 60

    while True:
        try:
            try:
                in_doc = p_inqueue.get(block=True, timeout=queue_get_timeout)
            except Empty:
                # Idle starts with the first exception (queue empty)
                logger.debug("No doc in queue in the last {}s".format(queue_get_timeout))
                if not start_idle:
                    start_idle = time.time()
            else:
                if start_idle:
                    elapsed_idle = time.time() - start_idle
                else:
                    elapsed_idle = 0

                # Manage poison pill
                if in_doc is None:
                    p_inqueue.task_done()
                    break

                # Call the proc with the arg list (keeping the * means : unwrap the list when calling the function)
                start_p_process = time.time()

                out_doc = p_process(in_doc, **kwargs)

                elapsed_p_process = time.time() - start_p_process

                for doc in out_doc:
                    p_outqueue.put(doc)

                p_inqueue.task_done()

                with p_counters['nb_items_processed'].get_lock():
                    p_counters['nb_items_processed'].value += 1
                    now = time.time()
                    elapsed = now - start

                    p_counters['whole_process_time'].value += elapsed
                    p_counters['real_process_time'].value += elapsed_p_process
                    p_counters['idle_process_time'].value += elapsed_idle

                    nb_items = p_counters['nb_items_processed'].value
                    if p_counters['nb_items_processed'].value % p_counters['log_every'] == 0:
                        logger.info("Process : {0} items".format(nb_items))
                        logger.debug("   -> Avg process time   : {0}ms".format(1000 * p_counters['whole_process_time'].value / nb_items))
                        logger.debug("   -> Avg real time      : {0}ms".format(1000 * p_counters['real_process_time'].value / nb_items))
                        logger.debug("   -> Avg idle time      : {0}ms".format(1000 * p_counters['idle_process_time'].value / nb_items))
                        logger.debug("State of queues :")
                        logger.debug("   -> Read  : {0}".format(p_inqueue.qsize()))
                        logger.debug("   -> Write : {0}".format(p_outqueue.qsize()))

                    # Start timers reinit
                    start = time.time()
                    start_idle = None

        except TimeoutError:
            logger.warn('Timeout exception while parsing with %s method', p_process)
            with p_counters['nb_items_error'].get_lock():
                p_counters['nb_items_error'].value += 1
        except KeyboardInterrupt:
            logger.info("user interruption")
            p_inqueue.task_done()
            break
        except Exception as e:
            logger.error("An error occured while processing elements : {0}".format(e))
            main_loop_retry += 1
            if main_loop_retry >= main_loop_max_retry:
                logger.error("Too many errors while processing. Process interrupted after {0} errors".format(main_loop_retry))
                p_inqueue.task_done()
                break
Example #25
0
    def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        client = algoliasearch.Client(self.app_id, self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        start = time.time()
        poison_pill = False
        while not (poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error(
                            "Bulk not indexed in algolia - Retry number %i",
                            try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters[
                                'whole_storage_time'].value += elapsed
                            self.counters[
                                'bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters[
                                    'log_every'] == 0 and nb_items != 0:
                                logger.info(
                                    "Store : {0} items".format(nb_items))
                                logger.debug(
                                    "   -> Avg store time : {0}ms".format(
                                        1000 * self.
                                        counters['whole_storage_time'].value /
                                        nb_items))
                                logger.debug(
                                    "   -> Avg bulk time  : {0}ms".format(
                                        1000 * self.
                                        counters['bulk_storage_time'].value /
                                        nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger.error(
                        "Bulk not indexed in algolia : operation aborted after %i retries",
                        try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger.info(
                    "ESio.dequeue_and_store : User interruption of the process"
                )
                sys.exit(1)
Example #26
0
    def dequeue_and_store(self, p_queue, p_index, p_timeout=10, p_nbmax_retry=3, p_disable_indexing=False):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:            queue wich items are picked from. Elements has to be "list".
            p_index:            elasticsearch index where to store the docs
            p_timeout:          timeout for bulk (default is 10s)
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        try:
            param = [{'host': self.host, 'port': self.port, 'timeout': p_timeout, 'max_retries': p_nbmax_retry, 'retry_on_timeout': True}]
            es = Elasticsearch(param)
            logger_mp.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server : %s', json.dumps(param))
            logger_mp.error(e)
            sys.exit(EXIT_IO_ERROR)

        if p_disable_indexing:
            self._set_indexing_refresh(logger_mp, es, p_index, "-1")

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        start = time.time()
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    # Bulk element creation from the source_doc
                    source_doc['_index'] = p_index

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()

                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            helpers.bulk(es, bulk, raise_on_error=True)
                    except Exception as e:
                        logger_mp.error("Bulk not indexed in ES - Retry n°{0}".format(try_counter))
                        logger_mp.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters['whole_storage_time'].value += elapsed
                            self.counters['bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters['log_every'] == 0 and nb_items != 0:
                                logger_mp.info("Store : {0} items".format(nb_items))
                                logger_mp.debug("   -> Avg store time : {0}ms".format(1000*self.counters['whole_storage_time'].value / nb_items))
                                logger_mp.debug("   -> Avg bulk time  : {0}ms".format(1000*self.counters['bulk_storage_time'].value / nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger_mp.error("Bulk not indexed in elasticsearch : operation aborted after %i retries", try_counter-1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger_mp.info("ESio.dequeue_and_store : User interruption of the process")
                # If indexing has been disabled, enable it again
                if p_disable_indexing:
                    self._set_indexing_refresh(logger_mp, es, p_index, "1s")

                sys.exit(EXIT_USER_INTERRUPT)

        # If indexing has been disabled, enable it again
        if p_disable_indexing:
            self._set_indexing_refresh(logger_mp, es, p_index, "1s")
Example #27
0
    def dequeue_and_store(self,
                          p_queue,
                          p_index,
                          p_timeout=10,
                          p_nbmax_retry=3,
                          p_disable_indexing=False):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:            queue wich items are picked from. Elements has to be "list".
            p_index:            elasticsearch index where to store the docs
            p_timeout:          timeout for bulk (default is 10s)
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger_mp = get_logger_mp(__name__, self.log_queue, self.log_level,
                                  self.formatter)

        es = None
        try:
            param = [{
                'host': self.host,
                'port': self.port,
                'timeout': p_timeout,
                'max_retries': p_nbmax_retry,
                'retry_on_timeout': True
            }]
            if self.proxy is None:
                es = Elasticsearch(param)
            else:
                es = Elasticsearch(param,
                                   connection_class=MyConnection,
                                   proxies={'http': self.proxy})
            es.ping()
            logger_mp.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server : %s',
                            json.dumps(param))
            logger_mp.error(e)

        # We need to record the previous setting, so as to apply it again after bulk operations
        current_settings = {}
        try:
            current_settings = es.indices.get_settings(index=p_index)
            logger_mp.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger_mp.error('Connection failed to ES Server : %s',
                            json.dumps(param))
            logger_mp.error(e)

        if p_disable_indexing:
            try:
                self._disable_indexing_and_replicat(logger_mp, es, p_index)
            except Exception as e:
                logger_mp.error(
                    "Can't disable indexing and replicat on {}".format(
                        p_index))
                logger_mp.error(e)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        start = time.time()
        poison_pill = False
        while not (poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    # Bulk element creation from the source_doc
                    source_doc['_index'] = p_index

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()

                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            helpers.bulk(es, bulk, raise_on_error=True)
                    except Exception as e:
                        logger_mp.error(
                            "Bulk not indexed in ES - Retry n°{0}".format(
                                try_counter))
                        logger_mp.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters[
                                'whole_storage_time'].value += elapsed
                            self.counters[
                                'bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters[
                                    'log_every'] == 0 and nb_items != 0:
                                logger_mp.info(
                                    "Store : {0} items".format(nb_items))
                                logger_mp.debug(
                                    "   -> Avg store time : {0}ms".format(
                                        1000 * self.
                                        counters['whole_storage_time'].value /
                                        nb_items))
                                logger_mp.debug(
                                    "   -> Avg bulk time  : {0}ms".format(
                                        1000 * self.
                                        counters['bulk_storage_time'].value /
                                        nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger_mp.error(
                        "Bulk not indexed in elasticsearch : operation aborted after %i retries",
                        try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger_mp.info(
                    "ESio.dequeue_and_store : User interruption of the process"
                )
                # If indexing has been disabled, enable it again
                if p_disable_indexing:
                    self._enable_indexing_and_replicat(logger_mp, es, p_index,
                                                       current_settings)
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger_mp.error(
                    "An error occured while storing elements to ES : {0}".
                    format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    poison_pill = True
                    p_queue.task_done()

        # If indexing has been disabled, enable it again
        if p_disable_indexing:
            try:
                self._enable_indexing_and_replicat(logger_mp, es, p_index,
                                                   current_settings)
            except Exception as e:
                logger_mp.error(
                    "Can't enable indexing and replicat again on {} from previous settings {}"
                    .format(p_index, current_settings))
                logger_mp.error(e)