Beispiel #1
0
    def scan_and_queue(self,p_queue,p_index,p_query={},p_doctype=None,p_scroll_time='5m',p_timeout='1m'):
        """Reads docs from an es index according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_scroll_time:    Time for scroll method
            p_timeout:        Timeout - After this period, scan context is closed
            p_index:        Index where items are picked from
            p_doctype:        DocType of the items
            p_query:        ElasticSearch query for scanning the index
        """
        try:
            param = [{'host':self.host,'port':self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server for reading: %s',json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server for reading: %s',json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            if 'p_doctype' is not None:
                documents = helpers.scan(client=es, query=p_query, size=1000, scroll=p_scroll_time, index=p_index, doc_type=p_doctype, timeout=p_timeout)
            else:
                documents = helpers.scan(client=es, query=p_query, size=1000, scroll= p_scroll_time, index=p_index, timeout=p_timeout)
            for doc in documents:
                logger.debug(doc)
                p_queue.put(doc)
        except Exception as e:
            logger.info("Error while scanning ES index %s with query %s",p_index,p_query)
Beispiel #2
0
    def set_mapping(self, p_index, p_mapping):
        """Create an index with a given p_mapping

            - p_index:     index to delete
            - p_mapping:   mapping forced
        """
        try:
            param = [{'host': self.host, 'port': self.port}]
            if self.proxy is None:
                es = Elasticsearch(param)
            else:
                es = Elasticsearch(param,
                                   connection_class=MyConnection,
                                   proxies={'http': self.proxy})
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s',
                         json.dumps(param))
            logger.error(e)

        try:
            es.indices.create(index=p_index, body=p_mapping)
            logger.info('Index %s created', p_index)
        except Exception as e:
            logger.error('Error creating the index %s', p_index)
            logger.error(e)
Beispiel #3
0
    def remove_items(self, p_collection, p_query):
        """Execute a delete query on collection using p_query selection

            p_collection:        mongo collection where to store the docs;
            p_query:             selection query
        """

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        try:
            mongo_connect[p_collection].remove(p_query)
            logger.info('Collection items removal done')
        except PyMongoError as e:
            logger.error('Failed to remove entries from %s', p_collection)
            logger.error(e)
Beispiel #4
0
    def clear_index(self, p_index):
        """Deletes and index

            - p_index:     index to delete
            - returns true if p_index has been deleted, false if not
        """
        delete_ok = True

        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s',
                         json.dumps(param))
            logger.error(e)
            delete_ok = False

        try:
            es.indices.delete(index=p_index)
            logger.info('Index %s deleted', p_index)
        except Exception as e:
            logger.error('Error deleting the index %s', p_index)
            logger.error(e)
            delete_ok = False

        return delete_ok
Beispiel #5
0
    def count(self, p_index, p_query={}):
        """Gets the number of docs for a query

            p_index:    elasticsearch index where to query
            p_query:    the query to process

            return the number of docs from the index p_index and the query p_query
        """
        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s', json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            result = es.count(index=p_index, body=p_query)
            logger.info('Count the number of items from %s for the query %s', p_index, p_query)
        except Exception as e:
            logger.error('Error querying the index %s with query %s', p_index, p_query)
            logger.error(e)

        return result['count']
Beispiel #6
0
    def count_items(self, p_collection, p_query):
        """Return item count using p_query selection

            p_collection:        mongo collection to query;
            p_query:             selection query
        """

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        try:
            return mongo_connect[p_collection].find(p_query).count()

        except PyMongoError as e:
            logger.error('Failed to count entries from %s', p_collection)
            logger.error(e)
Beispiel #7
0
    def remove_items(self, p_collection, p_query):
        """Execute a delete query on collection using p_query selection

            p_collection:        mongo collection where to store the docs;
            p_query:             selection query
        """

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        try:
            mongo_connect[p_collection].remove(p_query)
            logger.info('Collection items removal done')
        except PyMongoError as e:
            logger.error('Failed to remove entries from %s', p_collection)
            logger.error(e)
Beispiel #8
0
    def clear_index(self, p_index):
        """Deletes and index

            - p_index:     index to delete
            - returns true if p_index has been deleted, false if not
        """
        delete_ok = True

        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s', json.dumps(param))
            logger.error(e)
            delete_ok = False

        try:
            es.indices.delete(index=p_index)
            logger.info('Index %s deleted', p_index)
        except Exception as e:
            logger.error('Error deleting the index %s', p_index)
            logger.error(e)
            delete_ok = False

        return delete_ok
Beispiel #9
0
    def scan_and_queue(self,p_queue,p_collection,p_query,p_batch_size=100):
        """Reads docs from a collection according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_collection:    Collection where items are picked from
            p_query:        MongoDB query for scanning the collection
            p_batch_size:   Number of read docs by iteration
        """

        # Scan collection according to the query
        documents = self.mongo[p_collection].find(p_query)
        nb_docs = documents.count()
        logger.info('Scanning %i items in %s',nb_docs, p_collection)

        # Each items is put into the queue
        documents.batch_size(p_batch_size)

        # time_for_x_items = 0
        # num_items_processed = 0
        # num_items_average = 1000
        start_time = time.time()
        for doc in documents:
            p_queue.put(doc)
            # logger.warn('In Queue size : %i',p_queue.qsize())
        time_for_x_items = time.time()
        # num_items_processed += 1
        # if (num_items_processed % num_items_average) == 0:
        #     logger.info("Average reading time : %fs (after %i items)", time_for_x_items/num_items_processed, num_items_processed)

        logger.info("Average reading time : %fs", (time_for_x_items - start_time)/nb_docs)
Beispiel #10
0
    def count_items(self, p_collection, p_query):
        """Return item count using p_query selection

            p_collection:        mongo collection to query;
            p_query:             selection query
        """

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        try:
            return mongo_connect[p_collection].find(p_query).count()

        except PyMongoError as e:
            logger.error('Failed to count entries from %s', p_collection)
            logger.error(e)
Beispiel #11
0
    def __init__(self,p_host,p_port,p_base,p_user,p_password,p_connect_timeout=60000):
        """Class creation

            p_host:     Mongo Server address
            p_port:        Mongo Server port
            p_base:        Mongo base
            p_user:        Mongo user
            p_password:    Mongo password
        """
        self.host = p_host
        self.port = p_port
        self.base = p_base
        self.user = p_user
        self.password = p_password
        self.connect_timeout = p_connect_timeout

        # uri for mongo connection
        uri = 'mongodb://%s:%s@%s:%s/%s?connectTimeoutMS=' % (self.user,self.password,self.host,self.port,self.base,self.connect_timeout)
        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            self.mongo = mongo_client[self.base]
            logger.info('Connection succeeded on %s',uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s',uri)
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)
Beispiel #12
0
    def dequeue_and_store(self,p_queue,p_file,p_delimiter=',',p_quotechar='"',p_quoting=csv.QUOTE_NONNUMERIC):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:    queue wich items are picked from. Elements has to be "list".
            p_file:     file to store in
        """

        # If not exists, creates the cursor
        if p_file not in self.csvfilecursor:
            self.csvfilecursor[p_file] = open(p_file, "w")
            self.out_csvfile[p_file] = csv.writer(self.csvfilecursor[p_file],delimiter=p_delimiter,quotechar=p_quotechar,quoting=p_quoting)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not(poison_pill):
            try:
                source_doc = p_queue.get()

                # Manage poison pill : stop trying to get elements
                if source_doc is None:
                    logger.debug("CSVio has received 'poison pill' and is now ending ...")
                    poison_pill = True
                    self.csvfilecursor[p_file].close()
                    p_queue.task_done()
                    break

                self.out_csvfile[p_file].writerow(source_doc)

                p_queue.task_done()
            except KeyboardInterrupt:
                logger.info("CSVio.dequeue_and_store : User interruption of the process")
                self.csvfilecursor[p_file].close()
                sys.exit(EXIT_USER_INTERRUPT)
Beispiel #13
0
    def count(self, p_index, p_query={}):
        """Gets the number of docs for a query

            p_index:    elasticsearch index where to query
            p_query:    the query to process

            return the number of docs from the index p_index and the query p_query
        """
        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s',
                         json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            result = es.count(index=p_index, body=p_query)
            logger.info('Count the number of items from %s for the query %s',
                        p_index, p_query)
        except Exception as e:
            logger.error('Error querying the index %s with query %s', p_index,
                         p_query)
            logger.error(e)

        return result['count']
Beispiel #14
0
    def dequeue_and_store(self,p_queue, p_collection):
        """Gets docs from p_queue and stores them in a mongo collection
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_collection:        mongo collection where to store the docs;            
        """
        # uri for mongo connection
        uri = 'mongodb://%s:%s@%s:%s/%s?connectTimeoutMS=' % (self.user,self.password,self.host,self.port,self.base,self.connect_timeout)
        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connection = mongo_client[self.base]
            logger.info('Connection succeeded on %s',uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s',uri)
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False        

        while not(poison_pill):
            try:                
                
                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    logger.debug("Mongoio has received 'poison pill' and is now ending ...")
                    poison_pill = True
                    p_queue.task_done()
                    break

                #management of 'update/set' style request                 
                try:
                    find = source_doc['_mongo_find']
                except KeyError:
                    find = {'_id':source_doc['_id']}

                try:
                    update = source_doc['_mongo_update']
                except KeyError:
                    update = source_doc
            
                #insert into collection
                try:                                                                        
                    mongo_connection[p_collection].update(find,update,upsert=True)
                except Exception as e:
                    logger.error("Document not inserted in Mongo Collection %s", source_doc['_id'])
                    logger.error(e)                

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info("Mongoio.dequeue_and_store : User interruption of the process")
                sys.exit(EXIT_USER_INTERRUPT)
Beispiel #15
0
 def set_settings(self, p_index, p_conf):
     """Sets the index settings
     """
     try:
         client = algoliasearch.Client(self.app_id, self.api_key)
         index = client.init_index(p_index)
         index.set_settings(p_conf)
         logger.info('Index %s set', p_index)
     except Exception as e:
         logger.error('Error setting the index %s', p_index)
         logger.error(e)
Beispiel #16
0
 def set_settings(self,p_index,p_conf):
     """Sets the index settings
     """
     try:
         client = algoliasearch.Client(self.app_id,self.api_key)
         index = client.init_index(p_index)
         index.set_settings(p_conf)
         logger.info('Index %s set',p_index)
     except Exception as e:
         logger.error('Error setting the index %s',p_index)
         logger.error(e)
Beispiel #17
0
 def get_settings(self, p_index):
     """Gets the index settings
     """
     try:
         client = algoliasearch.Client(self.app_id, self.api_key)
         index = client.init_index(p_index)
         result = index.get_settings()
         logger.info('Index %s get', p_index)
         return result
     except Exception as e:
         logger.error('Error getting settings of %s', p_index)
         logger.error(e)
Beispiel #18
0
 def get_settings(self, p_index):
     """Gets the index settings
     """
     try:
         client = algoliasearch.Client(self.app_id, self.api_key)
         index = client.init_index(p_index)
         result = index.get_settings()
         logger.info('Index %s get', p_index)
         return result
     except Exception as e:
         logger.error('Error getting settings of %s', p_index)
         logger.error(e)
Beispiel #19
0
    def remove_items(self, p_collection, p_query):
        """Execute a delete query on collection using p_query selection              

            p_collection:        mongo collection where to store the docs;            
            p_query:             selection query
        """
        try:            
            self.mongo[p_collection].remove(p_query)
            logger.info('Collection items removal done')
        except PyMongoError as e:
            logger.error('Failed to remove entries from %s',p_collection)
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)
Beispiel #20
0
    def dequeue_and_store(self,p_queue,p_index,p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """

        client = algoliasearch.Client(self.app_id,self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()
                    
                    # Manage poison pill
                    if source_doc is None:
                        logger.debug("ESio has received 'poison pill' and is now ending ...")
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            logger.debug("Indexing %i documents",len(bulk))
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error("Bulk not indexed in algolia - Retry number %i",try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True

                if not is_indexed:
                    logger.error("Bulk not indexed in algolia : operation aborted after %i retries",try_counter-1)

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                sys.exit(1)
Beispiel #21
0
    def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100):
        """Reads docs from a collection according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_collection:    Collection where items are picked from
            p_query:        MongoDB query for scanning the collection
            p_batch_size:   Number of read docs by iteration
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        # Scan collection according to the query
        documents = mongo_connect[p_collection].find(p_query)
        nb_docs = documents.count()
        logger.info('Scanning %i items in %s', nb_docs, p_collection)

        # Each items is put into the queue
        documents.batch_size(p_batch_size)

        start_time = time.time()
        for doc in documents:
            p_queue.put(doc)
            with self.counters['nb_items_scanned'].get_lock():
                self.counters['nb_items_scanned'].value += 1
                if self.counters['nb_items_scanned'].value % self.counters[
                        'log_every'] == 0:
                    logger.info(
                        "Scan in progress : {0} items read from source".format(
                            self.counters['nb_items_scanned'].value))

            # logger.warn('In Queue size : %i',p_queue.qsize())
        time_for_x_items = time.time()

        if nb_docs == 0:
            logger.info("No document to process")
        else:
            logger.info("Average reading time : %fs",
                        (time_for_x_items - start_time) / nb_docs)
Beispiel #22
0
    def scan_and_queue(self,
                       p_queue,
                       p_index,
                       p_query={},
                       p_connect_timeout=1,
                       p_read_timeout=30):
        """Reads docs from an Algolia index according to a query and pushes them to the queue

            p_queue:        Queue where items are pushed to
            p_index:        Index where items are picked from
            p_query:        query for scanning the index
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)
        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            client.timeout = (p_connect_timeout, p_read_timeout)
            index = client.init_index(p_index)
        except Exception as e:
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            documents = index.browse_all(p_query)
            start = time.time()
            for doc in documents:
                p_queue.put(doc)
                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger.info("Scan : {0} items".format(nb_items))
                        logger.debug("   -> Avg scan time : {0}ms".format(
                            1000 * self.counters['scan_time'].value /
                            nb_items))

                    # Start timers reinit
                    start = time.time()
        except Exception as e:
            logger.info("Error while scanning Algolia index %s with query %s",
                        p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Beispiel #23
0
    def scan_and_queue(self, p_queue, p_collection, p_query, p_batch_size=100):
        """Reads docs from a collection according to a query and pushes them to the queue

            p_queue:         Queue where items are pushed to
            p_collection:    Collection where items are picked from
            p_query:        MongoDB query for scanning the collection
            p_batch_size:   Number of read docs by iteration
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Scan collection according to the query
        documents = mongo_connect[p_collection].find(p_query)
        nb_docs = documents.count()
        logger.info('Scanning %i items in %s', nb_docs, p_collection)

        # Each items is put into the queue
        documents.batch_size(p_batch_size)

        start_time = time.time()
        for doc in documents:
            p_queue.put(doc)
            with self.counters['nb_items_scanned'].get_lock():
                self.counters['nb_items_scanned'].value += 1
                if self.counters['nb_items_scanned'].value % self.counters['log_every'] == 0:
                    logger.info("Scan in progress : {0} items read from source".format(self.counters['nb_items_scanned'].value))

            # logger.warn('In Queue size : %i',p_queue.qsize())
        time_for_x_items = time.time()

        if nb_docs == 0:
            logger.info("No document to process")
        else:
            logger.info("Average reading time : %fs", (time_for_x_items - start_time)/nb_docs)
Beispiel #24
0
    def delete_document(self,p_index,p_id):
        """Deletes a doc from an index
            - p_index:      index where to delete the doc
            - p_id:         id of the doc to delete
        """
        delete_ok = True

        try:
            client = algoliasearch.Client(self.app_id,self.api_key)
            index = client.init_index(p_index)
            index.delete_object(p_id)
            logger.info('%s deleted from %s',p_id, p_index)
        except Exception as e:
            logger.error('Error deleting the %s from index %s',p_id, p_index)
            logger.error(e)
            delete_ok = False

        return delete_ok
Beispiel #25
0
    def delete_document(self, p_index, p_id):
        """Deletes a doc from an index
            - p_index:      index where to delete the doc
            - p_id:         id of the doc to delete
        """
        delete_ok = True

        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            index = client.init_index(p_index)
            index.delete_object(p_id)
            logger.info('%s deleted from %s', p_id, p_index)
        except Exception as e:
            logger.error('Error deleting the %s from index %s', p_id, p_index)
            logger.error(e)
            delete_ok = False

        return delete_ok
Beispiel #26
0
    def clear_index(self,p_index):
        """Deletes and index

            - p_index:     index to delete
            - returns true if p_index has been deleted, false if not
        """
        delete_ok = True

        try:
            client = algoliasearch.Client(self.app_id,self.api_key)
            index = client.init_index(p_index)
            index.clear_index()
            logger.info('Index %s deleted',p_index)
        except Exception as e:
            logger.error('Error deleting the index %s',p_index)
            logger.error(e)
            delete_ok = False

        return delete_ok
Beispiel #27
0
    def clear_index(self, p_index):
        """Deletes and index

            - p_index:     index to delete
            - returns true if p_index has been deleted, false if not
        """
        delete_ok = True

        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            index = client.init_index(p_index)
            index.clear_index()
            logger.info('Index %s deleted', p_index)
        except Exception as e:
            logger.error('Error deleting the index %s', p_index)
            logger.error(e)
            delete_ok = False

        return delete_ok
Beispiel #28
0
    def set_mapping(self, p_index, p_mapping):
        """Create an index with a given p_mapping

            - p_index:     index to delete
            - p_mapping:   mapping forced
        """
        try:
            param = [{'host': self.host, 'port': self.port}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s', json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s', json.dumps(param))
            logger.error(e)

        try:
            es.indices.create(index=p_index, body=p_mapping)
            logger.info('Index %s created', p_index)
        except Exception as e:
            logger.error('Error creating the index %s', p_index)
            logger.error(e)
Beispiel #29
0
    def scan_and_queue(self,p_queue,p_file, p_xpath):        
        """Reads xml files in a directory and pushes them to the queue
            
            p_queue:         Queue where items are pushed to
            p_file:            XML File to scan            
            p_xpath:        XPATH used to split document into multiple docs
        """
        logger.info('Scanning xml in %s', p_file)
        start_time = datetime.datetime.now()

        tree = ET.parse(p_file)
        root = tree.getroot()

        # Each items is put into the queue
        compteur = 0
        
        if p_xpath:
             nodeList=root.findall(p_xpath)
        else:
             nodeList=[root]

        for foundElem in nodeList:            
            compteur = compteur + 1            
            #logger.debug("queue size=",p_queue.qsize())
            #logger.debug(ET.tostring(foundElem, encoding="us-ascii", method="xml"))
            try:                
                p_queue.put(ET.tostring(foundElem, encoding="us-ascii", method="xml"))
            except Exception as e:                
                logger.error(e)            
        
        # start_time = datetime.datetime.now()
        # for doc in documents.skip(p_skip).limit(p_skip+p_limit):
        #for doc in documents:
        #    compteur = compteur + 1
            # if compteur % 500 == 0:
            #     elsapsed_time = datetime.datetime.now() - start_time
            #     start_time = datetime.datetime.now()
            #     logger.info('Pushing item number %i in the queue in %s',compteur,elsapsed_time)
            # logger.info('Pushing item number %i in the queue',compteur)
        #    p_queue.put(doc)
Beispiel #30
0
    def scan_and_queue(self, p_queue, p_index, p_query={}, p_connect_timeout=1, p_read_timeout=30):
        """Reads docs from an Algolia index according to a query and pushes them to the queue

            p_queue:        Queue where items are pushed to
            p_index:        Index where items are picked from
            p_query:        query for scanning the index
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)
        try:
            client = algoliasearch.Client(self.app_id, self.api_key)
            client.timeout = (p_connect_timeout, p_read_timeout)
            index = client.init_index(p_index)
        except Exception as e:
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        try:
            documents = index.browse_all(p_query)
            start = time.time()
            for doc in documents:
                p_queue.put(doc)
                elapsed = time.time() - start

                with self.counters['nb_items_scanned'].get_lock():
                    self.counters['nb_items_scanned'].value += 1
                    nb_items = self.counters['nb_items_scanned'].value
                    self.counters['scan_time'].value += elapsed

                    if nb_items % self.counters['log_every'] == 0:
                        logger.info("Scan : {0} items".format(nb_items))
                        logger.debug("   -> Avg scan time : {0}ms".format(1000*self.counters['scan_time'].value / nb_items))

                    # Start timers reinit
                    start = time.time()
        except Exception as e:
            logger.info("Error while scanning Algolia index %s with query %s", p_index, p_query)
            with self.counters['nb_items_error'].get_lock():
                self.counters['nb_items_error'].value += 1
Beispiel #31
0
    def scan_and_queue(self,p_queue,p_file,p_delimiter=',',p_skip_header=True):        
        """Reads csv file and pushes each line to the queue
            
            p_queue:    Queue where items are pushed to
            p_file:     CSV File to scan
            p_skip_header: Don't pass the first line
        """
        logger.info('Scanning csv in %s', p_file)

        # cr = csv.reader(open(p_file,"r",delimiter=p_delimiter))
        # for row in cr:
        #     print (row)

        filecursor = open(p_file, 'r')
        reader = csv.reader(filecursor,delimiter=p_delimiter)

        # Skip first line ?
        skipline = p_skip_header
        for row in reader:
            if not skipline:
                p_queue.put(row)
            else:
                skipline = False
Beispiel #32
0
def get_and_parse(p_inqueue,p_outqueue,p_process,**kwargs):
    """
        Gets doc from an input queue, applies transformation according to p_process function,
        then pushes the so produced new doc into an output queue

        p_process must take a "doc" as a first parameter
    """

    current = current_process()

    while True:
        try:
            logger.debug("(%s) Size of queues. in : %i / ou : %i",current.name,p_inqueue.qsize(),p_outqueue.qsize())
            
            try:
                in_doc = p_inqueue.get(False)
            except Exception:
                logger.info("Nothing to get in the Queue")
            else:
                # Manage poison pill
                if in_doc is None:
                    logger.info("(%s) => Parser has received 'poison pill' and is now ending ...",current.name)
                    p_inqueue.task_done()
                    break

                # Call the proc with the arg list (keeping the * means : unwrap the list when calling the function)

                out_doc = p_process(in_doc,**kwargs)

                for doc in out_doc:
                    p_outqueue.put(doc)

                p_inqueue.task_done()

        except TimeoutError:
            logger.warn('Timeout exception while parsing with %s method',p_process)
        except KeyboardInterrupt:
            logger.info("user interruption")
            sys.exit(0)
Beispiel #33
0
    def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        client = algoliasearch.Client(self.app_id, self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        start = time.time()
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error("Bulk not indexed in algolia - Retry number %i", try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters['whole_storage_time'].value += elapsed
                            self.counters['bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters['log_every'] == 0 and nb_items != 0:
                                logger.info("Store : {0} items".format(nb_items))
                                logger.debug("   -> Avg store time : {0}ms".format(1000 * self.counters['whole_storage_time'].value / nb_items))
                                logger.debug("   -> Avg bulk time  : {0}ms".format(1000 * self.counters['bulk_storage_time'].value / nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger.error("Bulk not indexed in algolia : operation aborted after %i retries", try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error("An error occured while storing elements to Algolia : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Beispiel #34
0
    def dequeue_and_store(self,p_queue,p_index,p_timeout=10,p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the csv file
             Stops dealing with the queue when receiving a "None" item

            p_queue:            queue wich items are picked from. Elements has to be "list".
            p_index:            elasticsearch index where to store the docs
            p_timeout:          timeout for bulk (default is 10s)
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        try:
            param = [{'host':self.host,'port':self.port,'timeout':p_timeout,'max_retries':p_nbmax_retry,'retry_on_timeout':True}]
            es = Elasticsearch(param)
            logger.info('Connected to ES Server: %s',json.dumps(param))
        except Exception as e:
            logger.error('Connection failed to ES Server : %s',json.dumps(param))
            logger.error(e)
            sys.exit(EXIT_IO_ERROR)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        poison_pill = False
        while not(poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()
                    
                    # Manage poison pill
                    if source_doc is None:
                        logger.debug("ESio has received 'poison pill' and is now ending ...")
                        poison_pill = True
                        p_queue.task_done()
                        break

                    # Bulk element creation from the source_doc
                    source_doc['_index'] = p_index

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            logger.debug("Indexing %i documents",len(bulk))
                            helpers.bulk(es, bulk, raise_on_error=True)
                            # es.index(index=self.index,doc_type=p_doctype,body=source_doc)
                    except Exception as e:
                        logger.error("Bulk not indexed in ES - Retry n°%i",try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True

                if not is_indexed:
                    logger.error("Bulk not indexed in elasticsearch : operation aborted after %i retries",try_counter-1)                  

            except KeyboardInterrupt:
                logger.info("ESio.dequeue_and_store : User interruption of the process")
                sys.exit(EXIT_USER_INTERRUPT)
Beispiel #35
0
    def dequeue_and_store(self, p_queue, p_collection, p_upsert=True):
        """Gets docs from p_queue and stores them in a mongo collection
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_collection:        mongo collection where to store the docs;
            p_upsert:            if true, new documents are created, if false they are ignored
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level, self.formatter)

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        poison_pill = False

        while not(poison_pill):
            try:

                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    poison_pill = True
                    p_queue.task_done()
                    break

                # management of 'update/set' style request
                try:
                    find = source_doc['_mongo_find']
                except KeyError:
                    find = {'_id': source_doc['_id']}

                try:
                    update = source_doc['_mongo_update']
                except KeyError:
                    update = source_doc

                # insert into collection
                try:
                    mongo_connect[p_collection].update(find, update, upsert=p_upsert, multi=True if '_mongo_update' in source_doc else False)
                except Exception as e:
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += 1
                    logger.error("Document not inserted in Mongo Collection %s", source_doc['_id'])
                    logger.error(e)
                else:
                    with self.counters['nb_items_stored'].get_lock():
                        self.counters['nb_items_stored'].value += 1
                        if self.counters['nb_items_stored'].value % self.counters['log_every'] == 0:
                            logger.info("Storage in progress : {0} items written to target".format(self.counters['nb_items_stored'].value))

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info("Mongoio.dequeue_and_store : User interruption of the process")
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error("An error occured while storing elements to Mongo : {0}".format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error("Too many errors while storing. Process interrupted after {0} errors".format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()
Beispiel #36
0
    def dequeue_and_store(self, p_queue, p_index, p_nbmax_retry=3):
        """Gets docs from p_queue and stores them in the algolia
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_index:            algolia index where to store the docs
            p_nbmax_retry:      number of tries when failing on a request (default is 3)
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        client = algoliasearch.Client(self.app_id, self.api_key)
        index = client.init_index(p_index)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        start = time.time()
        poison_pill = False
        while not (poison_pill):
            try:
                bulk = []
                while (len(bulk) < self.bulk_size):
                    source_doc = p_queue.get()

                    # Manage poison pill
                    if source_doc is None:
                        poison_pill = True
                        p_queue.task_done()
                        break

                    bulk.append(source_doc)
                    p_queue.task_done()

                try_counter = 1
                is_indexed = False
                while try_counter <= p_nbmax_retry and not is_indexed:
                    start_bulking = time.time()
                    try:
                        # Bulk indexation
                        if len(bulk) > 0:
                            index.add_objects(bulk)
                    except Exception as e:
                        logger.error(
                            "Bulk not indexed in algolia - Retry number %i",
                            try_counter)
                        logger.error(e)
                        try_counter += 1
                    else:
                        is_indexed = True
                        now = time.time()
                        elapsed_bulking = now - start_bulking
                        elapsed = now - start
                        with self.counters['nb_items_stored'].get_lock():
                            self.counters['nb_items_stored'].value += len(bulk)
                            self.counters[
                                'whole_storage_time'].value += elapsed
                            self.counters[
                                'bulk_storage_time'].value += elapsed_bulking
                            nb_items = self.counters['nb_items_stored'].value
                            if nb_items % self.counters[
                                    'log_every'] == 0 and nb_items != 0:
                                logger.info(
                                    "Store : {0} items".format(nb_items))
                                logger.debug(
                                    "   -> Avg store time : {0}ms".format(
                                        1000 * self.
                                        counters['whole_storage_time'].value /
                                        nb_items))
                                logger.debug(
                                    "   -> Avg bulk time  : {0}ms".format(
                                        1000 * self.
                                        counters['bulk_storage_time'].value /
                                        nb_items))

                            start = time.time()

                if not is_indexed:
                    start = time.time()
                    logger.error(
                        "Bulk not indexed in algolia : operation aborted after %i retries",
                        try_counter - 1)
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += len(bulk)

            except KeyboardInterrupt:
                logger.info(
                    "ESio.dequeue_and_store : User interruption of the process"
                )
                sys.exit(1)
Beispiel #37
0
    def dequeue_and_store(self, p_queue, p_collection, p_upsert=True):
        """Gets docs from p_queue and stores them in a mongo collection
             Stops dealing with the queue when receiving a "None" item

            p_queue:             queue wich items are picked from. Elements has to be "list".
            p_collection:        mongo collection where to store the docs;
            p_upsert:            if true, new documents are created, if false they are ignored
        """
        logger = get_logger_mp(__name__, self.log_queue, self.log_level,
                               self.formatter)

        # uri for mongo connection
        uri = self._get_mongo_uri()

        # Connect to mongo
        try:
            mongo_client = MongoClient(uri)
            mongo_connect = mongo_client[self.base]
            logger.info('Connection succeeded on %s', uri)
        except PyMongoError as e:
            logger.error('Failed to connect to %s', uri)
            logger.error(e)

        # Loop untill receiving the "poison pill" item (meaning : no more element to read)
        # Main loop max retry
        main_loop_max_retry = 5
        main_loop_retry = 0
        poison_pill = False

        while not (poison_pill):
            try:

                source_doc = p_queue.get()

                # Manage poison pill
                if source_doc is None:
                    poison_pill = True
                    p_queue.task_done()
                    break

                # management of 'update/set' style request
                try:
                    find = source_doc['_mongo_find']
                except KeyError:
                    find = {'_id': source_doc['_id']}

                try:
                    update = source_doc['_mongo_update']
                except KeyError:
                    update = source_doc

                # insert into collection
                try:
                    mongo_connect[p_collection].update(
                        find,
                        update,
                        upsert=p_upsert,
                        multi=True if '_mongo_update' in source_doc else False)
                except Exception as e:
                    with self.counters['nb_items_error'].get_lock():
                        self.counters['nb_items_error'].value += 1
                    logger.error(
                        "Document not inserted in Mongo Collection %s",
                        source_doc['_id'])
                    logger.error(e)
                else:
                    with self.counters['nb_items_stored'].get_lock():
                        self.counters['nb_items_stored'].value += 1
                        if self.counters[
                                'nb_items_stored'].value % self.counters[
                                    'log_every'] == 0:
                            logger.info(
                                "Storage in progress : {0} items written to target"
                                .format(
                                    self.counters['nb_items_stored'].value))

                p_queue.task_done()

            except KeyboardInterrupt:
                logger.info(
                    "Mongoio.dequeue_and_store : User interruption of the process"
                )
                poison_pill = True
                p_queue.task_done()
            except Exception as e:
                logger.error(
                    "An error occured while storing elements to Mongo : {0}".
                    format(e))
                main_loop_retry += 1
                if main_loop_retry >= main_loop_max_retry:
                    logger.error(
                        "Too many errors while storing. Process interrupted after {0} errors"
                        .format(main_loop_retry))
                    poison_pill = True
                    p_queue.task_done()