Example #1
0
    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        aws = kwargs.get('aws', {'access_id': '', 'secret_key': '', 'region': 'us-east-1'})
        client_options = kwargs.get('clientOptions', {})
        if 'aws' in kwargs:
            if _HAS_AWS is False:
                raise ConfigurationError('aws extras must be installed to sign Elasticsearch requests')
            aws_args = kwargs.get('aws', {'region': 'us-east-1'})
            aws = aws_session.Session()
            if 'access_id' in aws_args and 'secret_key' in aws_args:
                aws = aws_session.Session(
                    aws_access_key_id = aws_args['access_id'],
                    aws_secret_access_key = aws_args['secret_key'])
            credentials = aws.get_credentials()
            region = aws.region_name or aws_args['region']
            aws_auth = AWSV4Sign(credentials, region, 'es')
            client_options['http_auth'] = aws_auth
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = es_connection.RequestsHttpConnection
        self.elastic = Elasticsearch(
            hosts=[url], **client_options)
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
Example #2
0
    def test_types(self):
        trans = DefaultDocumentFormatter().transform_value

        # regex
        _, patt, flags = trans(self.regex).rsplit("/")
        self.assertIn("x", flags)
        self.assertIn("m", flags)
        self.assertNotIn("l", flags)
        self.assertEqual(patt, "hello")

        # binary
        self.assertEqual(trans(self.bin1), "AGhlbGxvAA==")
        self.assertEqual(trans(self.bin2), "AGhlbGxvAA==")

        # datetime
        self.assertEqual(trans(self.date), self.date)

        # UUID
        self.assertEqual(trans(self.xuuid), self.xuuid.hex)

        # Other type
        self.assertEqual(trans(self.oid), str(self.oid))

        # Compound types
        transformed = trans(self.doc)
        for k, v in self.doc.items():
            self.assertEqual(trans(v), transformed[k])
        for el1, el2 in zip(self.lst, map(trans, self.lst)):
            self.assertEqual(trans(el1), el2)

        # Infinity/NaN
        self.assertRaises(ValueError, trans, float("inf"))
        self.assertRaises(ValueError, trans, float("nan"))
Example #3
0
    def test_types(self):
        trans = DefaultDocumentFormatter().transform_value

        # regex
        _, patt, flags = trans(self.regex).rsplit("/")
        self.assertIn('x', flags)
        self.assertIn('m', flags)
        self.assertNotIn('l', flags)
        self.assertEqual(patt, 'hello')

        # binary
        self.assertEqual(trans(self.bin1), 'AGhlbGxvAA==')
        if PY3:
            self.assertEqual(trans(self.bin2), 'AGhlbGxvAA==')
        else:
            self.assertEqual(trans(self.bin2), self.bin2)

        # datetime
        self.assertEqual(trans(self.date), self.date)

        # UUID
        self.assertEqual(trans(self.xuuid), self.xuuid.hex)

        # Other type
        self.assertEqual(trans(self.oid), str(self.oid))

        # Compound types
        transformed = trans(self.doc)
        for k, v in self.doc.items():
            self.assertEqual(trans(v), transformed[k])
        for el1, el2 in zip(self.lst, map(trans, self.lst)):
            self.assertEqual(trans(el1), el2)
    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        client_options = kwargs.get('clientOptions', {})
        client_options.setdefault('sniff_on_start', True)
        client_options.setdefault('sniff_on_connection_fail', True)
        client_options.setdefault('sniffer_timeout', 60)
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.routing = kwargs.get('routing', {})
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
 def __init__(self, url, unique_id='_id', **kwargs):
     self.kafkaprod = KafkaProducer(
         client_id='mongotokafka-producer-mconnect',
         bootstrap_servers=[url])
     print("__init__ ran")
     print(str(self.kafkaprod.config))
     self.unique_key = unique_id
     self._formatter = DefaultDocumentFormatter()
Example #6
0
  def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs):

    self.graph = Graph(url)
    self.auto_commit_interval = auto_commit_interval
    self.unique_key = unique_key
    self.chunk_size = chunk_size
    self._formatter = DefaultDocumentFormatter()
    self.kwargs = kwargs.get("clientOptions")
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = True
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)

        # es6 deprecates support for multiple document types
        # using default_type for consistency
        # Will try and use multiple doc types only if explicity specified
        self.create_multi_type = kwargs.get("createMultiType", False)
        self.default_type = kwargs.get("defaultType", "_doc")
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type if self.create_multi_type else self.default_type

        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()
    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='uid', chunk_size=DEFAULT_MAX_BULK, **kwargs):

        self.graph = Graph(url)
        self.url = url
        self.auto_commit_interval = auto_commit_interval
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self._formatter = DefaultDocumentFormatter()
        self.kwargs = kwargs.get("clientOptions")
        self.authorization_token = base64.b64encode(os.getenv('NEO4J_AUTH'))
Example #9
0
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 es_name_template=os.getenv(
                     'ELASTIC7_DOC_MANAGER_ES_INDEX_NAME_TEMPLATE',
                     DEFAULT_ES_INDEX_NAME_TEMPLATE),
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic7-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = True
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.es_name_template = es_name_template \
            if self._is_es_name_template_valid(es_name_template) else \
            DEFAULT_ES_INDEX_NAME_TEMPLATE
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()
Example #10
0
    def test_default_formatter(self):
        formatter = DefaultDocumentFormatter()

        def check_format(document):
            transformed = dict(
                (k, formatter.transform_value(v)) for k, v in document.items())
            self.assertEqual(transformed, formatter.format_document(document))

        # Flat
        check_format(self.doc)

        # Nested
        check_format(self.doc_nested)

        # With a list
        check_format(self.doc_list)
 def __init__(self,
              url,
              auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
              unique_key='_id',
              chunk_size=DEFAULT_MAX_BULK,
              **kwargs):
     """ Establish a connection to Elastic
     """
     self.elastic = Elasticsearch(hosts=[url])
     self.auto_commit_interval = auto_commit_interval
     self.doc_type = 'string'  # default type is string, change if needed
     self.unique_key = unique_key
     self.chunk_size = chunk_size
     if self.auto_commit_interval not in [None, 0]:
         self.run_auto_commit()
     self._formatter = DefaultDocumentFormatter()
Example #12
0
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id',
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get('clientOptions', {})
        if 'aws' in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    'aws extras must be installed to sign Elasticsearch '
                    'requests. Install with: '
                    'pip install elastic2-doc-manager[aws]')
            client_options['http_auth'] = create_aws_auth(kwargs['aws'])
            client_options['use_ssl'] = True
            client_options['verify_certs'] = True
            client_options['connection_class'] = \
                es_connection.RequestsHttpConnection
        if type(url) is not list:
            url = [url]
        self.elastic = Elasticsearch(hosts=url, **client_options)

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get('autoSendInterval',
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()
Example #13
0
    def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key='_id', chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta", meta_type="mongodb_meta",
                 attachment_field="content", **kwargs):
        self.elastic = Elasticsearch(
            hosts=[url], timeout=200,**kwargs.get('clientOptions', {}))
        self.auto_commit_interval = auto_commit_interval
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        if self.auto_commit_interval not in [None, 0]:
            self.run_auto_commit()
        self._formatter = DefaultDocumentFormatter()

        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
 def __init__(self,
              url,
              auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
              unique_key='_id',
              chunk_size=DEFAULT_MAX_BULK,
              meta_index_name="mongodb_meta",
              meta_type="mongodb_meta",
              **kwargs):
     self.elastic = Elasticsearch(hosts=[url])
     self.auto_commit_interval = auto_commit_interval
     self.doc_type = 'string'  # default type is string, change if needed
     self.meta_index_name = meta_index_name
     self.meta_type = meta_type
     self.unique_key = unique_key
     self.chunk_size = chunk_size
     if self.auto_commit_interval not in [None, 0]:
         self.run_auto_commit()
     self._formatter = DefaultDocumentFormatter()
 def __init__(self,
              url,
              auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
              unique_key='_id',
              chunk_size=DEFAULT_MAX_BULK,
              meta_index_name="mongodb_meta",
              meta_type="mongodb_meta",
              **kwargs):
     self.host = url[0]
     self.port = url[1]
     self.username = "******"
     self.password = "******"
     self.auto_commit_interval = auto_commit_interval
     self.doc_type = 'string'  # default type is string, change if needed
     self.meta_index_name = meta_index_name
     self.meta_type = meta_type
     self.unique_key = unique_key
     self.chunk_size = chunk_size
     self._formatter = DefaultDocumentFormatter()
    def __init__(self,
                 url,
                 auto_commit_interval=DEFAULT_COMMIT_INTERVAL,
                 unique_key="_id",
                 chunk_size=DEFAULT_MAX_BULK,
                 meta_index_name="mongodb_meta",
                 meta_type="mongodb_meta",
                 attachment_field="content",
                 **kwargs):
        client_options = kwargs.get("clientOptions", {})
        if "aws" in kwargs:
            if not _HAS_AWS:
                raise errors.InvalidConfiguration(
                    "aws extras must be installed to sign Elasticsearch "
                    "requests. Install with: "
                    "pip install elastic2-doc-manager[aws]")
            client_options["http_auth"] = create_aws_auth(kwargs["aws"])
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection
        else:
            client_options["use_ssl"] = True
            client_options["verify_certs"] = False
            client_options[
                "connection_class"] = es_connection.RequestsHttpConnection

        if type(url) is not list:
            url = [url]

        LOG.always('URL IN DOC MANAGER:')
        LOG.always(url)

        # self.elastic = Elasticsearch(hosts=url, **client_options)
        protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED')
                              == "false") else "https"
        username = os.environ.get('ELASTIC_USER')
        password = os.environ.get('ELASTIC_PASSWORD')
        hostname = os.environ.get('ELASTIC_HOST')
        port = os.environ.get('ELASTIC_PORT')

        timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30))
        max_retries = int(
            __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20))
        retry_on_timeout = bool(
            int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT',
                                              True)))

        LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout))
        LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries))
        LOG.info(
            " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout))

        # We're not using sniffing now - we will fix it using Connection with credentials.
        sniff_on_start = bool(
            int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True)))
        sniff_on_connection_fail = bool(
            int(
                __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL',
                                              True)))
        sniffer_timeout = int(
            __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20))

        if username and password:
            elastic_url = "{0}://{1}:{2}@{3}:{4}/".format(
                protocol, username, password, hostname, port)
        else:
            elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port)

        LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:')
        LOG.always(elastic_url)

        if os.environ.get('ELASTIC_SSL_ENABLED') == "false":
            use_ssl = False
        else:
            use_ssl = True

        # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch
        # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
        # https://elasticsearch-py.readthedocs.io/en/master/#sniffing
        # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit
        # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392
        self.elastic = Elasticsearch(
            hosts=[elastic_url],
            verify_certs=False,
            use_ssl=use_ssl,
            timeout=timeout,
            max_retries=max_retries,
            retry_on_timeout=retry_on_timeout
            # sniff_on_start=sniff_on_start,
            # sniff_on_connection_fail=sniff_on_connection_fail,
            # sniffer_timeout=sniffer_timeout
        )

        self.summary_title = 'dm_ingestion_time'
        self.counter_title = 'dm_ingest'
        self.REQUEST_TIME = Summary(self.summary_title,
                                    'Bulk operations throughput')
        self.ingest_rate = Counter(
            self.counter_title,
            'Number of documents ingested per bulk operation',
            ['collectionName'])

        self.doc_summary_title = 'new_doc_operation_time'
        self.doc_count_title = 'new_doc_operation'
        self.REQUEST_TIME_OP = Summary(
            self.doc_summary_title,
            'Operations on documents for Elasticsearch')
        self.doc_operation_count = Counter(self.doc_count_title,
                                           'Document operation',
                                           ['operation_type', 'index'])

        self._formatter = DefaultDocumentFormatter()
        self.BulkBuffer = BulkBuffer(self)

        # As bulk operation can be done in another thread
        # lock is needed to prevent access to BulkBuffer
        # while commiting documents to Elasticsearch
        # It is because BulkBuffer might get outdated
        # docs from Elasticsearch if bulk is still ongoing
        self.lock = threading.Lock()

        self.auto_commit_interval = auto_commit_interval
        self.auto_send_interval = kwargs.get("autoSendInterval",
                                             DEFAULT_SEND_INTERVAL)
        self.meta_index_name = meta_index_name
        self.meta_type = meta_type
        self.unique_key = unique_key
        self.chunk_size = chunk_size
        self.has_attachment_mapping = False
        self.attachment_field = attachment_field
        self.auto_commiter = AutoCommiter(self, self.auto_send_interval,
                                          self.auto_commit_interval)
        self.auto_commiter.start()