def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): aws = kwargs.get('aws', {'access_id': '', 'secret_key': '', 'region': 'us-east-1'}) client_options = kwargs.get('clientOptions', {}) if 'aws' in kwargs: if _HAS_AWS is False: raise ConfigurationError('aws extras must be installed to sign Elasticsearch requests') aws_args = kwargs.get('aws', {'region': 'us-east-1'}) aws = aws_session.Session() if 'access_id' in aws_args and 'secret_key' in aws_args: aws = aws_session.Session( aws_access_key_id = aws_args['access_id'], aws_secret_access_key = aws_args['secret_key']) credentials = aws.get_credentials() region = aws.region_name or aws_args['region'] aws_auth = AWSV4Sign(credentials, region, 'es') client_options['http_auth'] = aws_auth client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = es_connection.RequestsHttpConnection self.elastic = Elasticsearch( hosts=[url], **client_options) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field
def test_types(self): trans = DefaultDocumentFormatter().transform_value # regex _, patt, flags = trans(self.regex).rsplit("/") self.assertIn("x", flags) self.assertIn("m", flags) self.assertNotIn("l", flags) self.assertEqual(patt, "hello") # binary self.assertEqual(trans(self.bin1), "AGhlbGxvAA==") self.assertEqual(trans(self.bin2), "AGhlbGxvAA==") # datetime self.assertEqual(trans(self.date), self.date) # UUID self.assertEqual(trans(self.xuuid), self.xuuid.hex) # Other type self.assertEqual(trans(self.oid), str(self.oid)) # Compound types transformed = trans(self.doc) for k, v in self.doc.items(): self.assertEqual(trans(v), transformed[k]) for el1, el2 in zip(self.lst, map(trans, self.lst)): self.assertEqual(trans(el1), el2) # Infinity/NaN self.assertRaises(ValueError, trans, float("inf")) self.assertRaises(ValueError, trans, float("nan"))
def test_types(self): trans = DefaultDocumentFormatter().transform_value # regex _, patt, flags = trans(self.regex).rsplit("/") self.assertIn('x', flags) self.assertIn('m', flags) self.assertNotIn('l', flags) self.assertEqual(patt, 'hello') # binary self.assertEqual(trans(self.bin1), 'AGhlbGxvAA==') if PY3: self.assertEqual(trans(self.bin2), 'AGhlbGxvAA==') else: self.assertEqual(trans(self.bin2), self.bin2) # datetime self.assertEqual(trans(self.date), self.date) # UUID self.assertEqual(trans(self.xuuid), self.xuuid.hex) # Other type self.assertEqual(trans(self.oid), str(self.oid)) # Compound types transformed = trans(self.doc) for k, v in self.doc.items(): self.assertEqual(trans(v), transformed[k]) for el1, el2 in zip(self.lst, map(trans, self.lst)): self.assertEqual(trans(el1), el2)
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get('clientOptions', {}) client_options.setdefault('sniff_on_start', True) client_options.setdefault('sniff_on_connection_fail', True) client_options.setdefault('sniffer_timeout', 60) if 'aws' in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( 'aws extras must be installed to sign Elasticsearch ' 'requests. Install with: ' 'pip install elastic2-doc-manager[aws]') client_options['http_auth'] = create_aws_auth(kwargs['aws']) client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = \ es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.routing = kwargs.get('routing', {}) if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field
def __init__(self, url, unique_id='_id', **kwargs): self.kafkaprod = KafkaProducer( client_id='mongotokafka-producer-mconnect', bootstrap_servers=[url]) print("__init__ ran") print(str(self.kafkaprod.config)) self.unique_key = unique_id self._formatter = DefaultDocumentFormatter()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions")
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]") client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = True client_options[ "connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) # es6 deprecates support for multiple document types # using default_type for consistency # Will try and use multiple doc types only if explicity specified self.create_multi_type = kwargs.get("createMultiType", False) self.default_type = kwargs.get("defaultType", "_doc") self.meta_index_name = meta_index_name self.meta_type = meta_type if self.create_multi_type else self.default_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='uid', chunk_size=DEFAULT_MAX_BULK, **kwargs): self.graph = Graph(url) self.url = url self.auto_commit_interval = auto_commit_interval self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter() self.kwargs = kwargs.get("clientOptions") self.authorization_token = base64.b64encode(os.getenv('NEO4J_AUTH'))
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", es_name_template=os.getenv( 'ELASTIC7_DOC_MANAGER_ES_INDEX_NAME_TEMPLATE', DEFAULT_ES_INDEX_NAME_TEMPLATE), **kwargs): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic7-doc-manager[aws]") client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = True client_options[ "connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.es_name_template = es_name_template \ if self._is_es_name_template_valid(es_name_template) else \ DEFAULT_ES_INDEX_NAME_TEMPLATE self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start()
def test_default_formatter(self): formatter = DefaultDocumentFormatter() def check_format(document): transformed = dict( (k, formatter.transform_value(v)) for k, v in document.items()) self.assertEqual(transformed, formatter.format_document(document)) # Flat check_format(self.doc) # Nested check_format(self.doc_nested) # With a list check_format(self.doc_list)
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, **kwargs): """ Establish a connection to Elastic """ self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get('clientOptions', {}) if 'aws' in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( 'aws extras must be installed to sign Elasticsearch ' 'requests. Install with: ' 'pip install elastic2-doc-manager[aws]') client_options['http_auth'] = create_aws_auth(kwargs['aws']) client_options['use_ssl'] = True client_options['verify_certs'] = True client_options['connection_class'] = \ es_connection.RequestsHttpConnection if type(url) is not list: url = [url] self.elastic = Elasticsearch(hosts=url, **client_options) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get('autoSendInterval', DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): self.elastic = Elasticsearch( hosts=[url], timeout=200,**kwargs.get('clientOptions', {})) self.auto_commit_interval = auto_commit_interval self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter() self.has_attachment_mapping = False self.attachment_field = attachment_field
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", **kwargs): self.elastic = Elasticsearch(hosts=[url]) self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size if self.auto_commit_interval not in [None, 0]: self.run_auto_commit() self._formatter = DefaultDocumentFormatter()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key='_id', chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", **kwargs): self.host = url[0] self.port = url[1] self.username = "******" self.password = "******" self.auto_commit_interval = auto_commit_interval self.doc_type = 'string' # default type is string, change if needed self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self._formatter = DefaultDocumentFormatter()
def __init__(self, url, auto_commit_interval=DEFAULT_COMMIT_INTERVAL, unique_key="_id", chunk_size=DEFAULT_MAX_BULK, meta_index_name="mongodb_meta", meta_type="mongodb_meta", attachment_field="content", **kwargs): client_options = kwargs.get("clientOptions", {}) if "aws" in kwargs: if not _HAS_AWS: raise errors.InvalidConfiguration( "aws extras must be installed to sign Elasticsearch " "requests. Install with: " "pip install elastic2-doc-manager[aws]") client_options["http_auth"] = create_aws_auth(kwargs["aws"]) client_options["use_ssl"] = True client_options["verify_certs"] = False client_options[ "connection_class"] = es_connection.RequestsHttpConnection else: client_options["use_ssl"] = True client_options["verify_certs"] = False client_options[ "connection_class"] = es_connection.RequestsHttpConnection if type(url) is not list: url = [url] LOG.always('URL IN DOC MANAGER:') LOG.always(url) # self.elastic = Elasticsearch(hosts=url, **client_options) protocol = "http" if (os.environ.get('ELASTIC_SSL_ENABLED') == "false") else "https" username = os.environ.get('ELASTIC_USER') password = os.environ.get('ELASTIC_PASSWORD') hostname = os.environ.get('ELASTIC_HOST') port = os.environ.get('ELASTIC_PORT') timeout = int(__get_os_environ_or_default__('ELASTIC_TIMEOUT', 30)) max_retries = int( __get_os_environ_or_default__('ELASTIC_MAX_RETRY', 20)) retry_on_timeout = bool( int(__get_os_environ_or_default__('ELASTIC_RETRY_ON_TIMEOUT', True))) LOG.info(" value of ELASTIC_TIMEOUT: {}".format(timeout)) LOG.info(" value of ELASTIC_MAX_RETRY: {}".format(max_retries)) LOG.info( " value of ELASTIC_RETRY_ON_TIMEOUT: {}".format(retry_on_timeout)) # We're not using sniffing now - we will fix it using Connection with credentials. sniff_on_start = bool( int(__get_os_environ_or_default__('ELASTIC_SNIFF_ON_START', True))) sniff_on_connection_fail = bool( int( __get_os_environ_or_default__('ELASTIC_SNIFF_ON_CONN_FAIL', True))) sniffer_timeout = int( __get_os_environ_or_default__('ELASTIC_SNIFFER_TIMEOUT', 20)) if username and password: elastic_url = "{0}://{1}:{2}@{3}:{4}/".format( protocol, username, password, hostname, port) else: elastic_url = "{0}://{1}:{2}/".format(protocol, hostname, port) LOG.always('SELF-ASSEMBLED ELASTIC URL IN DOC MANAGER:') LOG.always(elastic_url) if os.environ.get('ELASTIC_SSL_ENABLED') == "false": use_ssl = False else: use_ssl = True # https://stackoverflow.com/questions/25908484/how-to-fix-read-timed-out-in-elasticsearch # es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True) # https://elasticsearch-py.readthedocs.io/en/master/#sniffing # Sniffing caused authentication issue - it appears it was using username/password to retry. We'll revisit # this later to check if sniff can be integrated in case needed. Disabling it for now. SEAR-392 self.elastic = Elasticsearch( hosts=[elastic_url], verify_certs=False, use_ssl=use_ssl, timeout=timeout, max_retries=max_retries, retry_on_timeout=retry_on_timeout # sniff_on_start=sniff_on_start, # sniff_on_connection_fail=sniff_on_connection_fail, # sniffer_timeout=sniffer_timeout ) self.summary_title = 'dm_ingestion_time' self.counter_title = 'dm_ingest' self.REQUEST_TIME = Summary(self.summary_title, 'Bulk operations throughput') self.ingest_rate = Counter( self.counter_title, 'Number of documents ingested per bulk operation', ['collectionName']) self.doc_summary_title = 'new_doc_operation_time' self.doc_count_title = 'new_doc_operation' self.REQUEST_TIME_OP = Summary( self.doc_summary_title, 'Operations on documents for Elasticsearch') self.doc_operation_count = Counter(self.doc_count_title, 'Document operation', ['operation_type', 'index']) self._formatter = DefaultDocumentFormatter() self.BulkBuffer = BulkBuffer(self) # As bulk operation can be done in another thread # lock is needed to prevent access to BulkBuffer # while commiting documents to Elasticsearch # It is because BulkBuffer might get outdated # docs from Elasticsearch if bulk is still ongoing self.lock = threading.Lock() self.auto_commit_interval = auto_commit_interval self.auto_send_interval = kwargs.get("autoSendInterval", DEFAULT_SEND_INTERVAL) self.meta_index_name = meta_index_name self.meta_type = meta_type self.unique_key = unique_key self.chunk_size = chunk_size self.has_attachment_mapping = False self.attachment_field = attachment_field self.auto_commiter = AutoCommiter(self, self.auto_send_interval, self.auto_commit_interval) self.auto_commiter.start()