def __init__(self, name, **kwargs): super().__init__(name=name, **kwargs) if type(self.data) is dict: self.data = Mapper(self.data) if type(self.location) is not str: self.location = Mapper(self.location)(kwargs) try: es = Elasticsearch( **self.location, request_timeout=0.2, retries=False, ignore=404) # TODO url=self.location, ssl_context, http_auth es.info() self.location = es except ImproperlyConfigured as e: raise NotFound("ElasticSearch rejected {}\n-----\n\t{}".format( pformat(self.location), e)) except TransportError as e: raise NotFound( "Failed to reach ElasticSearch at {}\n-----\n\t{}".format( pformat(self.location), e.error)) except: raise NotFound( "Unable to connect to ElasticSearch at host:{}".format( self.location.get('host')))
def _elasticsearch_connect(timeout=300): """ Connect to configured Elasticsearch domain. :param timeout: How long to wait before ANY request to Elasticsearch times out. Because we use parallel bulk uploads (which sometimes wait long periods of time before beginning execution), a value of at least 30 seconds is recommended. :return: An Elasticsearch connection object. """ log.info('Connecting to %s %s with AWS auth', ELASTICSEARCH_URL, ELASTICSEARCH_PORT) auth = AWSRequestsAuth(aws_access_key=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, aws_host=ELASTICSEARCH_URL, aws_region=AWS_REGION, aws_service='es') auth.encode = lambda x: bytes(x.encode('utf-8')) es = Elasticsearch(host=ELASTICSEARCH_URL, port=ELASTICSEARCH_PORT, connection_class=RequestsHttpConnection, timeout=timeout, max_retries=10, retry_on_timeout=True, http_auth=auth, wait_for_status='yellow') es.info() return es
class ElasticService(): def __init__(self, host, port, username, password): super().__init__() requests.packages.urllib3.disable_warnings() try: log.info(f"connecting to elastic on host: {host}") self.es = Elasticsearch(f"https://{host}:{port}", http_auth=(username, password), verify_certs=False) self.es.info() except Exception as e: log.error(e) raise def create_index(self, index_name): try: print("creating index if not exists") self.es.indices.create(index=index_name, ignore=400) except Exception as e: log.error(e) def create_doc(self, index_name, id, body): try: log.info("creating doc in elastic") self.es.create(index=index_name, id=id, body=body) except Exception as e: log.error(e) def delete_doc(self, index_name, id): try: log.info("deleting doc from elastic") self.es.delete(index=index_name, id=id) except Exception as e: log.error(e)
def elastic(): connections.configure(default={ 'hosts': os.environ['ELASTICSEARCH_HOST'], 'port': os.environ['ELASTICSEARCH_PORT'], 'use_ssl': True, 'verify_certs': True, 'ca_certs': '/run/secrets/ca.crt', 'client_cert': '/run/secrets/certificate.crt', 'client_key': '/run/secrets/certificate.key' }, ) es = Elasticsearch(host=os.environ['ELASTICSEARCH_HOST'], port=os.environ['ELASTICSEARCH_PORT'], use_ssl=True, verify_certs=True, ca_certs='/run/secrets/ca.crt', client_cert='/run/secrets/certificate.crt', client_key='/run/secrets/certificate.key') try: es.info() except es_exceptions.ConnectionError: return error('Please check Elasticserach service, %s:%s' % (os.environ['ELASTICSEARCH_HOST'], os.environ['ELASTICSEARCH_PORT'])) return ({'response': es, 'status': 'OK'})
async def insert(data): server = 'http://es-arques.com:9200/' index_name = 'crypto_price_info' # # elasticsearch connect es = Elasticsearch(server) es.info() if len(data) > 0: for li in data: _exchange_name = li['exchange_name'] _symbol = li['symbol'] _period = 'ticksync' _timestamp = li['timestamp'] _datetime = li['datetime'] _open = check_none_value(li['open']) _high = check_none_value(li['high']) _low = check_none_value(li['low']) _close = check_none_value(li['close']) _volume = check_none_value(li['volume']) params = (_exchange_name, _symbol, _period, _timestamp, _datetime, _open, _high, _low, _close, _volume) r = db.insert_price(_exchange_name, params) es.index(index=index_name, doc_type='string', body=li) es.indices.refresh(index=index_name) t = time.time() print(t, 'Ticker ok')
def elasticsearch_fail(): es = Elasticsearch([{ 'host': 'example.com', 'port': 9999 }], timeout=0.1) es.info()
def setup(): log = logging.getLogger("haystack") try: import elasticsearch if not ((1, 0, 0) <= elasticsearch.__version__ < (2, 0, 0)): raise ImportError from elasticsearch import Elasticsearch, ElasticsearchException except ImportError: log.error( "Skipping ElasticSearch 1 tests: 'elasticsearch>=1.0.0,<2.0.0' not installed." ) raise unittest.SkipTest("'elasticsearch>=1.0.0,<2.0.0' not installed.") es = Elasticsearch(settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"]) try: es.info() except ElasticsearchException as e: log.error( "elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"], exc_info=True, ) raise unittest.SkipTest( "elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS["elasticsearch"]["URL"], e, )
def update_my_model_data(): es = Elasticsearch("http://localhost:9200/") es.info() index_name = 'articles' make_index(es, index_name) doc_files = [] for i in os.listdir( "/Users/parkjeongseop/Desktop/Dev/NLP/hw6/ITnews623_sim383/"): try: print("processing", i) this_doc = open( '/Users/parkjeongseop/Desktop/Dev/NLP/hw6/ITnews623_sim383/' + i, 'r', encoding='cp949').read() # Django Model this = Content() this.title = i this.content = this_doc this.save() # Elasticsearch Indexing doc = {'title': i, 'content': this_doc} es.index(index=index_name, doc_type='string', body=doc) except: print("ERROR", i) es.indices.refresh(index=index_name)
def _elasticsearch_connect(): """ Connect to configured Elasticsearch domain. :return: An Elasticsearch connection object. """ es_url = config("ELASTICSEARCH_URL", default="localhost") es_port = config("ELASTICSEARCH_PORT", default=9200, cast=int) es_aws_region = config("ELASTICSEARCH_AWS_REGION", default="us-east-1") auth = AWSRequestsAuth( aws_access_key=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, aws_host=es_url, aws_region=es_aws_region, aws_service="es", ) auth.encode = lambda x: bytes(x.encode("utf-8")) _es = Elasticsearch( host=es_url, port=es_port, connection_class=RequestsHttpConnection, timeout=10, max_retries=1, retry_on_timeout=True, http_auth=auth, wait_for_status="yellow", ) _es.info() return _es
def _elasticsearch_connect(): """ Connect to configured Elasticsearch domain. :return: An Elasticsearch connection object. """ auth = AWSRequestsAuth( aws_access_key=settings.AWS_ACCESS_KEY_ID, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, aws_host=settings.ELASTICSEARCH_URL, aws_region=settings.ELASTICSEARCH_AWS_REGION, aws_service='es' ) auth.encode = lambda x: bytes(x.encode('utf-8')) _es = Elasticsearch( host=settings.ELASTICSEARCH_URL, port=settings.ELASTICSEARCH_PORT, connection_class=RequestsHttpConnection, timeout=10, max_retries=99, retry_on_timeout=True, http_auth=auth, wait_for_status='yellow' ) _es.info() return _es
def get_elasticsearch_client(cloud_id=None, elasticsearch_url=None, es_user=None, es_password=None, ctx=None, **kwargs): """Get an authenticated elasticsearch client.""" if not (cloud_id or elasticsearch_url): client_error("Missing required --cloud-id or --elasticsearch-url") # don't prompt for these until there's a cloud id or elasticsearch URL es_user = es_user or click.prompt("es_user") es_password = es_password or click.prompt("es_password", hide_input=True) hosts = [elasticsearch_url] if elasticsearch_url else None timeout = kwargs.pop('timeout', 60) try: client = Elasticsearch(hosts=hosts, cloud_id=cloud_id, http_auth=(es_user, es_password), timeout=timeout, **kwargs) # force login to test auth client.info() return client except elasticsearch.AuthenticationException as e: error_msg = f'Failed authentication for {elasticsearch_url or cloud_id}' client_error(error_msg, e, ctx=ctx, err=True)
def _elasticsearch_connect() -> Elasticsearch: """ Connect to an Elasticsearch indices at the configured domain. This method also handles AWS authentication using the AWS access key ID and the secret access key. :return: an Elasticsearch client """ log.info( f"Connecting to {ELASTICSEARCH_URL}:{ELASTICSEARCH_PORT} with AWS auth" ) auth = AWSRequestsAuth( aws_access_key=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, aws_host=ELASTICSEARCH_URL, aws_region=AWS_REGION, aws_service="es", ) auth.encode = lambda x: bytes(x.encode("utf-8")) es = Elasticsearch( host=ELASTICSEARCH_URL, port=ELASTICSEARCH_PORT, connection_class=RequestsHttpConnection, http_auth=auth, timeout=TWELVE_HOURS_SEC, ) es.info() return es
def check_index(): try: # ES Connect es_client = Elasticsearch( ['121.125.71.147', '121.125.71.148', '121.125.71.149'], port=9200, timeout=20, http_auth=('elastic', 'wtlcnNyrDPVko01lZfIl')) es_client.info() index_name = "index-nudge-result-analysis" query = """ { "size": 0, "query": { "bool": { "filter": [ { "term": { "log_day": "%s" } } ] } },"aggs": { "NAME": { "terms": { "field": "action_body.category", "size": 1000 } } } } """ response = es_client.search(index=index_name, body=query % one_days_before) list_day = response['aggregations']['NAME']['buckets'] message = "<넛지 성과 분석>(" + today.strftime("%Y-%m-%d") + ")\n" if not list_day: message += "ALIAS ERROR" for day in list_day: message += str(day['key']) + " : " + str(day['doc_count']) + "건\n" print(message) bot.sendMessage(chat_id='1228894509', text=str(message)) bot.sendMessage(chat_id='976803858', text=str(message)) bot.sendMessage(chat_id='1070666335', text=str(message)) except Exception as es_err: print(es_err) err_message = "넛지 성과 분석 ERROR" err_message += str(es_err) bot.sendMessage(chat_id='1228894509', text=err_message)
def connElasticsearch(args): '''尝试连接 ElasticSearch''' es = Elasticsearch(['%s:%s'%(args['host'],args['port'])]) try: # 尝试连接 es.info() except Exception,e: raise Exception,'ElasticSearch <%s:%s> 连接失败!'%(args['host'],args['port'])
def v_es(self): hosts = self._get_config("ELASTICSEARCH_HOSTS") try: es = Elasticsearch(hosts=hosts) es.info() except Exception: return False return True
def __init__(self): indexName = 'andernieuws' es = Elasticsearch() print es.info() x = Indexer() """ uncomment the following to create an empty index on your local Elasticsearch""" #indexer.createIndex(es, indexName, '../resources/settings.json', '../resources/mapping.json') """ uncomment the following to index a directory with ASR files"""
def getSnapshotClient(): global client es = Elasticsearch() client = SnapshotClient(es) try: es.info() except ConnectionError, e: return False
class ElasticClient: def __init__(self, host: str, port: int): try: self.es = Elasticsearch(hosts=[ {'host': host, 'port': port}]) info = self.es.info() logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name'])) except ElasticsearchException as e: logger.error("Elasticsearch is not available.", e) exit(0) def get_articles(self, index, doctype, batch_size): query = '{"query": { "bool": { "must_not": { "exists": { "field": "status" }}}}}' result = self.es.search(index=index, doc_type=doctype, size=batch_size, body=query) articles = result.get('hits').get('hits') return articles if articles is not None else [] def count(self, index): return self.es.count(index=index)['count'] def info(self): return self.es.info() def check_url(self, url: str, auth_index: str): """ Private function to check if a URL appears in the database. Parameters ---------- url: URL for the news stories to be scraped. auth_index: es index Returns ------- found: Boolean. Indicates whether or not a URL was found in the database. """ response = self.es.search(index=auth_index, doc_type=auth_index, body={ "query": { "match_phrase": { "url": url } } }, size=0, terminate_after=1, ignore_unavailable=True) return response["hits"]["total"] > 0 def persist(self, index, doctype, payload): self.es.index(index=index, doc_type=doctype, body=payload) def update(self, index, doctype, doc_id, payload): self.es.update(index=index, doc_type=doctype, id=doc_id, body=payload)
def delete_index(cls): client = Elasticsearch([ 'https://search-ticker-sentiment-ohr4wryq6vcybcoqvqumx5bezm.us-east-2.es.amazonaws.com' ]) print(client.info()) client.indices.delete(index='tweet', ignore=[400, 404]) client.indices.delete(index='twitter', ignore=[400, 404]) client.indices.delete(index='financials', ignore=[400, 404]) client.indices.delete(index='stocks_data.py', ignore=[400, 404]) print(client.info())
def _init_elasticsearch_client(host: str) -> Elasticsearch: client = None try: client = Elasticsearch(host) client.info() except exceptions.ConnectionError: logger.error(f"Failed to connect to elasticsearch server at '{host}'") return client
class Catalogue(): def __init__(self, config): print config self.config = config self.es = Elasticsearch(host=self.config['CATALOGUE_ES_HOST'], port=self.config['CATALOGUE_ES_PORT']) try: print 'Trying to connect to the B&G catalogue' print self.es.info() except ConnectionError, e: print e
def wrap(request, *args, **kwargs): # controllo lo stato della connessione a ElastiSearch try: es = Elasticsearch() es.info() return function(request, *args, **kwargs) except es_exceptions.ConnectionError as ce: return HttpResponseRedirect('/elastic-connection-error') except Exception as generic_exp: print str(generic_exp) return HttpResponseRedirect('/elastic-connection-error')
def check_server_status(conn=None): if conn is None: conn = Elasticsearch(hosts=getElasticsearchServerHostAndPort()) try: conn.info() except (ConnectionError, TransportError): return 'Connection error' # no errors! return 'OK'
def setup(): try: from elasticsearch import Elasticsearch, ElasticsearchException except ImportError: raise SkipTest("elasticsearch-py not installed.") es = Elasticsearch(settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL']) try: es.info() except ElasticsearchException as e: raise SkipTest("elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'], e)
def get_es_client(user, password, elasticsearch_url=None, cloud_id=None, **kwargs): """Get an auth-validated elsticsearch client.""" assert elasticsearch_url or cloud_id, \ 'You must specify a host or cloud_id to authenticate to an elasticsearch instance' hosts = [elasticsearch_url] if elasticsearch_url else elasticsearch_url client = Elasticsearch(hosts=hosts, cloud_id=cloud_id, http_auth=(user, password), **kwargs) # force login to test auth client.info() return client
def setup(): try: from elasticsearch import Elasticsearch, ElasticsearchException except ImportError: raise unittest.SkipTest("elasticsearch-py not installed.") es = Elasticsearch(settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL']) try: es.info() except ElasticsearchException as e: raise unittest.SkipTest("elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'], e)
class EsSearcher: def __init__(self): self.es = None def initialize(self, ip, port): try: self.es = Elasticsearch([ip], port=port) print self.es.info() return True except Exception, err: print "failed to connect to es, err=%s" % err return False
def get_elasticsearch_pulse(): global logger, es try: es = Elasticsearch([{ 'host': os.getenv('ES_HOST'), 'port': os.getenv('ES_PORT') }]) es.info() set_service_available(True) except Exception as e: logger.error("elasticsearch unreachable: {}".format(str(e))) set_service_available(False)
def get_es_client(user, password, host=None, cloud_id=None, **kwargs): """Get an auth-validated elsticsearch client.""" assert host or cloud_id, 'You must specify a host or cloud-id to authenticate to elasticsearch instance' hosts = [host] if host else host client = Elasticsearch(hosts=hosts, cloud_id=cloud_id, http_auth=(user, password), **kwargs) # force login to test auth client.info() return client
def try_es_connect(attempts=0): """Recursively try to connect to elasticsearch.""" try: cli = Elasticsearch([ELASTIC_ENDPOINT]) cli.info() except ElasticsearchException as ex: if attempts < ELASTIC_CONNECT_ATTEMPTS: sleep(ELASTIC_WAIT) attempts += 1 try_es_connect(attempts) else: raise ex
def elasticsearch_fixture(elasticsearch_dir): # test if a ES cluster is already running. If not, download and start an ES instance locally. try: client = Elasticsearch(hosts=[{"host": "localhost"}]) client.info() except: print("Downloading and starting an Elasticsearch instance for the tests ...") thetarfile = "https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.6.1-linux-x86_64.tar.gz" ftpstream = urllib.request.urlopen(thetarfile) thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz") thetarfile.extractall(path=elasticsearch_dir) es_server = Popen([elasticsearch_dir / "elasticsearch-7.6.1/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT) time.sleep(40)
def get_es(): url = settings.ELASTICSEARCH_URL timeout = settings.ELASTICSEARCH_TIMEOUT for attempt in service_retries(): try: if not hasattr(settings, "_es_instance"): es = Elasticsearch(url, timeout=timeout) es.info() settings._es_instance = es return settings._es_instance except TransportError as exc: log.warning("ElasticSearch error: %s", exc.error) backoff(failures=attempt) raise RuntimeError("Could not connect to ElasticSearch")
def es_auth(user, pw): try: es = Elasticsearch( [' '], http_auth=(user, pw), port=443, use_ssl=True, verify_certs=True, ca_certs=certifi.where() ) es.info() except: print("Unable to Auth to ES") exit() return es
def get_es_info(hosts): es = Elasticsearch(hosts) info = es.info() if not info: return 0 stats = es.indices.stats() #print stats print "***********************************info of elasticsearch server : %s************************************" % hosts[0] for index in stats["indices"].keys(): indices = dict(es.indices.get(index)) #print "|Index Name\tDoc Count\tType" print "|",index,"\t",stats["indices"][index]["total"]["docs"]["count"],"\t",indices[index]["mappings"].keys() #print indices[index]["mappings"].keys() for type in indices[index]["mappings"].keys(): print "----------------------------------------------%s-------------------------------------------------------" % type doc = es.search(index=index,doc_type=type) #print doc for hits in doc["hits"]["hits"]: record = dict(hits) for key in record["_source"].keys(): print key,":",record["_source"][key] print "\n" print "****************************************************end*********************************************************" print "\n\n"
def PublishSamples(self, samples): """Publish samples to Elasticsearch service""" try: from elasticsearch import Elasticsearch except ImportError: raise ImportError('The "elasticsearch" package is required to use ' 'the Elasticsearch publisher. Please make sure it ' 'is installed.') es = Elasticsearch([self.es_uri]) if not es.indices.exists(index=self.es_index): # choose whether to use old or new mapings based on # the version of elasticsearch that is being used if int(es.info()['version']['number'].split('.')[0]) >= 5: es.indices.create(index=self.es_index, body=self.mapping_5_plus) logging.info('Create index %s and default mappings for' ' elasticsearch version >= 5.0.0', self.es_index) else: es.indices.create(index=self.es_index, body=self.mapping_before_5) logging.info('Create index %s and default mappings for' ' elasticsearch version < 5.0.0', self.es_index) for s in samples: sample = copy.deepcopy(s) # Make timestamp understandable by ES and human. sample['timestamp'] = self._FormatTimestampForElasticsearch( sample['timestamp'] ) # Keys cannot have dots for ES sample = self._deDotKeys(sample) # Add sample to the "perfkit index" of "result type" and using sample_uri # as each ES's document's unique _id es.create(index=self.es_index, doc_type=self.es_type, id=sample['sample_uri'], body=json.dumps(sample))
def admin( request ): """Administrative stuff like re-indexing. """ target_index = search.DOCSTORE.target_index() server_info = [] index_names = [] indices = [] es = Elasticsearch(hosts=settings.DOCSTORE_HOSTS) ping = es.ping() no_indices = True if ping: info = es.info() info_status = info['status'] if info_status == 200: info_status_class = 'success' else: info_status_class = 'error' server_info.append( {'label':'status', 'data':info_status, 'class':info_status_class} ) status = es.indices.status() shards_success = status['_shards']['successful'] shards_failed = status['_shards']['failed'] if shards_failed == 0: shards_success_class = 'success' shards_failed_class = 'success' else: shards_success_class = 'error' shards_failed_class = 'error' server_info.append( {'label':'shards(successful)', 'data':shards_success, 'class':shards_success_class} ) server_info.append( {'label':'shards(failed)', 'data':shards_failed, 'class':shards_failed_class} ) # indices for name in status['indices'].keys(): no_indices = False server_info.append( {'label':name, 'data':'', 'class':''} ) size = status['indices'][name]['total']['store']['size_in_bytes'] ONEPLACE = Decimal(10) ** -1 size_nice = Decimal(size/1024/1024.0).quantize(ONEPLACE) size_formatted = '%sMB (%s bytes)' % (size_nice, size) num_docs = status['indices'][name]['total']['docs']['num_docs'] server_info.append( {'label':'size', 'data':size_formatted, 'class':'info'} ) server_info.append( {'label':'documents', 'data':num_docs, 'class':'info'} ) index_names.append(name) index = {'name':name, 'exists':True} indices.append(index) indexform = IndexConfirmForm(request=request) dropform = None if indices: dropform = DropConfirmForm(request=request) return render(request, 'webui/search/admin.html', { 'ping': ping, 'no_indices': no_indices, 'server_info': server_info, 'indices': indices, 'indexform': indexform, 'dropform': dropform, 'docstore_index': settings.DOCSTORE_INDEX, 'target_index': target_index, })
def setup(): log = logging.getLogger('haystack') try: import elasticsearch if not ((2, 0, 0) <= elasticsearch.__version__ < (3, 0, 0)): raise ImportError from elasticsearch import Elasticsearch, exceptions except ImportError: log.error("Skipping ElasticSearch 2 tests: 'elasticsearch>=2.0.0,<3.0.0' not installed.") raise unittest.SkipTest("'elasticsearch>=2.0.0,<3.0.0' not installed.") url = settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'] es = Elasticsearch(url) try: es.info() except exceptions.ConnectionError as e: log.error("elasticsearch not running on %r" % url, exc_info=True) raise unittest.SkipTest("elasticsearch not running on %r" % url, e)
def get_es_client(self): config = self.config hosts = [config.get('elastic','cluster')] auth = (config.get('elastic','user'), config.get('elastic','password')) es = Elasticsearch(hosts, timeout=40, connection_class=RequestsHttpConnection, http_auth=auth, use_ssl=False, verify_certs=False) print("ES client Check:") print(es.info()) return es
def snapshot_indices_from_src_to_s3(config): """ Take a snapshot of all the indices specified in the config file. The specified indices are backed up from the ElasticSearch Node on which backup is initiated and are stored at the S3 location specified in the config file. Parameters: config: dictionary storing the configuration details """ src_seed1 = config['elasticsearch_config']['es_src_seed1'] es_s3_repo = config['elasticsearch_config']['es_repository_name'] try: src_seed2 = config['elasticsearch_config']['es_src_seed2'] src_seed3 = config['elasticsearch_config']['es_src_seed3'] except KeyError: # running in test mode? use a single node print ("\n[WARN] Only one SOURCE seed node found in the config, falling back to single SOURCE seed...") src_seed2 = src_seed3 = src_seed1 try: src_es = Elasticsearch([src_seed1, src_seed2, src_seed3], sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) print ("\n[INFO] Connected to src ES cluster: %s" %(src_es.info())) src_es.snapshot.create_repository(repository=es_s3_repo, body={ "type": "s3", "settings": { "region": config['aws_s3_config']['aws_region'], "bucket": config['aws_s3_config']['s3_bucket_name'], "base_path": config['aws_s3_config']['s3_base_path'], "access_key": config['aws_api_keys']['aws_access_key'], "secret_key": config['aws_api_keys']['aws_secret_key'] } }, request_timeout=30, verify=False) print ("\n[INFO] Snapshotting ES indices: '%s' to S3...\n" %(config['elasticsearch_config']['index_names'])) src_es.snapshot.create(repository=es_s3_repo, snapshot=config['elasticsearch_config']['snapshot_name'], body={"indices": config['elasticsearch_config']['index_names']}, wait_for_completion=False) except Exception as e: print ("\n\n[ERROR] Unexpected error: %s" %(str(e)))
def get_elasticsearch_info(): """Check Elasticsearch connection.""" from elasticsearch import ( Elasticsearch, ConnectionError as ESConnectionError ) try: url = settings.HAYSTACK_CONNECTIONS["default"]["URL"] except (AttributeError, KeyError) as ex: log.error("No elasticsearch connection info found in settings. " "Error: %s", ex) return {"status": NO_CONFIG} start = datetime.now() try: search = Elasticsearch(url, request_timeout=TIMEOUT_SECONDS) search.info() except ESConnectionError: return {"status": DOWN} del search # The elasticsearch library has no "close" or "disconnect." micro = (datetime.now() - start).microseconds return { "status": UP, "response_microseconds": micro, }
class Elastic(object): """ Базовый класс для работы с базой ElasticSearch """ def __init__(self, di): self.host = di['host'] self.port = int(di['port']) self.con = None self.index = di['index'] def __connect(self): ''' Соединение с базой ''' self.con = Elasticsearch([{'host' : self.host, 'port' : self.port}]) self.con.info() def search_all(self, query): """ Выполняет поиск в базе, если нет соединения - пересоединяется """ res = None index = '{0}-*'.format(self.index) try: res = self.con.search(index=index, body=query) except (AttributeError, ConnectionError): self.__connect() res = self.con.search(index=index, body=query) return res def store(self, query): """ Выполняет запрос к базе, если нет соединения - пересоединяется """ index = '{0}-{1}'.format(self.index, strftime('%Y.%m.%d', localtime())) try: self.con.index(index=index, doc_type='logs', body=query) except (AttributeError, ConnectionError): try: self.__connect() self.con.index(index=index, doc_type='logs', body=query) except ConnectionError: print('failed to connect elasticsearch') print(query)
def health(request): # check database message = 'OK' status = 200 # check elasticsearch try: client = Elasticsearch(settings.ELASTIC_SEARCH_HOSTS) assert client.info() except: log.exception("Elasticsearch connectivity failed") message += "\nElasticsearch connectivity failed." status = 500 return HttpResponse(message, content_type='text/plain', status=status) # return HttpResponse(message, content_type='text/plain', status=status) return check_data(request)
def connect(): from elasticsearch import Elasticsearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth REGION = "us-west-2" host = 'search-search-jxvug2z72gmuoz6ysdy3rz4z44.us-west-2.es.amazonaws.com' awsauth = AWS4Auth("AKIAIC6D7UKE76OCB6HQ", "/OMQDN2x8+FZyVeIxa7bbtNXswhYB7uIBOkz6rDi", REGION, 'es') es = Elasticsearch( hosts=[{'host': host, 'port': 443}], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection ) print(es.info()) return es
def consumeKafkaToEs(): es = Elasticsearch( hosts=[{'host': ES_HOST, 'port': 9200}], connection_class=RequestsHttpConnection ) print(es.info()) consumer = KafkaConsumer(TOPIC, group_id='my-group', bootstrap_servers=['localhost:9092'], # value_deserializer=lambda m: json.loads(m.decode('utf-8')) ) for message in consumer: print(dir(message)) print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) toInsert = json.loads(message.value.decode("utf-8") ) res = es.index(index="test-index", doc_type='fb', body=toInsert) print("done insert") time.sleep(1)
class Elastic: def __init__(self,credentials): self.ip = credentials["elastic.ip"] self.port = credentials["elastic.port"] self.connecting = False self.connected = False def start(self): self.connect() self.refresh("_all") def connect(self): self.connecting = True print("[elastic] :connecting...") self.es = Elasticsearch(hosts=[{'host': self.ip, 'port': self.port}]) def putIndex(self,putIndex,theBody): res = self.es.index(index=putIndex, doc_type='tweet', id=1, body=doc) print(res['created']) def getIndex(self,getIndex): res = self.es.get(index=getIndex, doc_type='tweet', id=1) print(res['_source']) def refresh(self,refIndex): self.es.indices.refresh(index=refIndex) print("[elastic] :connected!") print(self.es.info()) self.connected = True def search(self,index,query): res = self.es.search(index="test-index", body={ "query": { "match_all": {} } }) print("Got %d Hits:" % res['hits']['total']) for hit in res['hits']['hits']: print("%(timestamp)s %(author)s: %(text)s" % hit["_source"]) return res;
def load(tweets): es = Elasticsearch(host = config.es_host, port = config.es_port) es_version_number = es.info()['version']['number'] tweet_mapping = get_tweet_mapping(es_version_number) mapping = {doc_type: tweet_mapping } if es.indices.exists(index_name): print ('index {} already exists'.format(index_name)) try: es.indices.put_mapping(doc_type, tweet_mapping, index_name) except ElasticsearchException as e: print('error putting mapping:\n'+str(e)) print('deleting index {}...'.format(index_name)) es.indices.delete(index_name) create_index(es, index_name, mapping) else: print('index {} does not exist'.format(index_name)) create_index(es, index_name, mapping) counter = 0 bulk_data = [] list_size = len(tweets) for doc in tweets: tweet = get_tweet(doc) bulk_doc = { "_index": index_name, "_type": doc_type, "_id": tweet[id_field], "_source": tweet } bulk_data.append(bulk_doc) counter+=1 if counter % bulk_chunk_size == 0 or counter == list_size: print "ElasticSearch bulk index (index: {INDEX}, type: {TYPE})...".format(INDEX=index_name, TYPE=doc_type) success, _ = bulk(es, bulk_data) print 'ElasticSearch indexed %d documents' % success bulk_data = []
parser = argparse.ArgumentParser() parser.add_argument("--es_host", default="localhost:9200", help="ES Connection String") parser.add_argument("--es_user", default="elastic", help="ES User") parser.add_argument("--es_password", default="changeme", help="ES Password") parser.add_argument("--interval", default=300, help="Interval in Seconds", type=int) parser.add_argument("--start_time", help="Start Time") parser.add_argument("--end_time", help="End Time") parser.add_argument("--watch_template", help="Watch File") options = parser.parse_args() start_time = datetime.strptime(options.start_time, '%Y-%m-%dT%H:%M:%SZ') end_time = datetime.strptime(options.end_time, '%Y-%m-%dT%H:%M:%SZ') client = Elasticsearch(hosts=[options.es_host], http_auth=(options.es_user, options.es_password), use_ssl=False, timeout=300) try: cluster = client.info() except: print("Cluster not accessible") sys.exit(1) watch_template = json.loads(open(options.watch_template).read()) next_time = start_time while next_time < end_time: print("Executing for %s-%s seconds"%(next_time.strftime('%Y-%m-%dT%H:%M:%SZ'),options.interval)) watch_body = watch_template watch_body["metadata"]["time_period"] = "%ss"%options.interval client.transport.perform_request('POST', _make_path('_xpack', 'watcher', 'watch', '_execute'), body={ "trigger_data":{ "scheduled_time":next_time.strftime('%Y-%m-%dT%H:%M:%SZ') },
def elasticsearch_fail(): es = Elasticsearch([{'host': 'example.com', 'port': 9999}], timeout=0.1) es.info()
class Test(BaseTest): def init(self): self.elasticsearch_url = self.get_elasticsearch_url() self.kibana_url = self.get_kibana_url() print("Using elasticsearch: {}".format(self.elasticsearch_url)) self.es = Elasticsearch([self.elasticsearch_url]) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("elasticsearch").setLevel(logging.ERROR) self.modules_path = os.path.abspath(self.working_dir + "/../../../../module") self.kibana_path = os.path.abspath(self.working_dir + "/../../../../build/kibana") self.filebeat = os.path.abspath(self.working_dir + "/../../../../filebeat.test") self.index_name = "test-filebeat-ml" @parameterized.expand([ (False,), (True,), ]) @unittest.skipIf(not INTEGRATION_TESTS, "integration tests are disabled, run with INTEGRATION_TESTS=1 to enable them.") @unittest.skipIf(os.getenv("TESTING_ENVIRONMENT") == "2x", "integration test not available on 2.x") @unittest.skipIf(os.name == "nt", "skipped on Windows") @unittest.skip("Skipped as flaky: https://github.com/elastic/beats/issues/11629") def test_ml_setup(self, modules_flag): """ Test ML are installed in all possible ways """ self._run_ml_test(modules_flag) def _run_ml_test(self, modules_flag): self.init() from elasticsearch import AuthorizationException es_info = self.es.info() version = semver.parse(es_info["version"]["number"]) if version["major"] < 7: start_trial_api_url = "/_xpack/license/start_trial?acknowledge=true" ml_datafeeds_url = "/_xpack/ml/datafeeds/" ml_anomaly_detectors_url = "/_xpack/ml/anomaly_detectors/" else: start_trial_api_url = "/_license/start_trial?acknowledge=true" ml_datafeeds_url = "/_ml/datafeeds/" ml_anomaly_detectors_url = "/_ml/anomaly_detectors/" try: output = self.es.transport.perform_request("POST", start_trial_api_url) except AuthorizationException: print("License already enabled") print("Test modules_flag: {}".format(modules_flag)) # Clean any previous state for df in self.es.transport.perform_request("GET", ml_datafeeds_url)["datafeeds"]: if df["datafeed_id"] == 'filebeat-nginx-access-response_code': self.es.transport.perform_request( "DELETE", "/_ml/datafeeds/" + df["datafeed_id"]) for df in self.es.transport.perform_request("GET", ml_anomaly_detectors_url)["jobs"]: if df["job_id"] == 'datafeed-filebeat-nginx-access-response_code': self.es.transport.perform_request( "DELETE", ml_anomaly_detectors_url + df["job_id"]) shutil.rmtree(os.path.join(self.working_dir, "modules.d"), ignore_errors=True) # generate a minimal configuration cfgfile = os.path.join(self.working_dir, "filebeat.yml") self.render_config_template( template_name="filebeat_modules", output=cfgfile, index_name=self.index_name, elasticsearch_url=self.elasticsearch_url, kibana_url=self.kibana_url, kibana_path=self.kibana_path) if not modules_flag: # Enable nginx os.mkdir(os.path.join(self.working_dir, "modules.d")) with open(os.path.join(self.working_dir, "modules.d/nginx.yml"), "wb") as nginx: nginx.write("- module: nginx") cmd = [ self.filebeat, "-systemTest", "-e", "-d", "*", "-c", cfgfile ] # Skipping dashboard loading to speed up tests cmd += ["-E", "setup.dashboards.enabled=false"] cmd += ["setup", "--machine-learning"] if modules_flag: cmd += ["--modules=nginx"] output_path = os.path.join(self.working_dir, "output.log") output = open(output_path, "ab") output.write(" ".join(cmd) + "\n") beat = subprocess.Popen(cmd, stdin=None, stdout=output, stderr=output, bufsize=0) # Check result self.wait_until(lambda: "filebeat-nginx_ecs-access-status_code_rate_ecs" in (df["job_id"] for df in self.es.transport.perform_request( "GET", ml_anomaly_detectors_url)["jobs"]), max_timeout=60) self.wait_until(lambda: "datafeed-filebeat-nginx_ecs-access-status_code_rate_ecs" in (df["datafeed_id"] for df in self.es.transport.perform_request("GET", ml_datafeeds_url)["datafeeds"])) beat.kill() # check if fails during trying to setting it up again output = open(output_path, "ab") output.write(" ".join(cmd) + "\n") beat = subprocess.Popen(cmd, stdin=None, stdout=output, stderr=output, bufsize=0) output = open(output_path, "r") for obj in ["Datafeed", "Job", "Dashboard", "Search", "Visualization"]: self.wait_log_contains("{obj} already exists".format(obj=obj), logfile=output_path, max_timeout=60) beat.kill()
class ElasticsearchAPI: """ Each query will have its own index based on query name. index_name = query.name Doc type = query_name to make it possible to set mapping. Mapping is set per doc_type. All rows from a Query should look the same no matter the source. This makes all the data from all the servers in the same index. Comparable. Less indexes. """ def __init__(self, host, port, user, password): logger.info("Connecting to ES %s..." % host) self.es = Elasticsearch(hosts=[ {'host': host, 'port': port}, ]) logger.debug(self.es.info()) @staticmethod def from_config_manager(config_manager): config = config_manager.get_config('Elasticsearch') return ElasticsearchAPI(config['host'], config['port'], config['password'], config['username']) def consume_all(self, items, doc_type, index_name, id_column_name): print('Pushing %s docs to index: %s' % (len(items), index_name)) actions = [] for doc in items: action = { "_id": doc[id_column_name], "_index": index_name, "_type": doc_type, "_source": doc, } actions.append(action) helpers.bulk(self.es, actions) self.es.indices.refresh() return len(items) def find_ids(self, ids, doc_type, index_name): body = {"ids": ids} result = self.es.mget(index=index_name, doc_type=doc_type, body=body) # print(result) if len(result) > 0: return [r['_id'] for r in result['docs'] if r['found'] is True] return [] def init_indexes_for(self, sources): for source in sources: self.init_index_for_source(source) def set_mapping(self, doc_type, index_name, mapping): self.es.indices.put_mapping( index=index_name, doc_type=doc_type, body=mapping) def delete_index(self, index_name): print('Truncating data in index: %s' % index_name) self.es.indices.delete(index=index_name, ignore=404) def create_index(self, index_name): print('Creating index %s' % index_name) self.es.indices.create(index_name, ignore=400)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--numdays', help='Number of days back from today to archive, default is 0', default=0) parser.add_argument('--archive', help='Archive name') parser.add_argument('--logtype', help='Log type cwl/logstash', default='cwl') parser.add_argument('--esaddress', help='Elasticsearch Address', default='localhost') parser.add_argument('--esport', help='Elasticsearch Port', default=9200) parser.add_argument('--bucket', help='S3 bucket name') parser.add_argument('--awsaccesskey', help='AWS Access Key') parser.add_argument('--awssecretkey', help='AWS Secret Key') parser.add_argument('--awsregion', help='AWS Region', default='us-east-1') parser.add_argument('--dry', help='Dry run', action='store_true') args = parser.parse_args() numDays = args.numdays archiveName = args.archive now = datetime.now() indexDay = int(now.day) - int(numDays) indexName = '%s-%s.%02d.%s' % (args.logtype, now.year, now.month, indexDay) stuffs = [] awsauth = AWS4Auth(args.awsaccesskey, args.awssecretkey, args.awsregion, 'es') es = Elasticsearch( hosts=[{'host': args.esaddress, 'port': int(args.esport)}], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection ) print(es.info()) query={"query" : {"match_all" : {}}} if es.search_exists(index=indexName): rs = es.search(index=indexName, scroll='60s', search_type='scan', size=100, body=query) scroll_size = rs['hits']['total'] while (scroll_size > 0): try: scroll_id = rs['_scroll_id'] rs = es.scroll(scroll_id=scroll_id, scroll='60s') stuffs += rs['hits']['hits'] scroll_size = len(rs['hits']['hits']) except: break else: print 'Index %s does not exist' % indexName exit() with gzip.open(archiveName + '.gz', 'wb') as f: for stuff in stuffs: f.write(str(stuff)) f.close if args.dry: print "Not deleting index %s" % indexName else: es.indices.delete(index=indexName) print 'Pushing to bucket name %s' % args.bucket s3 = boto3.resource('s3') data = open(archiveName + '.gz', 'rb') s3.Bucket(args.bucket).put_object(Key=archiveName + '.gz', Body=data)
return True parser = argparse.ArgumentParser() parser.add_argument('inputfile', type=str, help='the input file, must be .json created from django dumpdata command') parser.add_argument('index', type=str, help='the elastic index') args = parser.parse_args() input_fp = args.inputfile index = args.index print('connecting to elastic index %s...' % index) es = Elasticsearch() try: print(es.info()) except Exception as e: print(e) raise print('...OK') print('loading data from %s...' % input_fp) DATA = json.load(open(input_fp)) print('found %s objects' % str(len(DATA))) print('this would be a sample document:\n') print(DATA[0]['fields'])
class Connector: def __init__(self, esEndpoint, dmonPort=5001, esInstanceEndpoint=9200, index="logstash-*"): self.esInstance = Elasticsearch(esEndpoint) self.esEndpoint = esEndpoint self.dmonPort = dmonPort self.esInstanceEndpoint = esInstanceEndpoint self.myIndex = index def query(self, queryBody, allm=True, dMetrics=[], debug=False): res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=230) if debug == True: print "%---------------------------------------------------------%" print "Raw JSON Ouput" print res print("%d documents found" % res['hits']['total']) print "%---------------------------------------------------------%" termsList = [] termValues = [] ListMetrics = [] for doc in res['hits']['hits']: if allm == False: if not dMetrics: sys.exit("dMetrics argument not set. Please supply valid list of metrics!") for met in dMetrics: # prints the values of the metrics defined in the metrics list if debug == True: print "%---------------------------------------------------------%" print "Parsed Output -> ES doc id, metrics, metrics values." print("doc id %s) metric %s -> value %s" % (doc['_id'], met, doc['_source'][met])) print "%---------------------------------------------------------%" termsList.append(met) termValues.append(doc['_source'][met]) dictValues = dict(zip(termsList, termValues)) else: for terms in doc['_source']: # prints the values of the metrics defined in the metrics list if debug == True: print "%---------------------------------------------------------%" print "Parsed Output -> ES doc id, metrics, metrics values." print("doc id %s) metric %s -> value %s" % (doc['_id'], terms, doc['_source'][terms])) print "%---------------------------------------------------------%" termsList.append(terms) termValues.append(doc['_source'][terms]) dictValues = dict(zip(termsList, termValues)) ListMetrics.append(dictValues) return ListMetrics, res def info(self): try: res = self.esInstance.info() except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to ES dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) return "An exception has occured with type %s at arguments %s" %(type(inst), inst.args) sys.exit(2) return res def roles(self): nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get roles url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rRoles = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rData = rRoles.json() return rData def createIndex(self, indexName): try: self.esInstance.create(index=indexName, ignore=400) logger.info('[%s] : [INFO] Created index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error('[%s] : [ERROR] Failed to created index %s with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) def closeIndex(self, indexName): try: self.esInstance.close(index=indexName) logger.info('[%s] : [INFO] Closed index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error('[%s] : [ERROR] Failed to close index %s with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) def deleteIndex(self, indexName): try: res = self.esInstance.indices.delete(index=indexName, ignore=[400, 404]) logger.info('[%s] : [INFO] Deleted index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error('[%s] : [ERROR] Failed to delete index %s with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) return 0 return res def openIndex(self, indexName): res = self.esInstance.indices.open(index=indexName) logger.info('[%s] : [INFO] Open index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) return res def getIndex(self, indexName): res = self.esInstance.indices.get(index=indexName, human=True) return res def getIndexSettings(self, indexName): res = self.esInstance.indices.get_settings(index=indexName, human=True) return res def clusterHealth(self): res = self.esInstance.cluster.health(request_timeout=15) return res def clusterSettings(self): res = self.esInstance.cluster.get_settings(request_timeout=15) return res def clusterState(self): res = self.esInstance.cluster.stats(human=True, request_timeout=15) return res def nodeInfo(self): res = self.esInstance.nodes.info(request_timeout=15) return res def nodeState(self): res = self.esInstance.nodes.stats(request_timeout=15) return res def getStormTopology(self): nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get storm topology url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rStormTopology = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rData = rStormTopology.json() return rData def pushAnomaly(self, anomalyIndex, doc_type, body): try: res = self.esInstance.index(index=anomalyIndex, doc_type=doc_type, body=body) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while pushing anomaly with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't push anomaly to dmon!" sys.exit(2) return res def getModel(self): return "getModel" def pushModel(self): return "push model" def localData(self): return "use local data" def getInterval(self): nUrl = "http://%s:%s/dmon/v1/overlord/aux/interval" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get interval url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rInterval = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rData = rInterval.json() return rData def aggQuery(self, queryBody): adt_timeout = os.environ['ADP_TIMEOUT'] = os.getenv('ADP_TIMEOUT', str(60)) # Set timeout as env variable ADT_TIMEOUT, if not set use default 60 try: res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=float(adt_timeout)) except Exception as inst: logger.error('[%s] : [ERROR] Exception while executing ES query with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return res def getNodeList(self): ''' :return: -> returns the list of registered nodes from dmon ''' nUrl = "http://%s:%s/dmon/v1/observer/nodes" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get node url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonNode = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rdata = rdmonNode.json() nodes = [] for e in rdata['Nodes']: for k in e: nodes.append(k) return nodes def getDmonStatus(self): nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get core status url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonStatus = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) return rdmonStatus.json()
class ElasticsearchDataStore(object): """Implements the datastore.""" # Number of events to queue up when bulk inserting events. DEFAULT_FLUSH_INTERVAL = 1000 DEFAULT_SIZE = 100 DEFAULT_LIMIT = DEFAULT_SIZE # Max events to return DEFAULT_FROM = 0 DEFAULT_STREAM_LIMIT = 5000 # Max events to return when streaming results def __init__(self, host='127.0.0.1', port=9200): """Create a Elasticsearch client.""" super(ElasticsearchDataStore, self).__init__() self.client = Elasticsearch([{'host': host, 'port': port}]) self.import_counter = Counter() self.import_events = [] @staticmethod def _build_label_query(sketch_id, label_name): """Build Elasticsearch query for Timesketch labels. Args: sketch_id: Integer of sketch primary key. label_name: Name of the label to search for. Returns: Elasticsearch query as a dictionary. """ query_dict = { 'query': { 'nested': { 'query': { 'bool': { 'must': [{ 'term': { 'timesketch_label.name': label_name } }, { 'term': { 'timesketch_label.sketch_id': sketch_id } }] } }, 'path': 'timesketch_label' } } } return query_dict @staticmethod def _build_events_query(events): """Build Elasticsearch query for one or more document ids. Args: events: List of Elasticsearch document IDs. Returns: Elasticsearch query as a dictionary. """ events_list = [event['event_id'] for event in events] query_dict = {'query': {'ids': {'values': events_list}}} return query_dict @staticmethod def _build_field_aggregator(field_name): """Build Elasticsearch query for aggregation based on field. Args: field_name: Field to aggregate. Returns: Elasticsearch aggregation as a dictionary. """ field_aggregation = { 'field_aggregation': { 'terms': { 'field': '{0:s}.keyword'.format(field_name) } } } return field_aggregation def build_query(self, sketch_id, query_string, query_filter, query_dsl, aggregations=None): """Build Elasticsearch DSL query. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query aggregations: Dict of Elasticsearch aggregations Returns: Elasticsearch DSL query as a dictionary """ if not query_dsl: if query_filter.get('star', None): query_dsl = self._build_label_query(sketch_id, '__ts_star') if query_filter.get('events', None): events = query_filter['events'] query_dsl = self._build_events_query(events) if not query_dsl: query_dsl = { 'query': { 'bool': { 'must': [{ 'query_string': { 'query': query_string } }] } } } if query_filter.get('time_start', None): # TODO(jberggren): Add support for multiple time ranges. query_dsl['query']['bool']['filter'] = { 'bool': { 'should': [{ 'range': { 'datetime': { 'gte': query_filter['time_start'], 'lte': query_filter['time_end'] } } }] } } if query_filter.get('from', None): query_dsl['from'] = query_filter['from'] if query_filter.get('size', None): query_dsl['size'] = query_filter['size'] if query_filter.get('exclude', None): query_dsl['post_filter'] = { 'bool': { 'must_not': { 'terms': { 'data_type': query_filter['exclude'] } } } } else: query_dsl = json.loads(query_dsl) # Make sure we are sorting. if not query_dsl.get('sort', None): query_dsl['sort'] = { 'datetime': query_filter.get('order', 'asc') } # Remove any aggregation coming from user supplied Query DSL. We have # no way to display this data in a good way today. # TODO: Revisit this and figure out if we can display the data. if query_dsl.get('aggregations', None): del query_dsl['aggregations'] # Add any pre defined aggregations if aggregations: # post_filter happens after aggregation so we need to move the # filter to the query instead. if query_dsl.get('post_filter', None): query_dsl['query']['bool']['filter'] = query_dsl[ 'post_filter'] query_dsl.pop('post_filter', None) query_dsl['aggregations'] = aggregations return query_dsl def search(self, sketch_id, query_string, query_filter, query_dsl, indices, count=False, aggregations=None, return_fields=None, enable_scroll=False): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query count: Boolean indicating if we should only return result count aggregations: Dict of Elasticsearch aggregations return_fields: List of fields to return enable_scroll: If Elasticsearch scroll API should be used Returns: Set of event documents in JSON format """ scroll_timeout = None if enable_scroll: scroll_timeout = '1m' # Default to 1 minute scroll timeout # Exit early if we have no indices to query if not indices: return {'hits': {'hits': [], 'total': 0}, 'took': 0} # Check if we have specific events to fetch and get indices. if query_filter.get('events', None): indices = { event['index'] for event in query_filter['events'] if event['index'] in indices } query_dsl = self.build_query(sketch_id, query_string, query_filter, query_dsl, aggregations) # Default search type for elasticsearch is query_then_fetch. search_type = 'query_then_fetch' # Only return how many documents matches the query. if count: del query_dsl['sort'] count_result = self.client.count( body=query_dsl, index=list(indices)) return count_result.get('count', 0) if not return_fields: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.search( body=query_dsl, index=list(indices), search_type=search_type, scroll=scroll_timeout) # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.search( body=query_dsl, index=list(indices), search_type=search_type, _source_include=return_fields, scroll=scroll_timeout) def search_stream( self, sketch_id=None, query_string=None, query_filter=None, query_dsl=None, indices=None, return_fields=None): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args : sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query return_fields: List of fields to return Returns: Generator of event documents in JSON format """ if not query_filter.get('size'): query_filter['size'] = self.DEFAULT_STREAM_LIMIT if not query_filter.get('terminate_after'): query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT result = self.search( sketch_id=sketch_id, query_string=query_string, query_dsl=query_dsl, query_filter=query_filter, indices=indices, return_fields=return_fields, enable_scroll=True) scroll_id = result['_scroll_id'] scroll_size = result['hits']['total'] for event in result['hits']['hits']: yield event while scroll_size > 0: # pylint: disable=unexpected-keyword-arg result = self.client.scroll(scroll_id=scroll_id, scroll='5m') scroll_id = result['_scroll_id'] scroll_size = len(result['hits']['hits']) for event in result['hits']['hits']: yield event def get_event(self, searchindex_id, event_id): """Get one event from the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id Returns: Event document in JSON format """ try: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.get( index=searchindex_id, id=event_id, doc_type='_all', _source_exclude=['timesketch_label']) except NotFoundError: abort(HTTP_STATUS_CODE_NOT_FOUND) def count(self, indices): """Count number of documents. Args: indices: List of indices. Returns: Number of documents. """ if not indices: return 0 result = self.client.count(index=indices) return result.get('count', 0) def set_label(self, searchindex_id, event_id, event_type, sketch_id, user_id, label, toggle=False, single_update=True): """Set label on event in the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id event_type: String of ElasticSearch document type sketch_id: Integer of sketch primary key user_id: Integer of user primary key label: String with the name of the label toggle: Optional boolean value if the label should be toggled single_update: Boolean if the label should be indexed immediately. (add/remove). The default is False. Returns: Dict with updated document body, or None if this is a single update. """ # Elasticsearch painless script. update_body = { 'script': { 'lang': 'painless', 'source': ADD_LABEL_SCRIPT, 'params': { 'timesketch_label': { 'name': str(label), 'user_id': user_id, 'sketch_id': sketch_id } } } } if toggle: update_body['script']['source'] = TOGGLE_LABEL_SCRIPT if not single_update: script = update_body['script'] return dict( source=script['source'], lang=script['lang'], params=script['params'] ) doc = self.client.get( index=searchindex_id, id=event_id, doc_type='_all') try: doc['_source']['timesketch_label'] except KeyError: doc = {'doc': {'timesketch_label': []}} self.client.update( index=searchindex_id, doc_type=event_type, id=event_id, body=doc) self.client.update( index=searchindex_id, id=event_id, doc_type=event_type, body=update_body) return None def create_index(self, index_name=uuid4().hex, doc_type='generic_event'): """Create index with Timesketch settings. Args: index_name: Name of the index. Default is a generated UUID. doc_type: Name of the document type. Default id generic_event. Returns: Index name in string format. Document type in string format. """ _document_mapping = { doc_type: { 'properties': { 'timesketch_label': { 'type': 'nested' } } } } if not self.client.indices.exists(index_name): try: self.client.indices.create( index=index_name, body={'mappings': _document_mapping}) except ConnectionError: raise RuntimeError('Unable to connect to Timesketch backend.') # We want to return unicode here to keep SQLalchemy happy. if not isinstance(index_name, six.text_type): index_name = codecs.decode(index_name, 'utf-8') if not isinstance(doc_type, six.text_type): doc_type = codecs.decode(doc_type, 'utf-8') return index_name, doc_type def delete_index(self, index_name): """Delete Elasticsearch index. Args: index_name: Name of the index to delete. """ if self.client.indices.exists(index_name): try: self.client.indices.delete(index=index_name) except ConnectionError as e: raise RuntimeError( 'Unable to connect to Timesketch backend: {}'.format(e) ) def import_event( self, index_name, event_type, event=None, event_id=None, flush_interval=DEFAULT_FLUSH_INTERVAL): """Add event to Elasticsearch. Args: flush_interval: Number of events to queue up before indexing index_name: Name of the index in Elasticsearch event_type: Type of event (e.g. plaso_event) event: Event dictionary event_id: Event Elasticsearch ID """ if event: # Make sure we have decoded strings in the event dict. event = { k.decode('utf8'): (codecs.decode(v, 'utf8') if isinstance(v, six.binary_type) else v) for k, v in event.items() } # Header needed by Elasticsearch when bulk inserting. header = { 'index': { '_index': index_name, '_type': event_type } } update_header = { 'update': { '_index': index_name, '_type': event_type, '_id': event_id } } if event_id: # Event has "lang" defined if there is a script used for import. if event.get('lang'): event = {'script': event} else: event = {'doc': event} header = update_header self.import_events.append(header) self.import_events.append(event) self.import_counter['events'] += 1 if self.import_counter['events'] % int(flush_interval) == 0: self.client.bulk(body=self.import_events) self.import_events = [] else: # Import the remaining events in the queue. if self.import_events: self.client.bulk(body=self.import_events) return self.import_counter['events'] def flush_queued_events(self): if self.import_events: self.client.bulk(body=self.import_events) @property def version(self): """Get Elasticsearch version. Returns: Version number as a string. """ version_info = self.client.info().get('version') return version_info.get('number')
class ElasticSearchUtil: def __init__(self, host): self.host = host self.conn = Elasticsearch([self.host]) def __del__(self): self.close() def check(self): ''' 输出当前系统的ES信息 :return: ''' return self.conn.info() def insertDocument(self, index, type, body, id=None): ''' 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: ''' return self.conn.index(index=index, doc_type=type, body=body, id=id) def insertDataFrame(self, index, type, dataFrame): ''' 批量插入接口; bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}] 其中optionType可为index、delete、update Condition可设置每条数据所对应的index值和type值 data为具体要插入/更新的单条数据 :param index: 默认插入的index值 :param type: 默认插入的type值 :param dataFrame: 待插入数据集 :return: ''' dataList = dataFrame.to_dict(orient='records') insertHeadInfoList = [{"index": {}} for i in range(len(dataList))] temp = [dict] * (len(dataList) * 2) temp[::2] = insertHeadInfoList temp[1::2] = dataList try: return self.conn.bulk(index=index, doc_type=type, body=temp) except Exception as e: return str(e) def deleteDocById(self, index, type, id): ''' 删除指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.delete(index=index, doc_type=type, id=id) def deleteDocByQuery(self, index, query, type=None): ''' 删除idnex下符合条件query的所有数据 :param index: :param query: 满足DSL语法格式 :param type: :return: ''' return self.conn.delete_by_query(index=index, body=query, doc_type=type) def deleteAllDocByIndex(self, index, type=None): ''' 删除指定index下的所有数据 :param index: :return: ''' try: query = {'query': {'match_all': {}}} return self.conn.delete_by_query(index=index, body=query, doc_type=type) except Exception as e: return str(e) + ' -> ' + index def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' return self.conn.search(index=index, doc_type=type, body=body) def getDocById(self, index, type, id): ''' 获取指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.get(index=index, doc_type=type, id=id) def updateDocById(self, index, type, id, body=None): ''' 更新指定index、type、id所对应的数据 :param index: :param type: :param id: :param body: 待更新的值 :return: ''' return self.conn.update(index=index, doc_type=type, id=id, body=body) def close(self): if self.conn is not None: try: self.conn.close() except Exception as e: pass finally: self.conn = None
def restore_indices_from_s3_to_dest(config): """ Restore the specified indices from the snapshot specified in the config file. The indices are restored at the specified 'dest' ElasticSearch Node. ElasticSearch automatically replicates the indices across the ES cluster after the restore. Parameters: config: dictionary storing the configuration details """ dest_seed1 = config['elasticsearch_config']['es_dest_seed1'] es_s3_repo = config['elasticsearch_config']['es_repository_name'] index_list = config['elasticsearch_config']['index_names'].split(',') try: dest_seed2 = config['elasticsearch_config']['es_dest_seed2'] dest_seed3 = config['elasticsearch_config']['es_dest_seed3'] except KeyError: # running in test mode? use a single node print ("\n[WARN] Are you running in test mode? Have you defined >1 dest node in the conf?") print ("\n[WARN] Falling back to a single dest node...") dest_seed2 = dest_seed3 = dest_seed1 try: # specify all 3 dest ES nodes in the connection string dest_es = Elasticsearch([dest_seed1, dest_seed2, dest_seed3], sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=60) dest_es.snapshot.create_repository(repository=es_s3_repo, body={ "type": "s3", "settings": { "region": config['aws_s3_config']['aws_region'], "bucket": config['aws_s3_config']['s3_bucket_name'], "base_path": config['aws_s3_config']['s3_base_path'], "access_key": config['aws_api_keys']['aws_access_key'], "secret_key": config['aws_api_keys']['aws_secret_key'] } }, request_timeout=30, verify=False) print ("\n[INFO] Connected to dest ES cluster: %s" %(dest_es.info())) # must close indices before restoring: for index in index_list: try: print ("[INFO] Closing index: '%s'" %(index)) dest_es.indices.close(index=index, ignore_unavailable=True) except NotFoundError: print ("\n\n[WARN] Index '%s' not present on Target ES cluster - could not close it." %(index)) except Exception as e: print ("\n\n[ERROR] Unexpected error '%s' while trying to close index: '%s'" %(str(e))) #reopen_indices(dest_es, index_list) print ("\n[INFO] Restoring ES indices: '%s' from S3 snapshot...\n" %(config['elasticsearch_config']['index_names'])) dest_es.snapshot.restore(repository=es_s3_repo, snapshot=config['elasticsearch_config']['snapshot_name'], body={"indices": config['elasticsearch_config']['index_names']}, wait_for_completion=False) except Exception as e: print ("\n\n[ERROR] Unexpected error: %s" %(str(e))) finally: print ("\n[INFO] (finally) Re-opening indices: '%s'" %(str(index_list))) reopen_indices(dest_es, index_list)
class ElasticSearchDB(object): """ .. class:: ElasticSearchDB :param str url: the url to the database for example: el.cern.ch:9200 :param str gDebugFile: is used to save the debug information to a file :param int timeout: the default time out to Elasticsearch :param int RESULT_SIZE: The number of data points which will be returned by the query. """ __chunk_size = 1000 __url = "" __timeout = 120 clusterName = '' RESULT_SIZE = 10000 ######################################################################## def __init__(self, host, port, user=None, password=None, indexPrefix='', useSSL=True): """ c'tor :param self: self reference :param str host: name of the database for example: MonitoringDB :param str port: The full name of the database for example: 'Monitoring/MonitoringDB' :param str user: user name to access the db :param str password: if the db is password protected we need to provide a password :param str indexPrefix: it is the indexPrefix used to get all indexes :param bool useSSL: We can disable using secure connection. By default we use secure connection. """ self.__indexPrefix = indexPrefix self._connected = False if user and password: gLogger.debug("Specified username and password") self.__url = "https://%s:%s@%s:%d" % (user, password, host, port) else: gLogger.debug("Username and password not specified") self.__url = "http://%s:%d" % (host, port) gLogger.verbose("Connecting to %s:%s, useSSL = %s" % (host, port, useSSL)) if useSSL: bd = BundleDeliveryClient() retVal = bd.getCAs() casFile = None if not retVal['OK']: gLogger.error("CAs file does not exists:", retVal['Message']) casFile = certifi.where() else: casFile = retVal['Value'] self.__client = Elasticsearch(self.__url, timeout=self.__timeout, use_ssl=True, verify_certs=True, ca_certs=casFile) else: self.__client = Elasticsearch(self.__url, timeout=self.__timeout) self.__tryToConnect() def getIndexPrefix(self): """ It returns the DIRAC setup. """ return self.__indexPrefix ######################################################################## def query(self, index, query): """ Executes a query and returns its result (uses ES DSL language). :param self: self reference :param basestring index: index name :param dict query: It is the query in ElasticSearch DSL language """ try: esDSLQueryResult = self.__client.search(index=index, body=query) return S_OK(esDSLQueryResult) except RequestError as re: return S_ERROR(re) def _Search(self, indexname): """ it returns the object which can be used for reatriving ceratin value from the DB """ return Search(using=self.__client, index=indexname) ######################################################################## def _Q(self, name_or_query='match', **params): """ It is a wrapper to ElasticDSL Query module used to create a query object. :param str name_or_query is the type of the query """ return Q(name_or_query, **params) def _A(self, name_or_agg, aggsfilter=None, **params): """ It is a wrapper to ElasticDSL aggregation module, used to create an aggregation """ return A(name_or_agg, aggsfilter, **params) ######################################################################## def __tryToConnect(self): """Before we use the database we try to connect and retrieve the cluster name :param self: self reference """ try: if self.__client.ping(): # Returns True if the cluster is running, False otherwise result = self.__client.info() self.clusterName = result.get("cluster_name", " ") # pylint: disable=no-member gLogger.info("Database info", result) self._connected = True else: self._connected = False gLogger.error("Cannot ping ElasticsearchDB!") except ConnectionError as e: gLogger.error(repr(e)) self._connected = False ######################################################################## def getIndexes(self): """ It returns the available indexes... """ # we only return indexes which belong to a specific prefix for example 'lhcb-production' or 'dirac-production etc. return [index for index in self.__client.indices.get_alias("%s*" % self.__indexPrefix)] ######################################################################## def getDocTypes(self, indexName): """ :param str indexName is the name of the index... :return S_OK or S_ERROR """ result = [] try: gLogger.debug("Getting mappings for ", indexName) result = self.__client.indices.get_mapping(indexName) except Exception as e: # pylint: disable=broad-except gLogger.error(e) doctype = '' for indexConfig in result: if not result[indexConfig].get('mappings'): # there is a case when the mapping exits and the value is None... # this is usually an empty index or a corrupted index. gLogger.warn("Index does not have mapping %s!" % indexConfig) continue if result[indexConfig].get('mappings'): doctype = result[indexConfig]['mappings'] break # we supose the mapping of all indexes are the same... if not doctype: return S_ERROR("%s does not exists!" % indexName) return S_OK(doctype) ######################################################################## def exists(self, indexName): """ it checks the existance of an index :param str indexName: the name of the index """ return self.__client.indices.exists(indexName) ######################################################################## def createIndex(self, indexPrefix, mapping, period=None): """ :param str indexPrefix: it is the index name. :param dict mapping: the configuration of the index. :param str period: We can specify, which kind of index will be created. Currently only daily and monthly indexes are supported. """ fullIndex = generateFullIndexName(indexPrefix, period) # we have to create an index each day... if self.exists(fullIndex): return S_OK(fullIndex) try: gLogger.info("Create index: ", fullIndex + str(mapping)) self.__client.indices.create(fullIndex, body={'mappings': mapping}) return S_OK(fullIndex) except Exception as e: # pylint: disable=broad-except gLogger.error("Can not create the index:", e) return S_ERROR("Can not create the index") def deleteIndex(self, indexName): """ :param str indexName the name of the index to be deleted... """ try: retVal = self.__client.indices.delete(indexName) except NotFoundError as e: return S_ERROR(DErrno.EELNOFOUND, e) except ValueError as e: return S_ERROR(DErrno.EVALUE, e) if retVal.get('acknowledged'): # if the value exists and the value is not None return S_OK(indexName) return S_ERROR(retVal) def index(self, indexName, doc_type, body): """ :param str indexName: the name of the index to be used... :param str doc_type: the type of the document :param dict body: the data which will be indexed :return: the index name in case of success. """ try: res = self.__client.index(index=indexName, doc_type=doc_type, body=body) except TransportError as e: return S_ERROR(e) if res.get('created') or res.get('result') == 'created': # the created index exists but the value can be None. return S_OK(indexName) return S_ERROR(res) def bulk_index(self, indexprefix, doc_type, data, mapping=None, period=None): """ :param str indexPrefix: index name. :param str doc_type: the type of the document :param list data: contains a list of dictionary :paran dict mapping: the mapping used by elasticsearch :param str period: We can specify which kind of indices will be created. Currently only daily and monthly indexes are supported. """ gLogger.info("%d records will be insert to %s" % (len(data), doc_type)) if mapping is None: mapping = {} indexName = generateFullIndexName(indexprefix, period) gLogger.debug("inserting datat to %s index" % indexName) if not self.exists(indexName): retVal = self.createIndex(indexprefix, mapping, period) if not retVal['OK']: return retVal docs = [] for row in data: body = { '_index': indexName, '_type': doc_type, '_source': {} } body['_source'] = row if 'timestamp' not in row: gLogger.warn("timestamp is not given! Note: the actual time is used!") # if the timestamp is not provided, we use the current utc time. timestamp = row.get('timestamp', int(Time.toEpoch())) try: if isinstance(timestamp, datetime): body['_source']['timestamp'] = int(timestamp.strftime('%s')) * 1000 elif isinstance(timestamp, basestring): timeobj = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f') body['_source']['timestamp'] = int(timeobj.strftime('%s')) * 1000 else: # we assume the timestamp is an unix epoch time (integer). body['_source']['timestamp'] = timestamp * 1000 except (TypeError, ValueError) as e: # in case we are not able to convert the timestamp to epoch time.... gLogger.error("Wrong timestamp", e) body['_source']['timestamp'] = int(Time.toEpoch()) * 1000 docs += [body] try: res = bulk(self.__client, docs, chunk_size=self.__chunk_size) except BulkIndexError as e: return S_ERROR(e) if res[0] == len(docs): # we have inserted all documents... return S_OK(len(docs)) else: return S_ERROR(res) return res def getUniqueValue(self, indexName, key, orderBy=False): """ :param str indexName the name of the index which will be used for the query :param dict orderBy it is a dictionary in case we want to order the result {key:'desc'} or {key:'asc'} It returns a list of unique value for a certain key from the dictionary. """ query = self._Search(indexName) endDate = datetime.utcnow() startDate = endDate - timedelta(days=30) timeFilter = self._Q('range', timestamp={'lte': int(Time.toEpoch(endDate)) * 1000, 'gte': int(Time.toEpoch(startDate)) * 1000, }) query = query.filter('bool', must=timeFilter) if orderBy: query.aggs.bucket(key, 'terms', field=key, size=self.RESULT_SIZE, order=orderBy).metric(key, 'cardinality', field=key) else: query.aggs.bucket(key, 'terms', field=key, size=self.RESULT_SIZE).metric(key, 'cardinality', field=key) try: query = query.extra(size=self.RESULT_SIZE) # do not need the raw data. gLogger.debug("Query", query.to_dict()) result = query.execute() except TransportError as e: return S_ERROR(e) values = [] for bucket in result.aggregations[key].buckets: values += [bucket['key']] del query gLogger.debug("Nb of unique rows retrieved", len(values)) return S_OK(values) def pingDB(self): """ Try to connect to the database :return: S_OK(TRUE/FALSE) """ connected = False try: connected = self.__client.ping() except ConnectionError as e: gLogger.error("Cannot connect to the db", repr(e)) return S_OK(connected) def deleteByQuery(self, indexName, query): """ Delete data by query :param str indexName: the name of the index :param str query: the query that we want to issue the delete on """ try: self.__client.delete_by_query(index=indexName, body=query) except Exception as inst: gLogger.error("ERROR: Couldn't delete data") return S_ERROR(inst) return S_OK('Successfully deleted data from index %s' % indexName)
class ElasticsearchAPI: """ Each query will have its own index based on query name. index_name = query.name Doc type = query_name to make it possible to set mapping. Mapping is set per doc_type. All rows from a Query should look the same no matter the source. This makes all the data from all the servers in the same index. Comparable. Less indexes. """ def __init__(self, host, port, user, password): logger.info("Connecting to ES %s..." % host) self.es = Elasticsearch(hosts=[{"host": host, "port": port}]) logger.debug(self.es.info()) @staticmethod def from_config_manager(config_manager): config = config_manager.get_config("Elasticsearch") return ElasticsearchAPI(config["host"], config["port"], config["password"], config["username"]) def consume_collection(self, calculated_delta): assert type(calculated_delta) is CalculatedData query_name = calculated_delta.source.query.query_name db_name = calculated_delta.source.source_name docs = calculated_delta.delta_rows index_name = self.get_index_names(db_name) logger.debug("Pushing %s docs to index: %s" % (len(docs), index_name)) print("Pushing %s docs to index: %s" % (len(docs), index_name)) actions = [] for doc in docs: d = doc.as_dict() d["measure_source"] = db_name action = {"_index": index_name, "_type": query_name + "_type", "_source": d} actions.append(action) helpers.bulk(self.es, actions) self.es.indices.refresh() return len(docs) def init_indexes_for(self, sources): for source in sources: self.init_index_for_source(source) def init_index_for_source(self, source): assert type(source) is Source db_name = source.source_name index_name = self.get_index_names(db_name=db_name) self.create_index(index_name) self.set_mapping(index_name, source.query.query_name, source.query.mapping) def set_mapping(self, index_name, query_name, source_mapping): mapping = { "properties": { "timestamp": {"type": "date", "format": "date_hour_minute_second"}, "key_col": {"index": "not_analyzed", "type": "string"}, } } for k, v in source_mapping.items(): mapping["properties"][k] = v self.es.indices.put_mapping(index=index_name, doc_type=query_name + "_type", body=mapping) def delete_index(self, index_name): logger.info("Truncating data in index: %s" % index_name) self.es.indices.delete(index=index_name, ignore=404) def get_index_names(self, db_name): hist = "hist-%s" % (db_name.replace("\\", "-")) return hist.lower() def create_index(self, index_name): print("Creating index %s" % index_name) self.es.indices.create(index_name, ignore=400)
class Ncli: _version = 1.0 _yaml = 'nets.yaml' def __init__(self, yamlfile): with open(yamlfile) as f: self.parameters = yaml.load(f) logging.basicConfig(format=self.parameters['logging']['format']) self.logger = logging.getLogger("NETS") self.logger.level = logging.INFO try: self.es = Elasticsearch(hosts=[ {'host': self.parameters['elasticsearch']['host'], 'port': self.parameters['elasticsearch']['port']}]) info = self.es.info() self.logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name'])) except ElasticsearchException: self.logger.info("Elasticsearch is not available.") exit(0) def indexinfo(self, target): for item in self.parameters['elasticsearch']['indexes']: if item['type'] == target: return item['name'], item['doctype'] # display status check and exit def status(self): idx_client = IndicesClient(self.es) for idx in ['raw-article', 'enhanced-article']: es_index = self.indexinfo(idx)[0] if idx_client.exists(es_index): self.logger.info("%s contains %s documents." % (idx, self.es.count(index=es_index)['count'])) if idx == 'article': query = {"query": {"term": {"status": 1}}} self.logger.info( "%s articles have been processed." % self.es.count(index=es_index, body=query)['count']) else: self.logger.info("%s does not exist" % es_index) # initialize articles or events index. def initialize(self, idx): es_index, es_doctype = self.indexinfo(idx) self.logger.info("Initializing %s" % es_index) idx_client = IndicesClient(self.es) if idx_client.exists(es_index): idx_client.delete(es_index) idx_client.create(es_index) if idx == 'event': idx_client.put_mapping(doc_type=es_doctype, index=[es_index], body=event_mapping()) self.logger.info("%s ready." % es_index) # find n articles and run them through the pipeline def pipeline(self, n): self.eventpipeline = Pipeline(self.parameters) es_index, es_doctype = self.indexinfo('raw-article') self.logger.info("Send %s articles through the pipeline" % n) query = '{"query": { "bool": { "must": { "match": { "status" : 0 }}}}}' result = self.es.search(index=es_index, doc_type=es_doctype, size=n, body=query) articles = result['hits']['hits'] self.eventpipeline.batch(articles) # load articles from json files in a directory def load(self): self.logger.info("Load articles") es_index, es_doctype = self.indexinfo('raw-article') path = self.parameters['directories']['articles'] files = [join(path, f) for f in listdir(path) if isfile(join(path, f))] for filename in files: with open(filename) as data_file: rows = [json.loads(row) for row in data_file.readlines()] for index, article in enumerate(rows): if '_id' in article: del article['_id'] self.es.index(index=es_index, doc_type=es_doctype, body=article) def reset(self, n): resetpayload = {"doc": {"status": 0}} self.logger.info("reset %s raw articles" % n) es_index, es_doctype = self.indexinfo('raw-article') query = '{"query": { "bool": { "must": { "match": { "status": "1" }}}}}' result = self.es.search(index=es_index, doc_type=es_doctype, size=n, body=query) articles = result['hits']['hits'] tic = 0 for article in articles: aid = article["_id"] status = article["_source"]["status"] self.es.update(index=es_index, doc_type=es_doctype, id=aid, body=resetpayload) tic = tic + 1 if tic == 500: print("...", tic) tic = 0