def count_records_in_ebola_data(): """ :return: """ query = dict() index = 'ebola_data' url_EShost = "http://52.7.75.159:9020/" es = Elasticsearch(url_EShost, connection_class = RequestsHttpConnection, http_auth = ('lorelei', 'thorthor')) query['query'] = ESQueryBuilders.build_match_all_query() print es.count(index=index, body = query)['count']
def new_es_instance(num_nodes, http_port, timeout=30): logging.info("Waiting for %s nodes to join the cluster" % num_nodes) for _ in range(0, timeout): # TODO(simonw): ask Honza if there is a better way to do this? try: es = Elasticsearch([{"host": "127.0.0.1", "port": http_port + x} for x in range(0, num_nodes)]) es.cluster.health(wait_for_nodes=num_nodes) es.count() # can we actually search or do we get a 503? -- anyway retry return es except (ConnectionError, TransportError): pass time.sleep(1) assert False, "Timed out waiting for %s nodes for %s seconds" % (num_nodes, timeout)
def create_client(http_port, timeout=30): logging.info("Waiting for node to startup") for _ in range(0, timeout): # TODO: ask Honza if there is a better way to do this? try: client = Elasticsearch([{"host": "127.0.0.1", "port": http_port}]) client.cluster.health(wait_for_nodes=1) client.count() # can we actually search or do we get a 503? -- anyway retry return client except (ConnectionError, TransportError): pass time.sleep(1) assert False, "Timed out waiting for node for %s seconds" % timeout
def create_client(http_port=DEFAULT_HTTP_TCP_PORT, timeout=30): logging.info('Waiting for node to startup') for _ in range(0, timeout): # TODO: ask Honza if there is a better way to do this? try: client = Elasticsearch([{'host': 'localhost', 'port': http_port}]) client.cluster.health(wait_for_nodes=1) client.count() # can we actually search or do we get a 503? -- anyway retry return client except (ConnectionError, TransportError): pass time.sleep(1) assert False, 'Timed out waiting for node for %s seconds' % timeout
def main(): args = parse_args() should_apply = args.apply print(should_apply) es = Elasticsearch([{'host': host}]) print_count("Source [before]", es.count(index=source)) print_count("Target [before]", es.count(index=target)) if (args.apply): reindex(es, source, target, chunk_size=5000, scroll='30m') print_count("Source [after]", es.count(index=source)) print_count("Target [after]", es.count(index=target))
def es_count(self,p_host,p_port,p_index,p_query=None): """ === Returns the Number of Documents That Match a Query === The result is the response from elastic search. The value is in the "count" field of the response. - ``p_host`` - Elasticsearch server - ``p_port`` - Port of the es server - ``p_index`` - Name of the index to query - ``p_query`` - Query to run | ${res} = | Es Count | localhost | 9200 | myIndex | {"query": {"query_string": {"query": "searched value"}}} | ``${res}`` contains the number of docs """ # Es client try: param = [{'host':p_host,'port':int(p_port)}] es = Elasticsearch(param) except Exception: raise AssertionError("Connection error on %s:%i",p_host,int(p_port)) try: result = es.count(index=p_index, body=p_query) except Exception: raise AssertionError("Count error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query) return result['count']
def _client_test_case(self, instance): try: loggers = [ inst for inst in instance.environment.services if instance.organization.application(inst.applicationId).name == self.name ] host = loggers[0].returnValues["logger.logger-server"] es = Elasticsearch([{"host": host}]) index_name = "logstash-" + datetime.utcnow().strftime("%Y.%m.%d") records_count = es.count(index_name, body={"query": {"term": {"instId": instance.id}}}) self.assertTrue(records_count >= 2, "Expected at least two messages in index, got %s" % records_count) records = es.search(index=index_name, body={"query": {"match_all": {}}})["hits"]["hits"] for record in records: self.assertEqual(record["_source"]["@message"], "Hello from execrun!") expected_keys = [ "@severity", "@timestamp", "filename", "instId", "jobId", "stepId", "stepname", "host", "@message", ] for key in expected_keys: self.assertIn( key, record["_source"], "Message saved to elasticsearch should contain field %s" % key ) except TransportError as e: self.fail("Can not retrieve count of log messages: %s %s" % (e.status_code, e.error))
def es_count(self,p_host,p_port,p_index,p_query=None): """ Returns the number of documents that match a query The result is the response from elastic search. The value is in the "count" field of the response. {p_host} Elasticsearch server\n {p_port} Port of the es server\n {p_index} Name of the index to query\n {p_query} Query to run\n | ${res} = | es count | localhost | 9200 | myIndex | {"query":{"query_string":{"query": "searched value"}}} | ${res} contains the number of docs """ # Es client try: param = [{'host':p_host,'port':int(p_port)}] es = Elasticsearch(param) except Exception: raise AssertionError("Connexion error on %s:%i",p_host,int(p_port)) try: result = es.count(index=p_index, body=p_query) except Exception: raise AssertionError("Count error on %s:%i/%s for query : %s",p_host,int(p_port),p_index,p_query) return result['count']
def query_and_dump_reults(args): es = Elasticsearch([args.hostname + ':' + str(args.port)]) query = '{"query":{"match_all":{}}}' if args.query is not None: query = args.query doc_type = None if args.doc_type is not None: doc_type = args.doc_type target = "output.csv" if args.target is not None: target = args.target res = es.count(index=args.index, body=query) nhits = res['count'] counter = 0 bar = progressbar.ProgressBar(max_value=nhits) res = helpers.scan(es, index=args.index, query=query, doc_type=doc_type) fields = args.fields.split(',') with open(target, 'w') as csvfile: datawriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) datawriter.writerow(fields) for item in res: item = item['_source'] datawriter.writerow([get_var(item, field) for field in fields]) counter += 1 bar.update(counter) bar.finish()
def count(self, p_index, p_query={}): """Gets the number of docs for a query p_index: elasticsearch index where to query p_query: the query to process return the number of docs from the index p_index and the query p_query """ try: param = [{'host': self.host, 'port': self.port}] es = Elasticsearch(param) logger.info('Connected to ES Server: %s', json.dumps(param)) except Exception as e: logger.error('Connection failed to ES Server : %s', json.dumps(param)) logger.error(e) sys.exit(EXIT_IO_ERROR) try: result = es.count(index=p_index, body=p_query) logger.info('Count the number of items from %s for the query %s', p_index, p_query) except Exception as e: logger.error('Error querying the index %s with query %s', p_index, p_query) logger.error(e) return result['count']
def get(self,indexs,logdir="/root",datadir="/opt"): es=Elasticsearch(self.host) try: count=es.count(index=indexs) count=int(count["count"]) logfile=logdir+"/"+indexs if os.path.isfile(logfile): file=open(logfile,"r") value=int(file.readline()) file.close() if count<=value: return False file=open(logfile,"w") file.write(str(count)) file.close() num=count/10 j=0 datafile=datadir+"/"+indexs while j<=num: rs=es.search(index=indexs,from_=j*10,size=10) file=open(datafile,"a") for doc in rs["hits"]["hits"]: file.write(str(doc["_source"])+"\n") file.close() j=j+1 if os.path.isfile(datafile): glacier=awsglacier(self.region,self.access,self.secret) glacier.uploadfile(datafile) os.remove(datafile) except ElasticsearchException: print "elasticsearch exceptiont"
def get_ids_with_response_status(status): es = Elasticsearch(["http://localhost:9200"]) total_num_docs = es.count(index="gor", body={"query": {"match_all": {}}})['count'] print "The total number of docs is: " + str(total_num_docs) # this number is the total number of documents inside the gor index that correspond to various queries. #filtered_num = es.count(index="gor", body={"query": {"bool": {"must": { "match": { "Resp_Status": str(status) }}, "must_not": { "match": { "Resp_Content-Type": "octet-stream" }}}}})['count'] filtered_num = es.count(index="gor", body={"query": {"bool": {"must": { "match": { "Resp_Status": str(status) }}}}})['count'] #total_num_charset = es.count(index="gor", body={"query": {"bool": {"must_not": { "match": { "Resp_Content-Type": "octet-stream" }}}}})['count'] res = es.search(index="gor", doc_type="RequestResponse", body={"query": {"bool": {"must": { "match": { "Resp_Status": str(status) }}}}, "size": int(filtered_num), "fields": ["_id"]}, request_timeout=300) print str(filtered_num) + " documents in the index have response status of "+ str(status) + "..." ids_list = [d['_id'] for d in res['hits']['hits']] return ids_list
class ElasticClient: def __init__(self, host: str, port: int): try: self.es = Elasticsearch(hosts=[ {'host': host, 'port': port}]) info = self.es.info() logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name'])) except ElasticsearchException as e: logger.error("Elasticsearch is not available.", e) exit(0) def get_articles(self, index, doctype, batch_size): query = '{"query": { "bool": { "must_not": { "exists": { "field": "status" }}}}}' result = self.es.search(index=index, doc_type=doctype, size=batch_size, body=query) articles = result.get('hits').get('hits') return articles if articles is not None else [] def count(self, index): return self.es.count(index=index)['count'] def info(self): return self.es.info() def check_url(self, url: str, auth_index: str): """ Private function to check if a URL appears in the database. Parameters ---------- url: URL for the news stories to be scraped. auth_index: es index Returns ------- found: Boolean. Indicates whether or not a URL was found in the database. """ response = self.es.search(index=auth_index, doc_type=auth_index, body={ "query": { "match_phrase": { "url": url } } }, size=0, terminate_after=1, ignore_unavailable=True) return response["hits"]["total"] > 0 def persist(self, index, doctype, payload): self.es.index(index=index, doc_type=doctype, body=payload) def update(self, index, doctype, doc_id, payload): self.es.update(index=index, doc_type=doctype, id=doc_id, body=payload)
def CheckStatus(): global start_time global finished es = Elasticsearch([url]) source_count = es.count(index=source) target_count = es.count(index=target) print "copied "+str(target_count['count'])+" of "+str(source_count['count'])+" ( elapsed: "+str(int(time.time() - start_time))+" sec. )" if source_count == target_count: finished = True
def test_from_redis_to_elasticsearch(self): ''' Crunch mock data and then retrieve crunched users ''' gc = GameCruncher() gc.crunch() sleep(2) es = Elasticsearch(ES_NODES) nb_user_crunched = es.count(index=RIOT_USERS_INDEX) self.assertEqual(10, nb_user_crunched['count'])
def main(): parser = argparse.ArgumentParser() parser.add_argument('-H', '--host', default='localhost', help='Elasticsearch host') parser.add_argument('-P', '--port', default=9200, help='Elasticsearch ' 'HTTP port') parser.add_argument('-s', '--ssl', default=False, action='store_true', help='Use SSL for connection') parser.add_argument('-u', '--username', help='HTTP auth username') parser.add_argument('-p', '--password', help='HTTP auth password') parser.add_argument('-U', '--url_prefix', default='', help='URL prefix ' 'for HTTP requests') parser.add_argument('-i', '--index', default='_all', help='Index that ' 'should be searched. Default: _all') parser.add_argument('-f', '--field', default='@timestamp', help='Field the ' 'range should be bound to. Default: @timestamp') parser.add_argument('-r', '--range', default='now-1h', help='Start time to ' 'search back for entries. Default: now-1h') args = parser.parse_args() host = args.host port = args.port ssl = args.ssl username = args.username password = args.password url_prefix = args.url_prefix index = args.index field = args.field time = args.range hosts = [{ 'host': host, 'port': port, 'url_prefix': url_prefix, 'http_auth': '{}:{}'.format(username, password), 'use_ssl': ssl },] es = Elasticsearch(hosts) search_filter = { 'query': { 'filtered': { 'filter': { 'range': { field: { 'gte': time } } } } } } count = es.count(index=index, body=search_filter) print 'metric item_count int {}'.format(count['count'])
def docs_density_frequency_statistics(elasticsearch_host, index_name, doc_type, output_file = None): """ We will vary the 'density' (the percentage of exists queries that should match) and print out the CUMULATIVE frequency at each density point. Interpret the results with care. Because of timeout issues, this function has to be called in lags at times. Change the 'range' line in the function to start from where you left off the last time something crashes. :param elasticsearch_host: the elasticsearch host :param index_name: the name of the elasticsearch index :param doc_type: the type in the index :return: None """ webpage_properties = MappingAnalyses._get_list_of_all_webpage_properties() should = list() for property in webpage_properties: should.append(TableFunctions.build_constant_score_exists_clause(property)) bool_query = BuildCompoundESQueries.BuildCompoundESQueries.build_bool_arbitrary(should = should) cumul_freq_dict = dict() # key is the 'density' in percent, value is total number of docs retrieved. es = Elasticsearch(elasticsearch_host) for i in range(0, 101, 5): msm_str = str(i) + '%' bool_query['bool']['minimum_should_match'] = msm_str query = dict() query['query'] = bool_query count = es.count(index= index_name, doc_type=doc_type, body = query)['count'] print str(i)+'\t'+str(count) cumul_freq_dict[i] = count query = dict() query['query'] = TableFunctions.build_match_all_query() cumul_freq_dict[0] = es.count(index= index_name, doc_type=doc_type, body = query)['count'] if output_file: file = codecs.open(output_file, 'w', 'utf-8') json.dump(cumul_freq_dict, file) file.write('\n') file.close() else: pp = pprint.PrettyPrinter(indent=4) pp.pprint(cumul_freq_dict)
def main(): es = Elasticsearch([{'host': eslogin.host, 'port': eslogin.port}], http_auth=(eslogin.user,eslogin.password)) columns = range(150) index = range(360) df = pd.DataFrame(index=index, columns=columns) df = df.fillna(0) for col in columns: for ind in index: df.loc[ind,col] = es.count(index="events-2015.05.*", body={'query': {'bool': {'must':[{'match': { 'ai' : col }}, {'match': { 'cr' : ind }}, {'match': { 'et' : 'AD_SHOW' }}],'must_not':[{'match': { 'fr' : 'true' }}]}}})['count'] df.to_csv("../data/ad_show_5_2015.tab",sep='\t')
def count_es(es_url, es_index, min_date, max_date): es_client = Elasticsearch(es_url or settings.ELASTICSEARCH['URL'], retry_on_timeout=True, timeout=settings.ELASTICSEARCH['TIMEOUT']) return es_client.count( index=(es_index or settings.ELASTICSEARCH['INDEX']), doc_type='creativeworks', body={ 'query': { 'range': { 'date_created': {'gte': min_date.isoformat(), 'lte': max_date.isoformat()} } } } )['count']
def main(argv): es_server = 'localhost' query_file = '' index = 'bench' doc_type = 'data' bench_type = 'search' num_threads = 1 help_msg = 'esbench.py -e <es-server> -q <queries> -i <index> -t <doc-type> -b <bench-type> -n <num-threads>' try: opts, args = getopt.getopt(argv, 'he:q:i:t:b:', ['es-server', 'queries=', 'index=', 'type=', 'benchtype=', 'numthreads=']) except getopt.GetoptError: print help_msg sys.exit(2) for opt, arg in opts: if opt == '-h': print help_msg sys.exit() elif opt in ('-e', '--es-server'): es_server = arg elif opt in ('-q', '--queries'): query_file = arg elif opt in ('-i', '--index'): index = arg elif opt in ('-t', '--type'): doc_type = arg elif opt in ('-b', '--benchtype'): bench_type = arg elif opt in ('-n', '--numthreads'): num_threads = int(arg) es = Elasticsearch(hosts=['http://%s:9200' % es_server], timeout=600) count = es.count(index=index)['count'] del es threads = [] print '[Main Thread] Initializing %d threads...' % num_threads for i in range(0, num_threads): queries = load_queries(bench_type=bench_type, query_file=query_file, record_count=count) thread = BenchmarkThread(thread_id=i, bench_type=bench_type, es_server=es_server, index=index, doc_type=doc_type, queries=queries) threads.append(thread) print '[Main Thread] Starting threads...' for thread in threads: thread.start() print '[Main Thread] Waiting for threads to join...' for thread in threads: thread.join()
class Index(object): def __init__(self, name=None): self.name = name self.doc_type = 'mof' self.es = Elasticsearch() def init_app(self, app): self.name = app.config['ES_INDEX'] def initialize(self): if self.name in self.es.indices.get_aliases(): print("deleting old index") self.drop() self.es.indices.create(index=self.name, body=ES_INDEX_BODY) def drop(self): self.es.indices.delete(self.name) def add(self, doc_id, data): self.es.index( index=self.name, doc_type=self.doc_type, id=doc_id, body=data, ) self.es.indices.refresh(self.name) def bulk_add(self, documents): rv = helpers.bulk_index( client=self.es, docs=( { '_id': doc_id, '_index': self.name, '_type': self.doc_type, '_source': data, } for doc_id, data in documents ), raise_on_error=True, ) self.es.indices.refresh(self.name) def count(self): return self.es.count(index=self.name)['count'] def search(self, query): return self.es.search(index=self.name, body={'query': query})
class ResultDB( BaseResultDB): collection_prefix = '' def __init__(self, url, database='resultdb'): self.conn = Elasticsearch() self.database = database #self.conn.IndicesClient(self.conn).delete(index=self.database); #self.save( "afxc2", "sd","http://www.5566.com",{"shopname":"sdfsdfs"} ) #print self.count( "afxc2" ) #print self.get( "afxc2" , "sd" ) #self.select( "afxc2" ) def _parse(self, data): return data["_source"] #if 'result' in data: # data['result'] = json.loads(data['result']) #return data def _stringify(self, data): if 'result' in data: data['result'] = json.dumps(data['result']) return data def save(self, project, taskid, url, result): obj = { 'taskid': taskid, 'url': url, 'result': result, 'updatetime': time.time(), } return self.conn.index( index=self.database, doc_type=project, id=taskid, body= obj ) def select(self, project, fields=None, offset=0, limit=0): ret = []; if limit==0 : limit = 10 items = self.conn.search( index=self.database, doc_type=project, fields=fields,_source=True , from_=offset,size=limit ); for item in items["hits"]["hits"]: ret.append( self._parse(item)) return ret; def count(self, project): r = self.conn.count(index=self.database, doc_type=project ); return r['count']; def get(self, project, taskid, fields=None): return self.conn.get_source( index=self.database, doc_type=project, id=taskid );
class ElasticsearchBackend(BaseBackend): def __init__(self, conn): if Elasticsearch is None: raise ImportError("Plz. install elasticsearch library for ElasticsearchBackend.") self._es = Elasticsearch(**conn.options.get('elasticsearch', {})) super().__init__(conn) def _gen_es_id_for_data(self, schema_name, _data): key_names = self._conn.schema.get_primary_key(schema_name) return '_n_'.join([str(_data[key]) for key in key_names]) def _gen_es_id_for_id(self, _id): if isinstance(_id, str): return _id elif isinstance(_id, (tuple, list)): return '_n_'.join(_id) else: return _id def put_item(self, schema_name, _data, overwrite=False): op_type = 'create' if not overwrite else 'index' result = self._es.index(index=schema_name, doc_type=schema_name, id=self._gen_es_id_for_data(schema_name, _data), body=_data, op_type=op_type) return result.get('_version', 0) > 0 def get_item(self, schema_name, _id): try: result = self._es.get(index=schema_name, doc_type=schema_name, id=self._gen_es_id_for_id(_id)) except NotFoundError: raise ItemNotFound("Item not found for id {} in {}.".format(_id, schema_name)) return result['_source'] def delete_item(self, schema_name, _id): result = self._es.delete(index=schema_name, doc_type=schema_name, id=self._gen_es_id_for_id(_id)) return result['found'] is True def query(self, schema_name, _w, limit=10): return self.scan(schema_name, _w, limit) def scan(self, schema_name, _w, limit=10): query = elastic_parse_wt(_w, {}) query["size"] = limit result = self._es.search(index=schema_name, doc_type=schema_name, body=query) return [hit['_source'] for hit in result["hits"]["hits"]] def query_count(self, schema_name, _w): query = elastic_parse_wt(_w, {}) result = self._es.count(index=schema_name, doc_type=schema_name, body=query) return result.get('count', 0)
def indexing_status_page(request, id): es = Elasticsearch() crawl_info = CrawlInfo.objects.get(id=id) try: es.indices.refresh(index="index-%d" % crawl_info.id) percentage = int(es.count("index-%d" % crawl_info.id, crawl_info.type).get('count') * 100 / crawl_info.successful_crawls) percentage = max(1, percentage) except Exception as e: percentage = 0 if request.GET.get('type', 'HTML') == 'JSON': result = json.dumps({'status': 'OK', 'percent': percentage}, ensure_ascii=False, encoding='utf8') return HttpResponse(result, content_type='application/json; charset=utf-8') return render(request, 'indexing_status.html', {'percent': percentage})
def export_edges(index, file, qs='*'): es = Elasticsearch() body = { "query" : { "bool":{ "must":[ { "query_string" : { "query" : qs } }, { "filtered": { "query": {"bool":{"must":[{"match_all":{}}]}}, "filter": { "bool": { "must": [ { "exists": { "field": "senders"}}], "should" :[ { "exists": { "field": "tos"}}, { "exists": { "field": "ccs"}}, { "exists": { "field": "bccs"}} ] } } } } ] } }, "sort": {} } def rcvrs(fields={}): return fields.get("tos",[]) +fields.get("ccs",[])+fields.get("bccs",[]) count = es.count(index=index, doc_type="emails", body=body)["count"] # TODO add batch processing addrs = es.search(index=index, doc_type="emails", size=count, from_=0, fields=["senders", "tos", "ccs", "bccs"], body=body) edges = reduce(operator.add, [[{"from":hit["fields"]["senders"][0], "to":rcvr}for rcvr in rcvrs(hit["fields"]) ]for hit in addrs["hits"]["hits"]]) text_file = open(file, "w") [text_file.write(json.dumps(edge)+"\n") for edge in edges] text_file.close()
class DatastoreConnection: def __init__(self): self._es = Elasticsearch() self._patients = PatientManager(self) self._vocabularies = VocabularyManager(self) def index_patients(self, filename): return self._patients.index(filename) def index_hpo(self, filename): return self._vocabularies.index(index='hpo', filename=filename, Parser=OBOParser) def index_genes(self, filename): return self._vocabularies.index(index='genes', filename=filename, Parser=GeneParser) def get_vocabulary_term(self, id, index='_all'): return self._vocabularies.get_term(id, index=index) def find_similar_patients(self, patient, n=5): """Return the n most similar patients to the given query api.Patient""" return self._patients.find_similar_patients(patient=patient, n=n) def search(self, *args, **kwargs): """Expose ElasticSearch method""" return self._es.search(*args, **kwargs) def bulk(self, *args, **kwargs): """Expose ElasticSearch method""" return self._es.bulk(*args, **kwargs) def index(self, *args, **kwargs): """Expose ElasticSearch method""" return self._es.index(*args, **kwargs) def count(self, *args, **kwargs): """Expose ElasticSearch method""" return self._es.count(*args, **kwargs) @property def indices(self): """Expose ElasticSearch property""" return self._es.indices
def from_elasticsearch(host, index, query, port=9200, pagination=100): """ Create Bag from Elasticsearch Query >>> b = from_elasticsearch(host='hostname', index='reddit', ... query={"match": {'body':'Python'}}) """ es = Elasticsearch([{'host': host, 'port': port}]) count = es.count(index=index, body={'query': query})['count'] npartitions = int(ceil(count / pagination)) name = 'elasticsearch' + next(tokens) dsk = dict() for i in range(npartitions): kwargs = {'index': index, 'body': {'query': query, 'from': pagination*i, 'size': pagination}} dsk[(name, i)] = (get_results, es, kwargs) return Bag(dsk, name, npartitions)
def main(): es = Elasticsearch([{ 'host': eslogin.host, 'port': eslogin.port }], http_auth=(eslogin.user, eslogin.password)) columns = range(150) index = range(360) df = pd.DataFrame(index=index, columns=columns) df = df.fillna(0) for col in columns: for ind in index: df.loc[ind, col] = es.count(index="events-2015.05.*", body={ 'query': { 'bool': { 'must': [{ 'match': { 'ai': col } }, { 'match': { 'cr': ind } }, { 'match': { 'et': 'AD_SHOW' } }], 'must_not': [{ 'match': { 'fr': 'true' } }] } } })['count'] df.to_csv("../data/ad_show_5_2015.tab", sep='\t')
def main(): (opts, args) = parse_opts() es = Elasticsearch( [{ 'host': opts.es_host, 'port': opts.es_port }], timeout=1200, retry_on_timeout=True ) print('Cluster: {}'.format(es.info().get('cluster_name'))) indices = es.indices.get(index=opts.index_pattern).keys() queries = [] if opts.program: queries.append({'term': {'program': opts.program}}) if opts.fleet: queries.append({'term': {'fleet': opts.fleet}}) if opts.message: queries.append({'match_phrase':{'message': opts.message}}) body = None if len(queries) > 0: body = {'query': {'bool': {'must': queries}}} for index in indices: resp = es.count(index=index, body=body) count = resp.get('count') print('{:22} count: {:6}'.format(index, count)) if opts.query > 0: resp = es.search(index=index, body=body) print_logs(resp['hits']['hits']) elif opts.delete: rval = es.delete_by_query(index=index, body=body) rval2 = es.indices.forcemerge( index=index, params={'only_expunge_deletes':'true'} ) print('{:22} Deleted: {:10} Failed: {}'.format(index, rval['deleted'], rval2['_shards']['failed']))
class ElasticSearchClient(object): def __init__(self, host, port): self.host = host self.port = port self.connect() def connect(self): self.es = Elasticsearch(hosts=[{'host': self.host, 'port': self.port}]) def count(self, index): """ :param index: :return: 统计index总数 """ return self.es.count(index=index) def delete(self, index, doc_type, id): """ :param index: :param doc_type: :param id: :return: 删除index中具体的一条 """ self.es.delete(index=index, doc_type=doc_type, id=id) def get(self, index, id): return self.es.get(index=index, id=id) def search(self, index, doc_type, constraint, size=20): try: doc = {"query": {"match": constraint}} res = self.es.search(index=index, doc_type=doc_type, body=doc, size=size) resources = [] for hit in res['hits']['hits']: resources.append(hit["_source"]) return resources except Exception as err: print(err)
class Queries(object): def __init__(self, log_data, filter_type): time.sleep(70) self.es_client = Elasticsearch() self.__all_documents_count(log_data, filter_type) def __all_documents_count(self, log_data, filter_type): filter_wildcard = filter_type + '-*' result = self.es_client.count(index=filter_wildcard) if result['count'] == len(log_data): Status.show( 'The created Elasticserach index {} has the same amount of items {} as the {}.log' .format(filter_type, result['count'], filter_type), True) else: Status.show( 'The {} index data count {} differs from the ingested log, please check the filters' .format(filter_type, result['count']), False) def __document_time(self): pass
def testdata(self, eshosts, indexname, query_body, count_query_body, highligth): es = Elasticsearch(eshosts) logtotal = es.count(indexname, body=count_query_body) logkey = es.indices.get_mapping(index=indexname) try: logdata = es.search(indexname, body=query_body) except RequestError: return {'status': False, 'msg': '有不存在的field被当作条件查询'} logstatis = logdata['aggregations']['groupDate']['buckets'] logstatis = self.generate_echart_data(logstatis) logdata = logdata['hits']['hits'] logdata = self.format_logdata(logdata, highligth) logkey = self.get_key(logkey) return { 'logkey': logkey, 'logtotal': logtotal, 'status': True, 'logdata': logdata, 'echart': logstatis }
def run(self): output = self.output().open('w') elastic = Elasticsearch(hosts=self.hosts) query = '{"query": {"prefix": {"path": "%s"}}}' % self.monitoring_key count = elastic.count(index="disthene", body=query)['count'] limit = 5000 sum = 0 for chunk in range(int(count / self.chunk_size) + 1): result = elastic.search(index="disthene", body=query, size=self.chunk_size, from_=sum, stored_fields="path") for path in result["hits"]["hits"]: output.write("{}\n".format(path["_source"]["path"])) sum += self.chunk_size if sum > limit: break output.close()
def export(): es = Elasticsearch(["ubuntu3:9200"]) index_name = "script_data" type_name = "script" target_index_name = "script_data" # file_name = "D:\search_text.json" file_name = "/home/hadoop/search_text.json" count = es.count(index=index_name, doc_type=type_name)['count'] body = {"size": count} data = es.search(index=index_name, doc_type=type_name, body=body)['hits']['hits'] tmp = "" for i in range(len(data) - 1): index = "{\"index\":{\"_index\":\"" + target_index_name + "\",\"_id\":" + str( i) + "}}\n" tmp += index tmp += str(data[i]['_source']) tmp += "\n" file = codecs.open(file_name, 'w', encoding="utf-8") file.write(tmp)
class Search(object): def __init__(self): self.es = Elasticsearch(hosts, http_compress=True) def multi_get(self): #health_status = es.cluster.health() #print health_status #res = es.mget(params) #body = {"query":{"term":{}}} #number = es.count(body=body) index = ["log-2018.03.21"] from_ = 0 body = """ {"index":%(index)s} {"query":{"match_all":{}},"from":%(from_)d, "size":%(limit)d} """ % dict(index=index, from_=from_, limit=LIMIT) res = self.es.msearch(body, doc_type='message') total = res['responses'][0]['hits']['total'] hits = res['responses'][0]['hits']['hits'] for i in xrange(total / LIMIT): body = """ {"index":["log-2018.03.21"]} {"query":{"match_all":{}},"from":LIMIT*(1+i), "size":LIMIT}""" res = es.msearch(body, doc_type='message') hits.append(hits) return hits def _count(self, index=None, item=None, value=None): body = { "query": { "term": { item: value, } } } res = self.es.count(index=index, body=body) return res['count']
def get(self, pid, record, **kwargs): """Handle GET request.""" page_views = 0 es = Elasticsearch(CFG_ELASTICSEARCH_SEARCH_HOST) query = { "query": { "bool": { "must": [{ "match": { "id_bibrec": pid.pid_value } }, { "match": { "_type": "events.pageviews" } }] } } } results = es.count(index=ES_INDEX, body=query) if results: page_views = results.get('count', 0) return make_response(jsonify(page_views), 200)
def es_get_all_ips(str_existing_index): """Returns list of list_of_ips stored in given Elasticsearch index""" list_ips = [] es = Elasticsearch(([{'host': get_es_cluster_ip()}])) count = es.count(index=str_existing_index)['count'] res = es.search(index=str_existing_index, body={ "size": 0, "aggs": { "all_ip": { "terms": { "field": "ip", "size": count } } } }) for key in res['aggregations']['all_ip']['buckets']: list_ips.append(key['key']) print('Found ' + str(len(list_ips)) + ' IPs in Elasticsearch index ' + str_existing_index) ask_continue() return list_ips
def init(self): """Init the connection to the ES server.""" if not self.export_enable: return None try: es = Elasticsearch(hosts=['{0}:{1}'.format(self.host, self.port)]) except Exception as e: logger.critical("Cannot connect to ElasticSearch server %s:%s (%s)" % (self.host, self.port, e)) sys.exit(2) else: logger.info("Connected to the ElasticSearch server %s:%s" % (self.host, self.port)) try: index_count = es.count(index=self.index)['count'] except Exception as e: # Index did not exist, it will be created at the first write # Create it... es.indices.create(self.index) else: logger.info("There is already %s entries in the ElasticSearch %s index" % (index_count, self.index)) return es
def init(self): """Init the connection to the ES server.""" if not self.export_enable: return None try: es = Elasticsearch(hosts=['{}:{}'.format(self.host, self.port)]) except Exception as e: logger.critical("Cannot connect to ElasticSearch server %s:%s (%s)" % (self.host, self.port, e)) sys.exit(2) else: logger.info("Connected to the ElasticSearch server %s:%s" % (self.host, self.port)) try: index_count = es.count(index=self.index)['count'] except Exception as e: # Index did not exist, it will be created at the first write # Create it... es.indices.create(self.index) else: logger.info("There is already %s entries in the ElasticSearch %s index" % (index_count, self.index)) return es
def get_count(ids, ip): client = Elasticsearch([ { 'host': ip }, ]) response = client.count(index="blogposts", body={ "query": { "bool": { "must": [{ "terms": { "blogsite_id": ids.replace(' ', '').split(','), "boost": 1 } }] } } }) return int(response['count'])
class ElkTask(): def __init__(self): elk_addr = "http://123.56.9.150:9200/" self.es = Elasticsearch(elk_addr) self.es.cluster.health(request_timeout=10) def do_user_agent_reg(self,threshold,reg_string): try: res = self.es.count(body = { "query": { "filtered": { "query": { "regexp": { "logs.http_user_agent.raw": "%s"%(reg_string) } }, "filter": { "term": { "logs.source_type.raw": "RTR" }} } } }) return res["count"] >= threshold except Exception,e: print(e) return False
def handle_with_partitioning(self, **options): self._es_client_config = { "hosts": options["es_hostname"], "timeout": options["es_timeout"] } self._partition_size = options["partition_size"] es = Elasticsearch(**self._es_client_config) self._index = options["index"] parallelism = options["parallelism"] doc_count = es.count(index=options["index"])["count"] _log.info(f"Found {doc_count:,} docs in index {self._index}") self._num_partitions = ceil(doc_count / self._partition_size) _log.info( f"Running a total of {self._num_partitions:,} agg queries, " f"each returning up to {self._partition_size:,} buckets " f"that capture the degree of duplication of a duplicated _id. " f"Queries will be distributed among {parallelism} parallel threads." ) with ThreadPool(parallelism) as pool: num_partitions = self._num_partitions if options.get("stop_after"): num_partitions = options["stop_after"] pool.map(self.count_duplication_by_partitions, range(0, num_partitions)) if self._duplicated_doc_ids: duped_id_count = len(self._duplicated_doc_ids) max_dupe = max(self._duplicated_doc_ids.values()) p75 = sorted(self._duplicated_doc_ids.values())[ int(ceil((duped_id_count * 75) / 100)) - 1] p95 = sorted(self._duplicated_doc_ids.values())[ int(ceil((duped_id_count * 95) / 100)) - 1] _log.warning( f"Found {len(self._duplicated_doc_ids):,} _ids with more than one doc in the index. " f"Max duplication (p100) = {max_dupe}; p95 = {p95}; p75 = {p75}" ) else: _log.info("No duplicate documents with the same _id field found.")
def es_log_count_search(idx, key, value, interval="1h"): ''' :param idx: 索引名 :param key: 查询字段 :param value: 匹配的值 :param interval: 查询的时间周期 :return: ''' es = Elasticsearch(['ops-es.00joy.com'], scheme='https', port=443) body = { "query": { "bool": { "must": [ { "match": { key: value } }, # {"match": {"content": "Elasticsearch"}} ], "filter": [{ "range": { "@timestamp": { # "gte": "2019-11-21T00:00:00.000+0800", # "lt": "2018-06-15T13:00:00.000+0800" "gt": "now-" + interval # "gt": "2014-01-01 00:00:00", # "lt": "2014-01-01 00:00:00||+1M" #加一个月 } } }] } } } # ret = es.search(index='loginprocess', ) ret = es.count(index=idx, body=body) print(ret['count'])
def connectDB(esIndex, nodes, rootLogger): ''' Function to connect to Elasticsearch DB. Uses default parameters. Args: esIndex (str): Elasticsearch index of concern nodes (list): list of string values of node information; e.g. ['127.0.0.1:9200', '127.0.0.2:9200'] rootLogger (obj): reference of rootLogger object Returns: obj: elasticsearch object reference ''' es = Elasticsearch([node for node in nodes]) try: # Get no. of documents numDocs = es.count(index=esIndex, body={"query": {"match_all": {}}})['count'] rootLogger.info(f'Connection successful. Number of documents found: {numDocs}') return es except ConnectionError: rootLogger.error('Error talking to ES DB. Check if DB is started up.') sys.exit(500) except ElasticsearchException: rootLogger.error("Unexpected error:", sys.exc_info()[0]) sys.exit(500)
def get_length(): es = Elasticsearch(port=9211) index = "image_cells" query = { "query": { "bool": { "should": [{ "match": { "isDebris": "true" } }, { "bool": { "must_not": [{ "term": { "annotation": "null" } }] } }] } } } res = es.count(index=index, body=query) print(res)
def elastic_processor(user, mapping, is_save): username = user['username'] mapping = pre_processing(username=username, mapping=mapping) data_to_save = [] record_temp = {} path = mapping['source']['path'].split("/") try: es = Elasticsearch([{'host': path[0], 'port': path[1]}]) count = es.count(index=mapping['source']['iterator']) count = count['count'] except: raise Exception("Invalid source path") number_record_in_file = math.ceil(count / (multiprocessing.cpu_count())) array = [] for i in range(0, count, number_record_in_file): array.append( (mapping, i, number_record_in_file, user, is_save, path[0], path[1], mapping['source']['iterator'])) p = Pool(len(array)) logging.warning(array) data_to_save = p.map(elastic_processing, array) p.close() p.join() return data_to_save
class ElasticSearchClass(object): def __init__(self, host, port, user, passwrod): self.host = host self.port = port self.user = user self.password = passwrod self.connect() def connect(self): self.es = Elasticsearch(hosts=[{'host': self.host, 'port': self.port}], http_auth=(self.user, self.password)) def count(self, indexname): """ :param indexname: :return: 统计index总数 """ return self.es.count(index=indexname) def delete(self, indexname, doc_type, id): """ :param indexname: :param doc_type: :param id: :return: 删除index中具体的一条 """ self.es.delete(index=indexname, doc_type=doc_type, id=id) def get(self, indexname, id): return self.es.get(index=indexname, id=id) def search(self, indexname, size=10): try: return self.es.search(index=indexname, size=size, sort="@timestamp:desc") except Exception as err: print(err)
class ElasticsearchDataStore(object): """Implements the datastore.""" # Number of events to queue up when bulk inserting events. DEFAULT_FLUSH_INTERVAL = 1000 DEFAULT_SIZE = 100 DEFAULT_LIMIT = DEFAULT_SIZE # Max events to return DEFAULT_FROM = 0 DEFAULT_STREAM_LIMIT = 5000 # Max events to return when streaming results def __init__(self, host='127.0.0.1', port=9200): """Create a Elasticsearch client.""" super(ElasticsearchDataStore, self).__init__() self.client = Elasticsearch([{'host': host, 'port': port}]) self.import_counter = Counter() self.import_events = [] @staticmethod def _build_labels_query(sketch_id, labels): """Build Elasticsearch query for Timesketch labels. Args: sketch_id: Integer of sketch primary key. labels: List of label names. Returns: Elasticsearch query as a dictionary. """ label_query = {'bool': {'should': [], 'minimum_should_match': 1}} for label in labels: nested_query = { 'nested': { 'query': { 'bool': { 'must': [{ 'term': { 'timesketch_label.name': label } }, { 'term': { 'timesketch_label.sketch_id': sketch_id } }] } }, 'path': 'timesketch_label' } } label_query['bool']['should'].append(nested_query) return label_query @staticmethod def _build_events_query(events): """Build Elasticsearch query for one or more document ids. Args: events: List of Elasticsearch document IDs. Returns: Elasticsearch query as a dictionary. """ events_list = [event['event_id'] for event in events] query_dict = {'query': {'ids': {'values': events_list}}} return query_dict def build_query(self, sketch_id, query_string, query_filter, query_dsl=None, aggregations=None): """Build Elasticsearch DSL query. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query aggregations: Dict of Elasticsearch aggregations Returns: Elasticsearch DSL query as a dictionary """ if query_dsl: query_dsl = json.loads(query_dsl) # Remove any aggregation coming from user supplied Query DSL. # We have no way to display this data in a good way today. if query_dsl.get('aggregations', None): del query_dsl['aggregations'] return query_dsl if query_filter.get('events', None): events = query_filter['events'] return self._build_events_query(events) query_dsl = { 'query': { 'bool': { 'must': [], 'must_not': [], 'filter': [] } } } # TODO: Remove when old UI has been deprecated. if query_filter.get('star', None): label_query = self._build_labels_query(sketch_id, ['__ts_star']) query_string = '*' query_dsl['query']['bool']['must'].append(label_query) # TODO: Remove when old UI has been deprecated. if query_filter.get('time_start', None): query_dsl['query']['bool']['filter'] = [{ 'bool': { 'should': [{ 'range': { 'datetime': { 'gte': query_filter['time_start'], 'lte': query_filter['time_end'] } } }] } }] if query_string: query_dsl['query']['bool']['must'].append( {'query_string': { 'query': query_string }}) # New UI filters if query_filter.get('chips', None): labels = [] must_filters = query_dsl['query']['bool']['must'] must_not_filters = query_dsl['query']['bool']['must_not'] datetime_ranges = { 'bool': { 'should': [], 'minimum_should_match': 1 } } for chip in query_filter['chips']: if chip['type'] == 'label': labels.append(chip['value']) elif chip['type'] == 'term': term_filter = { 'match_phrase': { '{}'.format(chip['field']): { 'query': "{}".format(chip['value']) } } } if chip['operator'] == 'must': must_filters.append(term_filter) elif chip['operator'] == 'must_not': must_not_filters.append(term_filter) elif chip['type'] == 'datetime_range': start = chip['value'].split(',')[0] end = chip['value'].split(',')[1] range_filter = { 'range': { 'datetime': { 'gte': start, 'lte': end } } } datetime_ranges['bool']['should'].append(range_filter) label_filter = self._build_labels_query(sketch_id, labels) must_filters.append(label_filter) must_filters.append(datetime_ranges) # Pagination if query_filter.get('from', None): query_dsl['from'] = query_filter['from'] # Number of events to return if query_filter.get('size', None): query_dsl['size'] = query_filter['size'] # Make sure we are sorting. if not query_dsl.get('sort', None): query_dsl['sort'] = {'datetime': query_filter.get('order', 'asc')} # Add any pre defined aggregations if aggregations: # post_filter happens after aggregation so we need to move the # filter to the query instead. if query_dsl.get('post_filter', None): query_dsl['query']['bool']['filter'] = query_dsl['post_filter'] query_dsl.pop('post_filter', None) query_dsl['aggregations'] = aggregations return query_dsl def search(self, sketch_id, query_string, query_filter, query_dsl, indices, count=False, aggregations=None, return_fields=None, enable_scroll=False): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query count: Boolean indicating if we should only return result count aggregations: Dict of Elasticsearch aggregations return_fields: List of fields to return enable_scroll: If Elasticsearch scroll API should be used Returns: Set of event documents in JSON format """ scroll_timeout = None if enable_scroll: scroll_timeout = '1m' # Default to 1 minute scroll timeout # Exit early if we have no indices to query if not indices: return {'hits': {'hits': [], 'total': 0}, 'took': 0} # Check if we have specific events to fetch and get indices. if query_filter.get('events', None): indices = { event['index'] for event in query_filter['events'] if event['index'] in indices } query_dsl = self.build_query(sketch_id, query_string, query_filter, query_dsl, aggregations) # Default search type for elasticsearch is query_then_fetch. search_type = 'query_then_fetch' # Only return how many documents matches the query. if count: del query_dsl['sort'] count_result = self.client.count(body=query_dsl, index=list(indices)) return count_result.get('count', 0) if not return_fields: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.search(body=query_dsl, index=list(indices), search_type=search_type, scroll=scroll_timeout) # The argument " _source_include" changed to "_source_includes" in # ES version 7. This check add support for both version 6 and 7 clients. # pylint: disable=unexpected-keyword-arg if self.version.startswith('6'): _search_result = self.client.search(body=query_dsl, index=list(indices), search_type=search_type, _source_include=return_fields, scroll=scroll_timeout) else: _search_result = self.client.search(body=query_dsl, index=list(indices), search_type=search_type, _source_includes=return_fields, scroll=scroll_timeout) return _search_result def search_stream(self, sketch_id=None, query_string=None, query_filter=None, query_dsl=None, indices=None, return_fields=None, enable_scroll=True): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args : sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query return_fields: List of fields to return enable_scroll: Boolean determing whether scrolling is enabled. Returns: Generator of event documents in JSON format """ if not query_filter.get('size'): query_filter['size'] = self.DEFAULT_STREAM_LIMIT if not query_filter.get('terminate_after'): query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT result = self.search(sketch_id=sketch_id, query_string=query_string, query_dsl=query_dsl, query_filter=query_filter, indices=indices, return_fields=return_fields, enable_scroll=enable_scroll) if enable_scroll: scroll_id = result['_scroll_id'] scroll_size = result['hits']['total'] else: scroll_id = None scroll_size = 0 # Elasticsearch version 7.x returns total hits as a dictionary. # TODO: Refactor when version 6.x has been deprecated. if isinstance(scroll_size, dict): scroll_size = scroll_size.get('value', 0) for event in result['hits']['hits']: yield event while scroll_size > 0: # pylint: disable=unexpected-keyword-arg result = self.client.scroll(scroll_id=scroll_id, scroll='5m') scroll_id = result['_scroll_id'] scroll_size = len(result['hits']['hits']) for event in result['hits']['hits']: yield event def get_event(self, searchindex_id, event_id): """Get one event from the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id Returns: Event document in JSON format """ try: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg if self.version.startswith('6'): event = self.client.get(index=searchindex_id, id=event_id, doc_type='_all', _source_exclude=['timesketch_label']) else: event = self.client.get(index=searchindex_id, id=event_id, doc_type='_all', _source_excludes=['timesketch_label']) return event except NotFoundError: abort(HTTP_STATUS_CODE_NOT_FOUND) def count(self, indices): """Count number of documents. Args: indices: List of indices. Returns: Number of documents. """ if not indices: return 0 try: result = self.client.count(index=indices) except (NotFoundError, RequestError): es_logger.error('Unable to count indexes (index not found)', exc_info=True) return 0 return result.get('count', 0) def set_label(self, searchindex_id, event_id, event_type, sketch_id, user_id, label, toggle=False, single_update=True): """Set label on event in the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id event_type: String of ElasticSearch document type sketch_id: Integer of sketch primary key user_id: Integer of user primary key label: String with the name of the label toggle: Optional boolean value if the label should be toggled single_update: Boolean if the label should be indexed immediately. (add/remove). The default is False. Returns: Dict with updated document body, or None if this is a single update. """ # Elasticsearch painless script. update_body = { 'script': { 'lang': 'painless', 'source': ADD_LABEL_SCRIPT, 'params': { 'timesketch_label': { 'name': str(label), 'user_id': user_id, 'sketch_id': sketch_id } } } } if toggle: update_body['script']['source'] = TOGGLE_LABEL_SCRIPT if not single_update: script = update_body['script'] return dict(source=script['source'], lang=script['lang'], params=script['params']) doc = self.client.get(index=searchindex_id, id=event_id, doc_type='_all') try: doc['_source']['timesketch_label'] except KeyError: doc = {'doc': {'timesketch_label': []}} self.client.update(index=searchindex_id, doc_type=event_type, id=event_id, body=doc) self.client.update(index=searchindex_id, id=event_id, doc_type=event_type, body=update_body) return None def create_index(self, index_name=uuid4().hex, doc_type='generic_event'): """Create index with Timesketch settings. Args: index_name: Name of the index. Default is a generated UUID. doc_type: Name of the document type. Default id generic_event. Returns: Index name in string format. Document type in string format. """ _document_mapping = { 'properties': { 'timesketch_label': { 'type': 'nested' }, 'datetime': { 'type': 'date' } } } # TODO: Remove when we deprecate Elasticsearch version 6.x if self.version.startswith('6'): _document_mapping = {doc_type: _document_mapping} if not self.client.indices.exists(index_name): try: self.client.indices.create( index=index_name, body={'mappings': _document_mapping}) except ConnectionError: raise RuntimeError('Unable to connect to Timesketch backend.') except RequestError: index_exists = self.client.indices.exists(index_name) es_logger.warning( 'Attempting to create an index that already exists ' '({0:s} - {1:s})'.format(index_name, str(index_exists))) # We want to return unicode here to keep SQLalchemy happy. if six.PY2: if not isinstance(index_name, six.text_type): index_name = codecs.decode(index_name, 'utf-8') if not isinstance(doc_type, six.text_type): doc_type = codecs.decode(doc_type, 'utf-8') return index_name, doc_type def delete_index(self, index_name): """Delete Elasticsearch index. Args: index_name: Name of the index to delete. """ if self.client.indices.exists(index_name): try: self.client.indices.delete(index=index_name) except ConnectionError as e: raise RuntimeError( 'Unable to connect to Timesketch backend: {}'.format(e)) def import_event(self, index_name, event_type, event=None, event_id=None, flush_interval=DEFAULT_FLUSH_INTERVAL): """Add event to Elasticsearch. Args: flush_interval: Number of events to queue up before indexing index_name: Name of the index in Elasticsearch event_type: Type of event (e.g. plaso_event) event: Event dictionary event_id: Event Elasticsearch ID """ if event: for k, v in event.items(): if not isinstance(k, six.text_type): k = codecs.decode(k, 'utf8') # Make sure we have decoded strings in the event dict. if isinstance(v, six.binary_type): v = codecs.decode(v, 'utf8') event[k] = v # Header needed by Elasticsearch when bulk inserting. header = { 'index': { '_index': index_name, } } update_header = {'update': {'_index': index_name, '_id': event_id}} # TODO: Remove when we deprecate Elasticsearch version 6.x if self.version.startswith('6'): header['index']['_type'] = event_type update_header['update']['_type'] = event_type if event_id: # Event has "lang" defined if there is a script used for import. if event.get('lang'): event = {'script': event} else: event = {'doc': event} header = update_header self.import_events.append(header) self.import_events.append(event) self.import_counter['events'] += 1 if self.import_counter['events'] % int(flush_interval) == 0: try: self.client.bulk(body=self.import_events) except (ConnectionTimeout, socket.timeout): # TODO: Add a retry here. es_logger.error('Unable to add events', exc_info=True) self.import_events = [] else: # Import the remaining events in the queue. if self.import_events: try: self.client.bulk(body=self.import_events) except (ConnectionTimeout, socket.timeout): # TODO: Add a retry here. es_logger.error('Unable to add events', exc_info=True) return self.import_counter['events'] def flush_queued_events(self): if self.import_events: self.client.bulk(body=self.import_events) @property def version(self): """Get Elasticsearch version. Returns: Version number as a string. """ version_info = self.client.info().get('version') return version_info.get('number')
class ESStorage(LogStorage): def __init__(self, config): hosts = config.get("HOSTS") kwargs = config.get("OTHER", {}) self.index = config.get("INDEX") or 'jumpserver' self.doc_type = config.get("DOC_TYPE") or 'command_store' self.es = Elasticsearch(hosts=hosts, **kwargs) @staticmethod def make_data(command): data = dict(user=command["user"], asset=command["asset"], system_user=command["system_user"], input=command["input"], output=command["output"], session=command["session"], timestamp=command["timestamp"]) data["date"] = datetime.fromtimestamp(command['timestamp'], tz=pytz.UTC) return data def bulk_save(self, command_set, raise_on_error=True): actions = [] for command in command_set: data = dict( _index=self.index, _type=self.doc_type, _source=self.make_data(command), ) actions.append(data) return bulk(self.es, actions, index=self.index, raise_on_error=raise_on_error) def save(self, command): """ 保存命令到数据库 """ data = self.make_data(command) return self.es.index(index=self.index, doc_type=self.doc_type, body=data) @staticmethod def get_query_body(match=None, exact=None, date_from=None, date_to=None): if date_to is None: date_to = datetime.now() if date_from is None: date_from = date_to - timedelta(days=7) time_from = date_from.timestamp() time_to = date_to.timestamp() body = { "query": { "bool": { "must": [], "filter": [{ "range": { "timestamp": { "gte": time_from, "lte": time_to, } } }] } }, "sort": { "timestamp": { "order": "desc" } } } if match: for k, v in match.items(): body["query"]["bool"]["must"].append({"match": {k: v}}) if exact: for k, v in exact.items(): body["query"]["bool"]["filter"].append({"term": {k: v}}) return body def filter(self, date_from=None, date_to=None, user=None, asset=None, system_user=None, input=None, session=None): match = {} exact = {} if user: exact["user"] = user if asset: exact["asset"] = asset if system_user: exact["system_user"] = system_user if session: match["session"] = session if input: match["input"] = input body = self.get_query_body(match, exact, date_from, date_to) data = self.es.search(index=self.index, doc_type=self.doc_type, body=body) return data["hits"] def count(self, date_from=None, date_to=None, user=None, asset=None, system_user=None, input=None, session=None): match = {} exact = {} if user: exact["user"] = user if asset: exact["asset"] = asset if system_user: exact["system_user"] = system_user if session: match["session"] = session if input: match["input"] = input body = self.get_query_body(match, exact, date_from, date_to) del body["sort"] data = self.es.count(body=body) return data["count"] def __getattr__(self, item): return getattr(self.es, item) def all(self): """返回所有数据""" raise NotImplementedError("Not support") def ping(self): try: return self.es.ping() except Exception: return False
class ElasticSearchUtility: """ class to communicate with ElasticSearch """ def __init__(self): self.es = Elasticsearch(hosts=[ES_HOST], timeout=750) def index_exists(self, index_name): return self.es.indices.exists(index_name) def create_index(self, index_name, body): """ Created a new index. If it already exists, deletes that first. :param index_name: index to create :param body: index creation body """ if self.es.indices.exists(index_name): print("deleting '%s' index..." % index_name) res = self.es.indices.delete(index=index_name) print(" response: '%s'" % res) print("creating '%s' index..." % index_name) res = self.es.indices.create(index=index_name, body=body) print(" response: '%s'" % res) def get_doc_count(self, index_name, doc_type): """ Get total number of documents in a given index :param index_name: name of the index :param doc_type: type of the document :return: total number of documents """ return self.es.count(index_name, doc_type)["count"] def store_index(self, index, doc_type, source_list, init_id): """ Store all data in source list as a unique document in given ElasticSearch index-type :param index: name of the index :param doc_type: type of the document :param source_list: list of document source to insert into given index-type :param init_id: initial id for the document """ bulk_actions = [] doc_id = init_id for source in source_list: data_body = ElasticSearchUtility.__index_data_body(index, doc_type, doc_id, source["_source"]) bulk_actions.append(data_body) doc_id += 1 print 'inserting - ', len(bulk_actions) helpers.bulk(self.es, bulk_actions) def get_all_terms(self, index, doc_type, doc_id, field): """ Get all terms for given field of given index-doc_type-doc_id :param index: name of the index :param doc_type: type of the document :param doc_id: id of the document :param field: field to get term vectors of :return: all terms for given document """ term_vector = self.es.termvectors(index, doc_type, id=doc_id, field_statistics=False, fields=[field], offsets=False, positions=False) all_terms = term_vector[field]["terms"].keys() return all_terms def get_all_ids(self, index_name, doc_type, query_body): """ Returns all ids of given index for given query :param index_name: Name of the index :param doc_type: Type of the document :param query_body: search query :return: List of ids of entire index """ print 'getting all ids...' # query scroll id_list = [] scroll = self.es.search( index=index_name, doc_type=doc_type, scroll='10m', size=10000, fields=['_id'], body=query_body) scroll_size = scroll['hits']['total'] size = 0 # retrieve results while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: doc_id = hit['_id'] id_list.append(doc_id) # update scroll size scroll_size = len(scroll['hits']['hits']) size += scroll_size print "scrolled - ", str(size) # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') return id_list def single_feature_matrix(self, index, doc_type, field, feature): """ Fetch all documents containing given feature along with its tf as a score from ElasticSearch in format {id: tf} :param index: name of the index :param doc_type: type of the document :param field: the field to extract features from :param feature: the feature to extract :return: the dictionary of the format {id: tf} """ out_dict = dict() query_body = { "query": { "function_score": { "query": { "term": { "body_shingles": { "value": feature } } }, "functions": [ { "script_score": { "script": { "file": "getFeatureValue", "params": { "term": feature, "field": field } } } } ], "boost_mode": "replace" } } } # query scroll scroll = self.es.search( index=index, doc_type=doc_type, scroll='10m', size=10000, body=query_body, fields=["stream_id"]) # set initial scroll size scroll_size = scroll['hits']['total'] # retrieve results while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: out_dict[hit["_id"]] = hit["_score"] # update scroll size scroll_size = len(scroll['hits']['hits']) # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') return out_dict @staticmethod def __index_data_body(index, doc_type, doc_id, source): """ Create index data body for insertion based on given parameters :param index: name of the index :param doc_type: type of the document :param doc_id: unique id for index source :param source: data source :return: index data to insert """ index_data = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": source } return index_data @staticmethod def get_match_query(field, value): """ creates match query body for given field and value :param field: document field :param value: value for the field :return: the query body """ query_body = { "query": { "match": { field: value } } } return query_body def get_field_values(self, index, doc_type, field): """ Get dictionary of id:field_value for given index-type and field :param index: name of the index :param doc_type: type of the document :param field: field to get value of :return: id:field_value dictionary """ out_dict = dict() query_body = { "query": { "match_all": {} } } # query scroll scroll = self.es.search( index=index, doc_type=doc_type, scroll='10m', size=10000, body=query_body, fields=[field]) # set initial scroll size scroll_size = scroll['hits']['total'] # retrieve results while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: doc_id = hit["_id"] field_value = hit["fields"][field][0] out_dict[doc_id] = field_value # update scroll size scroll_size = len(scroll['hits']['hits']) # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') return out_dict def get_all_grams(self, index, doc_type, field, unigrams=False): """ Get all unique grams from entire index for given field if unigrams is False, otherwise get only unigrams :param index: name of the index :param doc_type: type of the document :param field: name of the index field :return: the set of all grams """ print 'Getting all grams...' grams = set() if unigrams: file = "getUnigrams" else: file = "getGrams" query_body = { "script_fields": { "grams": { "script": { "file": file, "params": { "field": field } } } } } # query scroll scroll = self.es.search( index=index, doc_type=doc_type, scroll='10m', size=1000, body=query_body) # set initial scroll size scroll_size = scroll['hits']['total'] # retrieve results size = 0 while scroll_size > 0: # scrolled data is in scroll['hits']['hits'] hits_list = scroll['hits']['hits'] for hit in hits_list: try: field_value = hit["fields"]["grams"] grams.update(set([value.encode('UTF8') for value in field_value])) except KeyError: pass # update scroll size scroll_size = len(scroll['hits']['hits']) # prepare next scroll scroll_id = scroll['_scroll_id'] # perform next scroll scroll = self.es.scroll(scroll_id=scroll_id, scroll='10m') size += scroll_size print size return grams def get_sparse_tf_features(self, index, doc_type, field, doc_id, terms_to_include=None): """ Get an dictionary of format {term1:tf1, term2:tf2, ...} for all terms in given field of given index's doc_id where tf id greater than 0 :param index: name of the index :param doc_type: type of the document :param field: field to get terms of :param doc_id: index document id :return: dictionary of the format {term1:tf1, term2:tf2, ...} for all terms having tf > 0 """ out_dict = dict() # POST trec_spam/documents/1/_termvector?field_statistics=false # &positions=false&offsets=false # &fields=body_shingles response = self.es.termvectors(index, doc_type, doc_id, fields=[field], field_statistics=False, positions=False, offsets=False) try: terms = response["term_vectors"][field]["terms"] except KeyError: return dict() for term in terms: words_in_term = len(term.split(' ')) tf = terms[term]["term_freq"] if tf > 0 and words_in_term == 1: try: decoded_term = str(term) if terms_to_include is None: out_dict[decoded_term] = tf else: if terms_to_include.__contains__(decoded_term): out_dict[decoded_term] = tf except: pass return out_dict def get_field_values_for_docs(self, index, doc_type, field, docs_list): """ Get value of given field for all given docs in given order :param index: name of the index :param doc_type: type of the document :param field: field to retirieve value of :param docs_list: list of documents for whic values are to be retrieved :return: list of values of given field for their corresponding ddcs """ values_list = [] for doc in docs_list: response = self.es.get(index, doc, doc_type=doc_type, fields=[field]) value = str(response["fields"][field][0]) values_list.append(value) return values_list
class ElasticsearchDataStore(datastore.DataStore): """Implements the datastore.""" def __init__(self, host=u'127.0.0.1', port=9200): """Create a Elasticsearch client.""" super(ElasticsearchDataStore, self).__init__() self.client = Elasticsearch([{u'host': host, u'port': port}]) self.import_counter = Counter() self.import_events = [] @staticmethod def _build_label_query(sketch_id, label_name): """Build Elasticsearch query for Timesketch labels. Args: sketch_id: Integer of sketch primary key. label_name: Name of the label to search for. Returns: Elasticsearch query as a dictionary. """ query_dict = { u'query': { u'filtered': { u'filter': { u'nested': { u'filter': { u'bool': { u'must': [{ u'term': { u'timesketch_label.name': label_name } }, { u'term': { u'timesketch_label.sketch_id': sketch_id } }] } }, u'path': u'timesketch_label' } } } } } return query_dict @staticmethod def _build_events_query(events): """Build Elasticsearch query for one or more document ids. Args: events: List of Elasticsearch document IDs. Returns: Elasticsearch query as a dictionary. """ events_list = [event[u'event_id'] for event in events] query_dict = {u'query': {u'ids': {u'values': events_list}}} return query_dict @staticmethod def _build_field_aggregator(field_name): """Build Elasticsearch query for aggregation based on field. Args: field_name: Field to aggregate. Returns: Elasticsearch aggregation as a dictionary. """ field_aggregation = { u'field_aggregation': { u'terms': { u'field': field_name, u'size': 0 } } } return field_aggregation def build_query(self, sketch_id, query_string, query_filter, query_dsl, aggregations=None): """Build Elasticsearch DSL query. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query aggregations: Dict of Elasticsearch aggregations Returns: Elasticsearch DSL query as a dictionary """ if not query_dsl: if query_filter.get(u'star', None): query_dsl = self._build_label_query(sketch_id, u'__ts_star') if query_filter.get(u'events', None): events = query_filter[u'events'] query_dsl = self._build_events_query(events) if not query_dsl: query_dsl = { u'query': { u'filtered': { u'query': { u'query_string': { u'query': query_string } } } } } if query_filter.get(u'time_start', None): query_dsl[u'query'][u'filtered'][u'filter'] = { u'range': { u'datetime': { u'gte': query_filter[u'time_start'], u'lte': query_filter[u'time_end'] } } } if query_filter.get(u'exclude', None): query_dsl[u'filter'] = { u'not': { u'terms': { u'data_type': query_filter[u'exclude'] } } } else: query_dsl = json.loads(query_dsl) # Make sure we are sorting. if not query_dsl.get(u'sort', None): query_dsl[u'sort'] = { u'datetime': query_filter.get(u'order', u'asc') } # Remove any aggregation coming from user supplied Query DSL. We have # no way to display this data in a good way today. # TODO: Revisit this and figure out if we can display the data. if query_dsl.get(u'aggregations', None): del query_dsl[u'aggregations'] # Add any pre defined aggregations data_type_aggregation = self._build_field_aggregator(u'data_type') if aggregations: if isinstance(aggregations, dict): if query_filter.get(u'exclude', None): aggregations = { u'exclude': { u'filter': { u'not': { u'terms': { u'field_aggregation': query_filter[u'exclude'] } } }, u'aggregations': aggregations }, u'data_type': data_type_aggregation[u'field_aggregation'] } query_dsl[u'aggregations'] = aggregations else: query_dsl[u'aggregations'] = data_type_aggregation return query_dsl def search(self, sketch_id, query_string, query_filter, query_dsl, indices, aggregations=None, return_results=True): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will execute the search request on ElasticSearch and get result back. Args: sketch_id: Integer of sketch primary key query_string: Query string query_filter: Dictionary containing filters to apply query_dsl: Dictionary containing Elasticsearch DSL query indices: List of indices to query aggregations: Dict of Elasticsearch aggregations return_results: Boolean indicating if results should be returned Returns: Set of event documents in JSON format """ # Limit the number of returned documents. DEFAULT_LIMIT = 500 # Maximum events to return LIMIT_RESULTS = query_filter.get(u'limit', DEFAULT_LIMIT) # Exit early if we have no indices to query if not indices: return {u'hits': {u'hits': [], u'total': 0}, u'took': 0} # Check if we have specific events to fetch and get indices. if query_filter.get(u'events', None): indices = {event[u'index'] for event in query_filter[u'events']} query_dsl = self.build_query(sketch_id, query_string, query_filter, query_dsl, aggregations) # Default search type for elasticsearch is query_then_fetch. search_type = u'query_then_fetch' if not return_results: search_type = u'count' # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.search(body=query_dsl, index=list(indices), size=LIMIT_RESULTS, search_type=search_type, _source_include=[ u'datetime', u'timestamp', u'message', u'timestamp_desc', u'timesketch_label', u'tag' ]) def get_event(self, searchindex_id, event_id): """Get one event from the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id Returns: Event document in JSON format """ try: # Suppress the lint error because elasticsearch-py adds parameters # to the function with a decorator and this makes pylint sad. # pylint: disable=unexpected-keyword-arg return self.client.get(index=searchindex_id, id=event_id, _source_exclude=[u'timesketch_label']) except NotFoundError: abort(HTTP_STATUS_CODE_NOT_FOUND) def count(self, indices): """Count number of documents. Args: indices: List of indices. Returns: Number of documents. """ if not indices: return 0 result = self.client.count(index=indices) return result.get(u'count', 0) def set_label(self, searchindex_id, event_id, event_type, sketch_id, user_id, label, toggle=False): """Set label on event in the datastore. Args: searchindex_id: String of ElasticSearch index id event_id: String of ElasticSearch event id event_type: String of ElasticSearch document type sketch_id: Integer of sketch primary key user_id: Integer of user primary key label: String with the name of the label toggle: Optional boolean value if the label should be toggled (add/remove). The default is False. """ doc = self.client.get(index=searchindex_id, id=event_id) try: doc[u'_source'][u'timesketch_label'] except KeyError: # pylint: disable=redefined-variable-type doc = {u'doc': {u'timesketch_label': []}} self.client.update(index=searchindex_id, doc_type=event_type, id=event_id, body=doc) # Choose the correct script. script_name = u'add_label' if toggle: script_name = u'toggle_label' script = { u'script': { u'file': script_name, u'params': { u'timesketch_label': { u'name': str(label), u'user_id': user_id, u'sketch_id': sketch_id } } } } self.client.update(index=searchindex_id, id=event_id, doc_type=event_type, body=script) def create_index(self, index_name=uuid4().hex, doc_type=u'generic_event'): """Create index with Timesketch settings. Args: index_name: Name of the index. Default is a generated UUID. doc_type: Name of the document type. Default id generic_event. Returns: Index name in string format. Document type in string format. """ _document_mapping = { doc_type: { u'properties': { u'timesketch_label': { u'type': u'nested' } } } } if not self.client.indices.exists(index_name): try: self.client.indices.create( index=index_name, body={u'mappings': _document_mapping}) except ConnectionError: raise RuntimeError(u'Unable to connect to Timesketch backend.') # We want to return unicode here to keep SQLalchemy happy. index_name = unicode(index_name.decode(encoding=u'utf-8')) doc_type = unicode(doc_type.decode(encoding=u'utf-8')) return index_name, doc_type def import_event(self, flush_interval, index_name, event_type, event=None): """Add event to Elasticsearch. Args: flush_interval: Number of events to queue up before indexing index_name: Name of the index in Elasticsearch event_type: Type of event (e.g. plaso_event) event: Event dictionary """ if event: # Make sure we have decoded strings in the event dict. event = { k.decode(u'utf8'): v.decode(u'utf8') for k, v in event.items() } # Header needed by Elasticsearch when bulk inserting. self.import_events.append( {u'index': { u'_index': index_name, u'_type': event_type }}) self.import_events.append(event) self.import_counter[u'events'] += 1 if self.import_counter[u'events'] % int(flush_interval) == 0: self.client.bulk(index=index_name, doc_type=event_type, body=self.import_events) self.import_events = [] else: if self.import_events: self.client.bulk(index=index_name, doc_type=event_type, body=self.import_events) return self.import_counter[u'events']
index=rec["_index"], doc_type=rec["_type"], id=rec["_id"], body={"doc": {"srcSite": sS, "destSite": dS}} ) print "records:", recs, "\t remaining:", q.qsize(), "\ttotal rec:", totr q.task_done() print "make sure we are connected right." import requests res = requests.get("http://cl-analytics.mwt2.org:9200") print (res.content) es = Elasticsearch([{"host": "cl-analytics.mwt2.org", "port": 9200}]) print "documents to look into:" print es.count(index=ind) usrc = {"size": 0, "aggregations": {"unique_vals": {"terms": {"field": "@message.src", "size": 1000}}}} udest = {"size": 0, "aggregations": {"unique_vals": {"terms": {"field": "@message.dest", "size": 1000}}}} usrcs = [] udests = [] res = es.search(index=ind, body=usrc, size=10000) for tag in res["aggregations"]["unique_vals"]["buckets"]: usrcs.append(tag["key"]) res = es.search(index=ind, body=udest, size=10000) for tag in res["aggregations"]["unique_vals"]["buckets"]: udests.append(tag["key"]) print "unique sources: ", len(usrcs)
"--path", action="store", default=None, help= "Path to git repo. Commits used as data to load into Elasticsearch. (Default: None", ) args = parser.parse_args() # instantiate es client, connects to localhost:9200 by default es = Elasticsearch(args.host) # we load the repo and all commits load_repo(es, path=args.path) # run the bulk operations success, _ = bulk(es, UPDATES, index="git") print("Performed %d actions" % success) # we can now make docs visible for searching es.indices.refresh(index="git") # now we can retrieve the documents initial_commit = es.get(index="git", id="20fbba1230cabbc0f4644f917c6c2be52b8a63e8") print("%s: %s" % (initial_commit["_id"], initial_commit["_source"]["committed_date"])) # and now we can count the documents print(es.count(index="git")["count"], "documents in index")
class Elastic(object): FIELD_CATCHALL = "catchall" DOC_TYPE = "doc" # we don't make use of types SIMILARITY = "sim" # we always use this similarity ANALYZER_STOP_STEM = "english" ANALYZER_STOP = "stop_en" def __init__(self, index_name): self.__es = Elasticsearch(hosts=ELASTIC_HOSTS) self.__index_name = index_name @staticmethod def analyzed_field(analyzer=ANALYZER_STOP): """Returns the mapping for analyzed fields. :param analyzer: name of the analyzer; valid options: [ANALYZER_STOP, ANALYZER_STOP_STEM] """ if analyzer not in {Elastic.ANALYZER_STOP, Elastic.ANALYZER_STOP_STEM}: print("Error: Analyzer", analyzer, "is not valid.") exit(0) return { "type": "string", "term_vector": "with_positions_offsets", "analyzer": analyzer, "similarity": Elastic.SIMILARITY } @staticmethod def notanalyzed_field(): """Returns the mapping for not-analyzed fields.""" return { "type": "string", "index": "not_analyzed", "similarity": Elastic.SIMILARITY } def __gen_similarity(self, model="BM25", params={}): """Gets the custom similarity function.""" similarity = params similarity["type"] = model return {Elastic.SIMILARITY: similarity} def __gen_analyzers(self): """Gets custom analyzers. We include customized analyzers in the index setting, a field may or may not use it. """ analyzer = {"type": "standard", "stopwords": "_english_"} analyzers = {"analyzer": {Elastic.ANALYZER_STOP: analyzer}} return analyzers def analyze_query(self, query, analyzer=ANALYZER_STOP): """Analyzes the query. :param query: raw query :param analyzer: name of analyzer """ tokens = self.__es.indices.analyze(index=self.__index_name, body=query, analyzer=analyzer)["tokens"] query_terms = [] for t in sorted(tokens, key=lambda x: x["position"]): query_terms.append(t["token"]) return " ".join(query_terms) def get_mapping(self): """Returns mapping definition for the index.""" mapping = self.__es.indices.get_mapping(index=self.__index_name, doc_type=self.DOC_TYPE) return mapping[self.__index_name]["mappings"][ self.DOC_TYPE]["properties"] def get_settings(self): """Returns index settings.""" return self.__es.indices.get_settings( index=self.__index_name)[self.__index_name]["settings"]["index"] def __update_settings(self, settings): """Updates the index settings.""" self.__es.indices.close(index=self.__index_name) self.__es.indices.put_settings(index=self.__index_name, body=settings) self.__es.indices.open(index=self.__index_name) self.__es.indices.refresh(index=self.__index_name) def update_similarity(self, model="BM25", params={}): """Updates the similarity function "sim", which is fixed for all index fields. The method and param should match elastic settings: https://www.elastic.co/guide/en/elasticsearch/reference/2.3/index-modules-similarity.html :param model: name of the elastic model :param params: dictionary of params based on elastic """ old_similarity = self.get_settings()["similarity"] new_similarity = self.__gen_similarity(model, params) # We only update the similarity if it is different from the old one. # this avoids unnecessary closing of the index if old_similarity != new_similarity: self.__update_settings({"similarity": new_similarity}) def delete_index(self): """Deletes an index.""" self.__es.indices.delete(index=self.__index_name) print("Index <" + self.__index_name + "> has been deleted.") def create_index(self, mappings, force=False): """Creates index (if it doesn't exist). :param mappings: field mappings :param force: forces index creation (overwrites if already exists) """ if self.__es.indices.exists(self.__index_name): if force: self.delete_index() else: print("Index already exists. No changes were made.") return # sets general elastic settings body = ELASTIC_SETTINGS # sets the global index settings # number of shards should be always set to 1; otherwise the stats would not be correct body["settings"] = { "analysis": self.__gen_analyzers(), "similarity": self.__gen_similarity(), "index": { "number_of_shards": 1, "number_of_replicas": 0 }, } # sets the field mappings body["mappings"] = {self.DOC_TYPE: {"properties": mappings}} # creates the index self.__es.indices.create(index=self.__index_name, body=body) print("New index <" + self.__index_name + "> is created.") def add_docs_bulk(self, docs): """Adds a set of documents to the index in a bulk. :param docs: dictionary {doc_id: doc} """ actions = [] for doc_id, doc in docs.items(): action = { "_index": self.__index_name, "_type": self.DOC_TYPE, "_id": doc_id, "_source": doc } actions.append(action) if len(actions) > 0: helpers.bulk(self.__es, actions) def add_doc(self, doc_id, contents): """Adds a document with the specified contents to the index. :param doc_id: document ID :param contents: content of document """ self.__es.index(index=self.__index_name, doc_type=self.DOC_TYPE, id=doc_id, body=contents) def get_doc(self, doc_id, fields=None, source=True): """Gets a document from the index based on its ID. :param doc_id: document ID :param fields: list of fields to return (default: all) :param source: return document source as well (default: yes) """ return self.__es.get(index=self.__index_name, doc_type=self.DOC_TYPE, id=doc_id, fields=fields, _source=source) def search(self, query, field, num=100, fields_return="", start=0): """Searches in a given field using the similarity method configured in the index for that field. :param query: query string :param field: field to search in :param num: number of hits to return (default: 100) :param fields_return: additional document fields to be returned :param start: starting offset (default: 0) :return: dictionary of document IDs with scores """ hits = self.__es.search(index=self.__index_name, q=query, df=field, _source=False, size=num, fields=fields_return, from_=start)["hits"]["hits"] results = {} for hit in hits: results[hit["_id"]] = hit["_score"] return results def get_field_stats(self, field): """Returns stats of the given field.""" return self.__es.field_stats( index=self.__index_name, fields=[field])["indices"]["_all"]["fields"][field] def get_fields(self): """Returns name of fields in the index.""" return list(self.get_mapping().keys()) # ========================================= # ================= Stats ================= # ========================================= def __get_termvector(self, doc_id, field, term_stats=False): """Returns a term vector for a given document field, including global field and term statistics. Term stats can have a serious performance impact; should be set to true only if it is needed! :param doc_id: document ID :param field: field name """ tv = self.__es.termvectors(index=self.__index_name, doc_type=self.DOC_TYPE, id=doc_id, fields=field, term_statistics=term_stats) return tv.get("term_vectors", {}).get(field, {}).get("terms", {}) def __get_coll_termvector(self, term, field): """Returns a term vector containing collection stats of a term.""" hits = self.search(term, field, num=1) doc_id = next(iter(hits.keys())) if len(hits) > 0 else None return self.__get_termvector(doc_id, field, term_stats=True) if doc_id else {} def num_docs(self): """Returns the number of documents in the index.""" return self.__es.count(index=self.__index_name, doc_type=self.DOC_TYPE)["count"] def num_fields(self): """Returns number of fields in the index.""" return len(self.get_mapping()) def doc_count(self, field): """Returns number of documents with at least one term for the given field.""" return self.get_field_stats(field)["doc_count"] def coll_length(self, field): """Returns length of field in the collection.""" return self.get_field_stats(field)["sum_total_term_freq"] def avg_len(self, field): """Returns average length of a field in the collection.""" return self.coll_length(field) / self.doc_count(field) def doc_length(self, doc_id, field): """Returns length of a field in a document.""" return sum(self.term_freqs(doc_id, field).values()) def doc_freq(self, term, field): """Returns document frequency for the given term and field.""" tv = self.__get_coll_termvector(term, field) return tv.get(term, {}).get("doc_freq", 0) def coll_term_freq(self, term, field): """ Returns collection term frequency for the given field.""" tv = self.__get_coll_termvector(term, field) return tv.get(term, {}).get("ttf", 0) def term_freqs(self, doc_id, field): """Returns term frequencies for a given document and field. :return dictionary of terms with their frequencies; {doc_id: freq, ...} """ tv = self.__get_termvector(doc_id, field) term_freqs = {} for term, val in tv.items(): term_freqs[term] = val["term_freq"] return term_freqs def term_freq(self, doc_id, field, term): """Returns frequency of a term in a given document and field.""" return self.term_freqs(doc_id, field).get(term, 0)
load_repo(es, path=args.path) # run the bulk operations success, _ = bulk(es, REPO_ACTIONS, index='git', raise_on_error=True) print('Performed %d actions' % success) # now we can retrieve the documents es_repo = es.get(index='git', doc_type='repos', id='elasticsearch') print('%s: %s' % (es_repo['_id'], es_repo['_source']['description'])) # update - add java to es tags es.update( index='git', doc_type='repos', id='elasticsearch', body={ "script": { "inline" : "ctx._source.tags.add(params.tag)", "params" : { "tag" : "java" } } } ) # refresh to make the documents available for search es.indices.refresh(index='git') # and now we can count the documents print(es.count(index='git')['count'], 'documents in index')
# read in all zip files from depots os.chdir('C:\\Users\\Administrator\\Documents') file = pandas.read_csv('Depot_PLZ_Zuordnung_2016_12.csv', sep = ';', converters={'depot': str, 'postcode': str}) frag1 = '{"query": { "match": { "zip":"' frag2 = '"}}}' total_count = [] zipCollector = [] # query all zips in file and count records respectively for zip in file['postcode']: zipCollector.append(zip) match = frag1 + zip + frag2 # choose valid index name res = es.count(index="customer-d01", body=match) total_count.append(res['count']) if total_count > 400000: break # write collected zips to delta.csv to be picked up by cdh print("Requested amount of zips has been fetched. Writing to file path now. Starting revalidation process.") zipCollector = pandas.DataFrame(zipCollector) pandas.DataFrame.to_csv(zipCollector, path_or_buf="C:\\uniserv\\cdh\\temp\\plzdelta\\delta.csv", index = False, encoding="UTF-8", header=False) # slice data to contain everything from last cutoff and write to wd crtrow = file[file['postcode'] == zip].index.tolist() out_df = file[(crtrow[0]+1):len(file)] path = os.getcwd() + "\\Depot_PLZ_Zuordnung_2016_12.csv" pandas.DataFrame.to_csv(out_df, path_or_buf=path, index = False, encoding="UTF-8", header=False, sep = ";") # invoke shell commands to call revalidation and update-histnames os.chdir("C:\\Uniserv\\cdh\\tools")
def getTopBoard(): query = dashboard.topboardQuery results = db_session.execute(query) total = 0 before_total = 0 totalMaliciousCodeCount = 0 totalTodayUriAnalysisCount = 0 totalTodayUriAnalysisCountNPC = 0 totalTodayUriAnalysisCountIMAS = 0 totalTodayMaliciousFileCount = 0 totalTodayMaliciousFileCountIMAS = 0 totalTodayMaliciousFileCountNPC = 0 totalTodayMaliciousFileCountZombieZero = 0 totalMaliciousUrlCount = 0 totalMaliciousUrlCountRDBMS = 0 totalMaliciousFileCountRDBMS = 0 totalYesterdayMaliciousUrlCount = 0 totalYesterdayMaliciousFileCount = 0 #blackList count query to MySQL blackListQueryResult = Rules_BlackList.query blackListQueryResult = blackListQueryResult.filter_by(source=750) blackListQueryResult = blackListQueryResult.count() totalMaliciousFileCountRDBMS = blackListQueryResult #CNC url count by RDBMS cncRuleQueryResult = Rules_CNC.query cncRuleQueryResult = cncRuleQueryResult.count() totalMaliciousUrlCountRDBMS = cncRuleQueryResult es = Elasticsearch([{ 'host': app.config['ELASTICSEARCH_URI'], 'port': app.config['ELASTICSEARCH_PORT'] }]) ##total Malicious code count # query_type = "" # doc = totalMaliciousQuery(request, query_type) # res = es.search(index="gsp*" + "", doc_type="analysis_results", body=doc) # totalMaliciousCodeCount = int(res['hits']['total']) #Total malicious code count ##total malicious url count # MFdoc = totalMaliciousUrlQuery(request, "uri") # res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) # totalMaliciousUrlCount = int(res['hits']['total']) ##total tody uri analysis count NPC MUdoc = todayURLFileCount("uri", "NPC") res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MUdoc) totalTodayUriAnalySisCountNPC = res['count'] ##total tody uri analysis count NPC MUdoc = todayURLFileCount("uri", "IMAS") res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MUdoc) totalTodayUriAnalySisCountIMAS = res['count'] ##total today file analysis count NPC MFdoc = todayURLFileCount("file", "NPC") res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) totalTodayMaliciousFileCountNPC = res['count'] ##total today file analysis count IMAS MFdoc = todayURLFileCount("file", "IMAS") res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) totalTodayMaliciousFileCountIMAS = res['count'] ##total today file analysis count ZombieZero MFdoc = todayURLFileCount("file", "zombie zero") res = es.count(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) totalTodayMaliciousFileCountZombieZero = res['count'] # MFdoc = todayFileAnalysis(request, "file") # res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) # totalTodayMaliciousFileCount = int(res['hits']['total']) ##total yesterday malicious url count MFdoc = dashboard.yesterdayUrlFileAnalysis(request, "uri") res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) totalYesterdayMaliciousUrlCount = int(res['hits']['total']) ##total yesterday malicious file count MFdoc = dashboard.yesterdayUrlFileAnalysis(request, "file") res = es.search(index="gsp*" + "", doc_type="analysis_results", body=MFdoc) totalYesterdayMaliciousFileCount = int(res['hits']['total']) result = dict() result['spread'] = 0 result['cnc'] = 0 result['bcode'] = 0 result['before_spread'] = 0 result['before_cnc'] = 0 result['before_bcode'] = 0 result['link'] = 0 result['before_link'] = 0 result['uri'] = 0 result['before_uri'] = 0 result['file'] = 0 result['before_file'] = 0 result['totalTodayUriAnalysisCount'] = 0 result['totalTodayUriAnalysisCountNPC'] = 0 result['totalTodayUriAnalysisCountIMAS'] = 0 result['totalTodayMaliciousFileCount'] = 0 result['totalTodayMaliciousFileCountNPC'] = 0 result['totalTodayMaliciousFileCountIMAS'] = 0 result['totalTodayMaliciousFileCountZombieZero'] = 0 result['totalMaliciousUrlQuery'] = 0 result['totalYesterdayMaliciousUrlCount'] = 0 result['totalYesterdayMaliciousFileCount'] = 0 #region db 쿼리 for _row in results: if _row['date'] == datetime.datetime.now().strftime("%Y-%m-%d"): if _row['Code'] == "003": result['spread'] = _row['count'] elif _row['Code'] == "001": result['cnc'] = _row['count'] elif _row['Code'] == "-": result['bcode'] = _row['count'] total += _row['count'] else: if _row['Code'] == "003": result['before_spread'] = _row['count'] elif _row['Code'] == "001": result['before_cnc'] = _row['count'] elif _row['Code'] == "-": result['before_bcode'] = _row['count'] before_total += _row['count'] #endregion eb 쿼리 index = app.config['ELASTICSEARCH_INDEX_HEAD'] + datetime.datetime.now( ).strftime('%Y.%m.%d') #region es 쿼리 query = dashboard.topboardEsQuery("now-1d/d", "now/d") es = Elasticsearch([{ 'host': app.config['ELASTICSEARCH_URI'], 'port': int(app.config['ELASTICSEARCH_PORT']) }]) res = es.search(index="gsp*", body=query, request_timeout=30) #url_crawlds 인덱스 문제로 임시 해결책 18-03-06 for _row in res['aggregations']['types']['buckets']: if _row['key'] == "link_dna_tuple5": result['link'] = _row['doc_count'] total += _row['doc_count'] elif _row['key'] == "url_jobs": result['uri'] = _row['doc_count'] total += _row['doc_count'] elif _row['key'] == "url_crawleds": result['file'] = _row['doc_count'] total += _row['doc_count'] index = app.config['ELASTICSEARCH_INDEX_HEAD'] + datetime.datetime.now( ).strftime('%Y.%m.%d') query = dashboard.topboardEsQuery("now-2d/d", "now-1d/d") es = Elasticsearch([{ 'host': app.config['ELASTICSEARCH_URI'], 'port': int(app.config['ELASTICSEARCH_PORT']) }]) res = es.search(index="gsp*", body=query, request_timeout=30) #url_crawlds 인덱스 문제로 임시 해결책 18-03-06 for _row in res['aggregations']['types']['buckets']: if _row['key'] == "link_dna_tuple5": result['before_link'] = _row['doc_count'] before_total += _row['doc_count'] elif _row['key'] == "url_jobs": result['before_uri'] = _row['doc_count'] before_total += _row['doc_count'] elif _row['key'] == "url_crawleds": result['before_file'] = _row['doc_count'] before_total += _row['doc_count'] #endregion es 쿼리 # result['bcode'] = 34 # result['before_bcode'] = 11 # result['spread'] = 35 # result['before_spread'] = 21 # result['before_cnc'] = 7 # result['file'] = 1752 # result['before_file'] = 1127 result['totalTodayUriAnalysisCount'] = totalTodayUriAnalysisCount result['totalTodayMaliciousFileCount'] = totalTodayMaliciousFileCount result['totalMaliciousUrlCount'] = totalMaliciousUrlCountRDBMS result['totalYesterdayMaliciousUrlCount'] = totalYesterdayMaliciousUrlCount result[ 'totalYesterdayMaliciousFileCount'] = totalYesterdayMaliciousFileCount result['totalTodayUriAnalysisCountNPC'] = totalTodayUriAnalySisCountNPC result['totalTodayUriAnalysisCountIMAS'] = totalTodayUriAnalySisCountIMAS result['totalTodayMaliciousFileCountNPC'] = totalTodayMaliciousFileCountNPC result[ 'totalTodayMaliciousFileCountIMAS'] = totalTodayMaliciousFileCountIMAS result[ 'totalTodayMaliciousFileCountZombieZero'] = totalTodayMaliciousFileCountZombieZero result['cnc'] = totalMaliciousFileCountRDBMS result['cnc_before'] = 13 result['total'] = total result['before_total'] = before_total return json.dumps(result)
class ESearch(): def __init__(self): """ Initialize class parameters """ # Connection object self._es = None self._index_name = "article_data" self._hash_field = "URL" self._dict_of_duplicate_docs = {} def connect_to_es(self, host_name=ELASTIC_SEARCH_ENDPOINT): """ Establishes a connection to the Elastic search server. If server if pingable, returns connection object. Else return None :return: connection-object """ self._es = Elasticsearch(hosts=[host_name], timeout=60) # Ping the connection to check if it's alive if self._es.ping(): return self._es return None def index_exists(self, index_name=None): if not index_name: index_name = self._index_name return self._es.indices.exists(index_name) def _make_mapping(self): """ Creates the index with the correct mapping :return: """ m = Mapping() # add fields m.field('Title', 'text') m.field('Text', 'text') m.field('Publish_Date', 'date') # date type complicates matters across websites m.field('URL', 'text') m.field('Scrape_Date', 'date') # date type complicates matters across websites m.field('Source', 'text') m.field('Search_Keyword', 'text') # save list as text? m.field('SE_Is_Risk', 'boolean') m.field('GP_Is_Risk', 'boolean') m.field('RG_Is_Risk', 'boolean') m.field('SE_Risk_Rating', 'float') m.field('GP_Risk_Rating', 'float') m.field('RG_Risk_Rating', 'float') m.field('SE_SnP_Open', 'float') m.field('SE_SnP_Close', 'float') m.field('SE_AbbV_Open', 'float') m.field('SE_AbbV_Close', 'float') m.field('SE_XBI_Open', 'float') m.field('SE_XBI_Close', 'float') m.field('SE_SnP_Open_Plus1', 'float') m.field('SE_SnP_Close_Plus1', 'float') m.field('SE_AbbV_Open_Plus1', 'float') m.field('SE_AbbV_Close_Plus1', 'float') m.field('SE_XBI_Open_Plus1', 'float') m.field('SE_XBI_Close_Plus1', 'float') m.field('SE_SentimentScore', 'float') m.field('SE_SentimentPolarity', 'float') m.field('CompositeScore', 'float') m.field('RG_FDA_Warning', 'boolean') m.field('GP_SentimentScore', 'float') m.field('GP_SentimentPolarity', 'float') m.field('GP_Location', 'text') m.field('GP_Country', 'text') m.field('Article_references', 'float') m.field('Is_source_type_RG', 'boolean') m.field('Is_source_type_SE', 'boolean') m.field('Is_source_type_GP', 'boolean') # save the mapping into index 'my-index' try: m.save(self._index_name) except Exception as e: print("Could not save schema!", e) def create_index(self): """ Creates the index if it doesn't exist :return: """ # create the index if it doesn't exist if not self.index_exists(): try: index.create() self._make_mapping() print("Index was created :", index.exists()) except Exception as e: print("~~~Index exists error") print(e) return -1 else: print("Index already exists", self._index_name) return 0 def get_index_mapping(self): """ Retrieves the index mapping :return: Index mapping JSON object if success, -1 if error """ try: return self._es.indices.get_mapping(index=self._index_name) except Exception as e: print("~~~Get index mapping error") print(e) return -1 def get_count(self, search_obj=None): return self._es.count(index=self._index_name, body=search_obj) def upload_dataframe(self, df): """ Uploads a dataframe into the index :param df: Dataframe (pandas) :return: 0 if success, -1 if failure """ def rec_to_actions(df): for record in df.to_dict(orient="records"): yield ('{ "index" : { "_index" : "%s", "_type" : "%s" }}' % (self._index_name, "_doc")) yield (json.dumps(record, default=int)) if not self.index_exists(): print("!!!INDEX DOES NOT EXIST -- RETURNING!!!") return -1 try: # make the bulk call, and get a response response = self._es.bulk(rec_to_actions(df)) # return a dict if not response["errors"]: print("Records uploaded") else: print("Could not upload data ") print(response) return -1 except Exception as e: print("\nERROR:", e) return -1 return 0 # Process documents returned by the current search/scroll def _populate_dict_of_duplicate_docs(self, hits): for item in hits: combined_key = str(item['_source'][self._hash_field]) _id = item["_id"] # _Title = item["_source"]["Title"] hashval = hashlib.md5(combined_key.encode('utf-8')).digest() # If the hashval is new, then we will create a new key # in the dict_of_duplicate_docs, which will be # assigned a value of an empty array. # We then immediately push the _id onto the array. # If hashval already exists, then # we will just push the new _id onto the existing array self._dict_of_duplicate_docs.setdefault(hashval, []).append(_id) # Loop over all documents in the index, and populate the # dict_of_duplicate_docs data structure. def _scroll_over_all_docs(self): data = self._es.search(index=self._index_name, scroll='1m', body={"query": { "match_all": {} }}) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits self._populate_dict_of_duplicate_docs(data['hits']['hits']) while scroll_size > 0: data = self._es.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits self._populate_dict_of_duplicate_docs(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) def _loop_over_hashes_and_remove_duplicates(self): urls_to_delete = [] ids_to_delete = [] # Search through the hash of doc values to see if any # duplicate hashes have been found for hashval, array_of_ids in self._dict_of_duplicate_docs.items(): if len(array_of_ids) > 1: # print("********** Duplicate docs hash=%s **********" % hashval) # Get the documents that have mapped to the current hasval matching_docs = self._es.mget(index=self._index_name, body={"ids": array_of_ids}) # Check if the URLs are truly the same URLs dict_url_ids = {} for doc in matching_docs['docs']: dict_url_ids.setdefault(doc["_source"].get("URL"), []).append(doc["_id"]) # remove only the first ID from the list dict_url_ids = { key: value[1:] for (key, value) in dict_url_ids.items() } for i in list(dict_url_ids.keys()): urls_to_delete.append(i) # Delete all the IDs now for i in list(dict_url_ids.values()): ids_to_delete.extend(i) for u in urls_to_delete: print(u) for idd in ids_to_delete: try: del_return = self._es.delete(index=self._index_name, id=idd) except Exception as e: print(e) break def remove_duplicates(self): self._scroll_over_all_docs() self._loop_over_hashes_and_remove_duplicates()
class Es_connector: def __init__(self, host='localhost', port=9200, user='******', password='******', timeout=1000, index="test2", doc_type="tweet"): # def __init__(self, host='localhost', port=9200, user='', password='', timeout=1000, index="test2", doc_type="tweet"): # Define config self.host = host self.port = port self.user = user self.password = password self.timeout = timeout self.index = index self.doc_type = doc_type self.size = 500 self.body = {"query": {"match_all": {}}} self.result = [] # Init Elasticsearch instance self.es = Elasticsearch([self.host], http_auth=(self.user, self.password), port=self.port, timeout=self.timeout, use_ssl=False) # def search(self, query): # res = self.es.search( # index=self.index, # doc_type=self.doc_type, # body={"query": query}, # size=self.size, # ) # if res['hits']['total']>0: # print("Got %d Hits:" % res['hits']['total']) # return res def search(self, query): res = self.es.search( index=self.index, doc_type=self.doc_type, body=query, size=self.size, ) return res def search_size(self, query, size=500): res = self.es.search( index=self.index, doc_type=self.doc_type, body=query, size=size, ) return res def count(self, query): res = self.es.count(index=self.index, doc_type=self.doc_type, body=query) return res def post(self, query): res = self.es.index(index=self.index, doc_type=self.doc_type, body=query) return res def update_field(self, id, field, value): res = self.es.update(index=self.index, doc_type=self.doc_type, id=id, body={"doc": { field: value }}) if res['result'] == "updated": return res else: return False def update(self, id, query): res = self.es.update(index=self.index, doc_type=self.doc_type, id=id, body=query) if res['result'] == "updated": return res else: return False def delete(self, id): res = self.es.delete(index=self.index, doc_type=self.doc_type, id=id) if res['result'] == "deleted": return res else: return False def get(self, id): res = self.es.get(index=self.index, doc_type=self.doc_type, id=id) if res['found'] == True: # print(res) return res else: return False def bigSearch(self, query): res = [] # Process hits here def process_hits(hits, results): for item in hits: results.append(item) return results # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search( index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=query, ) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits res = process_hits(data['hits']['hits'], res) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits res = process_hits(data['hits']['hits'], res) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return res def init_paginatedSearch(self, query): res = [] # Process hits here def process_hits(hits, results): for item in hits: results.append(item) return results # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search( index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=query, ) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits res = process_hits(data['hits']['hits'], res) total = data['hits']['total'] scroll_size = total - scroll_size return { "results": res, "sid": sid, "scroll_size": scroll_size, "total": total } def loop_paginatedSearch(self, sid, scroll_size): res = [] # Process hits here def process_hits(hits, results): for item in hits: results.append(item) return results if scroll_size > 0: data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits res = process_hits(data['hits']['hits'], res) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return {"results": res, "sid": sid, "scroll_size": scroll_size} def getTweets(self): # Process hits here def process_hits(hits): for item in hits: self.result.append(item) # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() body = self.body body = { "_source": ["text", "timestamp_ms", "imagesCluster"], "query": { "match_all": {} } } # Init scroll by search data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits process_hits(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) text = self.result[0]['_source']['text'] date = self.result[0]['_source']['timestamp_ms'] return self.result def getFilteredTweets(self, session, status): # Process hits here def process_hits(hits): for item in hits: self.result.append(item) # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() session = 'session_' + session body = self.body body = { "_source": ["text", "timestamp_ms", "imagesCluster"], "query": { "terms": { session: status } } } # Init scroll by search data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits process_hits(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) # text = self.result[0]['_source']['text'] # date = self.result[0]['_source']['timestamp_ms'] return self.result def update_all(self, field, value): # Process hits here def process_hits(hits): for item in hits: self.update_field(item['_id'], field, value) # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=self.body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits # print(data['hits']['total']) process_hits(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return True def update_query(self, query, field, value): # Process hits here def process_hits(hits): for item in hits: self.update_field(item['_id'], field, value) # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=query) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits # print(data['hits']['total']) process_hits(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return True def remove_field_all(self, field): # Process hits here def process_hits(hits): for item in hits: item['_source'].pop(field, None) up = self.update( item['_id'], {"script": "ctx._source.remove(\"" + field + "\")"}) # print(item['_id']) # print(up) # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") return False # Init scroll by search data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=self.body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits # print(data['hits']['total']) process_hits(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return True def initMABED(self): # Process hits here def process_hits(hits): for item in hits: self.result.append(item) # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() body = self.body body = { "_source": ["text", "timestamp_ms", "imagesCluster"], "query": { "match_all": {} } } # Init scroll by search data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits process_hits(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits process_hits(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) text = self.result[0]['_source']['text'] date = self.result[0]['_source']['timestamp_ms'] return self.result def tokenize(self, text, stopwords): # split the documents into tokens based on whitespaces raw_tokens = text.lower().replace("...", "").replace("…", "").replace("..", "").split() # trim punctuation and convert to lower case return [ token.strip(string.punctuation) for token in raw_tokens if len(token) > 3 and token not in stopwords and 'http' not in token and 'cluster' not in token and re.search('[a-zA-Z]', token) ] def range_tweets(self, start, end, stopwords_file_path, words, count): # Process hits here tweets = [] # load stop-words stopwords = utils.load_stopwords(stopwords_file_path) # print(stopwords) def process_hits(hits, stopwords): t = [] for item in hits: # tweet = item['_source']['text'].encode('utf-8', 'ignore').decode('utf-8') tweet = item['_source']['text'] tokenized_tweet = self.tokenize(tweet, stopwords) # print(tokenized_tweet) t.append(tokenized_tweet) return t # Check index exists if not self.es.indices.exists(index=self.index): print("Index " + self.index + " not exists") exit() body = { "query": { "bool": { "should": { "match": { "text": { "query": words } } }, "filter": { "range": { "@timestamp": { "gt": str(start), "lt": str(end) } } } } } } print(body) # Init scroll by search # filepath = "models/" + str(hash(words)).replace("-", "") + ".model" filepath = "models/" + words.replace(" ", "").replace(",", "") + ".model" modelfile = Path(filepath) if modelfile.is_file(): model = gensim.models.Word2Vec.load(filepath) else: data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='2m', size=self.size, body=body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits tweets = process_hits(data['hits']['hits'], stopwords) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits tweets = tweets + process_hits(data['hits']['hits'], stopwords) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) # print(texts[0]) # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in tweets] # tweets = tweets + ['lyon'] model = gensim.models.Word2Vec(tweets, min_count=1, workers=1, negative=20) model.save(filepath) words = self.tokenize(words, stopwords) pwords = words print("pwords") print(pwords) # context = model.most_similar(positive=['fête','lumières'], topn=10) context = model.most_similar(positive=pwords, topn=count) # context = model.most_similar(positive=['fête','lumières'], topn=count) # context = model.most_similar_cosmul(positive=pwords, topn=5) # context = model.similar_by_word(word='lyon', topn=5) # context = model.similar_by_vector(vector=['lyon','fdl','fdl2017'], topn=5) return context # ======================================================= # ======================================================= def bigTweetTextSearch(self, query): res = [] # Process hits here def process_hits(hits, results): for item in hits: results.append(item['_source']['text']) return results # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search( index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=query, ) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits res = process_hits(data['hits']['hits'], res) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits res = process_hits(data['hits']['hits'], res) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return res def bigSearchMean(self, query): res = [] count = 0 scoreSum = 0 # Process hits here def process_hits(hits, scoreSum): for item in hits: scoreSum = scoreSum + item['_score'] return scoreSum # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search( index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=query, ) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits scoreSum = process_hits(data['hits']['hits'], scoreSum) count = count + len(data['hits']['hits']) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits scoreSum = process_hits(data['hits']['hits'], scoreSum) count = count + len(data['hits']['hits']) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) mean = scoreSum / count return mean def bigSearchSSE(self, query, mean): sse = 0 # Process hits here def process_hits(hits, sse): for item in hits: sse = (item['_score'] - mean)**2 return sse # Check index exists if not self.es.indices.exists(index=self.index): # print("Index " + self.index + " not exists") exit() # Init scroll by search data = self.es.search( index=self.index, doc_type=self.doc_type, scroll='15m', size=self.size, body=query, ) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits sse = process_hits(data['hits']['hits'], sse) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='15m') # Process current batch of hits sse = process_hits(data['hits']['hits'], sse) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) return sse def w2v_tweets(self, stopwords_file_path, words, count): # Process hits here tweets = [] # load stop-words stopwords = utils.load_stopwords(stopwords_file_path) # print(stopwords) def process_hits(hits, stopwords): t = [] for item in hits: # tweet = item['_source']['text'].encode('utf-8', 'ignore').decode('utf-8') tweet = item['_source']['text'] tokenized_tweet = self.tokenize(tweet, stopwords) # print(tokenized_tweet) t.append(tokenized_tweet) return t # Check index exists if not self.es.indices.exists(index=self.index): print("Index " + self.index + " not exists") exit() body = { "query": { "bool": { "should": { "match": { "text": { "query": words } } } } } } print(body) # Init scroll by search # filepath = "models/" + str(hash(words)).replace("-", "") + ".model" filepath = "models/" + words.replace(" ", "").replace(",", "") + ".model" modelfile = Path(filepath) if modelfile.is_file(): model = gensim.models.Word2Vec.load(filepath) else: data = self.es.search(index=self.index, doc_type=self.doc_type, scroll='2m', size=self.size, body=body) # Get the scroll ID sid = data['_scroll_id'] scroll_size = len(data['hits']['hits']) # Before scroll, process current batch of hits tweets = process_hits(data['hits']['hits'], stopwords) while scroll_size > 0: "Scrolling..." data = self.es.scroll(scroll_id=sid, scroll='2m') # Process current batch of hits tweets = tweets + process_hits(data['hits']['hits'], stopwords) # Update the scroll ID sid = data['_scroll_id'] # Get the number of results that returned in the last scroll scroll_size = len(data['hits']['hits']) # print(texts[0]) # tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in tweets] # tweets = tweets + ['lyon'] model = gensim.models.Word2Vec(tweets, min_count=1, workers=10, negative=20) model.save(filepath) words = self.tokenize(words, stopwords) pwords = words print("pwords") print(pwords) # context = model.most_similar(positive=['fête','lumières'], topn=10) context = model.most_similar(positive=pwords, topn=count) # context = model.most_similar(positive=['fête','lumières'], topn=count) # context = model.most_similar_cosmul(positive=pwords, topn=5) # context = model.similar_by_word(word='lyon', topn=5) # context = model.similar_by_vector(vector=['lyon','fdl','fdl2017'], topn=5) return context
class ElasticSearchClass(object): def __init__(self, host, port, user=None, pwd=None): self.host = host self.port = port if user is not None and pwd is not None: self.es = Elasticsearch(hosts=[{ 'host': self.host, 'port': self.port }], http_auth=(user, pwd)) else: self.es = Elasticsearch(hosts=[{ 'host': self.host, 'port': self.port }]) def isValid(self): try: self.es.ping() return True except: return False def count(self, indexName): """ :param indexname: :return: 统计index总数 """ return self.es.count(index=indexName) def delete(self, indexName, docType, id): """ :param indexname: :param doc_type: :param id: :return: 删除index中具体的一条 """ self.es.delete(index=indexName, doc_type=docType, id=id) def get(self, indexName, docType, id): return self.es.get(index=indexName, doc_type=docType, id=id) def search(self, indexName, size=10): try: return self.es.search(index=indexName, size=size, sort="@timestamp:desc") except Exception as err: print(err) def createIndex(self, indexName, body): try: self.es.indices.delete(index=indexName) except elasticsearch.NotFoundError: pass self.es.indices.create(index=indexName, body=body) def indexDocument(self, indexName, docType, body, docId=None): if docId is not None: self.es.index(index=indexName, doc_type=docType, id=docId, body=body) else: self.es.index(index=indexName, doc_type=docType, body=body) #https://github.com/elastic/elasticsearch-py/issues/508 def bulkIndexDocument(self, actions): success, _ = bulk(self.es, actions) return success def moreLikeThis(self, indexName, docType, id, mltFields, search_size=2, min_term_freq=1, min_doc_freq=1): return self.es.search( body={ "size": search_size, "query": { "more_like_this": { "fields": mltFields, "like": [{ "_index": indexName, "_type": docType, "_id": id }], "min_term_freq": min_term_freq, "min_doc_freq": min_doc_freq } } }) def termVector(self, indexName, docType, id): return self.es.termvectors(indexName, docType, id) '''