def createFeatureMatrix(key,filemodel,tset): es = Elasticsearch() docmap = {} wordcount = -1 for word in spamwords: #print word.rstrip() wordcount = wordcount+1 #doc = {\"fields\": [\"_id\"], \"query\": {\"bool\": {\"must\": [{\"match_phrase\":{\"text\": \"viagra\" }},{\"match\": {\"split\": \"train\"}}]}}} if key=="train": res = es.search(index="email_index",doc_type="document",body={"fields": ["_id"], "query": {"bool": {"must": [{"match":{"text": word }},{"match": {"split": "train"}}]}}, "size" : 75000}) else: res = es.search(index="email_index",doc_type="document",body={"fields": ["_id"], "query": {"bool": {"must": [{"match":{"text": word }},{"match": {"split": "test"}}]}}, "size" : 75000}) total = res['hits']['total'] if total!=0: for hit in res['hits']['hits']: docid = int(hit['_id']) tf = 1 if docid in docmap: print docid fmap = docmap[docid] else: fmap = {} fmap[wordcount]=float(tf) docmap[int(docid)]=fmap createFile(docmap,wordcount,filemodel,tset)
def autoComplete(self, query, key, myindex, mysize): self.index = myindex values = [] es = Elasticsearch(hosts = [{"host": self.host, "port": self.port}]) query = '.*' + query + '.*' # UPPER case res = es.search(index = myindex, size = mysize, body = {"query": {"regexp": {key: query.upper()}}}) for doc in res['hits']['hits']: values.append(doc['_source'][key]) # lower case res = es.search(index = myindex, size = mysize, body = {"query": {"regexp": {key: query.lower()}}}) for doc in res['hits']['hits']: values.append(doc['_source'][key]) # Title case res = es.search(index = myindex, size = mysize, body = {"query": {"regexp": {key: query.title()}}}) for doc in res['hits']['hits']: values.append(doc['_source'][key]) values = sorted(set(values)) # Remove duplicates and sort return values
def get_all_vars(var_list): es = Elasticsearch('https://*****:*****@fbc3032a2a91be69517a70b3d75f4eaa.us-east-1.aws.found.io:9243') q = 'viglink' qq = { 'from': 0, 'size': 10000, 'query': { 'query_string': {'query': q} } , "_source": var_list } r = es.search(body=qq, scroll='1m', index='products', doc_type='product') total = r['hits']['total'] batches = int(math.ceil(total * 1.0 / qq['size'])) h = [] for i in range(0, batches): if i == 0: r = es.search(body=qq, scroll='1m', index='products', doc_type='product') sid = r['_scroll_id'] hits = r['hits']['hits'] h.extend(hits) print(len(hits)) else: r = es.scroll(scroll_id=sid, scroll='1m') sid = r['_scroll_id'] hits = r['hits']['hits'] h.extend(hits) print(len(hits)) print("batch: " + str(i)) hvar = {} df = pandas.DataFrame() for var in var_list: hvar[var] = [x['_source'][var] for x in h] df = pandas.DataFrame(hvar) return df
def search(request): if request.method == 'POST': data = request.POST if not data: return _error_response(request, "Failed. No query received") query = data['query'] es = Elasticsearch(['es']) result = es.search(index='listing_index', body={'query': {'query_string': {'query': query}}}) courses_data = result['hits']['hits'] courses_list = [] for c in courses_data: course = {} course['name'] = c['_source']['name'] course['pk'] = c['_source']['pk'] course['description'] = c['_source']['description'] courses_list.append(course) #return a list dictionary (each dictionary is a course) return JsonResponse(courses_list, safe=False) else: es = Elasticsearch(['es']) result = es.search(index='listing_index', body={'query': {'query_string': {'query': 'calculus'}}, 'size': 10}) courses_data = result['hits']['hits'] courses_list = [] for c in courses_data: course = {} course['name'] = c['_source']['name'] course['pk'] = c['_source']['pk'] course['description'] = c['_source']['description'] courses_list.append(course) return JsonResponse(result, safe=False) return JsonResponse({'work': True, 'resp': courses_list}, safe=False)
class ElasticSearchManager(object): def __init__(self, index=None, doc_type=None, *args, **kwargs): self.index = index self.doc_type = doc_type self.obj_es = Elasticsearch() def search(self, query = None, *args, **kwargs): data = self.obj_es.search(index=self.index, doc_type=self.doc_type, body={"query":{"match":query}}) return fetch_source(data['hits']['hits']) def get(self, *args, **kwargs): data=self.obj_es.get(index=self.index, doc_type=self.doc_type, id=kwargs['id']) return data['_source'] def get_list(self, *args, **kwargs): data = self.obj_es.search(index=self.index, body={"query": {"match_all": {}}}) return fetch_source(data['hits']['hits']) def insert(self, data = None): data = json.loads(data) data['user_name'] = data['user']['screen_name'] del data['user'] del data['entities'] res = self.obj_es.index(index=self.index, doc_type=self.doc_type, id=data['id'], body=data) logger.info("Getting stream:{0}".format(res)) def delete(self, data = None): pass def update(self, data = None): pass
def query_elastic(string): es = Elasticsearch() res = es.search(index="documents_analyzed", doc_type="articles", body={"query": {"match": {"_all": string}}}) tamano=res['hits']['total'] res = es.search(index="documents_analyzed", doc_type="articles", body={"size" : tamano,"query": {"match": {"_all": string}},"sort": { "date": { "order": "desc" }}}) res['hits']['hits']#este es un json con los datos de 0 a n return res['hits']['hits']
class ScoreMerge(object): def __init__(self): self.es = Elasticsearch() self.count = 0 self.category = open("/home/eunsoo/Downloads/tutorial/tutorial/category.txt", "r").read().split() def scoreMerge(self): self.es.indices.delete(index='merge', ignore=[400, 404]) for cat in self.category: self.count = self.count + 1 acc_score = 0.0 post_count = 0 search_results = self.es.search(index="scoretest", doc_type='categorized', body={"query": { "match": {"category": cat}}}) if('hits' in search_results): #print search_results['hits']['hits'] for search_result in search_results['hits']['hits']: acc_score = acc_score + search_result['_source']['score'] post_count = post_count + 1 self.es.index(index="mergetest", doc_type="merge", id = cat, body={"category": cat, "acc_score": acc_score, "post_count" : post_count }) print cat+"successfully merged" def displaySortedCategory(self): search_results = self.es.search(index="mergetest", doc_type='merge', body={"sort": {"acc_score": {"order": "desc"}}}) print search_results #print "category / acc_likes / acc_comments_count / acc_score / post_count" for search_result in search_results['hits']['hits']: print ("%s/%10f/%10f" %(search_result['_source']['category'],search_result['_source']['acc_score'],search_result['_source']['post_count']))
class ElasticCom(object): def __init__(self, index, doc_type, hosts='localhost:9200', **kwargs): self.index = index self.doc_type = doc_type self.es = Elasticsearch(hosts=hosts, **kwargs) def search_and_export_to_dict(self, *args, **kwargs): _id = kwargs.pop('_id', True) data_key = kwargs.pop('data_key', kwargs.get('fields')) or '_source' kwargs = dict({'index': self.index, 'doc_type': self.doc_type}, **kwargs) if kwargs.get('size', None) is None: kwargs['size'] = 1 t = self.es.search(*args, **kwargs) kwargs['size'] = t['hits']['total'] return get_search_hits(self.es.search(*args, **kwargs), _id=_id, data_key=data_key) def search_and_export_to_df(self, *args, **kwargs): convert_numeric = kwargs.pop('convert_numeric', True) convert_dates = kwargs.pop('convert_dates', 'coerce') df = pd.DataFrame(self.search_and_export_to_dict(*args, **kwargs)) if convert_numeric: df = df.convert_objects(convert_numeric=convert_numeric, copy=True) if convert_dates: df = df.convert_objects(convert_dates=convert_dates, copy=True) return df
def search(query_word): result = [] es = Elasticsearch() query1 = {"query": {"wildcard": {"name": {"value": "*" + query_word + "*" } } } } res = es.search(index="urban", body=query1) if res['hits']['total'] == 0: res = es.search(index="champ", body=query1) if res['hits']['total'] == 0: return 0 ret = res['hits']['hits'] temp = defaultdict(int) for item in ret: ids = item['_source']['business_id'] query2 = {"query": {"match": {"business_id": ids } } } res = es.search(index="my_data", body=query2) for item in res['hits']['hits'][0]['_source']['word_freq']: temp[item[0]] += item[1] words = [] for item in temp: words.append((item,temp[item])) tags = make_tags(words, maxsize=80) create_tag_image(tags, 'static/cloud_large.jpg', size=(900, 600), fontname='Lobster')
class LogData(object): def __init__(self, host=None, port=None, index_name=None, start_time=None, end_time=None, add_query=None): """LogData is a class to collect log from Elasticsearch. Parameters ---------- host: string, Elasticsearch IP index_name: string, index name in Elasticsearch start_time: date string, format looks like '2016-03-24T00:00:00' end_time: date string, format looks like '2016-03-24T00:00:00' if empty this field, 'now' will use. Returns ------- """ if host: self.host = host else: raise ValueError("Elasticsearch host cannot be empty.") if port: self.port = port else: self.port = 9200 if index_name: self.index_name = index_name else: raise ValueError("index name cannot be empty.") if start_time: self.start_time = start_time else: self.start_time = '1987-03-24T00:00:00' if end_time: self.end_time = end_time else: self.end_time = 'now' self.add_query = add_query self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) self.total, self.browser_ids = self._total_log_browser_ids() def _total_log_browser_ids(self): search_result = self.es.search(index=self.index_name, body=search_syntax(start=0, size=0, start_time=self.start_time, end_time=self.end_time, add_query=self.add_query)) return search_result['hits']['total'], [bucket['key'] for bucket in search_result['aggregations']['identity_tag']['buckets']] def result(self, start=0, size=0): search_result = self.es.search(index=self.index_name, body=search_syntax(start=start, size=size, start_time=self.start_time, end_time=self.end_time, add_query=self.add_query)) return search_result['hits']['hits']
class ElasticClient: def __init__(self, host: str, port: int): try: self.es = Elasticsearch(hosts=[ {'host': host, 'port': port}]) info = self.es.info() logger.info("Connected to Elasticsearch v. %s, name: %s" % (info['version']['number'], info['name'])) except ElasticsearchException as e: logger.error("Elasticsearch is not available.", e) exit(0) def get_articles(self, index, doctype, batch_size): query = '{"query": { "bool": { "must_not": { "exists": { "field": "status" }}}}}' result = self.es.search(index=index, doc_type=doctype, size=batch_size, body=query) articles = result.get('hits').get('hits') return articles if articles is not None else [] def count(self, index): return self.es.count(index=index)['count'] def info(self): return self.es.info() def check_url(self, url: str, auth_index: str): """ Private function to check if a URL appears in the database. Parameters ---------- url: URL for the news stories to be scraped. auth_index: es index Returns ------- found: Boolean. Indicates whether or not a URL was found in the database. """ response = self.es.search(index=auth_index, doc_type=auth_index, body={ "query": { "match_phrase": { "url": url } } }, size=0, terminate_after=1, ignore_unavailable=True) return response["hits"]["total"] > 0 def persist(self, index, doctype, payload): self.es.index(index=index, doc_type=doctype, body=payload) def update(self, index, doctype, doc_id, payload): self.es.update(index=index, doc_type=doctype, id=doc_id, body=payload)
def search(request): #获取查询关键字 text = request.POST.get('search_content') #建立连接 es = Elasticsearch([{'host':'10.10.20.26','port':9200}]) #调用es-py API 根据用户名密码查询,限制查询条数<50 print text #定义查询类型,从前台页面获取 querytype = request.POST.get('way') print querytype if querytype == 'username' or querytype == 'none': #基于用户名查询 searchResult = es.search(index='_all',body={"query":{"term":{'username':text}},"highlight":{"fields":{"username":{}}},"size":100}) elif querytype == 'email': #基于邮箱查询 searchResult = es.search(index='_all',body={"query":{"term":{'email':text}},"size":50}) elif querytype == 'passwd': #基于密码查询 searchResult = es.search(index='_all',body={"query":{"term":{'password':text}},"size":50}) # elif querytype == 'multi': # 多字段检索 # searchResult = es.search(index='_all',body={"multi_match":{"query":text,"fields":["username","email"]}}) #搜索结果list search_list=[] #搜索集合list collection_list=[] #数据源list source_list=[] #获取搜索结果 a= searchResult['hits'] b=a['hits'] for i in range( len(b)): c=b[i]['_source'] print(c) t=b[i]['_index'] print(t) print '************************' search_list.append(c) collection_list.append(t) #获取数据源 for element in collection_list: targetSource = es.search(index='collectionlist2',body={"query":{"match":{'collectionName':element}}}) deal1 = targetSource['hits'] deal2 = deal1['hits'] for j in range( len(deal2)): deal3 = deal2[j]['_source'] deal4 = deal3['source'] source_list.append(deal4) #获取查询总条数 total = a['total'] #获取查询时间 took = searchResult['took'] ccc=search_count(user=request.user,content=text) ccc.save() #返回查询结果和数据源信息 return render_to_response("newmainpage.html",{"tplfile":"newsearchresult.html","username":request.user,"list":search_list,"source_list":source_list,"total":total,"took":took})
class ElasticsearchServices: def __init__(self): self.es = Elasticsearch() # body should be in json format def feed_data(self, index, doc_type, body): res = None try: res = self.es.index( index = index, doc_type = doc_type, body = body ) except: for e in sys.exc_info(): print "Unexpected error:", e pass return res # body should be in json format def search(self, index, doc_type, body, size=20): res = self.es.search( index = index, doc_type = doc_type, body = body, size = size ) return res def search_scroll(self, index, doc_type, body, scroll="2m", size=20): res = self.es.search( index = index, doc_type = doc_type, scroll = scroll, search_type = 'scan', size = size, body = body ) return res def scroll(self, scroll_id, scroll="2m"): res = None try: res = self.es.scroll(scroll_id = scroll_id, scroll = scroll) except NotFoundError as e: print e res = {"hits": {"hits": []}} return res def get_total_hit(self, res): return res['hits']['total']
class MorelikethisBolt(Bolt): connections = {} def initialize(self, conf, context): host = conf.get('zeit.recommend.elasticsearch.host', 'localhost') port = conf.get('zeit.recommend.elasticsearch.port', 9200) self.es = Elasticsearch(hosts=[{'host': host, 'port': port}]) script_path = os.path.dirname(os.path.realpath(__file__)) raw = open(script_path + '/stopwords.txt', 'r').read() self.stopwords = raw.decode('utf-8').split('\n')[3:] def recommend(self, paths, top_n=10): b = { 'query': { 'bool': { 'should': [ {'ids': {'values': paths, 'boost': 1}}, {'match_all': {'boost': 0}} ] } } } items = self.es.search(doc_type='item', body=b, size=len(paths) or 3) legacy_get = lambda i: i['_source'].get('body', i['_source']['teaser']) hits = [legacy_get(i) for i in items['hits']['hits']] b = { 'query': { 'more_like_this': { 'like_text': ' '.join(hits), 'stop_words': self.stopwords } } } items = self.es.search(doc_type='item', body=b, fields='', size=top_n) return list(i['_id'] for i in items['hits']['hits']) def process(self, tup): if tup.stream == 'control': action, user = tup.values if action == 'connect': self.connections[user] = int(time.time()) elif action == 'disconnect': del self.connections[user] elif tup.stream == 'default': user, paths = tup.values if user in self.connections: log('[MorelikethisBolt] Incoming: %s' % user) recommendations = self.recommend(paths) paths = list(set(paths))[:10] emit([user, paths, recommendations])
def newsearch(query_word): result = [] es = Elasticsearch() query1 = {"query": {"wildcard": {"name": {"value": "*" + query_word + "*" } } } } res = es.search(index="urban", body=query1) if res['hits']['total'] == 0: res = es.search(index="champ", body=query1) if res['hits']['total'] == 0: return 0 ret = res['hits']['hits'] temp = defaultdict(int) items = [] for item in ret: ids = item['_source']['business_id'] query2 = {"query": {"match": {"business_id": ids } } } res = es.search(index="alchem", body=query2) for item in res['hits']['hits'][0]['_source']['word_freq']: items.append(item) temp[item['text'].encode('utf-8')] += 1 words = [] for item in items: t = {} scale = 1 if 'sentiment' not in item: continue elif 'type' in item['sentiment']: if item['sentiment']['type'] == 'positive': scale = 1.75 t['color'] = (0,255,0) elif item['sentiment']['type'] == 'negative': scale = 1.25 t['color'] = (255,0,0) elif item['sentiment'] == 'positive': scale = 1.75 t['color'] = (0,255,0) elif item['sentiment'] == 'negative': scale = 1.25 t['color'] = (255,0,0) elif item['sentiment'] == 'neutral': t['color'] = (0,0,255) else: t['color'] = (128,128,128) t['tag'] = item['text'].encode('utf-8') t['size'] = int( math.ceil( temp[item['text']] * float(item['relevance']) * 30 * scale) ) words.append(t) create_tag_image(words, 'static/cloud_large.jpg', size=(900, 600), fontname='Philosopher')
class ElasticWrapper: def __init__(self): self.es = Elasticsearch(["http://mixednode1:9200"], use_ssl=False) def insert(self, docs, index, docs_type): total = len(docs) i = 0 index_doc = [] for doc in docs: i +=1 index_doc.append({"_index":index, "_type":docs_type, "_id":doc["id"], "_source":doc}) if len(index_doc)==1000: logging.info("indexing %s/%s" % (i, total)) helpers.bulk(self.es, index_doc) index_doc = [] if len(index_doc)!=0: logging.info("indexing %s/%s" % (i, total)) helpers.bulk(self.es, index_doc) def get_one_tweet(self): res = self.es.search(index=TWEETS, body={"query": {"match_all": {}}}) result = res['hits']['hits'][0]["_source"] return result def get_day_tweets(self, day, offset): from_date = day.strftime("%Y-%m-%dT00:00:00Z") to_date = day.strftime("%Y-%m-%dT23:59:59Z") res = self.es.search(index=TWEETS, body={"query": {"range": {"created_at":{"gte":from_date, "lte":to_date}}}, "size":1000, "from":offset}) result = res['hits']['hits'] result = list(map((lambda x: x['_source']), result)) return result def tweets_count_for_day(self, day): from_date = day.strftime("%Y-%m-%dT00:00:00Z") to_date = day.strftime("%Y-%m-%dT23:59:59Z") res = self.es.search(index=TWEETS, body={"query": {"range": {"created_at":{"gte":from_date, "lte":to_date}}}, "size":0}) result = res['hits']['total'] return result def articles_count_from(self, day): from_date = day.strftime("%Y-%m-%dT00:00:00Z") res = self.es.search(index=DW_NEWS, body={"query": {"range": {"created_at":{"gte":from_date}}}, "size":0}) result = res['hits']['total'] return result def get_articles_from(self, day, offset): from_date = day.strftime("%Y-%m-%dT00:00:00Z") res = self.es.search(index=DW_NEWS, body={"query": {"range": {"created_at":{"gte":from_date}}}, "size":1000, "from":offset}) result = res['hits']['hits'] result = list(map((lambda x: x['_source']), result)) return result
class ESHelper(object): def __init__(self, host, port): self.es = Elasticsearch(host=host, port=port) self.logger = logging.getLogger(__name__) @staticmethod def _build_agg_query(typ): """helper method to build aggregation query for sensors""" query = config.AVG_QUERY query['aggs']['per_day']['aggs'] = { 'avg_' + config.INDEXES[typ]['avg_field']: { 'avg': { 'field': config.INDEXES[typ]['avg_field'] } } } return query @staticmethod def _build_latest_query(typ): """helper method to build latest record from an index based on timestamp field""" return { "query": { "match_all": {} }, "size": 1, "sort": [ { config.INDEXES[typ]['latest_field']: { "order": "desc" } } ] } def get_data(self): """get aggregated sensor reading data based on the request Data filter parameter""" typ = request.args['Data'] results = self.es.search(index=config.INDEXES[typ]['index'], body=self._build_agg_query(typ)) data = [] for bucket in results['aggregations']['per_day']['buckets']: dt = datetime.strptime(bucket['key_as_string'], '%Y-%m-%dT%H:%M:%S.%fZ') fld = config.INDEXES[typ]['avg_field'] if bucket['avg_' + fld]: data.append({'x': dt.strftime('%Y-%m-%d'), 'y': bucket['avg_' + fld]['value']}) return jsonify({'result': [data, ], 'date': True}) def get_latest_reading(self, typ): """return latest reading from given sensor based on timestamp field""" results = self.es.search(index=config.INDEXES[typ]['index'], body=self._build_latest_query(typ)) return jsonify(results['hits']['hits'][0]['_source'])
class ElasticSearching: def __init__(self): self.es = Elasticsearch([{'host':'210.107.192.201','port':9200}]) def search(self,query,scheme,alpha,beta,gamma): #content = query.replace(r"/",',') content = query token = content.split(' ') content = [x for idx,x in enumerate(token) if not idx == 0] content = ' '.join(content) analyzer = 'my_DFR_analyzer' resTitle = self.es.search(index=scheme,doc_type='article',q='title:' + content,analyzer=analyzer,size=1000) resAbstract = self.es.search(index=scheme,doc_type='article',q='Abstract:' + content,analyzer=analyzer,size=1000) resBody = self.es.search(index=scheme,doc_type='article',q='body:' + content,analyzer=analyzer,size=1000) v = pd.DataFrame() l = pd.DataFrame() for entry in resTitle['hits']['hits']: pmcid = entry['_source']['pmcid'] score = entry['_score'] l = l.append(pd.DataFrame({'pmcid':[pmcid], 'title':[score]})) v = l l = pd.DataFrame() for entry in resAbstract['hits']['hits']: pmcid = entry['_source']['pmcid'] score = entry['_score'] l = l.append(pd.DataFrame({'pmcid':[pmcid], 'abstract':[score]})) v = pd.merge(v,l,how = 'outer', on =['pmcid']) l = pd.DataFrame() for entry in resBody['hits']['hits']: pmcid = entry['_source']['pmcid'] score = entry['_score'] l = l. append(pd.DataFrame({'pmcid':[pmcid], 'body':[score]})) v = pd.merge(v,l,how = 'outer', on = ['pmcid']) v = v.fillna(0) v['score'] = alpha*v['title']+beta*v['abstract']+gamma*v['body'] return v.ix[:,['pmcid','score']] def test(self): with open('summary2.csv','rb') as csvfile: reader = csv.DictReader(csvfile) for idx,item in enumerate(reader): query = item['summary'].replace(r"/",',') abstractQuery = {"abstract" : query} res = self.es.search(index="trec",doc_type="BM25",q=query,analyzer="my_BM25_analyzer",size=100) for doc in res['hits']['hits']: text = '{0},{1}'.format(doc['_source']['pmcid'],doc['_score']) print text
def elasticsearch(query_phrase,index_name): #Query Elasticsearch to see what URLs contain this query #result = es.search(index="movie_db", body={'query': {'match': {'description': 'CIA'}}}) from elasticsearch import Elasticsearch es = Elasticsearch(['52.88.228.98']) #Query Elasticsearch to see what URLs contain this query res = es.search(index_name, q=query_phrase) #Calculate hits out of the total amount Hits=res['hits']['total'] #Find total amount by query that is a false positive, 100%, wiki search total_Hits = (es.search(index_name, q='wiki'))['hits']['total'] #modify later to direct call to get total amount #Output print( ("Got %d Hits:" % res['hits']['total']) + ' out of total: ' + str(total_Hits)) return res
def index(request): q_dict = request.GET if q_dict: es = Elasticsearch() response_list = [] if 'color' in q_dict: color_to_parse = q_dict['color'] colors = color_to_parse.split('_') for c in colors: try: get_size = es.search(index = c)['hits']['total'] result_raw = es.search(index = c, filter_path = ['hits.hits._*'], size = get_size) #print "result_raw" #print result_raw result_list = result_raw["hits"]["hits"] except: result_list = [] for i in result_list: response_list.append(i['_source']) response = {"trashList": response_list} print JsonResponse(response) return JsonResponse(response) elif 'route' in q_dict: index = q_dict['route'] + 'routing' print index try: result_raw = es.search(index = index, filter_path = ['hits.hits._*'], size = 1) result = result_raw['hits']['hits'][0]['_source'] except: result = {} #print JsonResponse(response) return JsonResponse(result) elif 'routelist' in q_dict: index = q_dict['routelist'] + 'routinglist' print index try: result_raw = es.search(index = index, filter_path = ['hits.hits._*'], size = 1) result = result_raw['hits']['hits'][0]['_source'] except: result = {} #print JsonResponse(response) return JsonResponse(result) else: #return HttpResponse("Hello, world. You're at the polls index.") template = loader.get_template('part1/index.html') #print "view.index requested" return HttpResponse(template.render(request))
def searchByParams(index, doc_type): es = Elasticsearch() start = request.args.get('from') or 0 size = request.args.get('size') or 10 if 'type' in request.args: query = es.search(index, doc_type, body={'query': {'prefix': {request.args['type']: request.args['q']}}}, from_=start, size=size) else: query = es.search(index, doc_type, q=request.args.get('q'), default_operator='AND', size=size, from_=start) return jsonify(query['hits'])
def search(): es = Elasticsearch() query1 = {"query": {"match": {"city": "Urbana" } } } query2 = {"query": {"match": {"city": "Champaign" } } } res1 = es.search(index="business", body=query1, size=300) res2 = es.search(index="business", body=query2, size=400) print("Got %d Hits:" % res1['hits']['total']) print("Got %d Hits:" % res2['hits']['total']) urban = [] champ = [] for hit in res1['hits']['hits']: urban.append(hit['_source']['business_id']) for hit in res2['hits']['hits']: champ.append(hit['_source']['business_id']) reviews = [] for ids in urban: query = {"query": {"match": {"business_id": ids } } } res = es.search(index="reviews", body=query) count = res['hits']['total'] #print("Got %d Hits:" % res['hits']['total']) res = es.search(index="reviews", body=query, size=count+1) for hit in res['hits']['hits']: reviews.append(hit['_source']) print len(reviews) for ids in champ: query = {"query": {"match": {"business_id": ids } } } res = es.search(index="reviews", body=query) count = res['hits']['total'] #print("Got %d Hits:" % res['hits']['total']) res = es.search(index="reviews", body=query, size=count+1) for hit in res['hits']['hits']: reviews.append(hit['_source']) print len(reviews) temp = [] for item in reviews: temp.append(item['business_id']) print len(temp),len(set(temp)) save_reviews(reviews)
def get_judge_res(judge_image_dir): es = Elasticsearch(esport) judge_image_dir = 'judgeresult:' + judge_image_dir search_size = 20 search_offset = 0 print request.args try: if 'offset' in request.args: search_offset = int(request.args.get('offset')) if 'size' in request.args: search_size = int(request.args.get('size')) res_index = es.search( index = judge_image_dir, size = search_size, from_=search_offset ) except: del(es) return 'Error: index do not exist\n' res_lst = [] for item in res_index['hits']['hits']: res_lst.append(item['_source']['file']) res_dict = { 'total' : res_index['hits']['total'], 'file_list' : res_lst, 'from_' : search_offset, 'size' : len(res_index['hits']['hits']) } json_res = json.dumps(res_dict) del(es) return json_res
def show(ctx, path, order): router = Router(open(ctx.obj['CONFIG'])) route = router.match(path) logging.debug("Matched route: %s" % route) if not route: print 'No queries matched' return es = Elasticsearch(hosts=route.get('elasticsearch_url')) request_body = {} for non_mandatory_key in ['sort', 'query']: value = route.get(non_mandatory_key) if value: request_body[non_mandatory_key] = value if order == 'asc': request_body['sort'] = {'@timestamp': 'asc'} elif order == 'desc': request_body['sort'] = {'@timestamp': 'desc'} elif order: click.echo("Unknown order format: %s" % order, err=True) return 1 logging.debug("Query: %s" % (request_body,)) result = es.search(index=route.get('index'), doc_type=None, body=request_body) hits = result['hits']['hits'] template = Template(route.get("format", "{{ __at_timestamp }} {{ message }}")) for hit in hits: doc = hit['_source'] doc['__at_timestamp'] = doc.get('@timestamp') print template.render(doc)
def reindex(old_index, new_index, s): ''' Function to reindex by scan and scroll combined with a bulk insert. old_index is the index to take docs from, new_index is the one the docs go to. s is the size of each bulk insert - should set this as high as the RAM on the machine you run it on allows. 500-1000 seems reasonable for t2.medium ''' def create_bulk_insert_string(results, index): ret_str = '' for hit in results: ret_str += '{"create":{"_index":"' + index + '","_type":"variant","_id":"' + hit['_id'] + '"}}\n' ret_str += json.dumps(hit) + '\n' return ret_str es = Elasticsearch('localhost:9200') s = es.search(index=old_index, body='{"query": {"match_all": {}}}', search_type='scan', scroll='5m', size=s) curr_done = 0 try: while True: # do this loop until failure r = es.scroll(s['_scroll_id'], scroll='5m') this_l = [res['_source'] for res in r['hits']['hits']] this_str = create_bulk_insert_string(this_l, new_index) es.bulk(body=this_str, index=new_index, doc_type='variant') curr_done += len(this_l) except: print('{} documents inserted'.format(curr_done))
def search(query='', field='q1', _operator='and', sort=[('_score', 'desc'), ('quoted_by', 'desc')], _filter={}, size=1000, _id=False): es = Elasticsearch([elasticsearch_setting]) if query: es_query = { 'match': { field: { 'query': query, 'operator': _operator, 'minimum_should_match': '85%' } } } else: es_query = {"match_all": {}} body = { "query": { "filtered": { "query": es_query, "filter": _filter } }, 'size': size } sort_item = _build_sort(sort) if sort_item: body.update({'sort': sort_item}) logger.debug(body) result = es.search(index='qwerty', body=body, _source=True, timeout=55) if _id: return (x for x in result['hits']['hits']) return (x['_source'] for x in result['hits']['hits'])
def iter_elastic_query(instance, index, field, subfield=None): es = Elasticsearch(instance) # initial search resp = es.search(index, body={"query": {"match_all": {}}}, scroll='5m') scroll_id = resp.get('_scroll_id') if scroll_id is None: return first_run = True while True: for hit in resp['hits']['hits']: s = hit['_source'] try: if subfield is not None: print(s[field][subfield]) yield s[field][subfield] else: yield s[field] except ValueError: logging.warning("Unable to process row: %s" % str(hit)) scroll_id = resp.get('_scroll_id') # end of scroll if scroll_id is None or not resp['hits']['hits']: break
def search_index(index, searchdict, start=0, host='127.0.0.1', port=9200): # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-body.html # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html ''' #print searchdict pprint(searchdict) #import pdb; pdb.set_trace() thisurl = 'http://%s:%s/%s/_search' % (host, port, path) r = requests.get(thisurl, data=json.dumps(searchdict), verify=False) print r.reason ''' maxcount = 10000 es = Elasticsearch() res = es.search(index=index, body=searchdict, size=maxcount, scroll='1m') # hits.total is the total count of matches, but not the amount returned #total = res['hits']['total'] scroll = es.scroll(scroll_id=res['_scroll_id']) res['hits']['hits'] += scroll['hits']['hits'] return res
class CategoryModule(object): def __init__(self, index, doc_type, host ='172.19.1.77', user = '******', passwd = '123456', port = 3306): self.time_str = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) self.redis_client = PyBfdRedis.newClient('192.168.40.37:26379', 'bfdopen') self.es_client = Elasticsearch([{'host':'192.168.61.89','port':9200}], timeout=1000) self.mysql_client = InserMysql(host = host , user = user , passwd = passwd, db = db, port = port) self.index = index self.doc_type = doc_type def __topn(self, items, n): result = {} total = 0 for item in sorted(items, key=lambda x: x[1], reverse=True)[:n]: total += item[1] result.update({item[0]:item[1]}) return (result,total) def __output2redis(self, key, value): if value['detail']: print key, PyBfdRedis.set(self.redis_client, key, json.dumps(value, ensure_ascii=False)) def getNumWithTime(self, from_time, to_time): query = {"query":{"filtered":{"query":{"match_all":{}},"filter":{"range":{"hbase_time":{"from": from_time, "to": start_time}}}}},"size":0} rest = self.es_client.search(index='%s' %self.index, doc_type='%s' %self.doc_type, body=query, timeout=100000) return rest['hits'][]
def GET(self): es = Elasticsearch(conf['fulltext']['serviceUrl']) if web.input(wildcard_query=None).wildcard_query: query = { "wildcard": { "_all": web.input().query } } self.set_wildcard_query(True) else: query = { "multi_match": { "query": web.input().query, "operator": "and", "fields": ["text", "pageName", "tags"] } } self.set_wildcard_query(False) res = es.search(index=conf['fulltext']['indexName'], body={"query": query, "fields": ["pageName", "path", "fsPath", "text"]}) rows = [] for a in res['hits']['hits']: fields = a['fields'] fs_path = os.path.normpath('%s/%s.md' % (self.data_dir, fields['path'][0])) page_chapters, h1 = extract_description(fs_path) rows.append({ 'h1': h1 if h1 else fields['path'][0], 'file': fields['path'][0], 'chapters': page_chapters }) values = dict(query=web.input().query, ans=rows) return self._render('search.html', values)
def collect(self, raw_output_file='data_notDienGiaDung.csv'): elastic_client = None elastic_client = Elasticsearch(hosts=[{ 'host': '10.3.70.221', 'port': 9200 }]) if elastic_client.ping(): print('Yay Connected') else: print('Awww it could not connect!') search_object = { "size": 10000, "query": { "bool": { "must_not": [ { "multi_match": { "query": "điện tử", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "đèn", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "tivi", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "tủ lạnh", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "máy giặt", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "phòng ngủ", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "phòng bếp", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "nhà bếp", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "điều hoà", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "quạt", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "lò vi sóng", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "samsung", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "sony", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "lg", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "toshiba", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "Sunhouse", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "Kangaroo", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "Bluestone", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "Asanzo", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "gia đình", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "sinh hoạt", "fields": ["_all"], "type": "phrase", } }, { "multi_match": { "query": "tiêu dùng", "fields": ["_all"], "type": "phrase", } }, ] } } } search_object = json.dumps(search_object) # res = elastic_client.search(index=-7823841914345959386) res = elastic_client.search(index='urldata_2020_02', body=search_object) data = res['hits']['hits'] result = pd.DataFrame() for item in data: result = result.append(item['_source'], ignore_index=True) result.to_csv(raw_output_file)
"gte": last30.isoformat(), "lte": t.isoformat() } } }], "must_not": [], "should": [] } }, "size": 0, "aggs": { "dh": { "date_histogram": { "field": "harvest_date", "interval": "day" } } } } rv = es.search(**{"index": "stats", "doc_type": "search", "body": stats_query}) min_days = 31 for b in rv["aggregations"]["dh"]["buckets"]: min_days = min( min_days, (t - datetime.strptime(b["key_as_string"], "%Y-%m-%dT%H:%M:%S.%fZ")).days) print(min_days)
class ElasticSearchSeqSource(base.DataSource): """ Data source which executes arbitrary queries on ElasticSearch This is the tabular reader: will return dataframes. Nested return items will become dict-like objects in the output. Parameters ---------- query: str Query to execute. Can either be in Lucene single-line format, or a JSON structured query (presented as text) qargs: dict Further parameters to pass to the query, such as set of indexes to consider, filtering, ordering. See http://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search es_kwargs: dict Settings for the ES connection, e.g., a simple local connection may be ``{'host': 'localhost', 'port': 9200}``. Other keywords to the Plugin that end up here and are material: scroll: str how long the query is live for, default ``'100m'`` size: int the paging size when downloading, default 1000. metadata: dict Extra information for this source. """ container = 'python' def __init__(self, query, qargs, es_kwargs, metadata): self._query = query self._qargs = qargs self._scroll = es_kwargs.pop('scroll', '100m') self._size = es_kwargs.pop('size', 1000) # default page size self._es_kwargs = es_kwargs self._dataframe = None self.es = Elasticsearch([es_kwargs]) # maybe should be (more) global? super(ElasticSearchSeqSource, self).__init__(container=self.container, metadata=metadata) def _run_query(self, size=None): if size is None: size = self._size try: q = json.loads(self._query) if 'query' not in q: q = {'query': q} s = self.es.search(body=q, size=size, scroll=self._scroll, **self._qargs) except (JSONDecodeError, TypeError): s = self.es.search(q=self._query, size=size, scroll=self._scroll, **self._qargs) sid = s['_scroll_id'] scroll_size = s['hits']['total'] while scroll_size > len(s['hits']['hits']): page = self.es.scroll(scroll_id=sid, scroll=self._scroll) sid = page['_scroll_id'] s['hits']['hits'].extend(page['hits']['hits']) self.es.clear_scroll(scroll_id=sid) return s def _get_schema(self, retry=2): """Get schema from first 10 hits or cached dataframe""" return base.Schema(datashape=None, dtype=None, shape=None, npartitions=1, extra_metadata={}) def _get_partition(self, _): """Downloads all data ES has a hard maximum of 10000 items to fetch. Otherwise need to implement paging, known to ES as "scroll" https://stackoverflow.com/questions/41655913/elk-how-do-i-retrieve-more-than-10000-results-events-in-elastic-search """ results = self._run_query() return [r['_source'] for r in results['hits']['hits']]
#!/usr/bin/python import codecs import csv import sys from elasticsearch import Elasticsearch reload(sys) sys.setdefaultencoding("UTF-8") keywords = [] with codecs.open(sys.argv[1], 'r', 'utf-8') as data_file: data = csv.reader(data_file, delimiter=";") for line in data: if line: keywords.append(line[0]) es = Elasticsearch("https://search-iprice-production-3-f3orjkipgmnoxt4qzf6zuervfu.ap-southeast-1.es.amazonaws.com:443") query = '{"size": 0, "query" : { "term" : { "masterbrain" : "%s" } }}' for k in keywords: ret = es.search("product_*_20160121", "product", query % k.lower()) hits = ret['hits']['total'] print k, ";" , hits
def all_host(request): """ 所有的主机列表,从es中查询所有主机ip,从数据库中获取异常主机ip,根据数据库ip给所有主机加个状态(正常或异常) :param request: :return: """ # 返回数据 全部主机数 异常主机数 data = [] all_total = 0 unhealth_total = 0 sys_ips = [] # 处理异常主机 ips = [] items, total = get_unhealth_host() if total: for item in items: ips.append(item['ip']) # 先尝试从redis中获取数据 # try: # cache_data = red.get('all_host') # if cache_data: # data = eval(cache_data)[0] # all_total = eval(cache_data)[1] # unhealth_total = eval(cache_data)[2] # for per_data in data: # sys_ips.append(per_data['ip']) # # # 如果数据库有,sys没有 # if total: # for item in items: # if item['ip'] not in sys_ips: # data.append({'ip': item['ip'], 'name': item['hostname'], 'status': 1}) # unhealth_total += 1 # all_total += 1 # # return JsonResponse({'code': 200, 'message': 'OK', 'data': data, # 'all_total': all_total, 'unhealth_total': unhealth_total}) # except Exception as e: # logger.error(e) # 根据 syslog 和 时间筛选索引 today = datetime.now().strftime('%Y%m%d') try: # 获取所有索引index global es_ip, es_port MAIN_URL = "http://" + es_ip + ":" + str(es_port) # MAIN_URL = "http://192.168.1.243:9200" rs = RequestSimulator(MAIN_URL) get_data = {'pretty': ''} resp = rs.get(url='/_cat/indices', params=get_data, ignore_http_error=True) res = resp.read().decode('utf-8') lines = res.split('\n') lines = lines[:-1] indexs = [] for line in lines: temp_list = line.split() temp_str = temp_list[2] if temp_str.startswith('syslog') and temp_str.endswith(today): indexs.append(temp_str) # 索引总数 all_total = len(indexs) # 遍历索引取出每个主机名 # 链接es es = Elasticsearch(es_server_ip_port) body = {"query": {"match_all": {}}, "size": 1} # 异常主机数量 unhealth_total = 0 sys_ips = [] for index in indexs: # 从es中读取 result = es.search(index=index, body=body, ignore_unavailable=True) ip = result['hits']['hits'][0]['_source']['type'] # 主机ip sys_ips.append(ip) name = result['hits']['hits'][0]['_source']['host'] status = 0 # 0代表主机正常 1代表主机异常 if ip in ips: status = 1 unhealth_total += 1 data.append({'ip': ip, 'name': name, 'status': status}) # 如果数据库有,sys没有 if total: for item in items: if item['ip'] not in sys_ips: data.append({ 'ip': item['ip'], 'name': item['hostname'], 'status': 1 }) unhealth_total += 1 all_total += 1 # 将结果存入redis # red.setex('all_host', [data, all_total, unhealth_total], 30) except Exception as e: logger.error(e) for per_data in data: try: if per_data['status'] == 1: for per_item in items: if per_item['ip'] == per_data['ip']: info = eval(per_item['info']) per_data['file_error_path'] = info.get( 'file_error_path', '') per_data['file_error_hash'] = info.get( 'file_error_hash', '') obj = BlackboxHost.objects.filter(hostip=per_data['ip']) if obj.exists(): is_protect = obj[0].status else: is_protect = 1 per_data['is_protect'] = is_protect except Exception as e: logger.error(e) per_data['status'] = 1 # is_block=1表示未阻断状态 0表示阻断状态 per_data['is_block'] = 1 return JsonResponse({ 'code': 200, 'message': 'OK', 'data': data, 'all_total': all_total, 'unhealth_total': unhealth_total })
def syslog_incr_count(request): """ 每4小时产生了多少条syslog日志 :param request: :return: """ # 返回数据 result_list = [] es = Elasticsearch(jc.es_server_ip_port) # 连接ES now = datetime.now() now_str = datetime.strftime(now, '%Y-%m-%d %H:%M:%S') # 2018-03-19 17:28:40 # 如果是18分钟,取10分;29九分钟取20分;40分钟取40分 # minute = str((now.minute//10)*10) # if minute == '0': # minute = '00' # 拼接字符串获取整点(10分。20分,30分...) # new_time = now_str[:10] + 'T' + now_str[11:14] + '00:00.000Z' # es搜索需要的时间格式 2018-03-20T14:00:00.000Z return_time = datetime.strptime( now_str[:-5] + '00:00', '%Y-%m-%d %H:%M:%S') # 将2018-03-20 14:00:00转为datetime格式 # pre_time = new_time + '||-12h' # 前12小时 2018-03-20T14:00:00.000Z||-10m # 因为要显示历史记录, 所以再取前面的五条 for i in range(6): # 查询的结束时间 datetime.datetime(2018, 3, 22, 19, 00, 00) new_time = return_time - timedelta(hours=4 * i) # 查询的起始时间 datetime.datetime(2018, 3, 22, 15, 00, 00) pre_time = return_time - timedelta(hours=4 * (i + 1)) # es搜索需要的时间格式 2018-03-22T19:00:00.000Z es里存的时间是-8小时 new_time_for = datetime.strftime((new_time - timedelta(hours=8)), "%Y-%m-%dT%H:%M:%S.000Z") # es搜索需要的时间格式 2018-03-20T18:50:00.000Z es里存的时间是-8小时 pre_time_for = datetime.strftime(pre_time - timedelta(hours=8), "%Y-%m-%dT%H:%M:%S.000Z") body = { "query": { "bool": { "must": [ { "match_phrase": { "_type": "sysLog" } }, # 必须匹配规则 ], "filter": { "range": { "@timestamp": { "gte": pre_time_for, "lt": new_time_for } } } # 时间过滤器 } }, } try: result = es.search( index=['syslog*'], body=body, ignore_unavailable=True)['hits']['total'] # 从es中读取 except Exception as e: logger.error(e) result = 0 per_dict = {} per_dict[str(result)] = [ datetime.strftime(pre_time, '%d日%H时'), datetime.strftime(new_time, '%d日%H时') ] result_list.append(per_dict) return JsonResponse({'code': 200, 'total': result_list})
class Test(BaseTest): def init(self): self.elasticsearch_url = self.get_elasticsearch_url() print("Using elasticsearch: {}".format(self.elasticsearch_url)) self.es = Elasticsearch([self.elasticsearch_url]) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("elasticsearch").setLevel(logging.ERROR) self.modules_path = os.path.abspath(self.working_dir + "/../../../../module") self.filebeat = os.path.abspath(self.working_dir + "/../../../../filebeat.test") self.index_name = "test-filebeat-modules" body = {"transient": {"script.max_compilations_rate": "2000/1m"}} self.es.transport.perform_request('PUT', "/_cluster/settings", body=body) @parameterized.expand(load_fileset_test_cases) @unittest.skipIf( not INTEGRATION_TESTS, "integration tests are disabled, run with INTEGRATION_TESTS=1 to enable them." ) @unittest.skipIf( os.getenv("TESTING_ENVIRONMENT") == "2x", "integration test not available on 2.x") def test_fileset_file(self, module, fileset, test_file): self.init() # generate a minimal configuration cfgfile = os.path.join(self.working_dir, "filebeat.yml") self.render_config_template( template_name="filebeat_modules", output=cfgfile, index_name=self.index_name, elasticsearch_url=self.elasticsearch_url, ) self.run_on_file(module=module, fileset=fileset, test_file=test_file, cfgfile=cfgfile) def run_on_file(self, module, fileset, test_file, cfgfile): print("Testing {}/{} on {}".format(module, fileset, test_file)) try: self.es.indices.delete(index=self.index_name) except: pass self.wait_until(lambda: not self.es.indices.exists(self.index_name)) cmd = [ self.filebeat, "-systemTest", "-e", "-d", "*", "-once", "-c", cfgfile, "-E", "setup.ilm.enabled=false", "-modules={}".format(module), "-M", "{module}.*.enabled=false".format(module=module), "-M", "{module}.{fileset}.enabled=true".format(module=module, fileset=fileset), "-M", "{module}.{fileset}.var.input=file".format(module=module, fileset=fileset), "-M", "{module}.{fileset}.var.paths=[{test_file}]".format( module=module, fileset=fileset, test_file=test_file), "-M", "*.*.input.close_eof=true", ] # Based on the convention that if a name contains -json the json format is needed. Currently used for LS. if "-json" in test_file: cmd.append("-M") cmd.append("{module}.{fileset}.var.format=json".format( module=module, fileset=fileset)) output_path = os.path.join(self.working_dir) output = open(os.path.join(output_path, "output.log"), "ab") output.write(bytes(" ".join(cmd) + "\n", "utf-8")) # Use a fixed timezone so results don't vary depending on the environment # Don't use UTC to avoid hiding that non-UTC timezones are not being converted as needed, # this can happen because UTC uses to be the default timezone in date parsers when no other # timezone is specified. local_env = os.environ.copy() local_env["TZ"] = 'Etc/GMT+2' subprocess.Popen(cmd, env=local_env, stdin=None, stdout=output, stderr=subprocess.STDOUT, bufsize=0).wait() # Make sure index exists self.wait_until(lambda: self.es.indices.exists(self.index_name)) self.es.indices.refresh(index=self.index_name) # Loads the first 100 events to be checked res = self.es.search(index=self.index_name, body={ "query": { "match_all": {} }, "size": 100, "sort": { "log.offset": { "order": "asc" } } }) objects = [o["_source"] for o in res["hits"]["hits"]] assert len(objects) > 0 for obj in objects: assert obj["event"][ "module"] == module, "expected event.module={} but got {}".format( module, obj["event"]["module"]) assert "error" not in obj, "not error expected but got: {}".format( obj) if (module == "auditd" and fileset == "log") \ or (module == "osquery" and fileset == "result"): # There are dynamic fields that are not documented. pass else: self.assert_fields_are_documented(obj) self._test_expected_events(test_file, objects) def _test_expected_events(self, test_file, objects): # Generate expected files if GENERATE env variable is set if os.getenv("GENERATE"): with open(test_file + "-expected.json", 'w') as f: # Flatten an cleanup objects # This makes sure when generated on different machines / version the expected.json stays the same. for k, obj in enumerate(objects): objects[k] = self.flatten_object(obj, {}, "") clean_keys(objects[k]) json.dump(objects, f, indent=4, separators=(',', ': '), sort_keys=True) with open(test_file + "-expected.json", "r") as f: expected = json.load(f) assert len(expected) == len( objects), "expected {} events to compare but got {}".format( len(expected), len(objects)) for ev in expected: clean_keys(ev) found = False for obj in objects: # Flatten objects for easier comparing obj = self.flatten_object(obj, {}, "") clean_keys(obj) if ev == obj: found = True break assert found, "The following expected object was not found:\n {}\nSearched in: \n{}".format( pretty_json(ev), pretty_json(objects))