def query_articles(self, query, prefs): client = connections.get_connection() search = Search(using=client, index='articles') q = Q('bool', must=[Q('exists', field='watson_analyzed'), Q('match', watson_success=True), Q('match', body=query)]) search = search.query(q) search.execute() documents = [] for hit in search[:100]: if '#' not in hit.url and '?' not in hit.url: documents.append({ 'id': hit.meta.id, 'title': hit.title, 'body': hit.body, 'url': hit.url, 'score': hit.meta.score, 'tone': dict( joy=hit.tone.joy, fear=hit.tone.fear, sadness=hit.tone.sadness, disgust=hit.tone.disgust, anger=hit.tone.anger ), 'top_image': hit.top_image }) if len(documents) < 10: return documents else: return select_k_and_sort(documents, prefs)
def consensus(offset=60): """ check for 'eth.chain.new_head' messages and return the max number of clients, that had the same head during the last `offset` seconds. """ s = Search(client) # s = s.query(Q('match', message='eth.chain.new_head')) s = s.filter('exists', field='json_message.eth.chain.new_head.block_number') s = s.sort({'json_message.eth.chain.new_head.ts': {'order': 'desc', 'ignore_unmapped': 'true'}}) response = s.execute() # Get latest block number x = max(hit['_source']['json_message']['eth.chain.new_head']['block_number'] for hit in response.hits.hits) # By default, the buckets are ordered by their doc_count descending # s.aggs.bucket('by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3) # Reach consensus around latest block number s = Search(client) s = s.filter(time_range_filter(field="json_message.eth.chain.new_head.ts", offset=offset)) s.aggs.bucket('latest', 'range', field='json_message.eth.chain.new_head.block_number', ranges=[{"from": x - 1, "to": x + 1}]).bucket( 'by_block_hash', 'terms', field='json_message.eth.chain.new_head.block_hash', size=3) # s = s[10:10] response = s.execute() # pprint(response) if response: return max(tag.doc_count for tag in response.aggregations.latest.buckets[0].by_block_hash.buckets) else: return 0
def query_articles(self, query): client = connections.get_connection() search = Search(using=client) q = Q('match', body=query) search = search.query(q) search.execute() for hit in search: yield { 'title': hit.title, 'body': hit.body, 'top_image': hit.top_image }
def interface_get_highlights(): wiki_field = 'wiki_content' qb_field = 'qb_content' text = request.form['text'] s = Search(index='qb')[0:20].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) s = s.highlight(wiki_field).highlight(qb_field) results = list(s.execute()) if len(results) == 0: highlights = {'wiki': [''], 'qb': [''], 'guess': ''} else: guessForEvidence = request.form['guessForEvidence'] guessForEvidence = guessForEvidence.split("style=\"color:blue\">")[1].split("</a>")[0].lower() guess = None for index, item in enumerate(results): if item.page.lower().replace("_", " ")[0:25] == guessForEvidence: guess = results[index] break if guess == None: print("expanding search") s = Search(index='qb')[0:80].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) s = s.highlight(wiki_field).highlight(qb_field) results = list(s.execute()) for index, item in enumerate(results): if item.page.lower().replace("_", " ")[0:25] == guessForEvidence: guess = results[index] break if guess == None: highlights = {'wiki': [''], 'qb': [''], 'guess': ''} return jsonify(highlights) _highlights = guess.meta.highlight try: wiki_content = list(_highlights.wiki_content) except AttributeError: wiki_content = [''] try: qb_content = list(_highlights.qb_content) except AttributeError: qb_content = [''] highlights = {'wiki': wiki_content, 'qb': qb_content, 'guess': guess.page} return jsonify(highlights)
def autocomplete_institutions_titles(): name = str(request.args.get('name')) if not name: return Response({'Autocomplete query requires a "?name={} parameter'}, status=500) page = int(request.args.get('page') or 1) size = int(request.args.get('size') or SIZE) start = (page - 1) * size s_title = Search(index=ELASTIC_INDEX).query('match_phrase_prefix', name={'query': name, 'slop': 5})[start:start + size] res_title = s_title.execute() s_alias = Search(index=ELASTIC_INDEX).query('match_phrase_prefix', other_names={'query': name, 'slop': 5})[start:start + size] res_alias = s_alias.execute() comb = list(set(get_names(res_alias) + get_names(res_title))) return Response(json.dumps(comb), status=200)
def search(self, **params): limit_cat = params.get('cat', "").strip() limit_forum = params.get('forum', "").strip() limit_count = int(params.get('count', 100)) limit_size_min = human2bytes(params.get('min', "0b")) limit_size_max = human2bytes(params.get('max', "0b")) limit_wild = int(params.get('wild', 0)) arg = params.get('query', '').strip() if not arg: arg = "hobbit" s = Search(using=es, index=ela_index) if limit_size_min: s = s.filter("range", size = {'gte' : limit_size_min }) if limit_size_max: s = s.filter("range", size = {'lte' : limit_size_max }) arg = arg.split(' ') if limit_wild: q = Q("wildcard", name="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", name="*"+a+"*") else: q = Q("match", name=arg.pop(0)) for a in arg: q = q & Q("match", name=a) if len(limit_cat): for a in limit_cat.split(' '): q = q & Q("match", category=a) if len(limit_forum): for a in limit_forum.split(' '): q = q & Q("match", forum=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total #cherrypy.log("query have "+str(size)+" elements") if size > limit_count: size = limit_count s = s.sort('-size') s = s.extra(size=size) r = s.execute() data = [] for b in r: a = [b.id, b.size, b.name, b.category, b.forum, b.date[0] if b.date else '', b.hash] data.append(a) return {'data': data}
def search(self, **params): limit_author = params.get('author', "").strip() limit_title = params.get('title', "").strip() limit_count = int(params.get('count', 10)) limit_wild = int(params.get('wild', 0)) q = None if not limit_author and not limit_title: limit_title = "hobbit" s = Search(using=es, index=ela_index) arg = limit_title.split(' ') arg = [x for x in arg if x] if len(arg): if limit_wild: q = Q("wildcard", title="*"+arg.pop(0)+"*") for a in arg: q = q & Q("wildcard", title="*"+a+"*") else: q = Q("match", title=arg.pop(0)) for a in arg: q = q & Q("match", title=a) arg = limit_author.split(' ') arg = [x for x in arg if x] if len(arg): for a in arg: if q: q = q & Q("match", author=a) else: q = Q("match", author=a) s = s.query(q) #cherrypy.log("query is "+str(s.to_dict())) r = s.execute() size = r.hits.total if size > limit_count: size = limit_count s = s.sort('-date') s = s.extra(size=size) r = s.execute() #cherrypy.log("result is "+str(r)) data = [] for b in r: a = [b.id, b.author, b.title, b.size, b.date] data.append(a) return {'data': data}
def reverse(): try: lon = float(request.args.get('lon')) lat = float(request.args.get('lat')) except (TypeError, ValueError): lon = lat = None if not lat or not lon: abort(400, "missing 'lon' or 'lat': /?lon=2.0984&lat=48.0938") s = Search(es).index(INDEX).query(MatchAll()).extra(size=1).sort({ "_geo_distance": { "coordinate": { "lat": lat, "lon": lon }, "order": "asc" }}) _type = request.args.get('type', None) if _type: s = s.query({'match': {'type': _type}}) results = s.execute() if len(results.hits) < 1: notfound.debug('reverse: lat: {}, lon: {}, type: {}'.format( lat, lon, _type)) debug = 'debug' in request.args data = to_geo_json(results, debug=debug) data = json.dumps(data, indent=4 if debug else None) response = Response(data, mimetype='application/json') cors(response) return response
def test_inner_hits_are_wrapped_in_response(data_client): s = Search(index='git')[0:1].query('has_parent', parent_type='repo', inner_hits={}, query=Q('match_all')) response = s.execute() commit = response.hits[0] assert isinstance(commit.meta.inner_hits.repo, response.__class__) assert repr(commit.meta.inner_hits.repo[0]).startswith("<Hit(git/doc/elasticsearch-dsl-py): ")
def search(self, doc_type, query=""): """ Execute search query and retrive results :param doc_type: Type in ElasticSearch :param query: search query :return: list with results """ results = [] if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta: q = Q("multi_match", query=query.lower(), fields=["title"]) s = Search() s = s.using(self.client) s = s.index(self.index_name) s = s.doc_type(doc_type) s = s.query(q) print "search query: " + str(s.to_dict()) response = s.execute() for resp in response: results.append(resp) return results
def categories(self): s = Search( using=docstore._get_connection(settings.DOCSTORE_HOSTS), index=settings.DOCSTORE_INDEX, doc_type='articles' ).fields([ 'title', 'title_sort', 'categories', ])[0:docstore.MAX_SIZE] if not settings.MEDIAWIKI_SHOW_UNPUBLISHED: s = s.query('match', published=True) response = s.execute() pages = [] for hit in response: page = Page() page.url_title = hit.title[0] page.title = hit.title[0] page.title_sort = hit.title_sort[0] page.categories = hit.get('categories', []) pages.append(page) articles = sorted(pages, key=lambda page: page.title_sort) categories = {} for page in articles: for category in page.categories: # exclude internal editorial categories if category not in settings.MEDIAWIKI_HIDDEN_CATEGORIES: if category not in categories.keys(): categories[category] = [] # pages already sorted so category lists will be sorted if page not in categories[category]: categories[category].append(page) return categories
def gracc_query_apel(year, month): index = osg_summary_index starttime = datetime.datetime(year, month, 1) onemonth = dateutil.relativedelta.relativedelta(months=1) endtime = starttime + onemonth s = Search(using=es, index=index) s = s.query('bool', filter=[ Q('range', EndTime={'gte': starttime, 'lt': endtime }) & Q('terms', VOName=vo_list) & ( Q('term', ResourceType='Batch') | ( Q('term', ResourceType='Payload') & Q('term', Grid='Local') ) ) ] ) bkt = s.aggs bkt = bkt.bucket('Cores', 'terms', size=MAXSZ, field='Processors') bkt = bkt.bucket('VO', 'terms', size=MAXSZ, field='VOName') bkt = bkt.bucket('DN', 'terms', size=MAXSZ, field='DN') bkt = bkt.bucket('Site', 'terms', size=MAXSZ, missing=MISSING, field='OIM_ResourceGroup') #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='SiteName') #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='WLCGAccountingName') add_bkt_metrics(bkt) bkt = bkt.bucket('SiteName', 'terms', size=MAXSZ, field='SiteName') add_bkt_metrics(bkt) response = s.execute() return response
def get_files_in_path(self, dir_path): ''' gets all es file names from es in a given path ''' dir_hash = FileResource.get_hash(dir_path) #s = FileResource.search().query("match", path["hashdir"] = dir_hash) #s = FileResource.search().query("multi_match", query=dir_hash, fields=['path.hashdir']) # [{"query": {"match_all": {"index": "content_crawler", "body": {"query": {"term": {"path.hashdir": "b5844a9aba1536cc74682d8bfa28553b5dfd8a8a"}}}, "doc_type": "file_resource"}} s = Search().query( index = self.index, doc_type= self.type, body={"query": { "term" : { "file_dir_hash" : dir_hash } } } ) response = s.execute() files = [] for hit in s: files.append(hit.file_uri) return files
def search(self, **params): index = params.get('index', self.index) search = Search(using=self.client, index=index) page = params.get('page', None) per_page = params.get('per_page', None) if page and per_page: page = page - 1 search._extra = {'from': page, 'size': per_page} sort = params.get('sort', None) if sort and sort.replace('-', '') in ['created_at', 'level']: search = search.sort(sort) date_filter = self._filter_by_date_interval(params) if date_filter: search = search.filter(date_filter) level = params.get('group_by', None) if level: search = search.query('match', level=level) hits = search.execute() format = params.get('format', 'object') if format == 'dict': return self._to_dict(hits) else: return self._to_logs(hits)
def authors(self, num_columns=0): """ @param num_columns: int If non-zero, break up list into columns """ s = Search( using=docstore._get_connection(settings.DOCSTORE_HOSTS), index=settings.DOCSTORE_INDEX, doc_type='authors' ).fields([ 'url_title', 'title', 'title_sort', 'lastmod' ])[0:docstore.MAX_SIZE] response = s.execute() authors = [] for hit in response: url_title = hit.url_title[0] title = hit.title[0] title_sort = hit.title_sort[0] lastmod = hit.lastmod[0] if title and title_sort: author = Author() author.url_title = url_title author.title = title author.title_sort = title_sort author.lastmod = datetime.strptime(lastmod, mediawiki.TS_FORMAT) authors.append(author) authors = sorted(authors, key=lambda a: a.title_sort) if num_columns: return _columnizer(authors, num_columns) return authors
def exists(self): find_instance = Search(using=self.es,index=self.index) \ .query(Q("match",Id=self.sf_id)) response = find_instance.execute() return response
def session_times(): # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"} start_message = 'scenario.p2p_connect.starting.clients.sequentially' stop_message = 'scenario.p2p_connect.stopping.clients' s = Search(client) s = s.filter('bool', should=[F('term', message=start_message), F('term', message=stop_message)]) s = s.fields(['message', '@timestamp']) s = s[0:100000] s = s.sort('-@timestamp') # desc, we want the latest events response = s.execute() events = [] # joungest to oldest, last should be a stop message for h in response: msg = 'start' if h['message'][0] == start_message else 'stop' ts = h['@timestamp'][0] events.append((msg, ts)) assert not events or events[0][0] == 'stop' sessions = [] while len(events) >= 2: stop = events.pop() start = events.pop() sessions.append(dict([start, stop])) return list(reversed(sessions))
def dates(): """Return maximum and minimum date from dataset.""" q = Search(using=client, index=TRENDS_INDEX)[0:0] q.aggs.bucket('min_date', 'min', field=TRENDS_DATE_FIELD) q.aggs.bucket('max_date', 'max', field=TRENDS_DATE_FIELD) res = q.execute().aggregations return jsonify({'maximum': res.min_date.value_as_string, 'minimum': res.max_date.value_as_string})
def handle(self, *args, **options): min_id = FailureLine.objects.order_by('id').values_list("id", flat=True)[0] - 1 chunk_size = options['chunk_size'] if options["recreate"]: connection.indices.delete(TestFailureLine._doc_type.index, ignore=404) TestFailureLine.init() else: if connection.indices.exists(TestFailureLine._doc_type.index): self.stderr.write("Index already exists; can't perform import") return while True: rows = (FailureLine.objects .filter(id__gt=min_id) .order_by('id') .values("id", "job_guid", "action", "test", "subtest", "status", "expected", "message", "best_classification_id", "best_is_verified"))[:chunk_size] if not rows: break es_lines = [] for item in rows: es_line = failure_line_from_value(item) if es_line: es_lines.append(es_line) self.stdout.write("Inserting %i rows" % len(es_lines)) bulk_insert(es_lines) min_id = rows[len(rows) - 1]["id"] time.sleep(options['sleep']) s = Search(doc_type=TestFailureLine).params(search_type="count") self.stdout.write("Index contains %i documents" % s.execute().hits.total)
def get_journals_by_collection_institution(collection_acronym, page_from=0, page_size=1000): search = Search(index=INDEX).query( "nested", path="collections", query=Q("match", collections__acronym=COLLECTION)) search = search.filter("exists", field="sponsors") search = search[page_from:page_size] search_response = search.execute() meta = { 'total': search_response.hits.total, } sponsors = {} for journal in search_response: j = {'jid': journal.jid, 'title': journal.title, 'current_status': journal.current_status, 'last_issue': journal.last_issue, 'issue_count': journal.issue_count } for sponsor in journal['sponsors']: sponsors.setdefault(sponsor, []).append(j) result = { 'meta': meta, 'objects': sponsors } return result
def get_highlights(): wiki_field = 'wiki_content' qb_field = 'qb_content' text = request.form['text'] s = Search(index='qb')[0:10].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) s = s.highlight(wiki_field).highlight(qb_field) results = list(s.execute()) if len(results) == 0: highlights = {'wiki': [''], 'qb': [''], 'guess': ''} else: guess = results[0] # take the best answer _highlights = guess.meta.highlight try: wiki_content = list(_highlights.wiki_content) except AttributeError: wiki_content = [''] try: qb_content = list(_highlights.qb_content) except AttributeError: qb_content = [''] highlights = {'wiki': wiki_content, 'qb': qb_content, 'guess': guess.page} return jsonify(highlights)
def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError('The index does not exist, you must create it before searching') if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field] ) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): logger.debug('Start: %s End: %s Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields))) search = Search(using=self.client, index=self.indices[0]) search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}})) for k,v in self.fields.items(): if isinstance(v, list): for sv in v: search = search.query("match", **{k:sv}) else: search = search.query("match", **{k:v}) logger.debug('ES Query: %s' % str(search.to_dict())) response = search.execute() logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits))) for hit in response: # filter out the meta key and flatten the values row = {k: str(hit[k]) for k in hit if k != 'meta'} logger.debug(row) input = input.append(row, ignore_index=True) return input
def pages(): """Returns list of published light Page objects. @returns: list """ KEY = 'encyc-front:pages' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='articles').filter('term', published_encyc=True)[0:MAX_SIZE] s = s.sort('title_sort') s = s.fields([ 'url_title', 'title', 'title_sort', 'published', 'modified', 'categories', ]) response = s.execute() data = [ Page( url_title = hitvalue(hit, 'url_title'), title = hitvalue(hit, 'title'), title_sort = hitvalue(hit, 'title_sort'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), categories = hit.get('categories',[]), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) return data
def authors(num_columns=None): """Returns list of published light Author objects. @returns: list """ KEY = 'encyc-front:authors' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='authors')[0:MAX_SIZE] s = s.sort('title_sort') s = s.fields([ 'url_title', 'title', 'title_sort', 'published', 'modified', ]) response = s.execute() data = [ Author( url_title = hitvalue(hit, 'url_title'), title = hitvalue(hit, 'title'), title_sort = hitvalue(hit, 'title_sort'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) if num_columns: return _columnizer(data, num_columns) return data
def sources(): """Returns list of published light Source objects. @returns: list """ KEY = 'encyc-front:sources' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='sources')[0:MAX_SIZE] s = s.sort('encyclopedia_id') s = s.fields([ 'encyclopedia_id', 'published', 'modified', 'headword', 'media_format', 'img_path', ]) response = s.execute() data = [ Source( encyclopedia_id = hitvalue(hit, 'encyclopedia_id'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), headword = hitvalue(hit, 'headword'), media_format = hitvalue(hit, 'media_format'), img_path = hitvalue(hit, 'img_path'), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) return data
def get_highlights(text): # query top 10 guesses s = Search(index='qb_ir_instance_of')[0:10].query('multi_match', query=text, fields=['wiki_content', 'qb_content', 'source_content']) s = s.highlight('qb_content').highlight('wiki_content') results = list(s.execute()) if len(results) == 0: highlights = {'wiki': [''], 'qb': [''], 'guess': ''} return highlights guess = results[0] # take the best answer _highlights = guess.meta.highlight try: wiki_content = list(_highlights.wiki_content) except AttributeError: wiki_content = [''] try: qb_content = list(_highlights.qb_content) except AttributeError: qb_content = [''] highlights = {'wiki': wiki_content, 'qb': qb_content, 'guess': guess.page} return highlights
def search(): q = request.args.get('q') #resp = es.search(index='hoe', doc_type='record', q=q, body=aggs) #logging.info(q) s = Search(using=es, index='hoe', doc_type='record') s.aggs.bucket('library_place', 'terms', field='library-place') s.aggs.bucket('type', 'terms', field='type') s.aggs.bucket('genre', 'terms', field='genre') s.aggs.bucket('keywords', 'terms', field='keywords.label') s.aggs.bucket('author', 'terms', field='author.literal') s.query = Q('multi_match', query=q, fields=['_all']) filters = [] if 'filter' in request.args: filters = request.args.getlist('filter') logging.info(filters) for filter in filters: cat, val = filter.split(':') cat = cat.replace('_', '-') filter_dict = {} filter_dict.setdefault(cat, val) logging.info(cat) s.filter = F('term', **filter_dict) #if request.args resp = s.execute() #logging.info(resp) #logging.info(resp.aggregations.per_category.buckets) return render_template('resultlist.html', records=resp.to_dict().get('hits'), facets=resp.aggregations.to_dict(), header=q, query=q, filters=filters)
def from_es_id(cls,es,es_id,access_token,instance,version=None): index_exists = es.indices.exists(index=cls.ES_INDEX) type_exists = es.indices.exists_type(index=cls.ES_INDEX, doc_type=cls.ES_TYPE) if not all([index_exists,type_exists]): raise Exception('Elastic index or type does not exist. ' \ 'Cannot find {c} in Elastisearch '\ ' to create an instance'.format(c=cls.__name__)) find_instance = Search(using=es,index=cls.ES_INDEX) \ .query(Q("match",_id=es_id)) r = find_instance.execute() if not r: raise Exception('Cannot find elasticsearch {t}' \ ' instance from elasticsearch ' \ 'id:{id}'.format(t=cls.__name__, id=es_id)) sf_id = r[0]._d_.pop('Id',None) if sf_id is None: raise Exception('Missing a valid SF Id in ' \ ' Elasticsearch document id:{i}'.format(i=sf_id)) sf_data = r[0]._d_ return cls(es=es, sf_id=sf_id, sf_data=sf_data, access_token=access_token, instance=instance)
def search(self, args, es_client=client): search = Search(using=es_client, index=SearchableEvent.meta.index) if args.get('name'): search = search.query('fuzzy', name=args['name']) search = search.highlight('name') if args.get('description'): search = search.query('match', description=args['description']) search = search.highlight('description') if args.get('location-name'): search = search.query('fuzzy', location_name=args['location_name']) search = search.highlight('location_name') if args.get('organizer-name'): search = search.query( 'fuzzy', organizer_name=args['organizer_name']) search = search.highlight('organizer_name') if args.get('organizer-description'): search = search.query( 'fuzzy', organizer_description=args['organizer_description']) search = search.highlight('organizer_description') return [to_dict(r) for r in search.execute()]
def is_alive(): find_string = datetime.utcnow().strftime("%Y-%m") from_date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d") s = Search(using=es, index="bitshares-" + find_string) s.query = Q("range", block_data__block_time={'gte': from_date, 'lte': "now"}) s.aggs.metric("max_block_time", "max", field="block_data.block_time") json_response = { "server_time": datetime.utcnow(), "head_block_timestamp": None, "head_block_time": None } try: response = s.execute() if response.aggregations.max_block_time.value is not None: json_response["head_block_time"] = str(response.aggregations.max_block_time.value_as_string) json_response["head_block_timestamp"] = response.aggregations.max_block_time.value json_response["deltatime"] = abs((datetime.utcfromtimestamp(json_response["head_block_timestamp"] / 1000) - json_response["server_time"]).total_seconds()) if json_response["deltatime"] < 30: json_response["status"] = "ok" else: json_response["status"] = "out_of_sync" json_response["error"] = "last_block_too_old" else: json_response["status"] = "out_of_sync" json_response["deltatime"] = "Infinite" json_response["query_index"] = find_string json_response["query_from_date"] = from_date json_response["error"] = "no_blocks_last_24_hours" except NotFoundError: json_response["status"] = "out_of_sync" json_response["deltatime"] = "Infinite" json_response["error"] = "index_not_found" json_response["query_index"] = find_string return json_response
def get_aggregated_filtered_statistics(filters): s = Search(using=es, doc_type='associations') s = filter_association_search(s, filters) agg_chr = A("terms", field="snp.chr") agg_type = A("terms", field="snp.coding") agg_annotation = A({ "nested": { "path": "snp.annotations" }, "aggs": { "annotations": { "terms": { "field": "snp.annotations.effect" } } } }) agg_maf = A("range", field="maf", ranges=[{ "to": 0.01 }, { "from": 0.01, "to": 0.05001 }, { "from": 0.05001, "to": 0.1001 }, { "from": 0.1001 }]) agg_mac = A("range", field="mac", ranges=[{"to": 6}, {"from": 6}]) s.aggs.bucket('maf_count', agg_maf) s.aggs.bucket('mac_count', agg_mac) s.aggs.bucket('chr_count', agg_chr) s.aggs.bucket('type_count', agg_type) s.aggs.bucket('annotation_count', agg_annotation) agg_results = s.execute().aggregations return agg_results.chr_count.buckets, agg_results.maf_count.buckets, agg_results.mac_count.buckets, agg_results.type_count.buckets, agg_results.annotation_count.annotations.buckets
def search(request): if not request.GET.get('q'): return bad_request('no search term') term = request.GET.get('q') hits = [] s = Search(index=KORPUS_INDEX) s = s.source(includes=['pk', 'rec', 'vrsta', 'podvrsta']) s.query = MultiMatch(type='bool_prefix', query=remove_punctuation(term), fields=['oblici']) try: response = s.execute() for hit in response.hits.hits: hits.append({ 'vrsta': hit['_source']['vrsta'], 'vrsta_text': VRSTE_RECI[hit['_source']['vrsta']], 'rec': hit['_source']['rec'], 'pk': hit['_source']['pk'] }) return Response(hits, status=HTTP_200_OK, content_type=JSON) except ElasticsearchException as error: return server_error(error.args)
def get_all_data_sender_mobile_numbers(dbm): all_data_senders_count = get_all_data_senders_count(dbm) search_parameters = { "response_fields": ['mobile_number'], "number_of_results": all_data_senders_count, "start_result_number": 0 } es = Elasticsearch(hosts=[{ "host": ELASTIC_SEARCH_HOST, "port": ELASTIC_SEARCH_PORT }]) search = Search(using=es, index=dbm.database_name, doc_type=REPORTER_DOC_TYPE) search = _add_non_contact_filter(search) search = _add_non_deleted_ds_filter(search) search = _restrict_test_ds_filter(search) search = _add_pagination_criteria(search_parameters, search) search = _add_response_fields(search_parameters, search) search_results = search.execute() return [ safe_getattr(item, 'mobile_number')[0] for item in search_results.hits ]
def get_trx(): trx = request.args.get('trx', "738be2bd22e2da31d587d281ea7ee9bd02b9dbf0") from_ = request.args.get('from_', 0) size = request.args.get('size', 10) s = Search(using=es, index="graphene-*", extra={ "size": size, "from": from_ }) q = Q("match", block_data__trx_id=trx) s.query = q response = s.execute() results = [] for hit in response: # print hit.to_dict() results.append(hit.to_dict()) return jsonify(results)
def tx_propagation(client_count, offset=10): """ check for 'eth.tx.tx_new' messages and return the max number of clients, that had the same tx during the last `offset` seconds. """ s = Search(client) # s = s.query(Q("match", message='eth.tx.received')) s = s.filter('exists', field='json_message.eth.tx.received.tx_hash') s = s.filter( time_range_filter(field="json_message.eth.tx.received.ts", offset=offset)) s.aggs.bucket('by_tx', 'terms', field='json_message.eth.tx.received.tx_hash', size=client_count) # s = s[0:1000] response = s.execute() if response: return max(tag.doc_count for tag in response.aggregations.by_tx.buckets) else: return 0
def search_substrings_and(self, substrings, _index): """ Search for documents containing all substrings from the input list. (search for a conjunction of all substrings from 'substrings' :param substrings: substring list :param _index: ES index """ start = clock() # create the first query object, then add other queries to it using the 'and' operation q = Q("match", content=substrings[0]) for substring in substrings[1:]: q = q & Q("match", content=substring) # create search object from 'q' query object s = Search(index=_index).using(self.client).query(q) response = s.execute() end = clock() - start self.time = end self.total = response.hits.total for substring in substrings[:-1]: self.query += "'{}' & ".format(substring) self.query += substrings[-1] self.__save__()
def get_topics_aggregations(topic_modelling, topic_weight_threshold, is_multi_corpus): s = Search(using=ES_CLIENT, index=f"{ES_INDEX_TOPIC_DOCUMENT}_{topic_modelling}") \ .filter("range", topic_weight={"gte": topic_weight_threshold}) s.aggs.bucket(name='topics', agg_type="terms", field='topic_id', size=10000) \ .metric("topic_weight", agg_type="sum", field="topic_weight") if is_multi_corpus: s.aggs['topics'].bucket(name="corpus", agg_type="terms", field="document_corpus", size=10000) \ .metric("topic_weight", agg_type="sum", field="topic_weight") result = s.execute() topic_info_dict = dict((bucket.key, { "count": bucket.doc_count, "weight_sum": bucket.topic_weight.value, "corpus_weights": dict(((bucket_corpus.key, { "count": bucket_corpus.doc_count, "weight_sum": bucket_corpus.topic_weight.value, }) for bucket_corpus in bucket.corpus.buckets )) if is_multi_corpus else None }) for bucket in result.aggregations.topics.buckets) return topic_info_dict
def search_pokemon(es: Elasticsearch, search_query: str, page: int): s = Search(using=es) q = Q({ 'function_score': { 'query': { 'multi_match': { 'query': search_query, 'fields': [ 'doc.name', 'doc.abilities.ability.name', 'doc.forms.name', 'doc.moves.move.name' ], 'fuzziness': 'AUTO', 'prefix_length': 2 } } } }) s = s.query(q)[(page - 1) * 10:page * 10] res = s.execute() return res
def get_entity(self, sport, element): search = Search(using=self.es) if sport == Sport.SOCCER: search = search.index('soccer-entity') if sport == Sport.BASKETBALL: search = search.index('basketball-entity') search = search.query(Match(_id=element[0])) response = search.execute() if len(response) > 0: entity = {'name': response[0]['name']} if 'abstract' in response[0]: entity['abstract'] = response[0]['abstract'] else: entity['abstract'] = 'None' if 'type' in response[0]: entity['type'] = response[0]['type'] else: entity['type'] = 'None' else: entity = {'name': element[0], 'abstract': 'None', 'type': 'None'} entity['similarity'] = round(element[1], 2) entity['sport'] = sport.value return entity
def search(query, filter=None): s = Search(index='policy-index').query("multi_match", query=query, fields=["title", "school", "department", "administrator", "author", "state", "city", "latitude", "longitude", "link", "tags", "abstract", "text"], fuzziness="AUTO").extra(from_=0, size=100) if filter is not None and len(filter) > 0: years = [] schools = [] for f in filter: try: f = int(f) years.append(Q('range', published_date={'gte': date(f, 1, 1), 'lt': date(f, 12, 31)})) except ValueError: schools.append(Q('match_phrase', school=f)) if len(years) > 0 and len(schools) == 0: s = s.query("bool", filter=functools.reduce(operator.or_, years)) if len(schools) > 0 and len(years) == 0: s = s.query("bool", filter=functools.reduce(operator.or_, schools)) if len(schools) > 0 and len(years) > 0: combined = functools.reduce(operator.or_, years) & functools.reduce(operator.or_, schools) s = s.query("bool", filter=combined) response = s.execute() return response
def get_metrics_data(self, query): """ Get the metrics data from Elasticsearch given a DSL query :param query: query to be sent to Elasticsearch :return: a dict with the results of executing the query """ if self.es_url.startswith("http"): url = self.es_url else: url = 'http://' + self.es_url es = Elasticsearch(url) s = Search(using=es, index=self.es_index) s = s.update_from_dict(query) try: response = s.execute() return response.to_dict() except Exception as e: print() print( "In get_metrics_data: Failed to fetch data.\n Query: {}, \n Error Info: {}" .format(query, e.info)) raise
def get_performed_users(self): """ Returns the users that performed actions within the search filters """ search = Search(using=self.es, index=self.index) for query in self.searchfilter.values(): search = search.query(query) search.aggs.bucket( "user_names", "terms", field=self.get_field_name("userIdentity.userName"), size=5000, ) response = search.execute() user_names = {} for user in response.aggregations.user_names.buckets: if user.key == "HIDDEN_DUE_TO_SECURITY_REASONS": # This happens when a user logs in with the wrong username continue user_names[user.key] = True return user_names
def getHostBytes(client, starttime, endtime): s = Search(using=client, index="htcondor-xfer-stats2-*") s = s.filter('range', **{'@timestamp': {'gte': starttime, 'lt': endtime}}) # Remove records with more than 1 TB of data transferred, bug: # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7575,0 s = s.filter('range', bytes={'from': 0, 'to': 1024**4}) bkt = s.aggs bkt = bkt.bucket('hosts', 'terms', size=MAXSZ, field='host.name.keyword') bkt = bkt.metric('Bytes', 'sum', field='bytes') bkt = bkt.metric('loss', 'avg', field='lost') print(s.to_dict()) response = s.execute() hosts = {} for tag in response.aggregations.hosts: hosts[tag.key] = { 'bytes': tag.Bytes.value, 'bytes_str': convert_gb(tag.Bytes.value), 'loss': tag.loss.value } return hosts
def test_time_field_query(es): """Test executing query of fetch time field. Notes: if is_fetch is ticked, this function checks if the entered TIME_FIELD returns results. Args: es(Elasticsearch): an Elasticsearch object to which we run the test. Returns: (dict).The results of the query if they are returned. """ query = QueryString(query=TIME_FIELD + ':*') search = Search(using=es, index=FETCH_INDEX).query(query)[0:1] response = search.execute().to_dict() _, total_results = get_total_results(response) if total_results == 0: # failed in getting the TIME_FIELD return_error("Fetch incidents test failed.\nDate field value incorrect [{}].".format(TIME_FIELD)) else: return response
def companySearch(search): s = Search(using=es) search['offset'] = int(search['offset']) s = s.index('job_index') s = s.query('match_phrase', company=search['company']) s = s[search['offset']:search['offset'] + 10] response = s.execute() resultlist = [] print response.hits.total for hit in response.hits: result = {} result['id'] = hit.meta.id result['score'] = hit.meta.score result['title'] = hit['title'] result['summary'] = hit['summary'][:180] result['url'] = 'www.indeed.com' + hit['url'] result['company'] = hit['company'] result['location'] = hit['location'] result['postingdate'] = str(datetime.datetime.fromordinal(hit['date'])) resultlist.append(result) return resultlist
def get_need_content_from_id(content_id, index='rastarockets_needs'): """ Return need content from unique ID :param content_id: Need content unique ID :type content_id: str :param index: Index name (optional) :type index: str :return: NeedContent if exist :rtype: NeedContent|None """ search = Search(using=current_app.els_client, index=index, doc_type='content').query('term', _id=content_id) response = search.execute() if response.hits.total > 0: return NeedContent(response.hits[0]) else: return None
def test_checking_of_missing_user(es, data): user_count = 0 hit_count = 0 if data['index'] == 'users': if data['action'] == 'add': # Loads user data test_data = load_csv_to_dict(data['datafile']) for user in test_data: user_count += 1 query = query_construct(test_data.fieldnames, user) q = Q('bool', must=query) s = Search(using=es, index='users') s = s.query(q) response = s.execute() print(response) for hit in response: hit_count += 1 # Check if the expected user count is equal to the actual user count assert user_count == hit_count, \ 'Expected {} users but ES returning {} users'.format(user_count, hit_count)
def query_distinct_event_ids(self): es_query = [] es_query.append({ 'match': { 'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME } }) query = Q({'bool': {'must': es_query}}) s = Search(using=self.Client, index="winlogbeat-*").query(query) s.source(includes=['winlog.event_id', 'winlog.event_data.LogString']) s.aggs.bucket('distinct_event_ids', 'terms', field='winlog.event_id', size=1000) response = s.execute() sorted_distinct_distinct_event_ids = sorted( response.aggregations.distinct_event_ids, key=lambda kv: (kv.doc_count, kv.key), reverse=True) for e in sorted_distinct_distinct_event_ids: print("{0:50} {1}".format(e.key, e.doc_count))
def search(self, query, get_count=False, includes=None, size=1000): if self.DebugQuery: pprint.pprint(query) s = Search(using=self.Client, index=WINLOGBEAT_INDEX).query(query) if self.DTRange != None: s = s.filter('range', **self.DTRange) if includes == None: includes = ['winlog.provider_name', 'winlog.event_id'] s.source(includes=includes) if get_count: return s.count() if self.Scan: return s.scan() else: s = s[0:size] return s.execute().hits return None
def getAllDocs1(es, indices): hitCount = 0 index = "netflow-v5-2017.10.11" #qDict = {'size': 1000000, 'query': {'match_all': {} }} qDict = {'size': 1000, 'sort': ['_doc']} # or just {'sort': [ '_doc']} qDict = {'size': 500, 'query': {'match_all': {}}} qDict = {'query': {'terms': {'_id': 'AV8LhQqZyn_BE1UV4cVe'}}} s = Search(using=es, index=index) s.update_from_dict(qDict) total = s.count() s = s[0:total] results = s.execute() print("results.hits.total={} s.count()={} results.hits.hits={}".format( results.hits.total, s.count(), len(results.hits.hits))) #s = s[0:s.count()-1] #results = s.execute() print("Results: {}".format(len(results))) for result in s.scan(): print(str(hitCount) + ": " + result.to_dict()["@timestamp"]) hitCount += 1 exit()
def dump_event_counts(self): s = Search(using=self.Client, index=WINLOGBEAT_INDEX) s.source(includes=['winlog.provider_name', 'winlog.event_id']) s.aggs.bucket('distinct_provider_names', 'terms', field='winlog.provider_name', size=100000) response = s.execute() sorted_distinct_provider_names = sorted( response.aggregations.distinct_provider_names, key=lambda kv: (kv.doc_count, kv.key), reverse=True) max_provider_name_len = 0 for e in sorted_distinct_provider_names: str_len = len(e.key) if max_provider_name_len < str_len: max_provider_name_len = str_len fmt_str = "{0:%d} {1}" % max_provider_name_len for e in sorted_distinct_provider_names: print(fmt_str.format(e.key, e.doc_count))
def get(self, request, **kwargs): """ search for a user :param request: the HTTP GET request :return: JSON """ if "search_query" in request.GET: query = str(request.GET.get("search_query")) s = Search(index="acronyms").query("match", acronym=query) response = s.execute() logging.debug(response) hits_list = [] for hit in response.to_dict()["hits"]["hits"]: hit = hit['_source'] hits_list.append(hit) return JsonResponse({'data': hits_list}) return JsonResponse({'status': "error", 'detail': "please include a query"}, status=400)
def execute_elastic_query(args): logger.debug(args) query = args.query host = args.host port = args.port from_time = "now-{seconds}s".format(seconds=args.seconds) aggregate = need_aggregate(args) index = build_indices(indices_count=args.indices_count, index_pattern=args.index_pattern, index_prefix=args.index_prefix) client = Elasticsearch(hosts=["{}:{}".format(host, port)]) s = Search(using=client, index=index) \ .query("query_string", query=query, analyze_wildcard=True) \ .query("range", **{"@timestamp": {"gte": "{}".format(from_time)}}) if aggregate: s.aggs.bucket(args.aggregation_name, A(args.aggregation_type, field=args.aggregation_field)) return s.execute()
def search(query, page, sort): q = Q("match_all") word = query if word[0] == '"' and word[-1] == '"': q = q & Q("match_phrase", words="{}".format(word)) else: words = word.split(' ') for w in words: q = q & Q("wildcard", words="{}".format(w)) if ":" in sort: sort_arr = sort.split(":") search = Search(using=client, index="file").query(q).sort( {sort_arr[0]: { "order": sort_arr[1] }}) else: search = Search(using=client, index="file").query(q) total = search.count() max_pages = total // per_page search = search[(page - 1) * per_page:page * per_page] response = search.execute() return [file_result_from_hit(hit) for hit in response], max_pages + 1, total
def get_summary(self): s = Search(index=self.index) # Filter by date to approximately 20 years ago, to ensure there aren't more # than 10000 buckets date_20_years_ago = (datetime.utcnow() - timedelta(days=int(20*365.25))).date() s = s.filter('range', **{'last_updated': {'gte': str(date_20_years_ago)}}) s.aggs.bucket('daily_workflows', 'date_histogram', field='last_updated', format="yyyy-MM-dd", interval='day') \ .bucket('recid', 'terms', field='recid') result = s.execute().aggregations.to_dict() # flatten summary processed_result = [] _daily_workflows = result['daily_workflows'] for day in _daily_workflows['buckets']: for recid in day['recid']['buckets']: record_search = self.search(term=recid['key'], fields=['recid']) record = record_search[0] if len(record_search) == 1 else record_search[1] processed_result.append(record.as_custom_dict(exclude=[])) return processed_result
def get_trade_history(size=10, from_date='2015-10-10', to_date='now', sort_by='-operation_id_num', search_after=None, base="1.3.0", quote="1.3.121"): s = Search(using=es, index="bitshares-*") s = s.extra(size=size) if search_after and search_after != '': s = s.extra(search_after=search_after.split(',')) q = Q() q = q & Q("match", operation_type=4) q = q & Q("match", operation_history__op_object__is_maker=True) q = q & Q("match", operation_history__op_object__fill_price__base__asset_id=base) q = q & Q("match", operation_history__op_object__fill_price__quote__asset_id=quote) range_query = Q("range", block_data__block_time={'gte': from_date, 'lte': to_date}) s.query = q & range_query s = s.sort(*sort_by.split(',')) response = s.execute() return [hit.to_dict() for hit in response]
def autocomplete_view(request): query = request.GET.get('term', '') # resp = models.client.suggest( # index='review', # body={ # 'perfume': { # "text": query, # "completion": { # "field": 'perfume', # } # } # } # ) s = Search(using=models.client, index = "review") s = s.filter("term", perfume=query) resp = s.execute() perfumes = [] for hit in resp: perfumes.append(hit.perfume) data = json.dumps(perfumes) mimetype = 'application/json' return HttpResponse(data, mimetype)
def gracc_query_apel(year, month): index = osg_summary_index starttime = datetime.datetime(year, month, 1) onemonth = dateutil.relativedelta.relativedelta(months=1) endtime = starttime + onemonth s = Search(using=es, index=index) s = s.query('bool', filter=[ Q('range', EndTime={ 'gte': starttime, 'lt': endtime }) & Q('terms', VOName=vo_list) & (Q('term', ResourceType='Batch') | (Q('term', ResourceType='Payload') & Q('term', Grid='Local'))) ]) bkt = s.aggs bkt = bkt.bucket('Cores', 'terms', size=MAXSZ, field='Processors') bkt = bkt.bucket('VO', 'terms', size=MAXSZ, field='VOName') bkt = bkt.bucket('DN', 'terms', size=MAXSZ, field='DN') bkt = bkt.bucket('Site', 'terms', size=MAXSZ, missing=MISSING, field='OIM_ResourceGroup') #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='SiteName') #bkt = bkt.bucket('Site', 'terms', size=MAXSZ, field='WLCGAccountingName') add_bkt_metrics(bkt) bkt = bkt.bucket('SiteName', 'terms', size=MAXSZ, field='SiteName') add_bkt_metrics(bkt) response = s.execute() return response
def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError( 'The index does not exist, you must create it before searching' ) if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses