def test_user_org_filter_custom_user(self): """ Test that user_organization_filtering returns a filtered search when given a user """ org0 = Organization(name='testName', filter='testFilter') org0.save() org1 = Organization(name='otherTestName', filter='otherTestFilter') org1.save() user = XDSUser.objects.create_user('*****@*****.**', 'test1234', first_name='Jane', last_name='doe') user.organizations.add(org0) user.organizations.add(org1) query = XSEQueries('test', 'test', user=user) expected_search = Search(using='default', index='test').\ query(Q("match", filter=org0.filter) | Q("match", filter=org1.filter)) query.user_organization_filtering() result = query.search self.assertIn(expected_search.to_dict()['query']['bool']['should'][0], result.to_dict()['query']['bool']['should']) self.assertIn(expected_search.to_dict()['query']['bool']['should'][1], result.to_dict()['query']['bool']['should'])
def query(self): """ Method to query Elasticsearch cluster for EfficiencyReport information :return elasticsearch_dsl.Search: Search object containing ES query """ wildcardProbeNameq = 'condor:fifebatch?.fnal.gov' starttimeq = self.start_time.isoformat() endtimeq = self.end_time.isoformat() s = Search(using=self.client, index=self.indexpattern) \ .filter("wildcard", ProbeName=wildcardProbeNameq) \ .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq})[0:0] # Aggregations Buckets = s.aggs.bucket('group_status', 'filters', filters={ 'Success': {'bool': {'must': {'term': {'Resource_ExitCode': 0}}}}, 'Failure': { 'bool': {'must_not': {'term': {'Resource_ExitCode': 0}}}}}) \ .bucket('group_VO', 'terms', field='VOName', size=2**31-1) \ .bucket('group_CommonName','terms', field='CommonName', size=2**31-1) # Metrics Buckets.metric('numJobs', 'sum', field='Count')\ .metric('WallHours', 'sum', field='CoreHours') if self.verbose: print s.to_dict() return s
def query(self, client): """Query method to grab wasted hours, return query object""" wildcardProbeNameq = 'condor:fifebatch?.fnal.gov' starttimeq = self.dateparse(self.start_time) endtimeq = self.dateparse(self.end_time) s = Search(using = client, index = indexpattern_generate(self.start_time, self.end_time))\ .query("wildcard", ProbeName=wildcardProbeNameq)\ .filter("range", EndTime={"gte" : starttimeq, "lt" : endtimeq}) # Aggregations a1 = A('filters', filters = {'Success' : {'bool' : {'must' : {'term' : {'Resource_ExitCode' : 0}}}}, 'Failure': {'bool' : {'must_not' : {'term' : {'Resource_ExitCode' : 0}}}}}) a2 = A('terms', field = 'VOName') a3 = A('terms', field = 'CommonName') Buckets = s.aggs.bucket('group_status', a1)\ .bucket('group_VO', a2)\ .bucket('group_CommonName', a3) # Metrics # FIGURE OUT HOW TO TOTAL JOBS Metric = Buckets.metric('numJobs', 'value_count', field = 'GlobalJobId')\ .metric('WallHours', 'sum', script="(doc['WallDuration'].value*doc['Processors'].value/3600)") if self.verbose: print s.to_dict() return s
def query(self, client): """Method that actually queries elasticsearch""" # Set up our search parameters voq = self.config.get("query", "{}_voname".format(self.vo.lower())) productioncheck = '*Role=Production*' start_date = self.datesplit_pattern.split(self.start_time) starttimeq = datetime(*[int(elt) for elt in start_date]).isoformat() end_date = self.datesplit_pattern.split(self.end_time) endtimeq = datetime(*[int(elt) for elt in end_date]).isoformat() # Generate the index pattern based on the start and end dates indexpattern = indexpattern_generate(start_date, end_date) if self.verbose: print >> sys.stdout, indexpattern sleep(3) # Elasticsearch query resultset = Search(using=client, index=indexpattern) \ .query("wildcard", VOName=productioncheck) \ .filter(Q({"term": {"VOName": voq}})) \ .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \ .filter(Q({"term": {"ResourceType": "Payload"}})) if self.verbose: print resultset.to_dict() return resultset
def query(self): """ Method to query Elasticsearch cluster for EfficiencyReport information :return elasticsearch_dsl.Search: Search object containing ES query """ # Set up our search parameters voq = self.config.get(self.vo.lower(), "voname".format(self.vo.lower())) productioncheck = '*Role=Production*' starttimeq = self.start_time.isoformat() endtimeq = self.end_time.isoformat() self.logger.info(self.indexpattern) if self.verbose: sleep(3) # Elasticsearch query s = Search(using=self.client, index=self.indexpattern) \ .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \ .filter("term", ResourceType="Payload") if self.vo.lower() in re.split(',', self.config.get('noproduction', 'list')): s = s.filter("wildcard", VOName=voq) else: s = s.filter("wildcard", VOName=productioncheck)\ .filter("term", VOName=voq) if self.verbose: print s.to_dict() return s
async def get_lists(database: plugins.configuration.DBConfig) -> dict: """ :param database: a Pony Mail database configuration :return: A dictionary of all mailing lists found, and whether they are considered public or private """ lists = {} db = plugins.database.Database(database) limit = database.max_lists # Fetch aggregations of all private emails # Do this first, so mixed lists are not marked private s = Search(using=db.client, index=db.dbs.db_mbox).filter("term", private=True) s.aggs.bucket("per_list", "terms", field="list_raw", size=limit) res = await db.search(index=db.dbs.db_mbox, body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) lists[list_name] = { "count": 0, # Sorting later "private": True, } # Fetch aggregations of all public emails s = Search(using=db.client, index=db.dbs.db_mbox).filter("term", private=False) s.aggs.bucket("per_list", "terms", field="list_raw", size=limit) res = await db.search(index=db.dbs.db_mbox, body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) lists[list_name] = { "count": 0, # We'll sort this later "private": False, } # Get 90 day activity, if any s = Search(using=db.client, index=db.dbs.db_mbox) s = s.filter('range', date={'gte': ACTIVITY_TIMESPAN}) s.aggs.bucket("per_list", "terms", field="list_raw", size=limit) res = await db.search(index=db.dbs.db_mbox, body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) if list_name in lists: lists[list_name]["count"] = ml["doc_count"] await db.client.close() return lists
def test_query_combination(): q = Q("match", title='python') | Q("match", title='django') s = Search().query(q) print(s.to_dict()) q = Q("match", title='python') & Q("match", title='django') s = Search().query(q) print(s.to_dict()) q = ~Q("match", title="python") s = Search().query(q) print(s.to_dict())
async def get_lists(database: plugins.configuration.DBConfig) -> dict: """ :param database: a Pony Mail database configuration :return: A dictionary of all mailing lists found, and whether they are considered public or private """ lists = {} client = AsyncElasticsearch([ { "host": database.hostname, "port": database.port, "url_prefix": database.url_prefix or "", "use_ssl": database.secure, }, ]) # Fetch aggregations of all public emails s = Search(using=client, index=database.db_prefix + "-mbox").query("match", private=False) s.aggs.bucket("per_list", "terms", field="list_raw") res = await client.search(index=database.db_prefix + "-mbox", body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) lists[list_name] = { "count": ml["doc_count"], "private": False, } # Ditto, for private emails s = Search(using=client, index=database.db_prefix + "-mbox").query("match", private=True) s.aggs.bucket("per_list", "terms", field="list_raw") res = await client.search(index=database.db_prefix + "-mbox", body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) lists[list_name] = { "count": ml["doc_count"], "private": True, } await client.close() return lists
def test_filters(): s = Search() s = s.filter('terms', tags=['search', 'python']) print(s.to_dict()) # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}} s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])]) print(s.to_dict()) # {'query': {'bool': {'filter': [{'terms': {'tags': ['search', 'python']}}]}}} s = s.exclude('terms', tags=['search', 'python']) # 或者 # s = s.query('bool', filter=[~Q('terms', tags=['search', 'python'])]) print(s.to_dict())
def test_sorting(): s = Search().sort( 'category', '-title', {"lines": {"order": "asc", "mode": "avg"}} ) print(s.to_dict())
def _execute(search: Search) -> dict: if log.isEnabledFor(logging.DEBUG): log.debug(json.dumps(search.to_dict(), indent=4)) resp = search.execute() if log.isEnabledFor(logging.DEBUG): log.debug(json.dumps(resp.to_dict(), indent=4)) return resp.to_dict()
def do_search(self, search_params): """ Do the actual search, using the search params we've been passed Params: search_params(dict): 'ingredients': str of space delimited keywords """ # Prepare the required queries to be joined together with boolean operators in search ( &, | ) q_ingredients = Q("match", ingredients=search_params['ingredients']) # Leave out name for now to keep this simple # q_name = Q("match", name=search_params['ingredients']) # 'name' will add to score but is not essential # Prepare the search, using the prepared queries es_search = Search(index=settings.SEARCH_SERVICE['ES_INDEX']).using(self.client).query(q_ingredients) # Max number of results, from settings es_search = es_search[:settings.SEARCH_SERVICE['ES_MAX_RESULTS']] # Log the query_params and JSON query used logger.debug(json.dumps(search_params)) logger.debug(json.dumps(es_search.to_dict())) es_search.execute() results = get_recipes_from_search(es_search) return (results)
def query_datasets(self, index, offset, page_size): """Return list of datasets: { "query": { "match_all": {} }, "aggs": { "datasets": { "terms": { "field": "dataset", "size": 0 } } }, "size": 0 } """ s = Search(using=self.client, index=index).extra(size=0) a = A('terms', field='dataset.keyword', size=MAX_SIZE) s.aggs.bucket('datasets', a) if self.logger: self.logger.debug(s.to_dict()) datasets = [ i['key'] for i in s.execute().aggregations.to_dict()['datasets']['buckets'] ] return len(datasets), datasets[offset:offset + page_size]
def query_types_by_dataset(self, index, dataset, offset, page_size): """Return list of types by dataset: { "query": { "term": { "dataset.keyword": "area_of_interest" } }, "aggs": { "types": { "terms": { "field": "dataset_type.keyword", "size": 0 } } }, "size": 0 } """ s = Search(using=self.client, index=index).extra(size=0) q = Q('term', dataset__keyword=dataset) a = A('terms', field='dataset_type.keyword', size=MAX_SIZE) s = s.query(q) s.aggs.bucket('types', a) if self.logger: self.logger.debug(s.to_dict()) types = [ i['key'] for i in s.execute().aggregations.to_dict()['types']['buckets'] ] return len(types), types[offset:offset + page_size]
def get_permission(self, user_id, file_ids): query_conditions = query.Bool(must=[ query.Terms(file_id=file_ids), query.Bool(should=[ query.Term(owner={ 'value': user_id, 'boost': 100 }), query.Bool(must=[ query.Term(share_mode={ 'value': 1, 'boost': 5 }), query.Term(users_shared={ 'value': user_id, 'boost': 5 }) ]), query.Term(share_mode=2) ]) ]) file_es = Search() \ .query(query_conditions) \ .source(['owner', 'share_mode', 'editable']) file_es = file_es[0:1] print(json.dumps(file_es.to_dict())) responses = file_es.using(self.es).index(self._index).execute() return responses
def filter(cls, user_id, params, order=None, limit=None, offset=0): """Filter indexed objects using a query string. :param user_id: user identifier :type user_id: str :param params: parameters to add in query string, will be form of name:value :type params: dict :param limit: restrict result to this limit :type limit: int :param offset: start result list from this offset :type offset: int :return list """ # XXX well I know this it bad, security must be considered strongly values = [] for k, v in params.iteritems(): values.append('%s:%s' % (k, v)) q_str = ' AND '.join(values) client = cls.client() s = Search(using=client, index=user_id, doc_type=cls.doc_type). \ query("query_string", query=q_str) if limit or offset: s = s[offset:(offset + limit)] log.debug("Filter index %s %s with : %s" % (user_id, cls.doc_type, s.to_dict())) res = s.execute() return cls._format_list_result(res)
def search(self, criteria, key_list=None): """ Builds ElasticSearch query. Args: criteria(schemas/search-layer-criteria.json): Criteria to use to initiate search. key_list(list): List of keys to receive back from a search. Returns: dict: each element in the outer dict represents a search "hit" with the returned keys specified in key_list. """ query = self._build_query(criteria.get("search")) query = Search(using=self.connection).index( self.index).sort("_uid").query(query) # Using python splicing on a query is the same as using {from: 0, size: 50} in an elasticsearch query # the upper_limit is gathered from the elasticsearch config query = query[0:self.upper_limit] self.search_container.logger.debug( "Executing the following search query: {0}".format( query.to_dict())) search_results = query.execute() search_formatter = SearchFormatter(criteria, search_results, key_list) formatted_results = search_formatter.get_formatted_results() return formatted_results
async def elastic_filter( cls, *, query: str, offset: int, limit: int ) -> List[Union[SearchModel, Dict[str, Any]]]: """ Filter existing models in ElasticSearch by string query. This function uses __es_search_fields as fields for phrase_prefix query. >>> await Model.elastic_filter(query="La") """ elastic_query = Search() if query: elastic_query = elastic_query.query( MultiMatch( type="phrase_prefix", query=query, fields=cls.__es_search_fields ) ) elastic_query = elastic_query[offset : offset + limit] search_res = await elastic_client.search(elastic_query.to_dict()) hits = search_res.get("hits", {}).get("hits", []) results = [] constructor = cls.__es_search_type or dict for hit in hits: logger.debug(hit) results.append(constructor(id=hit.get("_id"), **hit.get("_source", {}))) return results
def getLastReported(client, endtime=datetime.datetime.now()): s = Search(using=client, index="htcondor-xfer-stats2-*") starttime = datetime.datetime.now() - datetime.timedelta(days=365) s = s.filter('range', **{'@timestamp': {'gte': starttime, 'lt': endtime}}) bkt = s.aggs bkt = bkt.bucket('hosts', 'terms', size=MAXSZ, field='host.name.keyword') bkt = bkt.bucket('max_time', 'max', field='CreateDate') print(s.to_dict()) response = s.execute() hosts = {} for tag in response.aggregations.hosts: if tag.max_time.value is None: continue last_seen = datetime.datetime.fromtimestamp(tag.max_time.value / 1000) # Discount hosts seen in the last week if last_seen > datetime.datetime.now() - datetime.timedelta(days=7): continue hosts[tag.key] = { 'max_time': tag.max_time.value, 'max_time_str': last_seen.strftime('%Y-%m-%d %H:%M:%S') } return hosts
def _filter(self, req=None, data=None): req = req or RequestFactory().get('/', data=data or {}) queryset = Search() for filter_class in self.filter_classes: queryset = filter_class().filter_queryset(req, queryset, self.view_class) return queryset.to_dict()
def query(self): """ Method to query Elasticsearch cluster for EfficiencyReport information :return elasticsearch_dsl.Search: Search object containing ES query """ # Gather parameters, format them for the query starttimeq = self.start_time.isoformat() endtimeq = self.end_time.isoformat() probelist = self.config[ self.report_type.lower()]['OSG_flocking_probe_list'] if self.verbose: self.logger.info(self.indexpattern) self.logger.info(probelist) # Elasticsearch query and aggregations s = Search(using=self.client, index=self.indexpattern) \ .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \ .filter("terms", ProbeName=probelist) \ .filter("term", ResourceType="Payload")[0:0] # Size 0 to return only aggregations Bucket = s.aggs.bucket('OIM_Facility', 'terms', field='OIM_Facility', size=MAXINT, order={'CoreHours': 'desc'}) Bucket.metric('CoreHours', 'sum', field='CoreHours') print(s.to_dict()) return s
def filter(cls, user_id, params, order=None, limit=None, offset=0): """Filter indexed objects using a query string. :param user_id: user identifier :type user_id: str :param params: parameters to add in query string, will be form of name:value :type params: dict :param limit: restrict result to this limit :type limit: int :param offset: start result list from this offset :type offset: int :return list """ # XXX well I know this it bad, security must be considered strongly values = [] for k, v in params.iteritems(): values.append('%s:%s' % (k, v)) q_str = ' AND '.join(values) client = cls.client() s = Search(using=client, index=user_id, doc_type=cls.doc_type). \ query("query_string", query=q_str) if limit or offset: s = s[offset:(offset + limit)] log.debug("Filter index %s %s with : %s" % (user_id, cls.doc_type, s.to_dict())) res = s.execute() return cls._format_list_result(res)
def search_query(queries, index): # TODO: create a base DocType class with method, get_index_by_name('dummy_movies') # TODO: create ability to search across indexes, using Search() s = Search(index=index) query_obj = { 'must': [], 'must_not': [], 'filter': [] } for query in queries: if query != None: q = Q(query['query']) query_obj[query['query_type']].append(q) total_queries = Q('bool', **query_obj) s = s.query(total_queries) print('query --> ', s.to_dict()) response = s print('count --> ', response.count()) response_obj = { 'hits': response.count(), 'data': [h.to_dict() for h in response] } return response_obj
def recommends(a): s = Search().query("more_like_this", stop_words=MINUS_WORDS, like={ "_id": a.pk, "_index": "article-index", "_type": "article_index" }, fields=["authors^2", "cats", "title^2", "content"]) search_body = { "query": { "function_score": { "query": s.to_dict()["query"], "functions": [AGEISM, RANDOMISE], "score_mode": "sum" } } } r = connections.get_connection().search(index="article-index", body=search_body) hits = r["hits"]["hits"][:settings.SUGGESTION_COUNT] return list( filter(lambda a: a is not None, [ models.Article.nondraft.filter(pk=hit["_id"]).first() for hit in hits ]))
def search_buy(self, query, ptype, cond): s = Search(index='buy') if ptype != '': s = s.filter('match', ptype=self.pt_dict[ptype]) if cond != '': s = s.filter('match', cond=self.c_dict[cond]) # s = s.source(['hand_kw', 'jieba_kw', 'synonym', 'ptype', 'cond']) q = Q("bool", should=[ Q("terms", hand_kw=query), Q("terms", jieba_kw=query), Q("terms", synonym=query), Q("match", raw=' '.join(query)) ]) # q = Q('multi_match', query=' '.join(query), fields=['raw']) | \ # Q("terms", hand_kw=query) | \ # Q("terms", jieba_kw=query) | \ # Q("terms", synonym=query) s = s.query(q) pprint(s.to_dict()) r = s.execute() return r
def index_single(es, network, channel, date, lines): # Delete existing delete_existing = Search( using=es, index='moffle', ).query( "term", network=network, ).query( "term", channel=channel, ).query( "term", date=date, ) es.delete_by_query( index='moffle', body=delete_existing.to_dict(), ) actions = [x for x in (line_to_index_action(network, channel, date, i, line) for i, line in lines) if x] while actions: retries = 0 try: success_count, _ = bulk(es, actions) log("{}/{}/{}: indexed {} lines".format(network, channel, date, success_count)) return success_count except Exception as e: retries += 1 log("{}/{}/{}: Attempt {}/3: {}".format(network, channel, date, retries, e)) if retries > 3: raise
def search(self, doc_type, query=""): """ Execute search query and retrive results :param doc_type: Type in ElasticSearch :param query: search query :return: list with results """ results = [] if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta: q = Q("multi_match", query=query.lower(), fields=["title"]) s = Search() s = s.using(self.client) s = s.index(self.index_name) s = s.doc_type(doc_type) s = s.query(q) print "search query: " + str(s.to_dict()) response = s.execute() for resp in response: results.append(resp) return results
def process(self, start_time:datetime, end_time:datetime, input:DataFrame): logger.debug('Start: %s End: %s Log: index=%s fields=%s' % (start_time.isoformat(), end_time.isoformat(), str(self.indices), str(self.fields))) search = Search(using=self.client, index=self.indices[0]) search = search.filter(Range(** {'@timestamp': {'gte': start_time.isoformat(), 'lte': end_time.isoformat()}})) for k,v in self.fields.items(): if isinstance(v, list): for sv in v: search = search.query("match", **{k:sv}) else: search = search.query("match", **{k:v}) logger.debug('ES Query: %s' % str(search.to_dict())) response = search.execute() logger.debug('Results: success:%d failed:%d hits:%d' % (response._shards.successful, response._shards.failed, len(response.hits))) for hit in response: # filter out the meta key and flatten the values row = {k: str(hit[k]) for k in hit if k != 'meta'} logger.debug(row) input = input.append(row, ignore_index=True) return input
def catalog_search(): params = app.current_request.query_params s = Search(using=client, index='imagery', doc_type="metadata") filter_count = 0 max_results = 1000 if ("st" in params): s = s.filter('range', date={'gte': params["st"]}) filter_count += 1 if ("et" in params): s = s.filter('range', date={'lte': params["et"]}) filter_count += 1 if ("wkt" in params): shape_filter = {"shape": wkt.loads(params["wkt"])} s = s.filter('geo_shape', bounds=shape_filter) filter_count += 1 if ("debug" in params): return (s.to_dict()) s = s[0:max_results] if (filter_count > 0): result = s.execute().to_dict() return ([hit["_source"] for hit in result["hits"]["hits"]]) else: return {'Search Failed': 'No search parameters were recognized'}
def test_add_filter_no_param(self): q = FieldSearchQuery(args={}) search = Search() prev_dict = search.to_dict() search = q.add_filters(search, 'units', 'units').to_dict() # Esperado: no se modifica la query si no hay parámetros self.assertEqual(prev_dict, search)
def _filter(self, req=None, data=None): req = req or RequestFactory().get('/', data=data or {}) queryset = Search() for filter_class in self.filter_classes: queryset = filter_class().filter_queryset(req, queryset, self.view_class) return queryset.to_dict()
def _data(self, request, cleaned, *args, explain=None, **kwargs): search = Search(using=connection, index=indicies, extra={'size': 0}) search.aggs.bucket('documents_by_type', TermsFacet(field='_type').get_aggregation()) \ .bucket('by_month', DateHistogramFacet(field='created', interval='month', min_doc_count=0).get_aggregation()) search.aggs.bucket( 'datasets_by_institution', NestedFacet( 'institution', TermsFacet(field='institution.id')).get_aggregation()) search.aggs.bucket( 'datasets_by_category', NestedFacet( 'category', TermsFacet(field='category.id', min_doc_count=1, size=50)).get_aggregation()) search.aggs.bucket('datasets_by_tags', TermsFacet(field='tags').get_aggregation()) search.aggs.bucket('datasets_by_formats', TermsFacet(field='formats').get_aggregation()) search.aggs.bucket( 'datasets_by_openness_scores', TermsFacet(field='openness_scores').get_aggregation()) if explain == '1': return search.to_dict() try: return search.execute() except TransportError as err: raise falcon.HTTPBadRequest( description=err.info['error']['reason'])
def searchTweets(keyword, latlondist): #Variables that contains the user credentials to access Twitter API if TwitterHelper.AWS_ACCESS_KEY == None: raise KeyError("Please set the AWS_ACCESS_KEY env. variable") if TwitterHelper.AWS_SECRET_KEY == None: raise KeyError("Please set the AWS_SECRET_KEY env. variable") s = Search() if latlondist != None: locJson = json.loads(latlondist) s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}}) if keyword != None: q = Q("match_phrase", text = keyword) s = s.query(q) scanResp = None scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m") arr = [] for resp in scanResp: hit = resp['_source'] d = {} d['name'] = hit['name'] d['text'] = hit['text'] d['sentiment'] = hit['sentiment'] d['lat'] = hit['location']['lat'] d['lon'] = hit['location']['lon'] arr.append(d) allD = {} allD['tweets'] = arr mapInput = json.dumps(allD) return mapInput
def get_registered_datasender_count(dbm, questionnaire_name): es = Elasticsearch(hosts=[{"host": ELASTIC_SEARCH_HOST, "port": ELASTIC_SEARCH_PORT}]) search = Search(using=es, index=dbm.database_name, doc_type='reporter') search = search.query("term", projects_value=lowercase_and_strip_accents(questionnaire_name)) search = search.query("term", void=False) body = search.to_dict() return es.search(index=dbm.database_name, doc_type='reporter', body=body, search_type='count')['hits']['total']
def search_my_data(self, username, q, offset, limit): split_query = q.split(" ") for i, c in enumerate(split_query): if c.upper() not in ["AND", "OR", "NOT"]: split_query[i] = "*" + c + "*" q = " ".join(split_query) search = Search(index='des-files') search = search.filter("nested", path="permissions", query=Q("term", permissions__username=username)) search = search.query("query_string", query=q, fields=["name", "name._exact", "keywords"]) search = search.query( Q('bool', must=[Q({'prefix': { 'path._exact': username }})])) search = search.filter("term", system='designsafe.storage.default') search = search.query( Q('bool', must_not=[ Q({'prefix': { 'path._exact': '{}/.Trash'.format(username) }}) ])) search = search.extra(from_=offset, size=limit) logger.info(search.to_dict()) return search
def get_update_list_single_process(self): """ Find units that needs updating and their sidstopdateret (last updated) the sidstopdateret may be inaccurate and thus way to far back in time therefore we cannot use take the largest of sidstopdateret from the database. Seems we download like 600 dicts a second with match_all. Should take around 2 hours and 30 minuttes then. This takes 30 so i need to save half an hour on downloads. :return datetime (min sidstopdateret), list (enhedsnumer, sidstopdateret) """ enh_samtid_map = self.make_samtid_dict() oldest_sidstopdateret = datetime.datetime.utcnow().replace( tzinfo=pytz.utc) + datetime.timedelta(days=1) update_dicts = { x: { 'units': [], 'sidstopdateret': oldest_sidstopdateret } for x in self.source_keymap.values() } if len(enh_samtid_map) == 0: return update_dicts dummy = CvrConnection.update_info(samtid=-1, sidstopdateret=self.dummy_date) print('Get update time for all data') for _type in self.source_keymap.values(): search = Search(using=self.elastic_client, index=self.index) search = search.query('match_all') sidst_key = '{0}.sidstOpdateret'.format(_type) samt_key = '{0}.samtId'.format(_type) field_list = ['_id', sidst_key, samt_key] # field_list = ['_id'] + ['{0}.sidstOpdateret'.format(key) for key in self.source_keymap.values()] + \ # ['{0}.samtId'.format(key) for key in self.source_keymap.values()] search = search.fields(fields=field_list) params = {'scroll': self.elastic_search_scroll_time, 'size': 2**12} search = search.params(**params) print('ElasticSearch Query: ', search.to_dict()) generator = search.scan() for cvr_update in tqdm.tqdm(generator): enhedsnummer = int(cvr_update.meta.id) raw_dat = cvr_update.to_dict() samtid = raw_dat[samt_key][0] if samt_key in raw_dat else None sidstopdateret = raw_dat[sidst_key][ 0] if sidst_key in raw_dat else None if sidstopdateret is None or samtid is None: continue current_update = enh_samtid_map[ enhedsnummer] if enhedsnummer in enh_samtid_map else dummy if samtid > current_update.samtid: utc_sidstopdateret = utc_transform(sidstopdateret) update_dicts[_type]['sidstopdateret'] = min( utc_sidstopdateret, update_dicts[_type]['sidstopdateret']) update_dicts[_type]['units'].append( (enhedsnummer, utc_sidstopdateret)) # break print('Update Info: ') print([(k, v['sidstopdateret'], len(v['units'])) for k, v in update_dicts.items()]) return update_dicts
def test_simple_search(): s = Search().query("match", title="python") # {'query': {'match': {'title': 'python'}}} print(s.to_dict()) response = s.execute() print response for hit in s: print(hit.title)
async def get_lists(database: plugins.configuration.DBConfig) -> dict: """ :param database: a Pony Mail database configuration :return: A dictionary of all mailing lists found, and whether they are considered public or private """ lists = {} db = plugins.database.Database(database) limit = 8192 # Fetch aggregations of all public emails s = Search(using=db.client, index=database.db_prefix + "-mbox").filter("term", private=False) s.aggs.bucket("per_list", "terms", field="list_raw", size=limit) res = await db.search(index=database.db_prefix + "-mbox", body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) lists[list_name] = { "count": ml["doc_count"], "private": False, } # Ditto, for private emails s = Search(using=db.client, index=database.db_prefix + "-mbox").filter("term", private=True) s.aggs.bucket("per_list", "terms", field="list_raw", size=limit) res = await db.search(index=database.db_prefix + "-mbox", body=s.to_dict(), size=0) for ml in res["aggregations"]["per_list"]["buckets"]: list_name = ml["key"].strip("<>").replace(".", "@", 1) lists[list_name] = { "count": ml["doc_count"], "private": True, } await db.client.close() return lists
def delete(self, region, date): index = app.config['ELASTICSEARCH_INDEX'] doc_type = app.config['ELASTICSEARCH_TYPE'] s = Search(using=self.es, index=app.config['ELASTICSEARCH_INDEX'], doc_type=doc_type) \ .filter('term', region=region) \ .filter('term', date=date) self.es.delete_by_query(index=index, doc_type=doc_type, body=s.to_dict())
def query(self): search_obj = Search() for f in self.filters: search_obj = search_obj.filter(f) for q in self.queries: search_obj = search_obj.query(q) return search_obj.to_dict()
def _from_uuid(cls, uuid): result = Search(using=es, index=CLUSTER_NAME).query('match', uuid=uuid).execute()[0] result = result.to_dict() r = cls() r.uuid = result.pop('uuid', None) r.epoch = result.pop('epoch', None) r.data = result return r
def test_post_filter(app): """Test post filter.""" urlargs = MultiDict() defs = dict( type=terms_filter('type'), subtype=terms_filter('subtype'), ) with app.test_request_context('?type=test'): search = Search().query(Q(query='value')) search, args = _post_filter(search, urlargs, defs) assert 'post_filter' in search.to_dict() assert search.to_dict()['post_filter'] == dict( terms=dict(type=['test']) ) assert args['type'] == 'test' with app.test_request_context('?anotertype=test'): search = Search().query(Q(query='value')) search, args = _post_filter(search, urlargs, defs) assert 'post_filter' not in search.to_dict()
def _queryElasticsearch(self, from_date, to_date, query): logging.debug("Connecting to ES") client = Elasticsearch() logging.debug("Beginning search") s = Search(using=client, index=self._config['ElasticSearch']['raw_index']) s = s.filter('range', **{'EndTime': {'from': from_date, 'to': to_date }}) logging.debug("About to execute query:\n%s" % str(s.to_dict())) for hit in s.scan(): yield hit
def test_default_facets_factory(app): """Test aggregations.""" defs = dict( aggs=dict( type=dict( terms=dict(field='upload_type'), ), subtype=dict( terms=dict(field='subtype'), ) ), filters=dict( subtype=terms_filter('subtype'), ), post_filters=dict( type=terms_filter('type'), ), ) app.config['RECORDS_REST_FACETS']['testidx'] = defs with app.test_request_context('?type=a&subtype=b'): search = Search().query(Q(query='value')) search, urlkwargs = default_facets_factory(search, 'testidx') assert search.to_dict()['aggs'] == defs['aggs'] assert 'post_filter' in search.to_dict() assert search.to_dict( )['query']['bool']['filter'][0]['terms']['subtype'] search = Search().query(Q(query='value')) search, urlkwargs = default_facets_factory(search, 'anotheridx') assert 'aggs' not in search.to_dict() assert 'post_filter' not in search.to_dict() assert 'bool' not in search.to_dict()['query']
def test_query_filter(app): """Test post filter.""" urlargs = MultiDict() defs = dict( type=terms_filter('type'), subtype=terms_filter('subtype'), ) with app.test_request_context('?type=test'): search = Search().query(Q('multi_match', query='value')) body = search.to_dict() search, args = _query_filter(search, urlargs, defs) assert 'post_filter' not in search.to_dict() assert search.to_dict()['query']['bool']['must'][0] == body['query'] assert search.to_dict()['query']['bool']['filter'] == [ dict(terms=dict(type=['test'])) ] assert args['type'] == 'test' with app.test_request_context('?anotertype=test'): search = Search().query(Q(query='value')) body = search.to_dict() query, args = _query_filter(search, urlargs, defs) assert query.to_dict() == body
def index_single(es, network, channel, date, lines): log("Processing {}/{}/{}".format(network, channel, date)) # Delete existing delete_existing = Search( using=es, index='moffle', ).query( "term", network=network, ).query( "term", channel=channel, ).query( "term", date=date, ) es.delete_by_query( index='moffle', body=delete_existing.to_dict(), ) actions = [] for i, line in lines: m = LINE.match(line) if not m: # What happened here? continue fields = m.groupdict() fields['text'] = fields['text'].strip() fields['line_type'] = TYPE_MAP[fields['line_type']] fields.update({ '_index': 'moffle', '_type': 'logline', 'network': network, 'channel': channel, 'date': date, 'line_no': i, }) actions.append(fields) if actions: log(bulk(es, actions))
def build_search_query(params: dict): s = Search(using=client, index="logger") query_string = "" for key, value in params.items(): if key == "remote_host" and value !="": query_string += "\'remote_host\': \'" + value + "\' AND " elif key == "application_name" and value !="": query_string += "\'application_name\': \'" + value + "\' AND " query_string += params['body'] print(query_string) s.query(Q("query_string", query=query_string)) print(s.to_dict()) result_list = list() try: response = s.execute() for h in response.to_dict()['hits']['hits']: result_list.append(h) return result_list except Exception as e: print(e)
def search(self, criteria, key_list=None): """ Builds ElasticSearch query. Args: criteria(schemas/search-layer-criteria.json): Criteria to use to initiate search. key_list(list): List of keys to receive back from a search. Returns: dict: each element in the outer dict represents a search "hit" with the returned keys specified in key_list. """ query = self._build_query(criteria.get("search")) query = Search(using=self.connection).index(self.index).sort("_uid").query(query) # Using python splicing on a query is the same as using {from: 0, size: 50} in an elasticsearch query # the upper_limit is gathered from the elasticsearch config query = query[0:self.upper_limit] self.search_container.logger.debug("Executing the following search query: {0}".format(query.to_dict())) search_results = query.execute() search_formatter = SearchFormatter(criteria, search_results, key_list) formatted_results = search_formatter.get_formatted_results() return formatted_results
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] self._build_fields() # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError('_results_number too large') elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: if not value: continue field_name = self.get_field_name(value) search.aggs.bucket( value, 'terms', field=field_name, size=facets_size, ) # Create signature aggregations. if params.get('_aggs.signature'): sig_bucket = A( 'terms', field=self.get_field_name('signature'), size=facets_size, ) for param in params['_aggs.signature']: for value in param.value: if not value: continue if value.startswith('_histogram.'): # This is a histogram aggregation we want to run, # not a terms aggregation. field_name = value[len('_histogram.'):] if field_name not in self.histogram_fields: continue histogram_type = ( self.all_fields[field_name]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) sig_bucket.bucket( 'histogram_%s' % field_name, histogram_type, field=self.get_field_name(field_name), interval=histogram_intervals[field_name], ) else: sig_bucket.bucket( value, 'terms', field=self.get_field_name(value), size=facets_size, ) search.aggs.bucket('signature', sig_bucket) # Create histograms. for f in self.histogram_fields: if params.get('_histogram.%s' % f): histogram_type = ( self.all_fields[f]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) date_bucket = A( histogram_type, field=self.get_field_name(f), interval=histogram_intervals[f], ) for param in params['_histogram.%s' % f]: for value in param.value: if not value: continue field_name = self.get_field_name(value) val_bucket = A( 'terms', field=field_name, size=facets_size, ) date_bucket.bucket(value, val_bucket) search.aggs.bucket('histogram_%s' % f, date_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def generate_where(self, query, where, is_root=False): where_clauses = where["clauses"] source_fields = set() musts = [] shoulds = [] filters = [] must_nots = [] sub_queries = [] shoulds_by_predicate = {} unbound_subquery_variables = set() for clause in where_clauses: if "operator" in clause: continue if "fields" in clause: fields = clause["fields"] source_fields |= set([field["name"] for field in fields]) if("constraint" in clause): es_clause = self.translate_clause_helper(clause, fields, True) elif "clauses" in clause: sub_query = self.generate_where(query, clause, False) sub_query["clause_fields"] = [] # if sub_query contains variable of parent query # create clause that filters on variable of parent query contains_parent_variable = False for c in clause["clauses"]: if "variable" in c: if c["variable"] == where["variable"]: contains_parent_variable = True else: unbound_subquery_variables.add(c["variable"]) for f in c["fields"]: if not f["name"].startswith("content") and not f["name"] == "raw_content": sub_query["clause_fields"].append({"name": f["name"], "variable": c["variable"]}) if contains_parent_variable: sub_query_clause = {} sub_query_clause["constraint"] = "__placeholder__" sub_query_clause["isOptional"] = False sub_query_clause["fields"] = where["fields"] source_fields |= set([field["name"] for field in where["fields"]]) sub_query_clause["_id"] = clause["_id"] es_clause = self.translate_clause_helper(sub_query_clause, where["fields"], True) sub_query["clause_fields"] = where["fields"] sub_query["clause_id"] = clause["_id"] else: es_clause = None #sub_query["clause_fields"] = where["fields"] sub_queries.append(sub_query) # else # create clause that's constrained on variable of clause #clause["constraint"] = "__placeholder__" #es_clause = self.translate_clause_helper(clause, fields, True) #sub_query["clause_name"] = es_clause["_name"] else: # this is a we need an answer for this clause if not is_root or "filter_for_fields_of_unbound_variables" \ not in self.elasticsearch_compiler_options or \ self.elasticsearch_compiler_options["filter_for_fields_of_unbound_variables"]: es_clause = self.translate_clause_helper(clause, fields, False) else: es_clause = None if es_clause: if clause.get("isOptional", False): predicate = clause.get("predicate") if predicate not in shoulds_by_predicate: shoulds_by_predicate[predicate] = list() shoulds_by_predicate.get(predicate).append(es_clause) else: musts.append(es_clause) if unbound_subquery_variables: sub_query = sub_queries[-1] sub_query["variable_to_clause_id"] = {} for clause in where_clauses: if "operator" in clause: if "union" == clause["operator"].lower(): union_shoulds = [] for uc in clause["clauses"]: if "variable" in uc and uc["variable"] in unbound_subquery_variables: uc["constraint"] = "__placeholder__" uc_es_clause = self.translate_clause_helper(uc, uc["fields"], True) if uc["variable"] not in sub_queries[-1]["variable_to_clause_id"]: sub_query["variable_to_clause_id"][uc["variable"]] = [] variable_to_clause_id = sub_query["variable_to_clause_id"][uc["variable"]] variable_to_clause_id.append(uc["_id"]) union_shoulds.append(uc_es_clause) union_q = Bool(should=union_shoulds) # must or filter? filters.append(union_q) elif "constraint" not in clause and "clauses" not in clause: if "variable" in clause and\ clause["variable"] in unbound_subquery_variables: clause["constraint"] = "__placeholder__" es_clause = self.translate_clause_helper(clause, clause["fields"], True) if clause["variable"] not in sub_queries[-1]["variable_to_clause_id"]: sub_query["variable_to_clause_id"][clause["variable"]] = [] variable_to_clause_id = sub_query["variable_to_clause_id"][clause["variable"]] variable_to_clause_id.append(clause["_id"]) # must or filter? filters.append(es_clause) for key, value in shoulds_by_predicate.iteritems(): if len(value) > 1: shoulds.append(DisMax(queries=value)) else: shoulds.append(value[0]) if "filters" in where: filter_clauses = where["filters"] for f in filter_clauses: source_fields = self.generate_filter(f, filters, source_fields) if self.elasticsearch_compiler_options.get("convert_text_filters_to_shoulds", False): valid_filters = list() converted_filters = list() for f in filters: is_matches = False if isinstance(f, DisMax): is_matches = True for q in f.queries: if isinstance(q, Range): is_matches = False break if is_matches: converted_filters.append(f) else: valid_filters.append(f) shoulds.extend(converted_filters) filters= valid_filters q = Bool(must=musts, should=shoulds, filter=filters, must_not=must_nots) if ("boost_musts" in self.elasticsearch_compiler_options and len(musts) > 0) or\ "boost_shoulds" in self.elasticsearch_compiler_options: if "boost_musts" in self.elasticsearch_compiler_options\ and len(musts) == 1: shoulds.extend(musts) q = Bool(should=shoulds, filter=filters, must_not=must_nots) else: boost = 10.0 weighted_by_musts = [] musts_temp = musts if "boost_musts" in self.elasticsearch_compiler_options: shoulds.extend(musts) musts_temp = [] if len(shoulds) > 0: extra_minimum_should_match = 0 if len(shoulds) >= 2 and "boost_shoulds"\ in self.elasticsearch_compiler_options: extra_minimum_should_match = 1 for x in range(0, len(shoulds) - extra_minimum_should_match): weighted_q = Bool( must=musts_temp, should=shoulds, filter=filters, must_not=must_nots, boost=boost, minimum_should_match=len(shoulds) - x) weighted_by_musts.append(weighted_q) boost = boost / 2 weighted_must = Bool(should=weighted_by_musts, disable_coord=True) q = weighted_must s = Search() s.query = q if is_root: s = self.generate_query_boilerplate(query, s, source_fields) else: s = self.generate_source_fields(s, source_fields) es_result = {} es_result["search"] = self.clean_dismax(s.to_dict()) es_result["type"] = where["type"] if len(sub_queries) > 0: sub_queries.append(es_result) return sub_queries return es_result
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=( '_results_number cannot be greater ' 'than 1,000' ) ) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative' ) elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000' ) for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations( params, search, facets_size, histogram_intervals ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error )[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass raise
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ( isinstance(val, basestring) and ' ' not in val ): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = { 'gt': param.value } elif param.operator == '<': # lower than filter_type = 'range' filter_value = { 'lt': param.value } elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = { 'gte': param.value } elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = { 'lte': param.value } elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = ( operator_wildcards[param.operator] % param.value ) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError( 'Operator %s is not supported' % param.operator ) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't restrict on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot return it' % value ) if not field_['is_returned']: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't sort on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot sort on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't facet on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot facet on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' search.aggs.bucket( value, 'terms', field=field_name, size=self.config.facets_max_number ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
class TopologyData(object): """A base class used by models that are really Elasticsearch entries, and not db tables.""" _DOC_TYPE = "" _INDEX_PREFIX = "" def __init__(self): self.conn = es_conn() self.search = Search(self.conn) # Using the private setters over methods simplifies mocking for # unit tests. # pylint: disable=W0212 self.search._doc_type = self._DOC_TYPE self.search._index = es_indices(self._INDEX_PREFIX, self.conn) @classmethod def _sort_arg(cls, key, order): """Return key as, key or -key, depending on the sort order.""" if order in ["+", "asc"]: return key # translates to [{key: {'order': 'asc'}}] elif order in ["-", "desc"]: return "-" + key # translates to [{key: {'order': 'desc'}}] else: raise ValueError("Valid order values are in [+, -, asc, desc]") def get(self, count=1, sort_key="@timestamp", sort_order="desc"): """Return the latest n instances from ES or None if not found.""" from elasticsearch import ElasticsearchException try: self.search.sort(self._sort_arg(sort_key, sort_order)) self.search = self.search[0:count] logger.debug("[get] search = %s", self.search.to_dict()) # pylint: disable=W0212 logger.debug("[get] index = %s", self.search._index) logger.debug("[get] doc_type = %s", self._DOC_TYPE) return self.search.execute() except ElasticsearchException as exc: logger.debug("get from ES failed, exception was %s", exc.message) raise except ValueError as exc: logger.exception(exc) raise def post(self, body, **_): """Post a record to the database. :arg body: record body as JSON object :arg _: Unused. :return: id of the inserted record """ logger.debug("post called with body = %s", json.dumps(body)) response = self.conn.create( daily_index(self._INDEX_PREFIX), self._DOC_TYPE, body, refresh=True) logger.debug('[post] response = %s', json.dumps(response)) return response['_id']
elif args.cmd == "missingparameter": s = query_missingparam(s, args.parameter, args.method, args.responsecode, args.invert) querytype = QUERY_SEARCH elif args.cmd == "headervalues": s = query_headervals(s, args.header) querytype = QUERY_VALUES elif args.cmd == "search": s = query(s, " ".join(args.query)) querytype = QUERY_SEARCH else: argparser.print_help() sys.exit(1) if querytype == QUERY_SEARCH: if args.fields: print_debug(s.to_dict()) r = s.scan() else: add_default_aggregation(s) print_debug(s.to_dict()) r = s.execute() elif querytype == QUERY_VALUES: print_debug(s.to_dict()) r = s.execute() if querytype == QUERY_SEARCH: if not r: print("No matches!") sys.exit(0) if args.fields: for d in r:
class Elastic(LogProvider): def __init__(self, config_file='config.cfg'): super(Elastic, self).__init__() self.percentage=10.0 self.minimum_occurrences=250 # The ConfigParser documentation points out that there's no way to force defaults config option # outside the "DEFAULT" section. config = ConfigParser() config.read(config_file) if not config.has_section('elastic'): config.add_section('elastic') for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items(): if not config.has_option('elastic', option): config.set('elastic', option, value) self.version = config.getint('elastic', 'version') self.index = config.get('elastic', 'index') use_ssl = config.getboolean('elastic', 'use_ssl') host = config.get('elastic', 'host') self.doc_type = config.get('elastic', 'doc_type') self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True ) Event.init(index=self.index) index = Index(self.index, using=self.client) index.doc_type(Event) self.initialize_search() def initialize_search(self): self.search = Search(using=self.client, index=self.index).extra(size=10000) def export_search(self): return self.search def import_search(self, search): self.search = search def get_filters(self): return self.search.to_dict() def add_filters(self, filters, regexp=False, negative=False): """ Add `filters` to the query. `filters is a dict of the form {'field': value, field2: value2}, but you can also use a list of values instead of a `str`. They'll be added as a _or_ (and not a _and_). :param dict filters: :param bool regexp: :param bool negative: :return: """ # We need to use multi_match, since we get the fields names dynamically. for key, value in filters.items(): if isinstance(value, set): value = list(value) # There is no need to process empty values. if not value: continue if isinstance(value, list): if negative: self.search = self.search.query(Q('bool', must_not=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: self.search = self.search.query(Q('bool', must=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: if negative: self.search = self.search.query(~Q("multi_match", query=value, fields=[key])) else: self.search = self.search.query(Q("multi_match", query=value, fields=[key])) def get_top(self, field, size=250): """ Get the top values for the given `field` :param str field: the field to filter on :param int size: how many top values to return, top :return dict of int: A structure of the form {value: number_of_hits, value2: numer_of_hits2} """ search = self.search ret = dict() if field in ['uri', 'vers', 'comments', 'server']: field = ''.join((field, '.raw')) if VERSION < (5, 0, 0): self.search = self.search.params(search_type='count', default_operator='AND') else: self.search = self.search.params(search_type='query_then_fetch') # This documented at https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search # search_type='count' has been deprecated in ES 2.0 self.search.aggs.bucket('TEST', 'terms', field=field) for hit in self.search.execute(ignore_cache=True).aggregations['TEST']['buckets']: ret[hit['key']] = hit['doc_count'] self.search = search return ret def get_relevant_ids(self, fields, percentage=0, minimum_occurrences=0): """ This function is supposed to return the id that are reparteed/present on the `fields`. :param list of str fields: :param float percentage: :param float minimum_occurrences: :return set of int: """ minimum_occurences = minimum_occurrences or self.minimum_occurrences percentage = percentage or self.percentage ret = set() search = self.search ids = set(i['id'] for i in self.search.execute()) # get all possible ID self.search = search for _id in ids: search = self.search self.add_filters({'id': _id}) # Get how many different fields there are for a given `id` data = collections.defaultdict(set) fields_counter = collections.defaultdict(int) for res in self.search.execute(): for field in fields: if res[field] not in data[field]: fields_counter[field] += 1.0 data[field].add(res[field]) # Ignore id that are present on less than 10% of different values of each fields for field, content in data.items(): if len(content) < minimum_occurrences: logging.debug('Discarding id \033[32m%s\033[0m only present %d times.', _id, len(content)) continue _percentage = len(content) / fields_counter[field] * 100.0 if _percentage > percentage: continue logging.debug('Discarding id \033[32m%s\033[0m present in %d%% of different values of the \033[32m%s\033[0m field', _id, _percentage, field) break else: ret.add(_id) self.search = search return ret def reset_filters(self): self.search = Search(using=self.client, index=self.index).extra(size=10000) def get_results(self): """ Return a `Result` object obtained from the execution of the search `self.search`. :return Result: The `Result` object obtained from the execution of the search `self.search`. """ search = self.search result = self.search.scan() self.search = search return result def commit(self): """Process list of dict (yes) and push them to DB """ self.total_objs += len(self.nlist) count = 0 def gen_events(events): dicts = list() for d in events: dicts.extend([{'index': {'_index': 'nxapi', '_type': 'events'}}, d.to_dict()]) yield dicts.pop(-2) yield dicts.pop(-1) events = list() for entry in self.nlist: event = Event(_index=self.index) for key, value in entry.items(): setattr(event, key, value) event.whitelisted = False event.comments = "import on"+str(datetime.datetime.now()) events.append(event) count += 1 try: ret = self.client.bulk(gen_events(events)) ## ToDo parse ret to selectively loop over events to events.save() whatever happens except TransportError as e: logging.warning("We encountered an error trying to continue.") for event in events: event.save(using=self.client) ## ToDo find a way to change the hardcoded 'events' for ES doctype ## elasticsearch_dsl Issue 689 self.total_commits += count logging.debug("Written "+str(self.total_commits)+" events") del self.nlist[0:len(self.nlist)]
def _build_query(self): query = Q() source = ['id'] sort = [] aggregations = {} query_string = None as_list = as_dict = False for action, value in self.steps: if action == 'order_by': for key in value: if key.startswith('-'): sort.append({key[1:]: 'desc'}) else: sort.append(key) elif action == 'values': source.extend(value) as_list, as_dict = True, False elif action == 'values_dict': if value: source.extend(value) as_list, as_dict = False, True elif action == 'query': query &= self._process_queries(value) elif action == 'filter': query &= self._process_filters(value) elif action == 'source': source.extend(value) elif action == 'aggregate': aggregations.update(value) elif action == 'filter_query_string': query_string = value else: raise NotImplementedError(action) # If we have a raw query string we are going to apply all sorts # of boosts and filters to improve relevance scoring. # # We are using the same rules that `search.filters:SearchQueryFilter` # implements to have a single-source of truth for how our # scoring works. from olympia.search.filters import SearchQueryFilter search = Search().query(query) if query_string: search = SearchQueryFilter().apply_search_query( query_string, search) if sort: search = search.sort(*sort) if source: search = search.source(source) body = search.to_dict() # These are manually added for now to simplify a partial port to # elasticsearch-dsl if self.start: body['from'] = self.start if self.stop is not None: body['size'] = self.stop - self.start if aggregations: body['aggs'] = aggregations self.source, self.as_list, self.as_dict = source, as_list, as_dict return body
# -*- encoding: utf-8 -*- import json from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q client = Elasticsearch() s = Search(using=client, index="test-index").query("match", nick=u"压力") testresult = client.search(index='test-index', body=s.to_dict(),size=3, from_=3) print '=============',testresult response = s.execute() print s.to_dict()
__author__ = "Monica Fernandez" from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, Q client = Elasticsearch() s = Search().query("term", title="Example").query("term", author="fracma") print(s.to_dict())