def test_scan_iterates_through_all_docs(data_client): s = Search(index='git').filter('term', _type='commits') commits = list(s.scan()) assert 52 == len(commits) assert set(d['_id'] for d in DATA if d['_type'] == 'commits') == set(c.meta.id for c in commits)
def GetAuditData(self, case, child_id, data_type, start=None, length=None, str_query=None, sort=None, order=None): q = ['w32registryraw', 'filedownloadhistory', 'urlhistory', 'timeline', 'w32apifiles', 'w32rawfiles', 'w32eventlogs'] if data_type in q: query = search_queries.GetGeneratorQuery(data_type, str_query, case, child_id, start, length, sort, order) else: s = Search() s = s[0:1000] t = Q('query_string', default_field="ComputerName.raw", query=child_id) & Q('query_string', default_field="CaseInfo.case_name", query=case) query = s.query(t).filter('term', AuditType__Generator=data_type) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']: data.append(x) except KeyError: return data return data
def BuildRootTree(self): s = Search() t = Q('query_string', query="*") aggs_casenum = A('terms', field="CaseInfo.case_name", size=0) s.aggs.bucket('casenum', aggs_casenum) query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [{ "id": "current_inv", "parent": "#", "text": "Current Investigations", "type": "root" }, { "id": "comp_inv", "parent": "#", "text": "Completed Investigations", "type": "root" }] for x in r.json()['aggregations']['casenum']['buckets']: data.append({ "id": x['key'], "parent": "current_inv", "text": x['key'], "children": True, "type": "case" }) return data
def get_files_in_path(self, dir_path): ''' gets all es file names from es in a given path ''' dir_hash = FileResource.get_hash(dir_path) #s = FileResource.search().query("match", path["hashdir"] = dir_hash) #s = FileResource.search().query("multi_match", query=dir_hash, fields=['path.hashdir']) # [{"query": {"match_all": {"index": "content_crawler", "body": {"query": {"term": {"path.hashdir": "b5844a9aba1536cc74682d8bfa28553b5dfd8a8a"}}}, "doc_type": "file_resource"}} s = Search().query( index = self.index, doc_type= self.type, body={"query": { "term" : { "file_dir_hash" : dir_hash } } } ) response = s.execute() files = [] for hit in s: files.append(hit.file_uri) return files
def BuildAuditAggs(self, child_id, parent_id): s = Search() s = s[0] t = Q('query_string', default_field="CaseInfo.case_name", query=parent_id) & Q('match', ComputerName=child_id) aggs_generator = A('terms', field='AuditType.Generator', size=0) s.aggs.bucket('datatypes', aggs_generator) query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] exclude = ['w32processes-memory', 'stateagentinspector', 'w32disks'] for y in r.json()['aggregations']['datatypes']['buckets']: if not y['key'] in exclude: data.append({ "id": y['key'], "parent": child_id, "text": y['key'], "type": "audit", "a_attr": {"href": "#" + y['key'] + '/' + parent_id + "/" + child_id } }) return data
def get(self, request, *args, **kwargs): q = request.GET.get('q') # Make search. queries = [ query.Q('match', slug=self._phrase(q)), # Slug. query.Q('match', type=self._phrase(q)), # Type. query.Q('match', search_names=self._phrase(q)), # Name. query.Q('prefix', carrier=q), # Shelf carrier. query.Q('term', region=q) # Shelf region. ] sq = query.Bool(should=queries) # Search. res = {'apps': [], 'brands': [], 'collections': [], 'shelves': []} es = Search(using=FeedItemIndexer.get_es(), index=self.get_feed_element_index()) feed_elements = es.query(sq).execute().hits if not feed_elements: return response.Response(res, status=status.HTTP_404_NOT_FOUND) # Deserialize. ctx = {'app_map': self.get_apps(request, self.get_app_ids_all(feed_elements)), 'request': request} for feed_element in feed_elements: item_type = feed_element.item_type serializer = self.SERIALIZERS[item_type] data = serializer(feed_element, context=ctx).data res[self.PLURAL_TYPES[item_type]].append(data) # Return. return response.Response(res, status=status.HTTP_200_OK)
def GetAuditDataMain(self, data): s = Search() s = s[0:1000] s = s.highlight('*') s = s.highlight_options(require_field_match=False) t = Q('query_string', query=data) & ~Q('query_string', default_field="AuditType.Generator", query="stateagentinspector") & ~Q('query_string', default_field="AuditType.Generator", query="w32processes-tree") query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [] try: for x in r.json()['hits']['hits']: for y, v in x['highlight'].iteritems(): data.append({ "doc_id": x['_id'], "endpoint": x['_parent'], "audittype": x['_source']['AuditType']['Generator'], "field": y, "response": v }) except KeyError: pass return data
def search(self): self.reindex(Addon) qs = Search(using=amo.search.get_es(), index=AddonIndexer.get_index_alias(), doc_type=AddonIndexer.get_doctype_name()) return qs.filter('term', id=self.addon.pk).execute()[0]
def query(self, client): """Method that actually queries elasticsearch""" # Set up our search parameters voq = self.config.get("query", "{}_voname".format(self.vo.lower())) productioncheck = '*Role=Production*' start_date = self.datesplit_pattern.split(self.start_time) starttimeq = datetime(*[int(elt) for elt in start_date]).isoformat() end_date = self.datesplit_pattern.split(self.end_time) endtimeq = datetime(*[int(elt) for elt in end_date]).isoformat() # Generate the index pattern based on the start and end dates indexpattern = indexpattern_generate(start_date, end_date) if self.verbose: print >> sys.stdout, indexpattern sleep(3) # Elasticsearch query resultset = Search(using=client, index=indexpattern) \ .query("wildcard", VOName=productioncheck) \ .filter(Q({"term": {"VOName": voq}})) \ .filter("range", EndTime={"gte": starttimeq, "lt": endtimeq}) \ .filter(Q({"term": {"ResourceType": "Payload"}})) if self.verbose: print resultset.to_dict() return resultset
def _filter(self, req=None, data=None): req = req or RequestFactory().get('/', data=data or {}) queryset = Search() for filter_class in self.filter_classes: queryset = filter_class().filter_queryset(req, queryset, self.view_class) return queryset.to_dict()
def searchTweets(keyword, latlondist): #Variables that contains the user credentials to access Twitter API if TwitterHelper.AWS_ACCESS_KEY == None: raise KeyError("Please set the AWS_ACCESS_KEY env. variable") if TwitterHelper.AWS_SECRET_KEY == None: raise KeyError("Please set the AWS_SECRET_KEY env. variable") s = Search() if latlondist != None: locJson = json.loads(latlondist) s = s.query({"filtered" : {"query" : {"match_all" : {}}, "filter" : {"geo_distance" : {"distance" : locJson['dist'], "location" : {"lat" : locJson['lat'], "lon" : locJson['lon']}}}}}) if keyword != None: q = Q("match_phrase", text = keyword) s = s.query(q) scanResp = None scanResp = helpers.scan(client = TwitterHelper.ES, query = s.to_dict(), scroll = "1m", index = "tweets", timeout = "1m") arr = [] for resp in scanResp: hit = resp['_source'] d = {} d['name'] = hit['name'] d['text'] = hit['text'] d['sentiment'] = hit['sentiment'] d['lat'] = hit['location']['lat'] d['lon'] = hit['location']['lon'] arr.append(d) allD = {} allD['tweets'] = arr mapInput = json.dumps(allD) return mapInput
def test_scan_iterates_through_all_docs(data_client): s = Search(index='flat-git') commits = list(s.scan()) assert 52 == len(commits) assert set(d['_id'] for d in FLAT_DATA) == set(c.meta.id for c in commits)
def handle(self, *args, **options): min_id = FailureLine.objects.order_by('id').values_list("id", flat=True)[0] - 1 chunk_size = options['chunk_size'] if options["recreate"]: connection.indices.delete(TestFailureLine._doc_type.index, ignore=404) TestFailureLine.init() else: if connection.indices.exists(TestFailureLine._doc_type.index): self.stderr.write("Index already exists; can't perform import") return while True: rows = (FailureLine.objects .filter(id__gt=min_id) .order_by('id') .values("id", "job_guid", "action", "test", "subtest", "status", "expected", "message", "best_classification_id", "best_is_verified"))[:chunk_size] if not rows: break es_lines = [] for item in rows: es_line = failure_line_from_value(item) if es_line: es_lines.append(es_line) self.stdout.write("Inserting %i rows" % len(es_lines)) bulk_insert(es_lines) min_id = rows[len(rows) - 1]["id"] time.sleep(options['sleep']) s = Search(doc_type=TestFailureLine).params(search_type="count") self.stdout.write("Index contains %i documents" % s.execute().hits.total)
def authors(self, num_columns=0): """ @param num_columns: int If non-zero, break up list into columns """ s = Search( using=docstore._get_connection(settings.DOCSTORE_HOSTS), index=settings.DOCSTORE_INDEX, doc_type='authors' ).fields([ 'url_title', 'title', 'title_sort', 'lastmod' ])[0:docstore.MAX_SIZE] response = s.execute() authors = [] for hit in response: url_title = hit.url_title[0] title = hit.title[0] title_sort = hit.title_sort[0] lastmod = hit.lastmod[0] if title and title_sort: author = Author() author.url_title = url_title author.title = title author.title_sort = title_sort author.lastmod = datetime.strptime(lastmod, mediawiki.TS_FORMAT) authors.append(author) authors = sorted(authors, key=lambda a: a.title_sort) if num_columns: return _columnizer(authors, num_columns) return authors
def test_inner_hits_are_wrapped_in_response(data_client): s = Search(index='git')[0:1].query('has_parent', parent_type='repo', inner_hits={}, query=Q('match_all')) response = s.execute() commit = response.hits[0] assert isinstance(commit.meta.inner_hits.repo, response.__class__) assert repr(commit.meta.inner_hits.repo[0]).startswith("<Hit(git/doc/elasticsearch-dsl-py): ")
def from_es_id(cls,es,es_id,access_token,instance,version=None): index_exists = es.indices.exists(index=cls.ES_INDEX) type_exists = es.indices.exists_type(index=cls.ES_INDEX, doc_type=cls.ES_TYPE) if not all([index_exists,type_exists]): raise Exception('Elastic index or type does not exist. ' \ 'Cannot find {c} in Elastisearch '\ ' to create an instance'.format(c=cls.__name__)) find_instance = Search(using=es,index=cls.ES_INDEX) \ .query(Q("match",_id=es_id)) r = find_instance.execute() if not r: raise Exception('Cannot find elasticsearch {t}' \ ' instance from elasticsearch ' \ 'id:{id}'.format(t=cls.__name__, id=es_id)) sf_id = r[0]._d_.pop('Id',None) if sf_id is None: raise Exception('Missing a valid SF Id in ' \ ' Elasticsearch document id:{i}'.format(i=sf_id)) sf_data = r[0]._d_ return cls(es=es, sf_id=sf_id, sf_data=sf_data, access_token=access_token, instance=instance)
def categories(self): s = Search( using=docstore._get_connection(settings.DOCSTORE_HOSTS), index=settings.DOCSTORE_INDEX, doc_type='articles' ).fields([ 'title', 'title_sort', 'categories', ])[0:docstore.MAX_SIZE] if not settings.MEDIAWIKI_SHOW_UNPUBLISHED: s = s.query('match', published=True) response = s.execute() pages = [] for hit in response: page = Page() page.url_title = hit.title[0] page.title = hit.title[0] page.title_sort = hit.title_sort[0] page.categories = hit.get('categories', []) pages.append(page) articles = sorted(pages, key=lambda page: page.title_sort) categories = {} for page in articles: for category in page.categories: # exclude internal editorial categories if category not in settings.MEDIAWIKI_HIDDEN_CATEGORIES: if category not in categories.keys(): categories[category] = [] # pages already sorted so category lists will be sorted if page not in categories[category]: categories[category].append(page) return categories
def index_single(es, network, channel, date, lines): # Delete existing delete_existing = Search( using=es, index='moffle', ).query( "term", network=network, ).query( "term", channel=channel, ).query( "term", date=date, ) es.delete_by_query( index='moffle', body=delete_existing.to_dict(), ) actions = [x for x in (line_to_index_action(network, channel, date, i, line) for i, line in lines) if x] while actions: retries = 0 try: success_count, _ = bulk(es, actions) log("{}/{}/{}: indexed {} lines".format(network, channel, date, success_count)) return success_count except Exception as e: retries += 1 log("{}/{}/{}: Attempt {}/3: {}".format(network, channel, date, retries, e)) if retries > 3: raise
def exists(self): find_instance = Search(using=self.es,index=self.index) \ .query(Q("match",Id=self.sf_id)) response = find_instance.execute() return response
def BuildRootTree(self): s = Search() t = Q('has_parent', type='hostname', query=Q('query_string', query="*")) aggs = A('terms', field='AuditType.Generator', size=16) s.aggs.bucket('datatypes', aggs) query = s.query(t) try: r = requests.post(self.es_host + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict())) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [{ "id": "stackable", "parent": "#", "text": "Stackable Data" }] i = ['w32services', 'w32tasks', 'w32scripting-persistence', 'w32prefetch', 'w32network-dns', 'urlhistory'] for x in r.json()['aggregations']['datatypes']['buckets']: if x['key'] not in i: pass else: data.append({ "id" : x['key'], "parent": "stackable", "text": x['key'], "children": True }) return data
def es_read(self, log_id, offset): """ Returns the logs matching log_id in Elasticsearch and next offset. Returns '' if no log is found or there was an error. :param log_id: the log_id of the log to read. :type log_id: str :param offset: the offset start to read log from. :type offset: str """ # Offset is the unique key for sorting logs given log_id. s = Search(using=self.client) \ .query('match', log_id=log_id) \ .sort('offset') s = s.filter('range', offset={'gt': offset}) logs = [] if s.count() != 0: try: logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \ .execute() except Exception as e: msg = 'Could not read log with log_id: {}, ' \ 'error: {}'.format(log_id, str(e)) self.log.exception(msg) return logs
def search(): q = request.args.get('q') #resp = es.search(index='hoe', doc_type='record', q=q, body=aggs) #logging.info(q) s = Search(using=es, index='hoe', doc_type='record') s.aggs.bucket('library_place', 'terms', field='library-place') s.aggs.bucket('type', 'terms', field='type') s.aggs.bucket('genre', 'terms', field='genre') s.aggs.bucket('keywords', 'terms', field='keywords.label') s.aggs.bucket('author', 'terms', field='author.literal') s.query = Q('multi_match', query=q, fields=['_all']) filters = [] if 'filter' in request.args: filters = request.args.getlist('filter') logging.info(filters) for filter in filters: cat, val = filter.split(':') cat = cat.replace('_', '-') filter_dict = {} filter_dict.setdefault(cat, val) logging.info(cat) s.filter = F('term', **filter_dict) #if request.args resp = s.execute() #logging.info(resp) #logging.info(resp.aggregations.per_category.buckets) return render_template('resultlist.html', records=resp.to_dict().get('hits'), facets=resp.aggregations.to_dict(), header=q, query=q, filters=filters)
def pages(): """Returns list of published light Page objects. @returns: list """ KEY = 'encyc-front:pages' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='articles').filter('term', published_encyc=True)[0:MAX_SIZE] s = s.sort('title_sort') s = s.fields([ 'url_title', 'title', 'title_sort', 'published', 'modified', 'categories', ]) response = s.execute() data = [ Page( url_title = hitvalue(hit, 'url_title'), title = hitvalue(hit, 'title'), title_sort = hitvalue(hit, 'title_sort'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), categories = hit.get('categories',[]), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) return data
def get_highlights(): wiki_field = 'wiki_content' qb_field = 'qb_content' text = request.form['text'] s = Search(index='qb')[0:10].query( 'multi_match', query=text, fields=[wiki_field, qb_field]) s = s.highlight(wiki_field).highlight(qb_field) results = list(s.execute()) if len(results) == 0: highlights = {'wiki': [''], 'qb': [''], 'guess': ''} else: guess = results[0] # take the best answer _highlights = guess.meta.highlight try: wiki_content = list(_highlights.wiki_content) except AttributeError: wiki_content = [''] try: qb_content = list(_highlights.qb_content) except AttributeError: qb_content = [''] highlights = {'wiki': wiki_content, 'qb': qb_content, 'guess': guess.page} return jsonify(highlights)
def authors(num_columns=None): """Returns list of published light Author objects. @returns: list """ KEY = 'encyc-front:authors' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='authors')[0:MAX_SIZE] s = s.sort('title_sort') s = s.fields([ 'url_title', 'title', 'title_sort', 'published', 'modified', ]) response = s.execute() data = [ Author( url_title = hitvalue(hit, 'url_title'), title = hitvalue(hit, 'title'), title_sort = hitvalue(hit, 'title_sort'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) if num_columns: return _columnizer(data, num_columns) return data
def get_journals_by_collection_institution(collection_acronym, page_from=0, page_size=1000): search = Search(index=INDEX).query( "nested", path="collections", query=Q("match", collections__acronym=COLLECTION)) search = search.filter("exists", field="sponsors") search = search[page_from:page_size] search_response = search.execute() meta = { 'total': search_response.hits.total, } sponsors = {} for journal in search_response: j = {'jid': journal.jid, 'title': journal.title, 'current_status': journal.current_status, 'last_issue': journal.last_issue, 'issue_count': journal.issue_count } for sponsor in journal['sponsors']: sponsors.setdefault(sponsor, []).append(j) result = { 'meta': meta, 'objects': sponsors } return result
def search(self, text: str, max_n_guesses: int, normalize_score_by_length=False, wiki_boost=1, qb_boost=1): if not self.exists(): raise ValueError('The index does not exist, you must create it before searching') if wiki_boost != 1: wiki_field = 'wiki_content^{}'.format(wiki_boost) else: wiki_field = 'wiki_content' if qb_boost != 1: qb_field = 'qb_content^{}'.format(qb_boost) else: qb_field = 'qb_content' s = Search(index=self.name)[0:max_n_guesses].query( 'multi_match', query=text, fields=[wiki_field, qb_field] ) results = s.execute() guess_set = set() guesses = [] if normalize_score_by_length: query_length = len(text.split()) else: query_length = 1 for r in results: if r.page in guess_set: continue else: guesses.append((r.page, r.meta.score / query_length)) return guesses
def sources(): """Returns list of published light Source objects. @returns: list """ KEY = 'encyc-front:sources' TIMEOUT = 60*5 data = cache.get(KEY) if not data: s = Search(doc_type='sources')[0:MAX_SIZE] s = s.sort('encyclopedia_id') s = s.fields([ 'encyclopedia_id', 'published', 'modified', 'headword', 'media_format', 'img_path', ]) response = s.execute() data = [ Source( encyclopedia_id = hitvalue(hit, 'encyclopedia_id'), published = hitvalue(hit, 'published'), modified = hitvalue(hit, 'modified'), headword = hitvalue(hit, 'headword'), media_format = hitvalue(hit, 'media_format'), img_path = hitvalue(hit, 'img_path'), ) for hit in response if hitvalue(hit, 'published') ] cache.set(KEY, data, TIMEOUT) return data
def session_times(): # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"} start_message = 'scenario.p2p_connect.starting.clients.sequentially' stop_message = 'scenario.p2p_connect.stopping.clients' s = Search(client) s = s.filter('bool', should=[F('term', message=start_message), F('term', message=stop_message)]) s = s.fields(['message', '@timestamp']) s = s[0:100000] s = s.sort('-@timestamp') # desc, we want the latest events response = s.execute() events = [] # joungest to oldest, last should be a stop message for h in response: msg = 'start' if h['message'][0] == start_message else 'stop' ts = h['@timestamp'][0] events.append((msg, ts)) assert not events or events[0][0] == 'stop' sessions = [] while len(events) >= 2: stop = events.pop() start = events.pop() sessions.append(dict([start, stop])) return list(reversed(sessions))
def build_query(self, start_date, end_date, **kwargs): """Build the elasticsearch query.""" agg_query = Search(using=self.client, index=self.index, doc_type=self.doc_type)[0:0] if start_date is not None or end_date is not None: time_range = {} if start_date is not None: time_range['gte'] = start_date.isoformat() if end_date is not None: time_range['lte'] = end_date.isoformat() agg_query = agg_query.filter( 'range', **{self.time_field: time_range}) term_agg = agg_query.aggs for term in self.aggregated_fields: term_agg = term_agg.bucket(term, 'terms', field=term, size=0) term_agg.metric('total', 'sum', field='count') if self.copy_fields: term_agg.metric( 'top_hit', 'top_hits', size=1, sort={'timestamp': 'desc'} ) for query_param, filtered_field in self.required_filters.items(): if query_param in kwargs: agg_query = agg_query.filter( 'term', **{filtered_field: kwargs[query_param]} ) return agg_query
from elasticsearch import Elasticsearch from elasticsearch_dsl import Search from elasticsearch_dsl.connections import connections BASE_URL = "https://hackzurich-api.migros.ch/hack/recipe/recipes_de/_search" connections.create_connection(hosts=[BASE_URL], http_auth=('hackzurich2020', 'uhSyJ08KexKn4ZFS')) s = Search().query("match", title="Suppe") response = s.execute() for hit in response: print(hit.meta.score, hit.title)
def test_no_querystring(self): es = Search() query = queries.get_es_query(es, "", "", []) assert query == es.query()
from elasticsearch import Elasticsearch from elasticsearch_dsl import Search client = Elasticsearch() s = Search(using=client, index="my-index") \ .filter("term", category="search") \ .query("match", title="python") \ .exclude("match", description="beta") s.aggs.bucket('per_tag', 'terms', field='tags') \ .metric('max_lines', 'max', field='lines') response = s.execute() for hit in response: print(hit.meta.score, hit.title) for tag in response.aggregations.per_tag.buckets: print(tag.key, tag.max_lines.value)
required=True, help='Index to search') parser.add_argument('--query', default=None, nargs=argparse.REMAINDER, help='Lucene query') args = parser.parse_args() index = args.index if args.query: query = ' '.join(args.query) try: client = Elasticsearch() s = Search(using=client, index=index) q = Q('query_string', query=query) s = s.query(q) response = s[0:10].execute() for r in response: print('DATE= %s URL=%s' % (r['date'], r['url'])) print('AUTOR= %s' % r['author']) print('TITLE= %s' % r['title']) print('KEYWORDS= %s' % r['keywords']) print('----------------------------------------') print('%d Documents' % response.hits.total) except NotFoundError: print('Index %s does not exists' % index)
def core_search(query): es = Elasticsearch(['http://elasticsearch613:9200/']) q = Q("multi_match", query=query, fields=['title', 'text', 'cat', 'tags' ]) ss = Search(using=es, index='blog').query(q) return ss
def search(cls): return Search(using=get_es(), index=get_index_name(), doc_type=cls.get_doctype())
def search(author): s = Search().filter('term', author=author) response = s.execute() return response
def test_get_es_search(self, es_data_client): view = self.create_view(es_data_client) expected = Search(using=es_data_client, index='test', doc_type=DataDocType) assert view.get_es_search().to_dict() == expected.to_dict()
import pandas as pd import time start = time.clock() raw_index = "mordred_raw" #raw index name enriched_index = "mordred_enriched" #enriched index name repo_url = "https://github.com/chaoss/grimoirelab-mordred.git" #github repository url es = Elasticsearch('http://localhost:9200', verify_certs=False) call("p2o.py --enrich --index " + raw_index + " --index-enrich " + enriched_index + " -e http://localhost:9200 --no_inc --debug git " + repo_url, shell=True) s = Search(using=es, index=enriched_index) s.aggs.bucket('by_authors', 'terms', field='author_name', size=10000).metric( 'first_commit', 'min', field='author_date' ) #aggregate on the basis of author name and find oldest commit date for each of them s = s.sort("author_date") result = s.execute() buckets_result = result['aggregations']['by_authors']['buckets'] buckets = [] for bucket in buckets_result: first_commit = bucket['first_commit']['value'] / 1000 buckets.append({ 'first_commit': datetime.utcfromtimestamp(first_commit), 'author': bucket['key'], 'commit_count': bucket['doc_count']
def __init__(self, data=None, files=None, auto_id='id_%s', prefix=None, initial=None, error_class=ErrorList, label_suffix=None, empty_permitted=False, field_order=None, use_required_attribute=None, renderer=None, user=None, has_combo=False): super().__init__(data, files, auto_id, prefix, initial, error_class, label_suffix, empty_permitted, field_order, use_required_attribute, renderer) # Get topic_modellings s = Search(using=ES_CLIENT, index=ES_INDEX_TOPIC_MODELLING).filter('term', is_ready=True) \ .source(['name', 'algorithm', 'number_of_topics', 'number_of_documents', 'source', 'datetime_from', 'datetime_to' # 'perplexity', 'purity', 'contrast', 'coherence', # 'tau_smooth_sparse_theta', 'tau_smooth_sparse_phi', # 'tau_decorrelator_phi', 'tau_coherence_phi', ])[:500] group = None if not user.is_superuser: group = get_user_group(user) topic_modellings = s.execute() topic_modellings = sorted(topic_modellings, key=lambda x: x.number_of_documents, reverse=True) topic_modellings = (( tm.name.lower(), f"{tm.name.replace('bigartm', 'tm')} - {tm.number_of_topics} топиков - {tm.number_of_documents} текстов - " + (f"{tm.source} - " if hasattr(tm, 'source') and tm.source else f"Все СМИ ") + (f"С {tm.datetime_from[:10]} - " if hasattr(tm, 'datetime_from') and tm.datetime_from else f"") + (f"По {tm.datetime_to[:10]} - " if hasattr(tm, 'datetime_to') and tm.datetime_to else f"")) for tm in topic_modellings if user.is_superuser or (group and tm.name.lower( ) in group.topic_modelling_names.split(","))) if has_combo: combo_indices = ES_CLIENT.indices.get_alias( f"{ES_INDEX_TOPIC_COMBOS}_*").keys() tms_with_combo = [ ind.replace(f"{ES_INDEX_TOPIC_COMBOS}_", "").lower() for ind in combo_indices ] topic_modellings = filter(lambda x: x[0] in tms_with_combo, topic_modellings) self.fields['topic_weight_threshold'].required = False self.fields['topic_modelling'].choices = topic_modellings # Get topic_weight_thresholds self.fields[ 'topic_weight_threshold'].choices = get_topic_weight_threshold_options( user.is_superuser or hasattr(user, "expert"))
def get_data_dsl(self): # using参数是指定Elasticsearch实例对象,index指定索引,可以缩小范围,index接受一个列表作为多个索引,且也可以用正则表示符合某种规则的索引都可以被索引,如index=["bank", "banner", "country"]又如index=["b*"]后者可以同时索引所有以b开头的索引,search中同样可以指定具体doc-type s = Search(using=self.es, index=self.index_name) res = s.query("match", serialNo="368400630043389952").query( "match", is_result="1").highlight("is_result").execute() print(type(res))
def build_brand_es(self, args, category_search_condition): keyword_es = Search() \ .query(category_search_condition) keyword_es = keyword_es.sort(*self.sort_condition(args)) keyword_es = self.add_page_limit_to_brand_es(args, keyword_es) return keyword_es
def test_upercase_and_lowercase_search_give_same_results(self): """Pretty self-explanatory function name, isn't it ?""" if not self.manager.connected_to_es: return # 1. Index lowercase stuffs text_lc = 'test' topic_1_lc = TopicFactory(forum=self.forum, author=self.user, title=text_lc) tag_lc = TagFactory(title=text_lc) topic_1_lc.tags.add(tag_lc) topic_1_lc.subtitle = text_lc topic_1_lc.save() post_1_lc = PostFactory(topic=topic_1_lc, author=self.user, position=1) post_1_lc.text = post_1_lc.text_html = text_lc post_1_lc.save() tuto_lc = PublishableContentFactory(type='TUTORIAL') tuto_draft_lc = tuto_lc.load_version() tuto_lc.title = text_lc tuto_lc.authors.add(self.user) subcategory_lc = SubCategoryFactory(title=text_lc) tuto_lc.subcategory.add(subcategory_lc) tuto_lc.tags.add(tag_lc) tuto_lc.save() tuto_draft_lc.description = text_lc tuto_draft_lc.repo_update_top_container(text_lc, tuto_lc.slug, text_lc, text_lc) chapter1_lc = ContainerFactory(parent=tuto_draft_lc, db_object=tuto_lc) extract_lc = ExtractFactory(container=chapter1_lc, db_object=tuto_lc) extract_lc.repo_update(text_lc, text_lc) published_lc = publish_content(tuto_lc, tuto_draft_lc, is_major_update=True) tuto_lc.sha_public = tuto_draft_lc.current_version tuto_lc.sha_draft = tuto_draft_lc.current_version tuto_lc.public_version = published_lc tuto_lc.save() # 2. Index uppercase stuffs text_uc = 'TEST' topic_1_uc = TopicFactory(forum=self.forum, author=self.user, title=text_uc) topic_1_uc.tags.add( tag_lc) # Note: a constraint forces tags title to be unique topic_1_uc.subtitle = text_uc topic_1_uc.save() post_1_uc = PostFactory(topic=topic_1_uc, author=self.user, position=1) post_1_uc.text = post_1_uc.text_html = text_uc post_1_uc.save() tuto_uc = PublishableContentFactory(type='TUTORIAL') tuto_draft_uc = tuto_uc.load_version() tuto_uc.title = text_uc tuto_uc.authors.add(self.user) tuto_uc.subcategory.add(subcategory_lc) tuto_uc.tags.add(tag_lc) tuto_uc.save() tuto_draft_uc.description = text_uc tuto_draft_uc.repo_update_top_container(text_uc, tuto_uc.slug, text_uc, text_uc) chapter1_uc = ContainerFactory(parent=tuto_draft_uc, db_object=tuto_uc) extract_uc = ExtractFactory(container=chapter1_uc, db_object=tuto_uc) extract_uc.repo_update(text_uc, text_uc) published_uc = publish_content(tuto_uc, tuto_draft_uc, is_major_update=True) tuto_uc.sha_public = tuto_draft_uc.current_version tuto_uc.sha_draft = tuto_draft_uc.current_version tuto_uc.public_version = published_uc tuto_uc.save() # 3. Index and search: self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 0) # index for model in self.indexable: if model is FakeChapter: continue self.manager.es_bulk_indexing_of_model(model) self.manager.refresh_index() result = self.client.get(reverse('search:query') + '?q=' + text_lc, follow=False) self.assertEqual(result.status_code, 200) response_lc = result.context['object_list'].execute() self.assertEqual(response_lc.hits.total, 8) result = self.client.get(reverse('search:query') + '?q=' + text_uc, follow=False) self.assertEqual(result.status_code, 200) response_uc = result.context['object_list'].execute() self.assertEqual(response_uc.hits.total, 8) for responses in zip( response_lc, response_uc): # we should get results in the same order! self.assertEqual(responses[0].meta.id, responses[1].meta.id)
def get_queryset(self): return Search(using=amo.search.get_es(), index=AddonIndexer.get_index_alias(), doc_type=AddonIndexer.get_doctype_name())
def test_change_topic_impacts_posts(self): if not self.manager.connected_to_es: return # 1. Create a hidden forum belonging to a hidden group and add staff in it. text = 'test' group = Group.objects.create(name='Les illuminatis anonymes de ZdS') _, hidden_forum = create_category_and_forum(group) self.staff.groups.add(group) self.staff.save() # 2. Create a normal topic and index it topic_1 = TopicFactory(forum=self.forum, author=self.user, title=text) post_1 = PostFactory(topic=topic_1, author=self.user, position=1) post_1.text = post_1.text_html = text post_1.save() self.manager.es_bulk_indexing_of_model(Topic) self.manager.es_bulk_indexing_of_model(Post) self.manager.refresh_index() self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 2) # indexing ok result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 1) # ok self.assertEqual(response[0].meta.doc_type, Post.get_es_document_type()) self.assertEqual(response[0].forum_pk, self.forum.pk) self.assertEqual(response[0].topic_pk, topic_1.pk) self.assertEqual(response[0].topic_title, topic_1.title) # 3. Change topic title and reindex topic_1.title = 'new title' topic_1.save() self.manager.es_bulk_indexing_of_model(Topic) self.manager.es_bulk_indexing_of_model(Post) self.manager.refresh_index() result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 1) # ok self.assertEqual(response[0].topic_title, topic_1.title) # title was changed # 4. connect with staff and move topic self.assertTrue( self.client.login(username=self.staff.username, password='******')) data = {'move': '', 'forum': hidden_forum.pk, 'topic': topic_1.pk} response = self.client.post(reverse('topic-edit'), data, follow=False) self.assertEqual(302, response.status_code) self.manager.es_bulk_indexing_of_model(Topic) self.manager.es_bulk_indexing_of_model(Post) self.manager.refresh_index() result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual( response.hits.total, 1) # Note: without staff, would not get any results (see below) self.assertEqual(response[0].forum_pk, hidden_forum.pk) # post was updated with new forum # 5. Topic is now hidden self.client.logout() result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 0) # ok
""" Configuration is global so no client needs to be passed around. """ from elasticsearch_dsl import connections """ Default connection used where no other connection specified. Any configuration methods just pass all parameters to the underlying elasticsearch-py client. """ connections.create_connection(hosts=["localhost"]) """ Optionally specify an alias for the connection in case of multiple connections. """ connections.create_connection("prod", hosts=["localhost"]) s = Search(using="prod") s.count() """ You can always just pass in your own client instance """ s = Search(using=Elasticsearch()) s.count() """ Any method on Search returns a clone so you need to always assign it back to the same variable. """ s = Search() s = s.params(q="fix")
def test_boosts(self): """Check if boosts are doing their job""" if not self.manager.connected_to_es: return # 1. Create topics (with identical titles), posts (with identical texts), an article and a tuto text = 'test' topic_1_solved_sticky = TopicFactory(forum=self.forum, author=self.user) topic_1_solved_sticky.title = text topic_1_solved_sticky.subtitle = '' topic_1_solved_sticky.solved_by = self.user topic_1_solved_sticky.is_sticky = True topic_1_solved_sticky.save() post_1 = PostFactory(topic=topic_1_solved_sticky, author=self.user, position=1) post_1.text = post_1.text_html = text post_1.save() post_2_useful = PostFactory(topic=topic_1_solved_sticky, author=self.user, position=2) post_2_useful.text = post_2_useful.text_html = text post_2_useful.is_useful = True post_2_useful.like = 5 post_2_useful.dislike = 2 # l/d ratio above 1 post_2_useful.save() topic_2_locked = TopicFactory(forum=self.forum, author=self.user, title=text) topic_2_locked.title = text topic_2_locked.subtitle = '' topic_2_locked.is_locked = True topic_2_locked.save() post_3_ld_below_1 = PostFactory(topic=topic_2_locked, author=self.user, position=1) post_3_ld_below_1.text = post_3_ld_below_1.text_html = text post_3_ld_below_1.like = 2 post_3_ld_below_1.dislike = 5 # l/d ratio below 1 post_3_ld_below_1.save() tuto = PublishableContentFactory(type='TUTORIAL') tuto_draft = tuto.load_version() tuto.title = text tuto.authors.add(self.user) tuto.save() tuto_draft.repo_update_top_container(text, tuto.slug, text, text) chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto) chapter1.repo_update(text, 'Who cares ?', 'Same here') ExtractFactory(container=chapter1, db_object=tuto) published_tuto = publish_content(tuto, tuto_draft, is_major_update=True) tuto.sha_public = tuto_draft.current_version tuto.sha_draft = tuto_draft.current_version tuto.public_version = published_tuto tuto.save() article = PublishedContentFactory(type='ARTICLE', title=text) published_article = PublishedContent.objects.get(content_pk=article.pk) opinion_not_picked = PublishedContentFactory(type='OPINION', title=text) published_opinion_not_picked = PublishedContent.objects.get( content_pk=opinion_not_picked.pk) opinion_picked = PublishedContentFactory(type='OPINION', title=text) opinion_picked.sha_picked = opinion_picked.sha_draft opinion_picked.date_picked = datetime.datetime.now() opinion_picked.save() published_opinion_picked = PublishedContent.objects.get( content_pk=opinion_picked.pk) for model in self.indexable: if model is FakeChapter: continue self.manager.es_bulk_indexing_of_model(model) self.manager.refresh_index() self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 10) # 2. Reset all boosts to 1 for doc_type in settings.ZDS_APP['search']['boosts']: for key in settings.ZDS_APP['search']['boosts'][doc_type]: settings.ZDS_APP['search']['boosts'][doc_type][key] = 1.0 # 3. Test posts result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 3) # score are equals without boost: self.assertTrue(response[0].meta.score == response[1].meta.score == response[2].meta.score) settings.ZDS_APP['search']['boosts']['post']['if_first'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 3) self.assertTrue(response[0].meta.score == response[1].meta.score > response[2].meta.score) self.assertEqual(response[2].meta.id, str( post_2_useful.pk)) # post 2 is the only one not first settings.ZDS_APP['search']['boosts']['post']['if_first'] = 1.0 settings.ZDS_APP['search']['boosts']['post']['if_useful'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 3) self.assertTrue(response[0].meta.score > response[1].meta.score == response[2].meta.score) self.assertEqual(response[0].meta.id, str(post_2_useful.pk)) # post 2 is useful settings.ZDS_APP['search']['boosts']['post']['if_useful'] = 1.0 settings.ZDS_APP['search']['boosts']['post']['ld_ratio_above_1'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 3) self.assertTrue(response[0].meta.score > response[1].meta.score == response[2].meta.score) self.assertEqual(response[0].meta.id, str( post_2_useful.pk)) # post 2 have a l/d ratio of 5/2 settings.ZDS_APP['search']['boosts']['post']['ld_ratio_above_1'] = 1.0 settings.ZDS_APP['search']['boosts']['post'][ 'ld_ratio_below_1'] = 2.0 # no one would do that in real life result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Post.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 3) self.assertTrue(response[0].meta.score > response[1].meta.score == response[2].meta.score) self.assertEqual(response[0].meta.id, str( post_3_ld_below_1.pk)) # post 3 have a l/d ratio of 2/5 settings.ZDS_APP['search']['boosts']['post']['ld_ratio_below_1'] = 1.0 # 4. Test topics result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Topic.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) # score are equals without boost: self.assertTrue(response[0].meta.score == response[1].meta.score) settings.ZDS_APP['search']['boosts']['topic']['if_sticky'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Topic.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) self.assertTrue(response[0].meta.score > response[1].meta.score) self.assertEqual(response[0].meta.id, str(topic_1_solved_sticky.pk)) # topic 1 is sticky settings.ZDS_APP['search']['boosts']['topic']['if_sticky'] = 1.0 settings.ZDS_APP['search']['boosts']['topic']['if_solved'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Topic.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) self.assertTrue(response[0].meta.score > response[1].meta.score) self.assertEqual(response[0].meta.id, str(topic_1_solved_sticky.pk)) # topic 1 is solved settings.ZDS_APP['search']['boosts']['topic']['if_solved'] = 1.0 settings.ZDS_APP['search']['boosts']['topic'][ 'if_locked'] = 2.0 # no one would do that in real life result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + Topic.get_es_document_type(), follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) self.assertTrue(response[0].meta.score > response[1].meta.score) self.assertEqual(response[0].meta.id, str(topic_2_locked.pk)) # topic 2 is locked settings.ZDS_APP['search']['boosts']['topic'][ 'if_locked'] = 1.0 # no one would do that in real life # 5. Test published contents result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 5) # score are equals without boost: self.assertTrue( response[0].meta.score == response[1].meta.score == response[2]. meta.score == response[3].meta.score == response[4].meta.score) settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_article'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 5) self.assertTrue(response[0].meta.score > response[1].meta.score) self.assertEqual(response[0].meta.id, str(published_article.pk)) # obvious settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_article'] = 1.0 settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_tutorial'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 5) self.assertTrue(response[0].meta.score > response[1].meta.score) self.assertEqual(response[0].meta.id, str(published_tuto.pk)) # obvious settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_tutorial'] = 1.0 settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_opinion'] = 2.0 settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_opinion_not_picked'] = 4.0 # Note: in "real life", unpicked opinion would get a boost < 1. result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 5) self.assertTrue(response[0].meta.score > response[1].meta.score > response[2].meta.score) self.assertEqual( response[0].meta.id, str(published_opinion_not_picked.pk)) # unpicked opinion got first self.assertEqual(response[1].meta.id, str(published_opinion_picked.pk)) settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_opinion'] = 1.0 settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_opinion_not_picked'] = 1.0 settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_medium_or_big_tutorial'] = 2.0 result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 5) self.assertTrue(response[0].meta.score > response[1].meta.score) self.assertEqual(response[0].meta.id, str(published_tuto.pk)) # obvious settings.ZDS_APP['search']['boosts']['publishedcontent'][ 'if_medium_or_big_tutorial'] = 1.0 # 6. Test global boosts # NOTE: score are NOT the same for all documents, no matter how hard it tries to, small differences exists for model in self.indexable: # set a huge number to overcome the small differences: settings.ZDS_APP['search']['boosts'][ model.get_es_document_type()]['global'] = 10.0 result = self.client.get(reverse('search:query') + '?q=' + text, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 10) self.assertEqual(response[0].meta.doc_type, model.get_es_document_type()) # obvious settings.ZDS_APP['search']['boosts'][ model.get_es_document_type()]['global'] = 1.0
def test_change_publishedcontents_impacts_chapter(self): if not self.manager.connected_to_es: return # 1. Create middle-size content and index it text = 'test' tuto = PublishableContentFactory(type='TUTORIAL') tuto_draft = tuto.load_version() tuto.title = text tuto.authors.add(self.user) tuto.save() tuto_draft.repo_update_top_container( text, tuto.slug, text, text) # change title to be sure it will match chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto) chapter1.repo_update(text, text, text) extract = ExtractFactory(container=chapter1, db_object=tuto) extract.repo_update(text, text) published = publish_content(tuto, tuto_draft, is_major_update=True) tuto.sha_public = tuto_draft.current_version tuto.sha_draft = tuto_draft.current_version tuto.public_version = published tuto.save() self.manager.es_bulk_indexing_of_model(PublishedContent) self.manager.refresh_index() self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 2) # indexing ok result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) chapters = [r for r in response if r.meta.doc_type == 'chapter'] self.assertEqual(chapters[0].meta.doc_type, FakeChapter.get_es_document_type()) self.assertEqual(chapters[0].meta.id, published.content_public_slug + '__' + chapter1.slug) # 2. Change tuto: delete chapter and insert new one ! tuto = PublishableContent.objects.get(pk=tuto.pk) tuto_draft = tuto.load_version() tuto_draft.children[0].repo_delete() # chapter 1 is gone ! another_text = 'another thing' self.assertTrue( text not in another_text ) # to prevent a future modification from breaking this test chapter2 = ContainerFactory(parent=tuto_draft, db_object=tuto) chapter2.repo_update(another_text, another_text, another_text) extract2 = ExtractFactory(container=chapter2, db_object=tuto) extract2.repo_update(another_text, another_text) published = publish_content(tuto, tuto_draft, is_major_update=False) tuto.sha_public = tuto_draft.current_version tuto.sha_draft = tuto_draft.current_version tuto.public_version = published tuto.save() self.manager.es_bulk_indexing_of_model(PublishedContent) self.manager.refresh_index() self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 2) # 2 objects, not 3 ! result = self.client.get(reverse('search:query') + '?q=' + text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() contents = [r for r in response if r.meta.doc_type != 'chapter'] self.assertEqual(response.hits.total, len(contents)) # no chapter found anymore result = self.client.get(reverse('search:query') + '?q=' + another_text + '&models=content', follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() chapters = [r for r in response if r.meta.doc_type == 'chapter'] self.assertEqual(response.hits.total, 1) self.assertEqual(chapters[0].meta.doc_type, FakeChapter.get_es_document_type()) self.assertEqual(chapters[0].meta.id, published.content_public_slug + '__' + chapter2.slug) # got new chapter
def test_category_and_subcategory_impact_search(self): """If two contents do not belong to the same (sub)category""" if not self.manager.connected_to_es: return text = 'Did you ever hear the tragedy of Darth Plagueis The Wise?' # 1. Create two contents with different subcategories category_1 = 'category 1' subcategory_1 = SubCategoryFactory(title=category_1) category_2 = 'category 2' subcategory_2 = SubCategoryFactory(title=category_2) tuto_1 = PublishableContentFactory(type='TUTORIAL') tuto_1_draft = tuto_1.load_version() tuto_1.title = text tuto_1.authors.add(self.user) tuto_1.subcategory.add(subcategory_1) tuto_1.save() tuto_1_draft.description = text tuto_1_draft.repo_update_top_container(text, tuto_1.slug, text, text) chapter_1 = ContainerFactory(parent=tuto_1_draft, db_object=tuto_1) extract_1 = ExtractFactory(container=chapter_1, db_object=tuto_1) extract_1.repo_update(text, text) published_1 = publish_content(tuto_1, tuto_1_draft, is_major_update=True) tuto_1.sha_public = tuto_1_draft.current_version tuto_1.sha_draft = tuto_1_draft.current_version tuto_1.public_version = published_1 tuto_1.save() tuto_2 = PublishableContentFactory(type='TUTORIAL') tuto_2_draft = tuto_2.load_version() tuto_2.title = text tuto_2.authors.add(self.user) tuto_2.subcategory.add(subcategory_2) tuto_2.save() tuto_2_draft.description = text tuto_2_draft.repo_update_top_container(text, tuto_2.slug, text, text) chapter_2 = ContainerFactory(parent=tuto_2_draft, db_object=tuto_2) extract_2 = ExtractFactory(container=chapter_2, db_object=tuto_2) extract_2.repo_update(text, text) published_2 = publish_content(tuto_2, tuto_2_draft, is_major_update=True) tuto_2.sha_public = tuto_2_draft.current_version tuto_2.sha_draft = tuto_2_draft.current_version tuto_2.public_version = published_2 tuto_2.save() # 2. Index: self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 0) # index for model in self.indexable: if model is FakeChapter: continue self.manager.es_bulk_indexing_of_model(model) self.manager.refresh_index() result = self.client.get(reverse('search:query') + '?q=' + text, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 4) # Ok # 3. Test result = self.client.get(reverse('search:query') + '?q=' + text + '&model=content&subcategory=' + subcategory_1.slug, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) self.assertEqual([ int(r.meta.id) for r in response if r.meta.doc_type == 'publishedcontent' ][0], published_1.pk) self.assertEqual([ r.meta.id for r in response if r.meta.doc_type == 'chapter' ][0], tuto_1.slug + '__' + chapter_1.slug) result = self.client.get(reverse('search:query') + '?q=' + text + '&model=content&subcategory=' + subcategory_2.slug, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) self.assertEqual([ int(r.meta.id) for r in response if r.meta.doc_type == 'publishedcontent' ][0], published_2.pk) self.assertEqual([ r.meta.id for r in response if r.meta.doc_type == 'chapter' ][0], tuto_2.slug + '__' + chapter_2.slug)
def test_basic_search(self): """Basic search and filtering""" if not self.manager.connected_to_es: return # 1. Index and test search: text = 'test' topic_1 = TopicFactory(forum=self.forum, author=self.user, title=text) post_1 = PostFactory(topic=topic_1, author=self.user, position=1) post_1.text = post_1.text_html = text post_1.save() # create a middle-size content and publish it tuto = PublishableContentFactory(type='TUTORIAL') tuto_draft = tuto.load_version() tuto.title = text tuto.authors.add(self.user) tuto.save() tuto_draft.repo_update_top_container( text, tuto.slug, text, text) # change title to be sure it will match chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto) extract = ExtractFactory(container=chapter1, db_object=tuto) extract.repo_update(text, text) published = publish_content(tuto, tuto_draft, is_major_update=True) tuto.sha_public = tuto_draft.current_version tuto.sha_draft = tuto_draft.current_version tuto.public_version = published tuto.save() # nothing has been indexed yet: self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 0) # index for model in self.indexable: if model is FakeChapter: continue self.manager.es_bulk_indexing_of_model(model) self.manager.refresh_index() result = self.client.get(reverse('search:query') + '?q=' + text, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 4) # get 4 results # 2. Test filtering: topic_1 = Topic.objects.get(pk=topic_1.pk) post_1 = Post.objects.get(pk=post_1.pk) published = PublishedContent.objects.get(pk=published.pk) ids = { 'topic': [topic_1.es_id], 'post': [post_1.es_id], 'content': [ published.es_id, published.content_public_slug + '__' + chapter1.slug ], } search_groups = [ k for k, v in settings.ZDS_APP['search']['search_groups'].items() ] group_to_model = { k: v[1] for k, v in settings.ZDS_APP['search']['search_groups'].items() } for doc_type in search_groups: result = self.client.get(reverse('search:query') + '?q=' + text + '&models=' + doc_type, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, len(ids[doc_type])) # get 1 result of each … for i, r in enumerate(response): self.assertIn( r.meta.doc_type, group_to_model[doc_type]) # … and only of the right type … self.assertEqual(r.meta.id, ids[doc_type][i]) # … with the right id !
def search(self, query_text, locale=None, fields=None): """ Return relevant articles given search text. Finding the query term in the title of an article is given twice as much weight as finding the text in the body. After the most relevant articles are obtained, they are ranked by the ranking module (uses view counts here, but can be easily extended). Args: query_text(str): Text to be searched. locale(str): String to filter results by location. fields(list(str)): If specified, restrict the fields returned to this list. Returns: list[dict]: Returns a ranked list of dictionaries representing articles [ { 'id': str, 'title': str, 'body': str, 'locale': str, }, . . ] """ # Create Search object to "match" query text against the title and body # of articles stored in the Knowledge base. s = Search( using=self.client, index=self.INDEX, doc_type=self.TYPE ).query( 'multi_match', query=query_text, fields=['title^2', 'body'] ) # If locale is provided, use it to filter the set of documents that are # queried for. if locale: s = s.filter('term', locale=locale) # Restrict fields if specified. s = s.source(fields) response = s.execute() results, result_dict = [], {} for hit in response: article_id = hit.meta['id'] result_dict[article_id] = hit.__dict__['_d_'] result_dict[article_id]['id'] = article_id # Retrieve view count for each relevant article. results.append((article_id, self.redis.get(article_id))) # Rank results using Ranking function. Currently sorts relevant results by # view counts. ranked_results = Ranker.rank(results) ranked_articles = [result_dict[article_id] for article_id in ranked_results] return ranked_articles
def test_hidden_forums_give_no_results_if_user_not_allowed(self): """Long name, isn't ?""" if not self.manager.connected_to_es: return # 1. Create a hidden forum belonging to a hidden staff group. text = 'test' group = Group.objects.create(name='Les illuminatis anonymes de ZdS') _, hidden_forum = create_category_and_forum(group) self.staff.groups.add(group) self.staff.save() topic_1 = TopicFactory(forum=hidden_forum, author=self.staff, title=text) post_1 = PostFactory(topic=topic_1, author=self.user, position=1) post_1.text = post_1.text_html = text post_1.save() self.manager.es_bulk_indexing_of_model(Topic) self.manager.es_bulk_indexing_of_model(Post) self.manager.refresh_index() self.assertEqual( len( self.manager.setup_search(Search().query( MatchAll())).execute()), 2) # indexing ok # 2. search without connection and get not result result = self.client.get(reverse('search:query') + '?q=' + text, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 0) # 3. Connect with user (not a member of the group), search, and get no result self.assertTrue( self.client.login(username=self.user.username, password='******')) result = self.client.get(reverse('search:query') + '?q=' + text, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 0) # 4. Connect with staff, search, and get the topic and the post self.client.logout() self.assertTrue( self.client.login(username=self.staff.username, password='******')) result = self.client.get(reverse('search:query') + '?q=' + text, follow=False) self.assertEqual(result.status_code, 200) response = result.context['object_list'].execute() self.assertEqual(response.hits.total, 2) # ok !
from elasticsearch import Elasticsearch from elasticsearch_dsl import Search client = Elasticsearch() s = Search().using(client).query("match", title="use") for hit in s: print(hit.title) print(100 * "*") ## do a term query s = Search().using(client).query("term", title="snowball").execute() if len(s) == 0: print("query %s is empty" % s.to_dict()) for hit in s: print(hit.title) print(100 * "*") ## do a terms query s = Search().using(client).query("terms", tags=["test"]) for hit in s: print(hit.title) print(100 * "*")
def build_query_body(item_uri): """Build query dict ready to pass to Elasticsearch search instance for retrieving a single item by URI.""" search = Search(index='pips').query('term', _id=item_uri) return search.to_dict()
def get_count(self): s = Search(using=self.es, index=self.index_pattern) if self.query: s.update_from_dict({"query": self.query}) return s.count()
import numpy as np from matplotlib_venn import venn3 from matplotlib import pyplot as plt # Define a default Elasticsearch client #connections.create_connection(hosts=['http://172.20.30.70:9200/']) #elasticServer = 'http://172.20.30.70:9200/' #prod elasticServer = 'http://172.20.31.19:9200/' #dev client = Elasticsearch(hosts=[elasticServer]) #q = Q('bool', must=[Q('match', index='propertypriceregister'), Q('match', Type='propertypriceregister')]) q = Q('match', id='_search') s = Search(using=client, index="propertypriceregister", doc_type="propertypriceregister").query() s2 = Search(using=client, index="daft", doc_type="daftproperty").query() s3 = Search(using=client, index="myhome", doc_type="myhomeproperty").query() s4 = Search(using=client, index="daftdrop", doc_type="daftdropproperty").query() ''' count = s.count() for i in range(0, (count / 1000) + 1): response = s[(i*1000):((i+1)*1000)].execute() #response = s.execute() print('Total %d hits found.' % response.hits.total) '''
from connectors.elasticsearch_connector import ElasticsearchConnector from elasticsearch_dsl import Search from elasticsearch_dsl.query import MultiMatch es_man = ElasticsearchConnector(host="localhost", port="9220") es_man.connect() index = "fdg-article" search_key = "publisher" uuid = "7457b5f27a46e69e3f891767d01d2c6f6c5829132a4c03e78f4858828d539fc983a70a3c43ff8a082a8650c0972349eafb50bda7a44062428e2b405249a387f3" a = "7457b5f27a46e69e3f891767d01d2c6f6c5829132a4c03e78f4858828d539fc983a70a3c43ff8a082a8650c0972349eafb50bda7a44062428e2b405249a387f3" s = Search(using=es_man.es, index=index).query("match", publisher=uuid) s = s.query("match", publisher=uuid) s = s.execute() multi_match = MultiMatch(query=uuid, fields=['publisher']) s2 = Search(using=es_man.es, index=index).query(multi_match) s2 = s2.execute() art_index = "fdg-textscore" art_id1 = "8796aff2a14a1ea1539265f76b044f1faf00304d6d9e237aaa21da6c4bab2166f0bed8a2eb99a05d18bc945f811f250da1f4c5a4acbfff1a213bc773894edcd1" art_id2 = "8796aff2a14a1ea1539265f76b044f1faf00304d6d9e237aaa21da6c4bab2166be47575aa0278fdaaccf0d0aac645381b6db09383e58519fc3589398b63777b3" sss = Search(using=es_man.es, index=art_index) \ .filter("terms", _id=[art_id1, art_id2]) response = sss.execute() # ------------------------------------------
def __len__(self): """Returns the total number of entries in the collection.""" return Search(using=self.client, index=self.name).execute().hits.total
def _Search(self, indexname): """ it returns the object which can be used for reatriving ceratin value from the DB """ return Search(using=self.__client, index=indexname)
def _build_query(self): query = Q() source = ['id'] sort = [] aggregations = {} query_string = None as_list = as_dict = False for action, value in self.steps: if action == 'order_by': for key in value: if key.startswith('-'): sort.append({key[1:]: 'desc'}) else: sort.append(key) elif action == 'values': source.extend(value) as_list, as_dict = True, False elif action == 'values_dict': if value: source.extend(value) as_list, as_dict = False, True elif action == 'query': query &= self._process_queries(value) elif action == 'filter': query &= self._process_filters(value) elif action == 'source': source.extend(value) elif action == 'aggregate': aggregations.update(value) elif action == 'filter_query_string': query_string = value else: raise NotImplementedError(action) # If we have a raw query string we are going to apply all sorts # of boosts and filters to improve relevance scoring. # # We are using the same rules that `search.filters:SearchQueryFilter` # implements to have a single-source of truth for how our # scoring works. from olympia.search.filters import SearchQueryFilter search = Search().query(query) if query_string: search = SearchQueryFilter().apply_search_query( query_string, search) if sort: search = search.sort(*sort) if source: search = search.source(source) body = search.to_dict() # These are manually added for now to simplify a partial port to # elasticsearch-dsl if self.start: body['from'] = self.start if self.stop is not None: body['size'] = self.stop - self.start if aggregations: body['aggs'] = aggregations self.source, self.as_list, self.as_dict = source, as_list, as_dict return body