def test_es_manager(self): """Test the behavior of the ``es_manager`` command""" if not self.index_manager.connected_to_es: return # in the beginning: the void self.assertTrue(self.index_manager.index not in self.index_manager.es.cat.indices()) text = "Ceci est un texte de test" # create a topic with a post topic = TopicFactory(forum=self.forum, author=self.user, title=text) post = PostFactory(topic=topic, author=self.user, position=1) post.text = post.text_html = text post.save() topic = Topic.objects.get(pk=topic.pk) post = Post.objects.get(pk=post.pk) self.assertFalse(topic.es_already_indexed) self.assertTrue(topic.es_flagged) self.assertFalse(post.es_already_indexed) self.assertTrue(post.es_flagged) # create a middle-tutorial and publish it tuto = PublishableContentFactory(type="TUTORIAL") tuto.authors.add(self.user) tuto.save() tuto_draft = tuto.load_version() chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto) chapter1.repo_update(text, text, text) extract1 = ExtractFactory(container=chapter1, db_object=tuto) version = extract1.repo_update(text, text) published = publish_content(tuto, tuto_draft, is_major_update=True) tuto.sha_public = version tuto.sha_draft = version tuto.public_version = published tuto.save() published = PublishedContent.objects.get(content_pk=tuto.pk) self.assertFalse(published.es_already_indexed) self.assertTrue(published.es_flagged) # 1. test "index-all" call_command("es_manager", "index_all") self.assertTrue( self.index_manager.es.indices.exists(self.index_manager.index)) self.index_manager.index_exists = True topic = Topic.objects.get(pk=topic.pk) post = Post.objects.get(pk=post.pk) self.assertTrue(topic.es_already_indexed) self.assertFalse(topic.es_flagged) self.assertTrue(post.es_already_indexed) self.assertFalse(post.es_flagged) published = PublishedContent.objects.get(content_pk=tuto.pk) self.assertTrue(published.es_already_indexed) self.assertFalse(published.es_flagged) s = Search() s.query(MatchAll()) results = self.index_manager.setup_search(s).execute() self.assertEqual(len(results), 4) # get 4 results, one of each type must_contain = { "post": False, "topic": False, "publishedcontent": False, "chapter": False } id_must_be = { "post": str(post.pk), "topic": str(topic.pk), "publishedcontent": str(published.pk), "chapter": tuto.slug + "__" + chapter1.slug, } for hit in results: doc_type = hit.meta.doc_type must_contain[doc_type] = True self.assertEqual(hit.meta.id, id_must_be[doc_type]) self.assertTrue(all(must_contain)) # 2. test "clear" self.assertTrue(self.index_manager.index in self.index_manager.es.cat.indices()) # index in call_command("es_manager", "clear") self.assertFalse( self.index_manager.es.indices.exists(self.index_manager.index)) self.index_manager.index_exists = False # must reset every object topic = Topic.objects.get(pk=topic.pk) post = Post.objects.get(pk=post.pk) self.assertFalse(topic.es_already_indexed) self.assertTrue(topic.es_flagged) self.assertFalse(post.es_already_indexed) self.assertTrue(post.es_flagged) published = PublishedContent.objects.get(content_pk=tuto.pk) self.assertFalse(published.es_already_indexed) self.assertTrue(published.es_flagged) self.assertTrue( self.index_manager.index not in self.index_manager.es.cat.indices()) # index wiped out ! # 3. test "setup" call_command("es_manager", "setup") self.assertTrue( self.index_manager.es.indices.exists(self.index_manager.index)) self.index_manager.index_exists = True self.assertTrue( self.index_manager.index in self.index_manager.es.cat.indices()) # index back in ... s = Search() s.query(MatchAll()) results = self.index_manager.setup_search(s).execute() self.assertEqual(len(results), 0) # ... but with nothing in it result = self.index_manager.es.indices.get_settings( index=self.index_manager.index) settings_index = result[self.index_manager.index]["settings"]["index"] self.assertTrue("analysis" in settings_index) # custom analyzer was setup # 4. test "index-flagged" once ... call_command("es_manager", "index_flagged") topic = Topic.objects.get(pk=topic.pk) post = Post.objects.get(pk=post.pk) self.assertTrue(topic.es_already_indexed) self.assertFalse(topic.es_flagged) self.assertTrue(post.es_already_indexed) self.assertFalse(post.es_flagged) published = PublishedContent.objects.get(content_pk=tuto.pk) self.assertTrue(published.es_already_indexed) self.assertFalse(published.es_flagged) s = Search() s.query(MatchAll()) results = self.index_manager.setup_search(s).execute() self.assertEqual(len(results), 4) # get the 4 results back
from elasticsearch_dsl import connections,Document,Text,Keyword,Date,Search from elasticsearch import Elasticsearch connections.create_connection(hosts=['127.0.0.1'], timeout=20) es=Elasticsearch() class Blogpost(Document): Title=Text() published_date=Date() published_by=Text() tags=Keyword() body=Text() class Index: name='blog_index' #index will be created automatically bg=Blogpost() bg.title="A better way to use elasticsearch python" bg.published_date="12-01-2019" bg.published_by="Kartik" bg.tags=['tech','elasticsearch','python'] bg.body="Who does not love object-oriented programming paradigms" bg.save() s = Search(using=es, index="blog_index").filter("term", tags="tech").query("match", title="python")\ .exclude("match", body="Hello world")
def __init__(self, host: str, index_name: str, *args, **kwargs): self.search = Search(using=Elasticsearch(hosts=host), index=index_name)
def dashboard(request): count_ips, count_domains = total_data() count_bugs = total_bug() total = { "ips": count_ips, "domains": count_domains, "bugs": count_bugs } # 资产相关 data = properly.objects.order_by("-id").all() # 图表统计 payload = {"size": 0, "aggs": { "sales": { "date_histogram": { "field": "published_from", "interval": STATIC_TASKS, "format": "yyyy-MM-dd" } } } } s = Search(using=es, index='w12scan').from_dict(payload) res = s.execute().to_dict() try: charts = res["aggregations"]["sales"]["buckets"] except KeyError: charts = [] data_chart = { "labels": [], "data": [] } for item in charts: count = item["doc_count"] if count == 0: continue data_chart["labels"].append(item["key_as_string"]) data_chart["data"].append(item["doc_count"]) # Bar chart names = count_name(6) data_bar = { "labels": [], "data": [] } for item in names: data_bar["labels"].append(item["key"]) data_bar["data"].append(item["doc_count"]) # node monitor nodenames = redis_con.keys("w12_node_*") nodes = [] for nodename in nodenames: dd = redis_con.hgetall(nodename) tem_dict = {} tem_dict["nodename"] = lstrsub(nodename, "w12_node_") tem_dict["last_time"] = dd.get("last_time", 0) tem_dict["tasks"] = dd.get("tasks", "error") tem_dict["running"] = dd.get("running", "error") tem_dict["finished"] = dd.get("finished", "error") tem_dict["status"] = "Running" if time.time() - float(tem_dict["last_time"]) > 60 * 5: tem_dict["status"] = "Pending" tem_dict["time"] = smartDate(float(tem_dict["last_time"])) nodes.append(tem_dict) # bug[domain]漏洞图表展示 dd = es.indices.get_mapping(index='w12scan', doc_type='domains') dd = dd["w12scan"]["mappings"]["domains"]["properties"] data_bugs = [] if "bugs" in dd: bug_type = dd["bugs"]["properties"].keys() index = 0 for bug_name in bug_type: index += 1 count = get_bug_count('domains', bug_name) dd = {} _cls = ["primary", "info", "danger", "success", "warning"] dd["label"] = bug_name dd["count"] = count dd["cls"] = _cls[index % 5] data_bugs.append(dd) return render(request, "frontend/dashboard.html", {"total": total, "zc_data": data, "data_chart": data_chart, "data_bar": data_bar, "nodes": nodes, "data_bugs": data_bugs})
def index(request): page = request.GET.get("p", "1") q = request.GET.get("q", None) try: page = int(page) except: page = 1 if page <= 0: page = 1 es = Elasticsearch(ELASTICSEARCH_HOSTS) start_time = datetime.now() keywords = None if q is None: _search = { "from": (page - 1) * 20, "size": 20, "sort": {"published_from": {"order": "desc"}} } else: _search, keywords = k2e_search(q, page) s = Search(using=es, index='w12scan').from_dict(_search) count = s.execute().hits.total # 分页逻辑 max_page = math.ceil(count / 20) if page <= 5: paginations = range(1, 10) elif page + 5 > max_page: paginations = range(max_page - 5, max_page + 5) else: paginations = range(page - 5, page + 5) temp_pagin = [] for i in paginations: if i <= max_page: temp_pagin.append(i) paginations = temp_pagin pagination = { "max_page": str(max_page), "current": page, "pre": str(page - 1) if page - 1 > 0 else "1", "next": str(page + 1) if page + 1 <= max_page else str(max_page), "paginations": paginations, "keyword": "" } if q is not None: pagination["keyword"] = "&q=" + q # 分页完 datas = [] for hit in s: doc_type = hit.meta.doc_type id = hit.meta.id d = {} if doc_type == "ips": d.update(hit.to_dict()) if d.get("infos"): d["info_tags"] = [] for info in d["infos"]: d["info_tags"].append("{}/{}".format(info["port"], info.get("name", "unknown"))) d["infos"] = json.dumps(d["infos"], indent=2) # 资产关联 d["proper"] = is_proper(d["target"], "ip") elif doc_type == "domains": d.update(hit.to_dict()) d["target"] = d.get("title") or d.get("url") if d.get("ip"): ip = d.get("ip") ip_info = es_search_ip(ip, True) if ip_info: d["location"] = ip_info.location d["proper"] = is_proper(d["url"], "domain") d["doc_type"] = doc_type d["id"] = id d["published_from"] = datetime_string_format(d["published_from"]) datas.append(d) # 左侧统计代码逻辑 statistics = {} # 1.组件统计 apps = count_app() countrys = count_country() names = count_name() ports = count_port() statistics["apps"] = apps statistics["countrys"] = countrys statistics["names"] = names statistics["ports"] = ports # 总耗时间 end_time = (datetime.now() - start_time).total_seconds() return render(request, "frontend/recent.html", {"datas": datas, "count": count, "second": end_time, "pagination": pagination, "statistics": statistics, "keyword": keywords})
from datetime import datetime from elasticsearch import Elasticsearch, NotFoundError from elasticsearch_dsl import Search from es_models import Items, Bids # from view_helper import create_bid_id es = Elasticsearch() s = Search(es, index='item', doc_type='item') INITIAL_ITEM_STATUS = 'Available' def create_bid_id(bidder, item): return (bidder + '_' + item.item_name) def add_item(seller, item_name, min_bid): item = Items(meta={'id': item_name}) item.item_name = item_name item.status = INITIAL_ITEM_STATUS item.seller = seller # item.created_at = datetime.now().strftime("%a %b %d %H:%M:%S %Y") item.min_bid = min_bid item.sold_to = None item.save() return item def create_bid(item, bidder, bid_amount): bid = Bids()
def checkticket(ip_src, cipher, msg_type, timestamp): # Wait output to Elasticsearch time.sleep(3) es = Elasticsearch('10.0.19.112:9200') s = Search(using=es, index="cipher-*") s = s[0:10000] # Check old cipher if msg_type == 12: q = (Q('match', layers__kerberos_msg_type=11) | Q('match', layers__kerberos_msg_type=13)) & Q( 'match', layers__ip_dst__keyword=ip_src) & Q( 'match', layers__kerberos_cipher__keyword=cipher) if msg_type == 14: q = Q('match', layers__kerberos_msg_type=13) & Q( 'match', layers__ip_dst__keyword=ip_src) & Q( 'match', layers__kerberos_cipher__keyword=cipher) s1 = s.query(q) response = s1.execute() if len(response) != 0: print('matched with old cipher at ' + str(timestamp)) else: # Check TKT Expire qtime = Q('range', timestamp={ 'gte': int(timestamp), 'lte': int(timestamp) + 1000 }) & Q('match', layers__ip_dst__keyword=ip_src) & Q( 'match', layers__kerberos_error_code=32) s2 = s.query(qtime) response2 = s2.execute() if len(response2) != 0: print('TKT Expired at ' + str(timestamp)) else: # Update packet-* index time.sleep(5) s = Search(using=es, index="packet-*") s = s[0:10000] qsilver = Q('match', layers__kerberos_cipher__keyword=cipher) s3 = s.query(qsilver) response3 = s3.execute() for h in response3: id = h.meta.id index = h.meta.index if msg_type == 12: detect_golden.detect_golden(ip_src) es.update(index=index, doc_type='doc', id=id, body={ 'doc': { 'indicator': 'attack: Golden Ticket is used' } }) if msg_type == 14: es.update(index=index, doc_type='doc', id=id, body={ 'doc': { 'indicator': 'attack: Silver Ticket is used' } }) if msg_type == 12: print('Golden ticket was used on ' + str(ip_src) + ' at ' + str(timestamp)) if msg_type == 14: print('Silver ticket was used on ' + str(ip_src) + ' at ' + str(timestamp))
def search(search_params, index, page_size, ip, request, filter_dead, page=1) -> Tuple[List[Hit], int, int]: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `ImageSearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :param request: Django's request object. :param filter_dead: Whether dead links should be removed. :param page: The results page number. :return: Tuple with a List of Hits from elasticsearch, the total count of pages and results. """ s = Search(index=index) # Apply term filters. Each tuple pairs a filter's parameter name in the API # with its corresponding field in Elasticsearch. "None" means that the # names are identical. filters = [('extension', None), ('categories', None), ('aspect_ratio', None), ('size', None), ('source', 'provider'), ('license', 'license__keyword'), ('license_type', 'license__keyword')] for tup in filters: api_field, elasticsearch_field = tup s = _apply_filter(s, search_params, api_field, elasticsearch_field) # Get suggestions for any route s = s.suggest('get_suggestion', '', term={'field': 'creator'}) # Exclude mature content unless explicitly enabled by the requester if not search_params.data['mature']: s = s.exclude('term', mature=True) # Hide data sources from the catalog dynamically. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = models.ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) to_exclude = [f['provider_identifier'] for f in filtered_providers] s = s.exclude('terms', provider=to_exclude) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. search_fields = ['tags.name', 'title', 'description'] if 'q' in search_params.data: query = _quote_escape(search_params.data['q']) s = s.query('simple_query_string', query=query, fields=search_fields) # Get suggestions for term query s = s.suggest('get_suggestion', query, term={'field': 'creator'}) else: if 'creator' in search_params.data: creator = _quote_escape(search_params.data['creator']) s = s.query('simple_query_string', query=creator, fields=['creator']) # Get suggestions for creator s = s.suggest('get_suggestion', creator, term={'field': 'creator'}) if 'title' in search_params.data: title = _quote_escape(search_params.data['title']) s = s.query('simple_query_string', query=title, fields=['title']) # Get suggestions for title s = s.suggest('get_suggestion', title, term={'field': 'title'}) if 'tags' in search_params.data: tags = _quote_escape(search_params.data['tags']) s = s.query('simple_query_string', fields=['tags.name'], query=tags) # Get suggestions for tags s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'}) # Boost by popularity metrics if POPULARITY_BOOST: queries = [] factors = ['comments', 'views', 'likes'] boost_factor = 100 / len(factors) for factor in factors: rank_feature_query = Q('rank_feature', field=factor, boost=boost_factor) queries.append(rank_feature_query) s = Search().query( Q('bool', must=s.query, should=queries, minimum_should_match=1)) # Use highlighting to determine which fields contribute to the selection of # top results. s = s.highlight(*search_fields) s = s.highlight_options(order='score') s.extra(track_scores=True) # Route users to the same Elasticsearch worker node to reduce # pagination inconsistencies and increase cache hits. s = s.params(preference=str(ip), request_timeout=7) # Paginate start, end = _get_query_slice(s, page_size, page, filter_dead) s = s[start:end] try: search_response = s.execute() log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}') except RequestError as e: raise ValueError(e) results = _post_process_results(s, start, end, page_size, search_response, request, filter_dead) suggestion = _query_suggestions(search_response) result_count, page_count = _get_result_and_page_count( search_response, results, page_size) return results, page_count, result_count, suggestion
def search(self, keyword): s = Search(using=self.client, index=self.news_index)\ .query("match", content=keyword) response = s.execute() return response
from elasticsearch_dsl import Search from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT_EVAL, ES_INDEX_DOCUMENT from sklearn.metrics import classification_report, roc_auc_score tm_name = "bigartm_two_years_old_parse" criterion_id = 35 s = Search(using=ES_CLIENT, index=f"{ES_INDEX_DOCUMENT_EVAL}_{tm_name}_{criterion_id}") s = s.source(('value', 'document_es_id')) # Values document_values = dict((h.document_es_id, h.value) for h in s.scan()) # Resonance thresholds std = Search(using=ES_CLIENT, index=f"{ES_INDEX_DOCUMENT}") \ .filter("range", num_views={"gt": 0}) std.aggs.bucket("sources", agg_type="terms", field="source", size=100) \ .metric("stats", agg_type="extended_stats", field="num_views") r = std.execute() source_resonances = dict( (bucket.key, bucket.stats) for bucket in r.aggregations.sources.buckets) source_resonance_means = dict( ((source, stats.avg) for source, stats in source_resonances.items())) source_resonance_stds = dict(((source, stats.std_deviation) for source, stats in source_resonances.items())) sigma_threshold = 1 source_resonance_thresholds = dict(
from elasticsearch import Elasticsearch from elasticsearch_dsl import Search client = Elasticsearch(timeout=60) s = Search(using=client, index="winlogbeat-*") s.source(includes=['winlog.provider_name', 'winlog.event_id']) maps = {} count = 0 for hit in s.scan(): if not hit.winlog.provider_name in maps: maps[hit.winlog.provider_name] = {} if not hit.winlog.event_id in maps[hit.winlog.provider_name]: maps[hit.winlog.provider_name][hit.winlog.event_id] = 1 else: maps[hit.winlog.provider_name][hit.winlog.event_id] += 1 count += 1 if count % 1000 == 0: print("Progress %d" % count) for (provider_name, v) in maps.items(): for (event_id, count) in v.items(): print('%s: %d (%d)' % (provider_name, event_id, count))
def search(page='0', dic=None, content=None): limitpage = 15 validresult = False orderlabel = 0 orderarray = [] if content is not None: q = Q("multi_match", query=content, fields=[ 'ip', 'name', 'product', 'script', 'detail', 'head', 'hackinfo', 'keywords', 'disclosure' ]) else: searcharray = [] keys = dic.keys() orderlabel = 0 for key in keys: if key == 'name': searcharray.append(Q('term', name=dic[key])) if key == 'ip': searcharray.append(Q('term', ip=dic[key])) if key == 'port': searcharray.append(Q('term', port=dic[key])) if key == 'state': searcharray.append(Q('term', state=dic[key])) if key == 'timesearch': searcharray.append(Q('match', timesearch=dic[key])) if key == 'keywords': searcharray.append(Q('match', keywords=dic[key])) if key == 'product': searcharray.append(Q('match', product=dic[key])) if key == 'version': searcharray.append(Q('match', version=dic[key])) if key == 'script': searcharray.append(Q('match', script=dic[key])) if key == 'hackinfo': searcharray.append(Q('match', hackinfo=dic[key])) if key == 'head': searcharray.append(Q('match', head=dic[key])) if key == 'detail': searcharray.append(Q('match', detail=dic[key])) if key == 'disclosure': searcharray.append(Q('match', disclosure=dic[key])) if key == 'webtitle': searcharray.append(Q('match', webtitle=dic[key])) if key == 'webkeywords': searcharray.append(Q('match', webkeywords=dic[key])) if key == 'order': orderarray.append(dic[key]) orderlabel = 1 q = Q('bool', must=searcharray) if orderlabel == 0: s = Search(index='datap', doc_type='snifferdata').query(q) else: s = Search(index='datap', doc_type='snifferdata').query(q).sort(orderarray[0]) # s = Search.from_dict({"query": { # "bool":{ # "must":[ # { # "term":{"name":"http"}, # "term":{"port":"80"}, # # }, # { # "match":{"head":"manager"}, # "match":{"head":"200"}, # } # ] # } # # } # }) s = s[int(page) * limitpage:int(page) * limitpage + limitpage] response = s.execute() if response.success(): portarray = [] count = response.hits.total print '返回的集中率为%d' % count if count == 0: pagecount = 0 elif count % limitpage > 0: pagecount = int((count + limitpage - 1) / limitpage) else: pagecount = count / limitpage from nmaptoolbackground.model import ports count = len(response) print '返回的实际数量为%d' % count if count > 0: for temp in response: dic = temp.to_dict() aport = ports.Port( ip=getproperty(dic, 'ip'), port=getproperty(dic, 'port'), timesearch=getproperty(dic, 'timesearch'), state=getproperty(dic, 'state'), name=getproperty(dic, 'name'), product=getproperty(dic, 'product'), version=getproperty(dic, 'version'), script=base64.b64encode(str(getproperty(dic, 'script'))), detail=getproperty(dic, 'detail'), head=getproperty(dic, 'head'), city='', hackinfo=getproperty(dic, 'hackinfo'), disclosure=getproperty(dic, 'disclosure'), keywords=getproperty(dic, 'keywords'), webtitle=base64.b64encode(str(getproperty(dic, 'webtitle'))), webkeywords=getproperty(dic, 'webkeywords')) # ip=getproperty(dic,'ip') # port=getproperty(dic,'port') # timesearch=getproperty(dic,'timesearch') # state=getproperty(dic,'state') # name=getproperty(dic,'name') # product=getproperty(dic,'product') # version=getproperty(dic,'version') # script=base64.b64encode(getproperty(dic,'script')) # detail=getproperty(dic,'detail') # head=getproperty(dic,'head') # city='' # hackinfo=getproperty(dic,'hackinfo') # disclosure=getproperty(dic,'disclosure') portarray.append(aport) return portarray, count, pagecount else: print '查询失败' return [], 0, 0
def results(): page = request.args page_number = int(page.get('page_number')) if page.get('page_number', '') is not "" else 1 query = page.get('query', '') if len(query) == 0: return redirect('/') search = Search(index=index_name) # boost field weights boost_weight = [ i + 1 for i in get_classifier().predict([extract_features(query)])[0] ] fields_list = query_helper.boost_fields(boost_weight) # supports '|', '+', '-', "" phrase search, '*', etc. s = search.query('simple_query_string', fields=fields_list, query=query, default_operator='and') # highlight query_helper.highlight(s, fields_list) start = 0 + (page_number - 1) * 10 end = 10 + (page_number - 1) * 10 message = [] response = s[start:end].execute() # if there are no results, switch to disjunction search if response.hits.total == 0 and len(query) > 0: message.append(f'Unknown search term: {query},switch to disjunction.') s = search.query('simple_query_string', fields=fields_list, query=query, default_operator='or') response = s[start:end].execute() # insert data into response result_list = query_helper.parse_result(response) # if there are results, insert it to query_index query_id = 0 qs = "" try: qs = Query.select().where(Query.query == query) except Query.DoesNotExist: qs = None if result_list: if not qs: q1 = Query(query=query, result=json.dumps(result_list)) q1.save() query_id = q1.id q = SearchQuery(query=query, suggest=query, meta={ 'index': 'query_index', 'id': query_id }) q.save() else: query_id = Query.get(Query.query == query).id result_num = response.hits.total return render_result({ 'result_list': result_list, 'result_num': result_num, 'query_id': query_id, 'query': query, 'page_number': page_number, 'message': message, 'page_size': 10 })
def search(search_params, index, page_size, ip, page=1) -> Response: """ Given a set of keywords and an optional set of filters, perform a ranked paginated search. :param search_params: Search parameters. See :class: `~cccatalog.api.search_serializers.SearchQueryStringSerializer`. :param index: The Elasticsearch index to search (e.g. 'image') :param page_size: The number of results to return per page. :param page: The results page number. :param ip: The user's hashed IP. Hashed IPs are used to anonymously but uniquely identify users exclusively for ensuring query consistency across Elasticsearch shards. :return: An Elasticsearch Response object. """ s = Search(index=index) # Paginate search query. start_slice = page_size * (page - 1) end_slice = page_size * page if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW: raise ValueError("Deep pagination is not allowed.") s = s[start_slice:end_slice] # If any filters are specified, add them to the query. if 'li' in search_params.data or 'lt' in search_params.data: license_field = 'li' if 'li' in search_params.data else 'lt' license_filters = [] for _license in search_params.data[license_field].split(','): license_filters.append(Q('term', license__keyword=_license)) s = s.filter('bool', should=license_filters, minimum_should_match=1) if 'provider' in search_params.data: provider_filters = [] for provider in search_params.data['provider'].split(','): provider_filters.append(Q('term', provider=provider)) s = s.filter('bool', should=provider_filters, minimum_should_match=1) # It is sometimes desirable to hide content providers from the catalog # without scrubbing them from the database or reindexing. filter_cache_key = 'filtered_providers' filtered_providers = cache.get(key=filter_cache_key) if not filtered_providers: filtered_providers = ContentProvider.objects\ .filter(filter_content=True)\ .values('provider_identifier') cache.set(key=filter_cache_key, timeout=CACHE_TIMEOUT, value=filtered_providers) for filtered in filtered_providers: s = s.exclude('match', provider=filtered['provider_identifier']) # Search either by generic multimatch or by "advanced search" with # individual field-level queries specified. if 'q' in search_params.data: keywords = ' '.join(search_params.data['q'].lower().split(',')) s = s.query('constant_score', filter=Q('multi_match', query=keywords, fields=['tags.name', 'title'], operator='AND')) else: if 'creator' in search_params.data: creator = search_params.data['creator'] s = s.query('constant_score', filter=Q('match', creator=creator)) if 'title' in search_params.data: title = search_params.data['title'] s = s.query('constant_score', filter=Q('match', title=title)) if 'tags' in search_params.data: tags = ' '.join(search_params.data['tags'].lower().split(',')) s = s.query('constant_score', filter=Q('multi_match', fields=['tags.name'], query=tags)) s.extra(track_scores=True) s = s.params(preference=str(ip)) search_response = s.execute() return search_response
def aggregate_by_event_data(self, event_id=None, event_data_name="Image", sub_event_data_name=None, bucket_size=1000, sub_bucket_size=100, threshold=None, filter_event_data_name='', filter_event_data_value='', aggregate_by_hostname=False): es_query = self.get_default_query() if event_id != None: es_query.append({'match': {'winlog.event_id': event_id}}) if filter_event_data_name: filter_field_name = 'winlog.event_data.' + filter_event_data_name es_query.append( {'match': { filter_field_name: filter_event_data_value }}) query = Q({'bool': {'must': es_query}}) s = Search(using=self.Client, index="winlogbeat-*").query(query) if self.DTRange != None: s = s.filter('range', **self.DTRange) s.source(includes=['winlog.*']) if aggregate_by_hostname: b = s.aggs.bucket(event_data_name, 'terms', field='agent.hostname', size=bucket_size) else: b = s.aggs b = b.bucket(event_data_name, 'terms', field='winlog.event_data.' + event_data_name, size=bucket_size) if threshold: # https://github.com/ongr-io/ElasticsearchDSL/blob/master/docs/Aggregation/Pipeline/BucketSelector.md # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html threshold_bucket_name = event_data_name + "_counts" b.bucket(threshold_bucket_name, 'cardinality', field='@timestamp') b.pipeline('threshold_bucket_selector', 'bucket_selector', buckets_path={"counts": threshold_bucket_name}, script='params.counts > %d' % threshold) if sub_event_data_name: b.bucket(sub_event_data_name, 'terms', field='winlog.event_data.' + sub_event_data_name, size=sub_bucket_size) if self.DebugQuery: pprint.pprint(s.to_dict()) response = s.execute() if self.Scan: s.scan() else: response = s.execute() return response.aggregations[event_data_name]
def get_topics_from_topic_index(self): s = Search(using=self.client, index=self.topic_index) response = s.scan() for hit in response: yield hit
from core.search.cache_data import cache_paper_info # Constants NUM_PAPERS = 1 # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) THRESHOLD_DATE = datetime(2019, 3, 6, 10, 43, 45, 734484) # Memory for deleting entries which no longer exist last_papers = set() while True: # Specify the query paper_info_s = Search(index='paper_info', using=client) paper_info_s = paper_info_s.sort({ "CreatedDate": { "order": "desc" } }) paper_info_s = paper_info_s.update_from_dict({ "query": { "bool": { "must_not": [ { "exists": { "field": "FieldsOfStudy" } } ], "must": { "range": { "CreatedDate": { "lt": THRESHOLD_DATE } } } } } }) paper_info_s = paper_info_s.source(['PaperId']) # Get number of query results results = paper_info_s[:NUM_PAPERS] papers = [x.PaperId for x in results.execute()] # Check if the paper has been seen before, and thus needs to be deleted checked_papers = last_papers.intersection(set(papers)) if checked_papers: delete_info_s = Search(index='paper_info', using=client) delete_info_s = delete_info_s.query("match", PaperId=list(checked_papers)) delete_info_s.delete() last_papers = set(papers).difference(checked_papers)
def aggregate(self, keyword): s = Search(using=self.client, index=self.news_index).query("match", content=keyword) response = s.execute() for tag in response.aggregations: print(tag)
def search_elastic(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75, max_search_results=1000): # This function can easily be memcached now es_client = Elasticsearch() es_sort_keys = { 'id': 'id', 'size': 'filesize', # 'name': 'display_name', # This is slow and buggy 'comments': 'comment_count', 'seeders': 'seed_count', 'leechers': 'leech_count', 'downloads': 'download_count' } sort_ = sort.lower() if sort_ not in es_sort_keys: flask.abort(400) es_sort = es_sort_keys[sort] order_keys = {'desc': 'desc', 'asc': 'asc'} order_ = order.lower() if order_ not in order_keys: flask.abort(400) # Only allow ID, desc if RSS if rss: sort = es_sort_keys['id'] order = 'desc' # funky, es sort is default asc, prefixed by '-' if desc if 'desc' == order: es_sort = '-' + es_sort # Quality filter quality_keys = [ '0', # Show all '1', # No remakes '2', # Only trusted '3' # Only completed ] if quality_filter.lower() not in quality_keys: flask.abort(400) quality_filter = int(quality_filter) # Category filter main_category = None sub_category = None main_cat_id = 0 sub_cat_id = 0 if category: cat_match = re.match(r'^(\d+)_(\d+)$', category) if not cat_match: flask.abort(400) main_cat_id = int(cat_match.group(1)) sub_cat_id = int(cat_match.group(2)) if main_cat_id > 0: if sub_cat_id > 0: sub_category = models.SubCategory.by_category_ids( main_cat_id, sub_cat_id) if not sub_category: flask.abort(400) else: main_category = models.MainCategory.by_id(main_cat_id) if not main_category: flask.abort(400) # This might be useless since we validate users # before coming into this method, but just to be safe... if user: user = models.User.by_id(user) if not user: flask.abort(404) user = user.id same_user = False if logged_in_user: same_user = user == logged_in_user.id s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME')) # todo, sukebei prefix # Apply search term if term: s = s.query( 'simple_query_string', # Query both fields, latter for words with >15 chars fields=['display_name', 'display_name.fullword'], analyzer='my_search_analyzer', default_operator="AND", query=term) # User view (/user/username) if user: s = s.filter('term', uploader_id=user) if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in user is not the same as the user being viewed, # show only torrents that aren't hidden or anonymous. # # If logged in user is the same as the user being viewed, # show all torrents including hidden and anonymous ones. # # On RSS pages in user view, show only torrents that # aren't hidden or anonymous no matter what if not same_user or rss: s = s.filter('term', hidden=False) s = s.filter('term', anonymous=False) # General view (homepage, general search view) else: if not admin: # Hide all DELETED torrents if regular user s = s.filter('term', deleted=False) # If logged in, show all torrents that aren't hidden unless they belong to you # On RSS pages, show all public torrents and nothing more. if logged_in_user and not rss: hiddenFilter = Q('term', hidden=False) userFilter = Q('term', uploader_id=logged_in_user.id) combinedFilter = hiddenFilter | userFilter s = s.filter('bool', filter=[combinedFilter]) else: s = s.filter('term', hidden=False) if main_category: s = s.filter('term', main_category_id=main_cat_id) elif sub_category: s = s.filter('term', main_category_id=main_cat_id) s = s.filter('term', sub_category_id=sub_cat_id) if quality_filter == 0: pass elif quality_filter == 1: s = s.filter('term', remake=False) elif quality_filter == 2: s = s.filter('term', trusted=True) elif quality_filter == 3: s = s.filter('term', complete=True) # Apply sort s = s.sort(es_sort) # Only show first RESULTS_PER_PAGE items for RSS if rss: s = s[0:per_page] else: max_page = min(page, int(math.ceil(max_search_results / float(per_page)))) from_idx = (max_page - 1) * per_page to_idx = min(max_search_results, max_page * per_page) s = s[from_idx:to_idx] highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT') if highlight: s = s.highlight_options(tags_schema='styled') s = s.highlight("display_name") # Return query, uncomment print line to debug query # from pprint import pprint # print(json.dumps(s.to_dict())) return s.execute()
from datetime import datetime from elasticsearch import Elasticsearch from elasticsearch_dsl import Search import pandas as pd # We will create an object for accessing the ElasticSearch instance. # In this case, we will assume that it is running on our system with its REST interface available in port 9200 # verify_certs will be necessary if you're connecting to an ElasticSearch over TLS (https) with a bad certificate es = Elasticsearch('http://localhost:9200', verify_certs=False) # The enriched indexes store one document per commit # The query builds buckets of commits, grouped by author name, # aggregated as first commit for each of these authors s = Search(using=es, index='git') s.aggs.buckets('by_authors', 'terms', field='author_name', size=10000).metric('first_commit', 'min', field='author_date') s = s.sort('author_date')
def search(event_name): s = Search().filter('term', event_name=event_name) response = s.execute() return response
def default_search(param): if param is None: return Search() return param
def detail(request, id): ''' ip domain 详情 :param request: :param id: :return: ''' data = es_search_ip_by_id(id) if not data: raise Http404 data = data[0] doc_type = data["_type"] data = data["_source"] data["published_from"] = datetime_string_format(data["published_from"]) if doc_type == "ips": target = data["target"] data["proper"] = is_proper(target, "ip") # 关联出域名 union_domains = es_search_domain_by_ip(target, True) # 历史ip historys = es_search_ip(target) for h in historys: h["published_from"] = datetime_string_format(h["published_from"]) # 关联C段ip c_data = [] temp_ips = target.split(".") if len(temp_ips) == 4: del temp_ips[-1] query_ip = '.'.join(temp_ips) + ".*" payload = { "query": { "wildcard": {"target": query_ip} }, "collapse": { "field": "target" }, "sort": { "published_from": {"order": "desc"} }, "from": 0, "size": 10000 } s = Search(using=es, index='w12scan', doc_type='ips').from_dict(payload) res = s.execute() for hit in res: cid = hit.meta.id d = hit.to_dict() if d["target"] != target: if isinstance(d["target"], list): d["target"] = d["target"][0] # C段ip的上的域名 sub_data = [] sub_domain = es_search_domain_by_ip(d["target"], True) for sub in sub_domain: dd = {} dd.update(sub) sub_data.append(dd) extrainfo = "" for k in d.get("infos", []): extrainfo += "{0}/{1} ".format(k.get("port", ""), k.get("name", "unknown")) c_data.append({"id": cid, "ip": d["target"], "data": sub_data, "extrainfo": extrainfo}) # c_data 排序 c_data.sort(key=lambda a: int(a.get("ip", 0).split(".")[3])) return render(request, "frontend/ip_detail.html", {"data": data, "union": union_domains, "c_data": c_data, "third_infomation": third_info(target), "historys": historys}) elif doc_type == "domains": ip = data["ip"] target = data["url"] data["proper"] = is_proper(target, "domain") # 展现信息 field = ["title", "status_code", "X-Powered-By", "Server"] uldata = [] for f in field: if f in data: uldata.append((f, data[f])) hit = es_search_ip(ip, deduplicat=True) historys = es_search_domain_by_url(target) for h in historys: h["published_from"] = datetime_string_format(h["published_from"]) # s = Search(using=es, index='w12scan', doc_type='ips').from_dict(payload) ip_data = {} if hit: ip_data["id"] = hit.meta.id ip_data["ip"] = list(hit.target)[0] # subdomain 获取 try: sub_domain = get_fld(target, fix_protocol=True) except: sub_domain = None sub_domain_data = [] if sub_domain: payload = {"query": { "wildcard": {"url": "*." + sub_domain} } , "collapse": { "field": "url" }, "sort": { "published_from": {"order": "desc"} }, "from": 0, "size": 10000 } s = Search(using=es, index='w12scan', doc_type='domains').from_dict(payload) for hit in s: dd = {} dd.update(hit.to_dict()) if isinstance(dd["url"], list): dd["url"] = dd["url"][0] dd["id"] = hit.meta.id dd["published_from"] = datetime_string_format(dd["published_from"]) sub_domain_data.append(dd) return render(request, "frontend/domain_detail.html", {"data": data, "ip_data": ip_data, "sub_domain": sub_domain_data, "third_infomation": third_info(ip), "historys": historys, "uldata": uldata})
def get_queryset(self): if not self.index_manager.connected_to_es: messages.warning(self.request, _('Impossible de se connecter à Elasticsearch')) return [] if self.search_query: # Searches forums the user is allowed to visit self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() # Restrict (sub)category if any if self.search_form.cleaned_data['category']: self.content_category = self.search_form.cleaned_data[ 'category'] if self.search_form.cleaned_data['subcategory']: self.content_subcategory = self.search_form.cleaned_data[ 'subcategory'] # Mark that contents must come from library if required self.from_library = False if self.search_form.cleaned_data['from_library'] == 'on': self.from_library = True # Setting the different querysets (according to the selected models, if any) part_querysets = [] chosen_groups = self.search_form.cleaned_data['models'] if chosen_groups: models = [] for group in chosen_groups: if group in settings.ZDS_APP['search']['search_groups']: models.append(settings.ZDS_APP['search'] ['search_groups'][group][1]) else: models = [ v[1] for k, v in settings.ZDS_APP['search'] ['search_groups'].items() ] models = reduce(operator.concat, models) for model in models: part_querysets.append( getattr(self, 'get_queryset_{}s'.format(model))()) queryset = part_querysets[0] for query in part_querysets[1:]: queryset |= query # Weighting: weight_functions = [] for _type, weights in list( settings.ZDS_APP['search']['boosts'].items()): if _type in models: weight_functions.append({ 'filter': Match(_type=_type), 'weight': weights['global'] }) scored_queryset = FunctionScore(query=queryset, boost_mode='multiply', functions=weight_functions) search_queryset = search_queryset.query(scored_queryset) # Highlighting: search_queryset = search_queryset.highlight_options( fragment_size=150, number_of_fragments=5, pre_tags=['[hl]'], post_tags=['[/hl]']) search_queryset = search_queryset.highlight('text').highlight( 'text_html') # Executing: return self.index_manager.setup_search(search_queryset) return []
def zc_detail(request, id): try: m = properly.objects.get(id=id) except: m = None if m is None: raise Http404 # 处理域名资产 show_data = {} domains = m.domains.splitlines() show_data['domains'] = domains payload = {"query": { "bool": { "should": [ ] } }, "collapse": { "field": "url" }, "sort": { "published_from": {"order": "desc"} }, "from": 0, "size": 10000 } temp_list = [] for temp in domains: if "*" not in temp and not temp.startswith("http"): temp = "http*" + temp temp_list.append({ "wildcard": { "url": temp } }) payload["query"]["bool"]["should"] = temp_list domains_data = [] apps = set() if temp_list: s = Search(using=es, index='w12scan', doc_type='domains').from_dict(payload) for hit in s: dd = {} dd.update(hit.to_dict()) dd["id"] = hit.meta.id if isinstance(dd["url"], list): dd["url"] = dd["url"][0] if dd.get("app"): apps |= set(dd.get("app")) domains_data.append(dd) # 从域名中分离出ip加入到ip资产 temp_ips = set() for domain in domains_data: ip = domain.get("ip") if ip: temp_ips.add(ip) # 处理IP资产 ips = m.ips.splitlines() show_data["ips"] = ips temp_ips |= set(ips) temp_list = [] for temp in temp_ips: _ip = temp if "*" in _ip: temp_list.append({ "wildcard": { "target": _ip } }) elif "/" in _ip: try: net = ipaddress.ip_network(_ip) except Exception as e: print(e) net = None if net: for i in net: if i not in temp_ips: temp_list.append({ "term": { "target": str(i) } }) else: temp_list.append({ "term": { "target": _ip } }) payload = {"query": { "bool": { "should": [ ] } }, "collapse": { "field": "target" }, "sort": { "published_from": {"order": "desc"} }, "from": 0, "size": 10000 } payload["query"]["bool"]["should"] = temp_list ips_data = [] # ip service name statices statics_services = {} if temp_list: s = Search(using=es, index='w12scan', doc_type='ips').from_dict(payload) for hit in s: dd = {} dd.update(hit.to_dict()) dd["id"] = hit.meta.id if isinstance(dd["target"], list): dd["target"] = dd["target"][0] ips_data.append(dd) # 统计 if dd.get("infos"): for item in dd.get("infos"): name = item.get("name", None) if not name: continue if name not in statics_services: statics_services[name] = 0 statics_services[name] += 1 data_pie = { "labels": list(statics_services.keys()), "data": list(statics_services.values()) } return render(request, "frontend/zc-detail.html", {"model": m, "domains": domains_data, "show_data": show_data, "apps": apps, "ips": ips_data, "data_pie": data_pie})
def search_results(self, request, query_term): """ Display results based on search term. """ is_gene_suggest = False if request.method == "GET": client = Elasticsearch([ES_HOST], timeout=60) search_gene = Search().using(client).doc_type('genes').source( exclude=['isoforms.cds', 'isoforms.exons', 'GO']) #'isoforms.cds','GO']) if query_term == None: studies = Study.objects.all() phenotypes = Phenotype.objects.all() # Elasticsearch query cannot be made before knowing the ordering and the page number, etc as this is taken into account by elasticsearch.py else: studies = Study.objects.filter( Q(name__icontains=query_term) | Q(phenotype__trait_ontology_name__icontains=query_term) | Q(phenotype__name__icontains=query_term) | Q(phenotype__description__icontains=query_term) | Q(publication_pmid__icontains=query_term) | Q(publication_pmcid__icontains=query_term)).order_by( 'n_hits_perm').reverse() phenotypes = Phenotype.objects.filter( Q(name__icontains=query_term) | Q(description__icontains=query_term)).order_by('name') # Add chromosome position search for genomic regions try: int(query_term) isnum = True except ValueError: isnum = False import re pattern = re.compile( "(Chr|CHR|chr)+\s?([0-9]{1,2})+(-|:)?(\d*)\s*(-|:|)?\s*(\d+|)" ) if isnum: # Only a number, look for neighboring genes on all chromosomes. q = QES('range', positions={ "gte": int(query_term), 'lte': int(query_term) }) search_gene = search_gene.query(q) elif pattern.match(query_term): # Specific genomic range splitted = re.split( "(Chr|CHR|chr)+\s?([0-9]{1,2})+(-|:)?(\d*)\s*(-|:|)?\s*(\d+|)", query_term) chr = int(splitted[2]) s_p = None e_p = None if splitted[4]: s_p = int(splitted[4]) if splitted[6]: e_p = int(splitted[6]) # Need to retrieve all genes that overlap somehow with that region (all-in, right part in, left part in, etc) q = QES('match', chr='chr' + str(chr)) search_gene = search_gene.query(q) if s_p: if e_p: # Look for genes overlapping with region of interest q = QES('range', positions={ 'gte': s_p, 'lte': e_p }) | QES('range', positions={ 'gte': s_p, 'lte': s_p }) | QES('range', positions={ 'gte': e_p, 'lte': e_p }) else: q = QES('range', positions={ 'gte': s_p, 'lte': s_p }) | QES('range', positions={'gte': s_p}) search_gene = search_gene.query(q) else: # other type of request is_gene_suggest = True search_gene = search_gene.suggest('gene_suggest', query_term, completion={ 'field': 'suggest', 'size': 200 }) # custom ordering ordering = request.query_params.get('ordering', None) ordering_fields = { 'studies': ['name', 'genotype', 'phenotype', 'method', 'transformation'], 'phenotypes': ['name', 'description'], 'genes': [ 'name', 'chr', 'start', 'end', 'SNPs_count', 'association_count', 'description' ] } if ordering is not None: from django.db.models.functions import Lower inverted = False if ordering.startswith('-'): inverted = True ordering = ordering[1:] if ordering in ordering_fields['studies'] and studies: if ordering == 'phenotype' or ordering == 'genotype': # Need to reference the names and not the internal IDs for ordering ordering += '__name' studies = studies.order_by(Lower(ordering)).reverse() if inverted: studies = studies.reverse() if ordering in ordering_fields['phenotypes'] and phenotypes: phenotypes = phenotypes.order_by(Lower(ordering)) if inverted: phenotypes = phenotypes.reverse() if ordering in ordering_fields['genes']: # if ordering == 'snp' or ordering == 'study': # ordering += '__name' # genes = genes.order_by(Lower(ordering)) if ordering == 'start' or ordering == 'end': ordering += '_position' if inverted: ordering = "-" + ordering search_gene.sort(ordering) n_genes = search_gene.count() if studies: pagest = self.paginate_queryset(studies) study_serializer = StudySerializer(pagest, many=True) else: study_serializer = StudySerializer(studies, many=True) if n_genes: size = min(200, search_gene.count()) if is_gene_suggest: size = 0 results = search_gene[0:size].execute() if is_gene_suggest: genes = results.to_dict( )['suggest']['gene_suggest'][0]['options'] else: genes = results.to_dict()['hits']['hits'] genes_out = [] for gene in genes: genes_out.append(gene["_source"]) pagege = self.paginate_queryset(genes_out) else: genes = [] pagege = [] if phenotypes: pagephe = self.paginate_queryset(phenotypes) phenotype_serializer = PhenotypeListSerializer(pagephe, many=True) else: phenotype_serializer = PhenotypeListSerializer(phenotypes, many=True) counts = [len(genes), len(phenotypes), len(studies)] PAGE_SIZE = 25. import math page_counts = [ int(math.ceil(float(len(genes)) / PAGE_SIZE)), int(math.ceil(float(len(phenotypes)) / PAGE_SIZE)), int(math.ceil(float(len(studies)) / PAGE_SIZE)) ] data = { 'study_search_results': study_serializer.data, 'phenotype_search_results': phenotype_serializer.data, 'gene_search_results': pagege, 'counts': counts, 'page_counts': page_counts } if any([studies, genes, phenotypes]): return self.get_paginated_response(data) else: return Response({ 'results': {i: data[i] for i in data if i != 'counts'}, 'count': counts, 'page_count': [0, 0, 0] })
def _Search(self, indexname): """ it returns the object which can be used for reatriving ceratin value from the DB """ return Search(using=self.__client, index=indexname)
def analyze_git(es_read, es_write, es_read_index, es_write_index, git_enrich, size, incremental): query = {"match_all": {}} sort = [{"metadata__timestamp": {"order": "asc"}}] if incremental.lower() == 'true': search = Search(using=es_write, index=es_write_index) # from:to parameters (=> from: 0, size: 0) search = search[0:0] search = search.aggs.metric('max_date', 'max', field='metadata__timestamp') try: response = search.execute() if response.to_dict()['aggregations']['max_date']['value'] is None: msg = "No data for 'metadata__timestamp' field found in " msg += es_write_index + " index" logging.warning(msg) init_write_index(es_write, es_write_index) else: # Incremental case: retrieve items from last item in ES write index max_date = response.to_dict( )['aggregations']['max_date']['value_as_string'] max_date = date_parser.parse(max_date).isoformat() logging.info("Starting retrieval from: " + max_date) query = {"range": {"metadata__timestamp": {"gte": max_date}}} except NotFoundError: logging.warning("Index not found: " + es_write_index) init_write_index(es_write, es_write_index) else: init_write_index(es_write, es_write_index) search_query = {"query": query, "sort": sort} logging.info(search_query) logging.info("Start reading items...") commits = [] cont = 0 for hit in helpers.scan(es_read, search_query, scroll='300m', index=es_read_index, preserve_order=True): cont = cont + 1 item = hit["_source"] commits.append(item) logging.debug("[Hit] metadata__timestamp: " + item['metadata__timestamp']) if cont % size == 0: logging.info("Total Items read: " + str(cont)) events_df = eventize_and_enrich(commits, git_enrich) upload_data(events_df, es_write_index, es_write) commits = [] events_df = None # In case we have some commits pending, process them if len(commits) > 0: logging.info("Total Items read: " + str(cont)) events_df = eventize_and_enrich(commits, git_enrich) upload_data(events_df, es_write_index, es_write)
if __name__ == '__main__': # Selection of the arguments. index, nrounds, R, k, alpha, beta, query = parse_and_select() #If the user introduced the agrument (--query) but not the terms to search. if not query: print('No query parameters passed!') raise SystemExit else: query_dict = ini_query_dic(query) try: # Start of elasticsearch client = Elasticsearch() s = Search(using=client, index=index) # The query is solved and, using the k most relevant documents retrieved, # the query is updated using the Rocchio's rule nrounds. for _ in range(nrounds): #Creation of the query. q = Q('query_string', query=query[0]) for i in range(1, len(query)): q &= Q('query_string', query=query[i]) s = s.query(q) # We select the k most relevant documents. response = s[0:k].execute() # We compute the TFIDF representation for each document and # store them in a list
def _find(params, total_only=False, make_suggestions=False, min_suggestion_score=0.8): search_query = Search(index=settings.SEARCH_INDEX_NAME, ) if make_suggestions: # XXX research if it it's better to use phrase suggesters and if # that works # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#phrase-suggester search_query = search_query.suggest("title_suggestions", params["query"], term={"field": "title"}) search_query = search_query.suggest("body_suggestions", params["query"], term={"field": "body"}) sub_queries = [] sub_queries.append( Q("match", title={ "query": params["query"], "boost": 2.0 })) sub_queries.append( Q("match", body={ "query": params["query"], "boost": 1.0 })) if " " in params["query"]: sub_queries.append( Q("match_phrase", title={ "query": params["query"], "boost": 10.0 })) sub_queries.append( Q("match_phrase", body={ "query": params["query"], "boost": 5.0 })) sub_query = query.Bool(should=sub_queries) if params["locales"]: search_query = search_query.filter("terms", locale=params["locales"]) if params["archive"] == "exclude": search_query = search_query.filter("term", archived=False) elif params["archive"] == "only": search_query = search_query.filter("term", archived=True) search_query = search_query.highlight_options( pre_tags=["<mark>"], post_tags=["</mark>"], number_of_fragments=3, fragment_size=120, encoder="html", ) search_query = search_query.highlight("title", "body") if params["sort"] == "relevance": search_query = search_query.sort("_score", "-popularity") search_query = search_query.query(sub_query) elif params["sort"] == "popularity": search_query = search_query.sort("-popularity", "_score") search_query = search_query.query(sub_query) else: popularity_factor = 10.0 boost_mode = "sum" score_mode = "max" search_query = search_query.query( "function_score", query=sub_query, functions=[ query.SF( "field_value_factor", field="popularity", factor=popularity_factor, missing=0.0, ) ], boost_mode=boost_mode, score_mode=score_mode, ) search_query = search_query.source(excludes=["body"]) search_query = search_query[params["size"] * (params["page"] - 1):params["size"] * params["page"]] retry_options = { "retry_exceptions": ( # This is the standard operational exception. exceptions.ConnectionError, # This can happen if the search happened right as the index had # just been deleted due to a fresh re-indexing happening in Yari. exceptions.NotFoundError, # This can happen when the index simply isn't ready yet. exceptions.TransportError, ), # The default in redo is 60 seconds. Let's tone that down. "sleeptime": settings.ES_RETRY_SLEEPTIME, "attempts": settings.ES_RETRY_ATTEMPTS, "jitter": settings.ES_RETRY_JITTER, } with retrying(search_query.execute, **retry_options) as retrying_function: response = retrying_function() if total_only: return response.hits.total metadata = { "took_ms": response.took, "total": { # The `response.hits.total` is a `elasticsearch_dsl.utils.AttrDict` # instance. Pluck only the exact data needed. "value": response.hits.total.value, "relation": response.hits.total.relation, }, "size": params["size"], "page": params["page"], } documents = [] for hit in response: try: body_highlight = list(hit.meta.highlight.body) except AttributeError: body_highlight = [] try: title_highlight = list(hit.meta.highlight.title) except AttributeError: title_highlight = [] d = { "mdn_url": hit.meta.id, "score": hit.meta.score, "title": hit.title, "locale": hit.locale, "slug": hit.slug, "popularity": hit.popularity, "archived": hit.archived, "summary": hit.summary, "highlight": { "body": body_highlight, "title": title_highlight, }, } documents.append(d) try: suggest = getattr(response, "suggest") except AttributeError: suggest = None suggestions = [] if suggest: suggestion_strings = _unpack_suggestions( params["query"], response.suggest, ("body_suggestions", "title_suggestions"), ) for score, string in suggestion_strings: if score > min_suggestion_score or 1: # Sure, this is different way to spell, but what will it yield # if you actually search it? total = _find(params, total_only=True) if total["value"] > 0: suggestions.append({ "text": string, "total": { # This 'total' is an `AttrDict` instance. "value": total.value, "relation": total.relation, }, }) # Since they're sorted by score, it's usually never useful # to suggestion more than exactly 1 good suggestion. break return { "documents": documents, "metadata": metadata, "suggestions": suggestions, }