def clusters(): """ Отображает AJAX-версию страницы с кластеризацией. Предназначено для замены display_clusters() после тестирования. """ person = request.values.get('filter') print(person) Face._index.refresh() total = Face.search().count() named = Face.search().filter("exists", field="person").count() status = "{:.1%} ({} out of {}) faces are named. Clusters count: {}".format( named / total, named, total, Cluster.search().count()) a = A("terms", field="person.raw", size=10000) ps = Search() ps.aggs.bucket("persons", a) psr = ps.execute() persons = [b.key for b in psr.aggs['persons']] if person: s = Cluster.search().filter("prefix", person=person).sort("-face_count") results = s[0:10000].execute() else: s = Cluster.search().exclude("exists", field="person") s.query = FunctionScore(query=s.query, functions=[SF('random_score', weight=100), SF('field_value_factor', field="face_count", weight=1)], score_mode="avg", boost_mode="replace") results = s[0:50].execute() return render_template('clusters.html', clusters=results, persons=persons, status=status)
def get_queryset_posts(self): """Search in posts, and remove result if the forum is not allowed for the user or if the message is invisible. Score is modified if: + post is the first one in a topic; + post is marked as "useful"; + post has a like/dislike ratio above (has more likes than dislikes) or below (the other way around) 1.0. """ query = Match(_type='post') \ & Terms(forum_pk=self.authorized_forums) \ & Term(is_visible=True) \ & MultiMatch(query=self.search_query, fields=['text_html']) functions_score = [ {'filter': Match(position=1), 'weight': settings.ZDS_APP['search']['boosts']['post']['if_first']}, {'filter': Match(is_useful=True), 'weight': settings.ZDS_APP['search']['boosts']['post']['if_useful']}, { 'filter': Range(like_dislike_ratio={'gt': 1}), 'weight': settings.ZDS_APP['search']['boosts']['post']['ld_ratio_above_1'] }, { 'filter': Range(like_dislike_ratio={'lt': 1}), 'weight': settings.ZDS_APP['search']['boosts']['post']['ld_ratio_below_1'] } ] scored_query = FunctionScore(query=query, boost_mode='multiply', functions=functions_score) return scored_query
def get_queryset_posts(self): """Search in posts, and remove result if the forum is not allowed for the user or if the message is invisible. Score is modified if: + post is the first one in a topic; + post is marked as "useful"; + post has a like/dislike ratio above (has more likes than dislikes) or below (the other way around) 1.0. """ query = ( Match(_type="post") & Terms(forum_pk=self.authorized_forums) & Term(is_visible=True) & MultiMatch(query=self.search_query, fields=["text_html"]) ) functions_score = [ {"filter": Match(position=1), "weight": settings.ZDS_APP["search"]["boosts"]["post"]["if_first"]}, {"filter": Match(is_useful=True), "weight": settings.ZDS_APP["search"]["boosts"]["post"]["if_useful"]}, { "filter": Range(like_dislike_ratio={"gt": 1}), "weight": settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_above_1"], }, { "filter": Range(like_dislike_ratio={"lt": 1}), "weight": settings.ZDS_APP["search"]["boosts"]["post"]["ld_ratio_below_1"], }, ] scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score) return scored_query
def search_releases(term: str, user_id: str): """Search music releases with priority for previously listened tracks.""" search = Release.search() search.query = FunctionScore( query=MultiMatch( query=term, type="bool_prefix", fields=["title", "title._2gram", "title._3gram"], ), functions=[ # Weight results with higher listen count FieldValueFactor( field=f'user_listens.{user_id}', missing=0, ) ], ) response = search.execute() return [{ "id": release.id, "title": release.title, "listens": release.user_listens.to_dict().get(user_id, 0), } for release in response]
def get_queryset_topics(self): """Search in topics, and remove the result if the forum is not allowed for the user. Score is modified if: + topic is solved; + topic is sticky; + topic is locked. """ query = ( Match(_type="topic") & Terms(forum_pk=self.authorized_forums) & MultiMatch(query=self.search_query, fields=["title", "subtitle", "tags"]) ) functions_score = [ {"filter": Match(is_solved=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"]}, {"filter": Match(is_sticky=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"]}, {"filter": Match(is_locked=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"]}, ] scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score) return scored_query
def query(self, search, query): """Manipulates the query to support nested queries and a custom rank for pages.""" search = search.highlight_options(**self._highlight_options) queries = self._get_queries( query=query, fields=self.fields, ) sections_nested_query = self._get_nested_query( query=query, path='sections', fields=self._section_fields, ) domains_nested_query = self._get_nested_query( query=query, path='domains', fields=self._domain_fields, ) queries.extend([sections_nested_query, domains_nested_query]) final_query = FunctionScore( query=Bool(should=queries), script_score=self._get_script_score(), ) search = search.query(final_query) return search
def get_queryset_publishedcontents(self): """Find in PublishedContents. """ query = Match(_type='publishedcontent') \ & MultiMatch(query=self.search_query, fields=['title', 'description', 'categories', 'tags', 'text']) functions_score = [ { 'filter': Match(content_type='TUTORIAL'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent']['if_tutorial'] }, { 'filter': Match(content_type='TUTORIAL') & Match(has_chapters=True), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent']['if_medium_or_big_tutorial'] }, { 'filter': Match(content_type='ARTICLE'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent']['if_article'] }, { 'filter': Match(content_type='OPINION'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent']['if_opinion'] }, { 'filter': Match(content_type='OPINION') & Match(picked=False), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent']['if_opinion_not_picked'] }, ] scored_query = FunctionScore(query=query, boost_mode='multiply', functions=functions_score) return scored_query
def get(self, request, *args, **kwargs): if 'q' in request.GET: self.search_query = ''.join(request.GET['q']) results = [] if self.index_manager.connected_to_es and self.search_query: self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() query = Match(_type='topic') \ & Terms(forum_pk=self.authorized_forums) \ & MultiMatch(query=self.search_query, fields=['title', 'subtitle', 'tags']) functions_score = [ {'filter': Match(is_solved=True), 'weight': settings.ZDS_APP['search']['boosts']['topic']['if_solved']}, {'filter': Match(is_sticky=True), 'weight': settings.ZDS_APP['search']['boosts']['topic']['if_sticky']}, {'filter': Match(is_locked=True), 'weight': settings.ZDS_APP['search']['boosts']['topic']['if_locked']} ] scored_query = FunctionScore(query=query, boost_mode='multiply', functions=functions_score) search_queryset = search_queryset.query(scored_query)[:10] # Build the result for hit in search_queryset.execute(): result = {'id': hit.pk, 'url': str(hit.get_absolute_url), 'title': str(hit.title)} results.append(result) data = {'results': results} return HttpResponse(json.dumps(data), content_type='application/json')
def get(self, request, *args, **kwargs): if "q" in request.GET: self.search_query = "".join(request.GET["q"]) excluded_content_ids = request.GET.get("excluded", "").split(",") results = [] if self.index_manager.connected_to_es and self.search_query: self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() if len(excluded_content_ids) > 0 and excluded_content_ids != [""]: search_queryset = search_queryset.exclude( "terms", content_pk=excluded_content_ids) query = Match(_type="publishedcontent") & MultiMatch( query=self.search_query, fields=["title", "description"]) functions_score = [ { "filter": Match(content_type="TUTORIAL"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"] ["if_tutorial"], }, { "filter": Match(content_type="ARTICLE"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"] ["if_article"], }, { "filter": Match(content_type="OPINION"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"] ["if_opinion"], }, ] scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score) search_queryset = search_queryset.query(scored_query)[:10] # Build the result for hit in search_queryset.execute(): result = { "id": hit.content_pk, "pubdate": hit.publication_date, "title": str(hit.title), "description": str(hit.description), } results.append(result) data = {"results": results} return HttpResponse(json_handler.dumps(data), content_type="application/json")
def get(self, request, *args, **kwargs): if "q" in request.GET: self.search_query = "".join(request.GET["q"]) results = [] if self.index_manager.connected_to_es and self.search_query: self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() query = (Match(_type="topic") & Terms(forum_pk=self.authorized_forums) & MultiMatch(query=self.search_query, fields=["title", "subtitle", "tags"])) functions_score = [ { "filter": Match(is_solved=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_solved"] }, { "filter": Match(is_sticky=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_sticky"] }, { "filter": Match(is_locked=True), "weight": settings.ZDS_APP["search"]["boosts"]["topic"]["if_locked"] }, ] scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score) search_queryset = search_queryset.query(scored_query)[:10] # Build the result for hit in search_queryset.execute(): result = { "id": hit.pk, "url": str(hit.get_absolute_url), "title": str(hit.title), "subtitle": str(hit.subtitle), "forumTitle": str(hit.forum_title), "forumUrl": str(hit.forum_get_absolute_url), "pubdate": str(hit.pubdate), } results.append(result) data = {"results": results} return HttpResponse(json_handler.dumps(data), content_type="application/json")
def TagBoost(slugs, boost_mode="multiply", weight=5): included, excluded = _parse_slugs(slugs) return FunctionScore(boost_mode=boost_mode, functions=[{ "filter": Nested(path="tags", filter=Terms(**{"tags.slug": included})), "weight": weight }])
def build_search(self): s = super(SearchQuery, self).build_search() # Handle scoring functions if self.boosters: s.query = FunctionScore( query=s.query, functions=[b.to_query() for b in self.boosters]) # Until https://github.com/elastic/elasticsearch-dsl-py/pull/474 # is merged and released s = s.fields([]) return s
def query(self, search, query): """Manipulates the query to support nested queries and a custom rank for pages.""" search = search.highlight_options(**self._highlight_options) all_queries = [] # match query for the title (of the page) field. for operator in self.operators: query_string = self._get_text_query( query=query, fields=self.fields, operator=operator, ) all_queries.append(query_string) # nested query for search in sections sections_nested_query = self.generate_nested_query( query=query, path='sections', fields=self._section_fields, inner_hits={ 'highlight': dict( self._highlight_options, fields={ 'sections.title': {}, 'sections.content': {}, } ) } ) # nested query for search in domains domains_nested_query = self.generate_nested_query( query=query, path='domains', fields=self._domain_fields, inner_hits={ 'highlight': dict( self._highlight_options, fields={ 'domains.name': {}, 'domains.docstrings': {}, } ) } ) all_queries.extend([sections_nested_query, domains_nested_query]) final_query = FunctionScore( query=Bool(should=all_queries), script_score=self._get_script_score(), ) search = search.query(final_query) return search
def get(self, request, *args, **kwargs): if 'q' in request.GET: self.search_query = ''.join(request.GET['q']) excluded_content_ids = request.GET.get('excluded', '').split(',') results = [] if self.index_manager.connected_to_es and self.search_query: self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() if len(excluded_content_ids) > 0 and excluded_content_ids != ['']: search_queryset = search_queryset.exclude( 'terms', content_pk=excluded_content_ids) query = Match(_type='publishedcontent') & MultiMatch( query=self.search_query, fields=['title', 'description']) functions_score = [{ 'filter': Match(content_type='TUTORIAL'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent'] ['if_tutorial'] }, { 'filter': Match(content_type='ARTICLE'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent'] ['if_article'] }, { 'filter': Match(content_type='OPINION'), 'weight': settings.ZDS_APP['search']['boosts']['publishedcontent'] ['if_opinion'] }] scored_query = FunctionScore(query=query, boost_mode='multiply', functions=functions_score) search_queryset = search_queryset.query(scored_query)[:10] # Build the result for hit in search_queryset.execute(): result = { 'id': hit.content_pk, 'pubdate': hit.publication_date, 'title': str(hit.title), 'description': str(hit.description) } results.append(result) data = {'results': results} return HttpResponse(json_handler.dumps(data), content_type='application/json')
def query(self, search, query): if query: return search.query("simple_query_string", fields=self.fields, query=query, default_operator='and') else: search.query = FunctionScore( query=Q(), functions=[SF('random_score', seed=int(time.time()))]) return search
def get_queryset(self): if not self.index_manager.connected_to_es: messages.warning(self.request, _(u'Impossible de se connecter à Elasticsearch')) return [] if self.search_query: # find forums the user is allowed to visit self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() # setting the different querysets (according to the selected models, if any) part_querysets = [] chosen_groups = self.search_form.cleaned_data['models'] if chosen_groups: models = [] for group in chosen_groups: if group in settings.ZDS_APP['search']['search_groups']: models.append(settings.ZDS_APP['search']['search_groups'][group][1]) else: models = [v[1] for k, v in settings.ZDS_APP['search']['search_groups'].iteritems()] models = reduce(operator.concat, models) for model in models: part_querysets.append(getattr(self, 'get_queryset_{}s'.format(model))()) queryset = part_querysets[0] for query in part_querysets[1:]: queryset |= query # weighting: weight_functions = [] for _type, weights in settings.ZDS_APP['search']['boosts'].items(): if _type in models: weight_functions.append({'filter': Match(_type=_type), 'weight': weights['global']}) scored_queryset = FunctionScore(query=queryset, boost_mode='multiply', functions=weight_functions) search_queryset = search_queryset.query(scored_queryset) # highlighting: search_queryset = search_queryset.highlight_options( fragment_size=150, number_of_fragments=5, pre_tags=['[hl]'], post_tags=['[/hl]']) search_queryset = search_queryset.highlight('text').highlight('text_html') # executing: return self.index_manager.setup_search(search_queryset) return []
def random(request): """ Redirect to a random case over 1,000 words. """ s = CaseDocument.search().source(['frontend_url']).filter('range', analysis__word_count={'gte':1000}) s.query = FunctionScore( query=s.query, # omit this if not applying a filter first functions=[ SF('random_score'), # to weight by pagerank: # SF('field_value_factor', field='analysis.pagerank.percentile', modifier="ln1p", missing=0) ], boost_mode='replace', ) random_case = s[0].execute()[0] return HttpResponseRedirect(random_case.frontend_url)
def query(self, search, query): """ Manipulates the query to support nested queries and a custom rank for pages. If `self.projects` was given, we use it to filter the documents that match the same project and version. """ search = search.highlight_options(**self._highlight_options) search = search.source(excludes=self.excludes) queries = self._get_queries( query=query, fields=self.fields, ) sections_nested_query = self._get_nested_query( query=query, path='sections', fields=self._section_fields, ) domains_nested_query = self._get_nested_query( query=query, path='domains', fields=self._domain_fields, ) queries.extend([sections_nested_query, domains_nested_query]) bool_query = Bool(should=queries) if self.projects: versions_query = [ Bool(must=[ Term(project={'value': project}), Term(version={'value': version}), ]) for project, version in self.projects.items() ] bool_query = Bool(must=[bool_query, Bool(should=versions_query)]) final_query = FunctionScore( query=bool_query, script_score=self._get_script_score(), ) search = search.query(final_query) return search
def search_keywords(self, words): s = Search(using=self.client) or_queries = [] for w in words: m = MultiMatch(query=w, fields=self.spec_fields) or_queries.append(m) q = reduce(operator.or_, or_queries) s.query = q script = { 'script': { 'source': "_score * sigmoid(2.74, 1, Math.sqrt(doc['info.opinion_stats.positive'].value) - doc['info.opinion_stats.negative'].value) * (doc['info.opinion_stats.positive'].value + 1) / (doc['info.opinion_stats.total'].value + 1)" } } s.query = FunctionScore(query=q, script_score=script) return s.execute()
def get_queryset_publishedcontents(self): """Search in PublishedContent objects.""" query = Match(_type="publishedcontent") & MultiMatch( query=self.search_query, fields=["title", "description", "categories", "subcategories", "tags", "text"] ) if self.from_library: query &= Match(content_type="TUTORIAL") | Match(content_type="ARTICLE") if self.content_category: query &= Match(categories=self.content_category) if self.content_subcategory: query &= Match(subcategories=self.content_subcategory) functions_score = [ { "filter": Match(content_type="TUTORIAL"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_tutorial"], }, { "filter": Match(content_type="TUTORIAL") & Match(has_chapters=True), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_medium_or_big_tutorial"], }, { "filter": Match(content_type="ARTICLE"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_article"], }, { "filter": Match(content_type="OPINION"), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion"], }, { "filter": Match(content_type="OPINION") & Match(picked=False), "weight": settings.ZDS_APP["search"]["boosts"]["publishedcontent"]["if_opinion_not_picked"], }, ] scored_query = FunctionScore(query=query, boost_mode="multiply", functions=functions_score) return scored_query
def run(self): emails = { 'breached': set(), 'unbreached': set(), } # contact_email exists must = [Q('exists', field='contact_email')] # matches source if specified if self.source: must.append(Q({'term': {'analysis.source': self.source}})) # not already tagged with breached s = Search(using=self.es).\ query(FunctionScore( query=Q('bool', must=must, must_not=[Q('exists', field='analysis.breached')]), functions=[SF('random_score', seed=int(time.time()))] )).\ source(['contact_email']) print('%s breached: source=%s limit=%s' % (datetime.now().isoformat(), self.source, self.limit)) print('query=\n%s' % json.dumps(s.to_dict())) for filing in s[:self.limit]: email = filing['contact_email'] if not email or email in emails['breached'] or email in emails[ 'unbreached']: continue breached = self.is_breached(email) emails['breached' if breached else 'unbreached'].add(email) docs = [] print('done source=%s' % self.source) if emails['breached']: docs += self.tag_by_email(list(emails['breached']), True) if emails['unbreached']: docs += self.tag_by_email(list(emails['unbreached']), False) try: lib.bulk_update(self.es, docs) except Exception as e: print('error indexing: %s' % e)
def get_queryset_topics(self): """Search in topics, and remove the result if the forum is not allowed for the user. Score is modified if: + topic is solved; + topic is sticky; + topic is locked. """ query = Match(_type='topic') \ & Terms(forum_pk=self.authorized_forums) \ & MultiMatch(query=self.search_query, fields=['title', 'subtitle', 'tags']) functions_score = [ {'filter': Match(is_solved=True), 'weight': settings.ZDS_APP['search']['boosts']['topic']['if_solved']}, {'filter': Match(is_sticky=True), 'weight': settings.ZDS_APP['search']['boosts']['topic']['if_sticky']}, {'filter': Match(is_locked=True), 'weight': settings.ZDS_APP['search']['boosts']['topic']['if_locked']} ] scored_query = FunctionScore(query=query, boost_mode='multiply', functions=functions_score) return scored_query
def browse(request): s = Search(using=es) description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))]) if 'source' in request.GET: source = request.GET['source'] s = s.filter('terms', **{'analysis.source': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source elif 'titleii' in request.GET: title_ii = request.GET['titleii'] if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket( 'email_confirmation', A('filters', filters={ 'true': { 'term': { 'emailConfirmation': 'true' } }, 'false': { 'term': { 'emailConfirmation': 'false' } } })) s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Emails': { 'Unique': 0, }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 } }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict( )['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False']) context = { 'description': description, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
def get_sort_popularity(self, request): score = FunctionScore( score_mode='sum', functions=[ SF( 'field_value_factor', field='status_score', weight=10, factor=10 ), SF( 'gauss', weight=0.1, created={ 'scale': "365d" }, ), ] ) | FunctionScore( score_mode='multiply', functions=[ SF( 'field_value_factor', field='contribution_count', missing=0 ), SF( 'gauss', weight=0.1, multi_value_mode='avg', contributions={ 'scale': '5d' }, ), ] ) if request.user.is_authenticated: if request.user.skills: score = score | FunctionScore( score_mode='first', functions=[ SF({ 'filter': Nested( path='expertise', query=Q( 'terms', expertise__id=[skill.pk for skill in request.user.skills.all()] ) ), 'weight': 1, }), SF({'weight': 0}), ] ) if request.user.favourite_themes: score = score | FunctionScore( score_mode='first', functions=[ SF({ 'filter': Nested( path='theme', query=Q( 'terms', theme__id=[theme.pk for theme in request.user.favourite_themes.all()] ) ), 'weight': 1, }), SF({'weight': 0}), ] ) position = None if request.user.location and request.user.location.position: position = { 'lat': request.user.location.position.latitude, 'lon': request.user.location.position.longitude } elif request.user.place and request.user.place.position: position = { 'lat': request.user.place.position.latitude, 'lon': request.user.place.position.longitude } if position: score = score | FunctionScore( score_mode='first', functions=[ SF({ 'filter': {'exists': {'field': 'position'}}, 'weight': 1, 'gauss': { 'position': { 'origin': position, 'scale': "100km" }, 'multi_value_mode': 'max', }, }), SF({'weight': 0}), ] ) return score
def SponsoredBoost(field_name, boost_mode="multiply", weight=5): return FunctionScore(boost_mode=boost_mode, functions=[{ "filter": Exists(field=field_name), "weight": weight }])
def handle(self, *args, **options): corrected = NACPDeclaration.search().filter("term", intro__corrected=True) cntr = 0 success_rate = 0 for i, d in enumerate(corrected.scan()): must = [ ConstantScore(query=Q( "multi_match", query=d.general.full_name, operator="and", fields=[ "general.last_name", "general.name", "general.patronymic", "general.full_name", ], ), boost=10) ] should = [ ConstantScore(query=Q( "match", general__post__post={ "query": d.general.post.post, "minimum_should_match": "50%" }, ), boost=2), ConstantScore(query=Q( "match", general__post__office={ "query": d.general.post.office, "minimum_should_match": "50%" }, ), boost=2), ConstantScore(query=Q( "match", general__post__region={ "query": d.general.post.region.replace(" область", ""), "minimum_should_match": "60%" }, ), boost=1) ] for fam in getattr(d.general, "family", []): should.append( ConstantScore(query=Q( "multi_match", query=fam.family_name, operator="and", fields=["general.family.family_name"]), boost=2)) candidates = NACPDeclaration.search() \ .query( FunctionScore( query=Q("bool", must=must, should=should), score_mode="sum" ) ) \ .filter("term", intro__declaration_year=d.intro.declaration_year) \ .query(~Q('term', _id=d.meta.id)) \ .filter("term", intro__corrected=False) \ .query( ConstantScore( query=Q("term", intro__doc_type=d.intro.doc_type), boost=0 ) ) if options["store_matches"]: candidates = candidates \ .highlight_options( order='score', fragment_size=500, number_of_fragments=100, pre_tags=['||!'], post_tags=["||"]) \ .highlight( "general.full_name", "general.post.region", "general.post.office", "general.post.post", "general.family.family_name") candidates = candidates.execute() success = self.store_example( d, candidates, debug=options["debug"], store_matches=options["store_matches"]) if success: success_rate += 1 cntr += 1 if cntr and cntr % 5000 == 0: self.stdout.write("%s declarations processed, SR: %s%%" % (cntr, success_rate / cntr * 100)) self.stdout.write("%s declarations processed, SR: %s%%" % (cntr, success_rate / cntr * 100)) if options["store_matches"]: self.save_to_excel(options["store_matches"])
def browse(request, sentiment=None, group=None): s = Search(using=es, index="fcc-comments") description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))] ) if group: source = group s = s.filter('terms', **{'analysis.source.keyword': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source details = SOURCE_MAP.get(source, {}).get('details') or "" url = SOURCE_MAP.get(source, {}).get('url') or "" elif sentiment: title_ii = sentiment if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') details, url = "", None s.aggs.bucket("date", A('date_histogram', field='date_submission', interval='month')) s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('email_domain', A('terms', field='analysis.throwawayemail')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket('ingestion', A('terms', field='analysis.ingestion_method.keyword')) s.aggs.bucket('email_confirmation', A('filters', filters={ 'true': {'term': {'emailConfirmation': 'true'}}, 'false': {'term': {'emailConfirmation': 'false'}} })) # s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Throwaway Email': { 'True': 0, 'False': 0 }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 }, 'Filing Method': { 'API': 0, 'Spreadsheet': 0, 'Direct': 0 }, 'Filing Dates': OrderedDict({ }) }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.date.buckets: d = datetime.fromtimestamp((bucket.key/1000.) + 14400) title = "%s/17 - %s" % (d.strftime("%m"), d.strftime("%B")) stats['Filing Dates'][title] = bucket.doc_count for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.email_domain.buckets: if bucket.key == 1: stats['Throwaway Email']['True'] = bucket.doc_count elif bucket.key == 0: stats['Throwaway Email']['False'] = bucket.doc_count for bucket in response.aggregations.ingestion.buckets: if bucket.key == "api": stats['Filing Method']['API'] = bucket.doc_count elif bucket.key == "csv": stats['Filing Method']['Spreadsheet'] = bucket.doc_count elif bucket.key == "direct": stats['Filing Method']['Direct'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count # stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict()['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False'] ) context = { 'description': description, 'details': details, 'url': url, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
def get_queryset(self): if not self.index_manager.connected_to_es: messages.warning(self.request, _("Impossible de se connecter à Elasticsearch")) return [] if self.search_query: # Searches forums the user is allowed to visit self.authorized_forums = get_authorized_forums(self.request.user) search_queryset = Search() # Restrict (sub)category if any if self.search_form.cleaned_data["category"]: self.content_category = self.search_form.cleaned_data[ "category"] if self.search_form.cleaned_data["subcategory"]: self.content_subcategory = self.search_form.cleaned_data[ "subcategory"] # Mark that contents must come from library if required self.from_library = False if self.search_form.cleaned_data["from_library"] == "on": self.from_library = True # Setting the different querysets (according to the selected models, if any) part_querysets = [] chosen_groups = self.search_form.cleaned_data["models"] if chosen_groups: models = [] for group in chosen_groups: if group in settings.ZDS_APP["search"]["search_groups"]: models.append(settings.ZDS_APP["search"] ["search_groups"][group][1]) else: models = [ v[1] for k, v in settings.ZDS_APP["search"] ["search_groups"].items() ] models = reduce(operator.concat, models) for model in models: part_querysets.append( getattr(self, f"get_queryset_{model}s")()) queryset = part_querysets[0] for query in part_querysets[1:]: queryset |= query # Weighting: weight_functions = [] for _type, weights in list( settings.ZDS_APP["search"]["boosts"].items()): if _type in models: weight_functions.append({ "filter": Match(_type=_type), "weight": weights["global"] }) scored_queryset = FunctionScore(query=queryset, boost_mode="multiply", functions=weight_functions) search_queryset = search_queryset.query(scored_queryset) # Highlighting: search_queryset = search_queryset.highlight_options( fragment_size=150, number_of_fragments=5, pre_tags=["[hl]"], post_tags=["[/hl]"]) search_queryset = search_queryset.highlight("text").highlight( "text_html") # Executing: return self.index_manager.setup_search(search_queryset) return []