def get_search_query(phrase): query = Q('function_score', query=MultiMatch( fields=['name', 'description', 'speaker', 'transcript'], query=phrase), functions=[SF('field_value_factor', field='number_of_views')]) return TalkDocument.search().query(query)
def search_close(self, origin_timestamp, channel, qterm, number_results): """ Find log entries close to origin timestamp, filter by channel, highlight qterm and return them sorted by date. :param origin_timestamp: origin timestamp to find logs around :param channel: Channel to be filtered :param qterm: Term to be highlighted :param number_results: how many results :return: List of sorted log entries (Elastic-search response) :rtype: ``list`` """ # Prepare query s = DslSearch(using=self._es, index=self._index_prefix.format('*')) # Function score main_query_boosting = 1e-15 # only used for highlighting, not for scoring -> give very low signifance pos = MatchPhrase(msg={'query': qterm, 'boost': main_query_boosting}) | \ Match(**{'username': {'query': qterm, 'boost': main_query_boosting}}) | \ Match(channel={'query': qterm, 'boost': main_query_boosting}) | \ Match(msg={'query': qterm, 'boost': main_query_boosting}) main_query = (pos | Q('match_all')) function_score_query = Q('function_score', query=main_query, functions=[ SF( 'exp', **{ '@timestamp': { "origin": origin_timestamp, "scale": "1m", "decay": 0.999 } }) ]) s = s.query(function_score_query) # filter channel s = s.filter('term', **{'channel.keyword': channel}) # Number of results s = s[0:number_results] # Highlight s = s.highlight_options(order='score') s = s.highlight('msg', number_of_fragments=0) s = s.highlight('username') s = s.highlight('channel') # Execute response = s.execute() # Sort results response_sorted = sorted(response, key=lambda hit: hit['@timestamp']) return response_sorted
def build_search_company_query(params): term = params.pop('term', None) # perform OR operation for items specified in same group and # then an AND operation for different groups e.g., # (NORTH_EAST OR NORTH_WEST) AND (AEROSPACE OR AIRPORTS) # each sibling filter should have equal score with each other must = [] for key, values in params.items(): should = [ ConstantScore(filter=Q('term', **{key: value})) for value in values ] must.append(Q('bool', should=should, minimum_should_match=1)) should = [] if term: should.append( Q('bool', should=[ ConstantScore(filter=Q('term', keyword_wildcard=term)), ConstantScore(filter=Q('match_phrase', wildcard=term)), ConstantScore(filter=Q('match', wildcard=term)), ConstantScore( filter=Q('match_phrase', casestudy_wildcard=term)), ConstantScore(filter=Q('match', casestudy_wildcard=term)) ], minimum_should_match=1)) return Q('function_score', query=Q('bool', must=must, should=should, minimum_should_match=1 if should else 0), functions=[ SF({ 'weight': 5, 'filter': (Q('match_phrase', name=term) | Q('match', name=term)) }) ], boost_mode='sum') else: return Q('bool', must=must, should=should, minimum_should_match=1 if should else 0)
def run(self): emails = { 'breached': set(), 'unbreached': set(), } # contact_email exists must = [Q('exists', field='contact_email')] # matches source if specified if self.source: must.append(Q({'term': {'analysis.source': self.source}})) # not already tagged with breached s = Search(using=self.es).\ query(FunctionScore( query=Q('bool', must=must, must_not=[Q('exists', field='analysis.breached')]), functions=[SF('random_score', seed=int(time.time()))] )).\ source(['contact_email']) print('%s breached: source=%s limit=%s' % (datetime.now().isoformat(), self.source, self.limit)) print('query=\n%s' % json.dumps(s.to_dict())) for filing in s[:self.limit]: email = filing['contact_email'] if not email or email in emails['breached'] or email in emails[ 'unbreached']: continue breached = self.is_breached(email) emails['breached' if breached else 'unbreached'].add(email) docs = [] print('done source=%s' % self.source) if emails['breached']: docs += self.tag_by_email(list(emails['breached']), True) if emails['unbreached']: docs += self.tag_by_email(list(emails['unbreached']), False) try: lib.bulk_update(self.es, docs) except Exception as e: print('error indexing: %s' % e)
def get_search_results(term, page, size): """Search companies by term Wildcard search of companies by provided term. The position of companies that have only one sector is increased. Arguments: term {str} -- Search term to match on page {int} -- Page number to query size {int} -- Number of results per page Returns: dict -- Companies that match the term """ start = (page - 1) * size end = start + size query = search.CompanyDocType.search().query( 'function_score', query=Q('match', _all=term), functions=[SF('field_value_factor', field='has_single_sector')]) return query[start:end].execute().to_dict()
def browse(request): s = Search(using=es) description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))]) if 'source' in request.GET: source = request.GET['source'] s = s.filter('terms', **{'analysis.source': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source elif 'titleii' in request.GET: title_ii = request.GET['titleii'] if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket( 'email_confirmation', A('filters', filters={ 'true': { 'term': { 'emailConfirmation': 'true' } }, 'false': { 'term': { 'emailConfirmation': 'false' } } })) s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Emails': { 'Unique': 0, }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 } }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict( )['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False']) context = { 'description': description, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
def browse(request, sentiment=None, group=None): s = Search(using=es, index="fcc-comments") description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))] ) if group: source = group s = s.filter('terms', **{'analysis.source.keyword': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source details = SOURCE_MAP.get(source, {}).get('details') or "" url = SOURCE_MAP.get(source, {}).get('url') or "" elif sentiment: title_ii = sentiment if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') details, url = "", None s.aggs.bucket("date", A('date_histogram', field='date_submission', interval='month')) s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('email_domain', A('terms', field='analysis.throwawayemail')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket('ingestion', A('terms', field='analysis.ingestion_method.keyword')) s.aggs.bucket('email_confirmation', A('filters', filters={ 'true': {'term': {'emailConfirmation': 'true'}}, 'false': {'term': {'emailConfirmation': 'false'}} })) # s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Throwaway Email': { 'True': 0, 'False': 0 }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 }, 'Filing Method': { 'API': 0, 'Spreadsheet': 0, 'Direct': 0 }, 'Filing Dates': OrderedDict({ }) }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.date.buckets: d = datetime.fromtimestamp((bucket.key/1000.) + 14400) title = "%s/17 - %s" % (d.strftime("%m"), d.strftime("%B")) stats['Filing Dates'][title] = bucket.doc_count for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.email_domain.buckets: if bucket.key == 1: stats['Throwaway Email']['True'] = bucket.doc_count elif bucket.key == 0: stats['Throwaway Email']['False'] = bucket.doc_count for bucket in response.aggregations.ingestion.buckets: if bucket.key == "api": stats['Filing Method']['API'] = bucket.doc_count elif bucket.key == "csv": stats['Filing Method']['Spreadsheet'] = bucket.doc_count elif bucket.key == "direct": stats['Filing Method']['Direct'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count # stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict()['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False'] ) context = { 'description': description, 'details': details, 'url': url, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
def get_sort_popularity(self, request): score = FunctionScore( score_mode='sum', functions=[ SF( 'field_value_factor', field='status_score', weight=10, factor=10 ), SF( 'gauss', weight=0.1, created={ 'scale': "365d" }, ), ] ) | FunctionScore( score_mode='multiply', functions=[ SF( 'field_value_factor', field='contribution_count', missing=0 ), SF( 'gauss', weight=0.1, multi_value_mode='avg', contributions={ 'scale': '5d' }, ), ] ) if request.user.is_authenticated: if request.user.skills: score = score | FunctionScore( score_mode='first', functions=[ SF({ 'filter': Nested( path='expertise', query=Q( 'terms', expertise__id=[skill.pk for skill in request.user.skills.all()] ) ), 'weight': 1, }), SF({'weight': 0}), ] ) if request.user.favourite_themes: score = score | FunctionScore( score_mode='first', functions=[ SF({ 'filter': Nested( path='theme', query=Q( 'terms', theme__id=[theme.pk for theme in request.user.favourite_themes.all()] ) ), 'weight': 1, }), SF({'weight': 0}), ] ) position = None if request.user.location and request.user.location.position: position = { 'lat': request.user.location.position.latitude, 'lon': request.user.location.position.longitude } elif request.user.place and request.user.place.position: position = { 'lat': request.user.place.position.latitude, 'lon': request.user.place.position.longitude } if position: score = score | FunctionScore( score_mode='first', functions=[ SF({ 'filter': {'exists': {'field': 'position'}}, 'weight': 1, 'gauss': { 'position': { 'origin': position, 'scale': "100km" }, 'multi_value_mode': 'max', }, }), SF({'weight': 0}), ] ) return score
def get_search_query(phrase): query = Q('function_score', query=MultiMatch(fields=['title', 'author', 'publisher'], query=phrase), functions=[SF('field_value_factor', field='number_of_views')]) return BooksIndex.search().query(query)