def tweetQuery(query: str, current_user: User = Depends(get_current_user)): a_like_count = A("sum", field="like_count") a_retweet_count = A("sum", field="retweet_count") a_keywords = A("significant_text", field="text") search = Search(using=es, index="tweets-*").query("match", text=query) search.aggs.bucket("total_likes", a_like_count) search.aggs.bucket("total_retweets", a_retweet_count) search.aggs.bucket("keywords", a_keywords) res = search.execute().to_dict() tweets = {} ids = "" for tweet in res["hits"]["hits"]: tweets[tweet["_id"]] = tweet ids += tweet["_id"] + "," ids = ids[:-1] updated_tweets = tweet_update(tweets, ids) return updated_tweets
def _aggregate_duplicates(form_model, search_parameters, search): if search_parameters == 'exactmatch': search = search.params(search_type="count") form_fields_to_be_filtered = [] aggs = _aggregate_exact_match_duplicates(form_model.form_fields, form_model.id, search.aggs, search, form_fields_to_be_filtered) aggs.bucket('tag', 'terms', field='ds_id_exact', size=0, min_doc_count=2)\ .bucket('tag', 'top_hits', size=(2 ** 7)) setattr(form_model, "filter_fields", form_fields_to_be_filtered) elif search_parameters == 'datasender': search = search.params(search_type="count") a = A("terms", field='ds_id_exact', size=0, min_doc_count=2) b = A("top_hits", size=(2**10)) search.aggs.bucket('tag', a).bucket('tag', b) else: search = search.params(search_type="count") a = A("terms", field=form_model.id + '_' + search_parameters + '_unique_code_exact', size=0, min_doc_count=2) b = A("top_hits", size=(2**10)) search.aggs.bucket('tag', a).bucket('tag', b) return search
def get_genre_agg(): s = Search(using=es) s = s.index('imdb') s.aggs.bucket('genres', A('terms', field='genres')) ret = s.execute() # logger.debug('genre agg is %s', json.dumps(ret.aggs.to_dict(), indent=2)) return [x['key'] for x in ret.aggs.to_dict()['genres']['buckets']]
def first_words(index='codetoname', language='python'): es = elasticsearch.Elasticsearch() # update first name s = elasticsearch_dsl.Search(using=es, index=index, doc_type=language)\ .query('bool', filter=Q('exists', field='feature') & Q('missing', field='first_name')) for hit in s.scan(): data = hit.to_dict() feature = json.loads(data['feature']) data['first_name'] = firstname(feature['name'], language) es.index(index=index, doc_type=language, id=hit.meta.id, body=data) es.indices.refresh(index=index) # aggregation s = elasticsearch_dsl.Search(using=es, index=index, doc_type=language)\ .query('bool', filter=Q('exists', field='feature')) a = A('terms', field='first_name') s.aggs.bucket('first_name_terms', a) response = s.execute() words = [] for item in response.aggregations.first_name_terms.buckets: percentage = item.doc_count / float(response.hits.total) * 100 words.append({'word': item.key, 'percentage': percentage}) return words
def etl(index='cf_rfem_hist_price', start_date='2018-12-26', end_date='2019-03-25', symbol='rfem'): ESLowLevelClientByConnection.get_instance() search = Search(index=index, using='high_level_client')[0:100] search.query = Q( Bool(must=[ Range(date={ 'gte': '2018-12-26', 'lte': '2019-03-25' }), Term(symbol='rfem') ])) aggs = A( DateHistogram(field='date', interval='1d', format='yyyy-MM-dd', min_doc_count=1)) response = search.execute() hits = response['hits'] hits = hits['hits'] XX = [] for hit in hits: X = [] X.append(hit['_source']['changeOverTime']) X.append(hit['_source']['changePercent']) X.append(hit['_source']['volume']) XX.append(X) return (XX)
def bollinger_band(index='cf_etf_hist_price', start_date='2018-12-26', end_date='2019-03-25', symbol='rfem'): ESLowLevelClientByConnection.get_instance() search = Search(index=index, using='high_level_client')[0:0] search.query = Q( Bool(must=[ Range(date={ 'gte': '2018-12-26', 'lte': '2019-03-25' }), Term(symbol='rfem') ])) aggs = A( DateHistogram(field='date', interval='1d', format='yyyy-MM-dd', min_doc_count=1)) aggs_tp = A( ScriptedMetric( init_script='state.totals=[]', map_script= 'state.totals.add((doc.high.value+doc.low.value+doc.close.value)/3)', combine_script= 'double total=0; for (t in state.totals) {total += t} return total', reduce_script= 'double total=0; for (t in states) {total += t} return total')) aggs_moving_avg = A( MovingAvg(model='simple', window=20, buckets_path='tp.value')) aggs_bbu = A( BucketScript(buckets_path={'SMA': '20_trading_days_moving_avg'}, script='params.SMA + 0.5')) aggs_bbl = A( BucketScript(buckets_path={'SMA': '20_trading_days_moving_avg'}, script='params.SMA - 0.5')) search.aggs.bucket('Bollinger_band', aggs).pipeline('tp', aggs_tp), pipeline( '20_trading_days_moving_avg', aggs_moving_avg), pipeline('BBU', aggs_bbu).pipeline( 'BBL', aggs_bbl) response = search.execute() print(response.to_dict())
def num_repos(self): if self._es.indices.exists(index=self._es_index): s = elasticsearch_dsl.Search(using=self._es, index=self._es_index, doc_type=self._language) s.aggs.bucket('num_repos', A('cardinality', field='repo.github_id')) response = s.execute() return response.aggregations.num_repos.value return 0
def get_aggregations_for_choice_fields(dbm, form_model, local_time_delta, pagination_params, sort_params, search_parameters): search = _create_search(dbm, form_model, local_time_delta, pagination_params, sort_params, search_parameters) search = search.params(search_type="count") field_names = [] for field in form_model.choice_fields: field_name = es_questionnaire_field_name(field.code, form_model.id) a = A("terms", field=field_name + '_exact', size=0) search.aggs.bucket(field_name, a) field_names.append(field_name) search_results = search.execute() aggs_results = [ _get_aggregation_result(field_name, search_results) for field_name in field_names ] return aggs_results, search_results.hits.total
def sources(request): s = Search(using=es, index='fcc-comments') a = A('terms', field='analysis.source.keyword', size=50) s.aggs.bucket('sources', a) response = s.execute() context = { 'sources': [] } for source in response.aggregations.sources.buckets: context['sources'].append({ 'key': source.key, 'count': source.doc_count, 'name': SOURCE_MAP.get(source.key, {}).get('name'), 'url': SOURCE_MAP.get(source.key, {}).get('url') }) return render(request, 'sources.html', context)
def index(request): s = Search(using=es) total = s.count() pro_titleii = s.query('match', **{'analysis.titleii': True}).count() anti_titleii = s.query('match', **{'analysis.titleii': False}).count() unknown_titleii = total - pro_titleii - anti_titleii context = { 'total_comments': s.count(), 'title_ii': { 'pro': pro_titleii / total * 100, 'anti': anti_titleii / total * 100, 'unknown': unknown_titleii / total * 100 } } a = A('terms', field='analysis.source') s.aggs.bucket('sources', a) response = s.execute() context['sources'] = [] for source in response.aggregations.sources.buckets: if source.key == 'unknown': continue context['sources'].append({ 'key': source.key, 'count': source.doc_count, 'name': SOURCE_MAP.get(source.key, {}).get('name'), 'url': SOURCE_MAP.get(source.key, {}).get('url') }) print(source.key, source.doc_count) # context['sources'] = s.aggs['sources'] return render(request, 'index.html', context)
def browse(request): s = Search(using=es) description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))]) if 'source' in request.GET: source = request.GET['source'] s = s.filter('terms', **{'analysis.source': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source elif 'titleii' in request.GET: title_ii = request.GET['titleii'] if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket( 'email_confirmation', A('filters', filters={ 'true': { 'term': { 'emailConfirmation': 'true' } }, 'false': { 'term': { 'emailConfirmation': 'false' } } })) s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Emails': { 'Unique': 0, }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 } }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict( )['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False']) context = { 'description': description, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
def index(self, address=None, *args, **kwargs): if not address: return dict(results=[]) search = model.Geocomplete().search() address_query = Q() postal_code_query = Q() (address, postal_code) = self.geocomplete_town_input_parser(address) if address: address_query = Q('match', name=address) if postal_code: postal_code_query = Q('match', postal_code=postal_code) weight_scoring_function = SF('field_value_factor', factor=1, modifier='none', field='weight') search.query = Q('function_score', query=address_query & postal_code_query, functions=[weight_scoring_function]) dedup_docs = A('top_hits', size=1, sort={'postal_code.raw': 'asc'}) dedup = A('terms', field='name.raw', size=5, order={'score_sort': 'desc'}) score_sort = A('max', script=dict(lang='expression', script='_score')) dedup.bucket('dedup_docs', dedup_docs) dedup.bucket('score_sort', score_sort) search.aggs.bucket('dedup', dedup) # Do not compute the results, we are only interested by the aggregations raw_res = search[0:0].execute() res = list() for bucket in raw_res.aggregations.dedup.buckets: for source_doc in bucket['dedup_docs']['hits']['hits']: fields = source_doc['_source'] name = fields['name'] complement = fields['complement'] postal_code = fields['postal_code'] country = 'France' geoloc = fields['geolocation'] coordinates = dict(lat=geoloc['lat'], lon=geoloc['lon']) res.append( dict(name=name, complement=complement, postal_code=postal_code, country=country, coordinates=coordinates)) return dict(results=res)
metric.save() # And now search s = Search() s = s.source(['user_id', 'value', 'metadata']) s.to_dict() response = s.execute() print(response.success()) print(response.hits.total) # Get all records with a time metric value between 2000 and 30000, returning only s = Search().filter('range', value={"gte": 2000, "lte": 30000}) # Number of records per user # Define aggregation agg = A('terms', field='user_id', include='user_id') # init search s = Search() # Size 0 s = s[0:0] # Add aggregation s.aggs.bucket('group_by_user', agg) # Dict representation s.to_dict() ###################### # Average time by segment s = Search() # Set size to 0, we only want aggregations s = s[0:0]
s = Search() s = s.filter('terms', tags=['search', 'python']) # Same as s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])]) # We can use exclude too s = s.exclude('terms', tags=['search', 'python']) ##################################################### # AGGREGATIONS from elasticsearch_dsl.aggs import A a = A('terms', field='category') a.metric('clicks_per_category', 'sum', field='clicks')\ .bucket('tags_per_category', 'terms', field='tags') # This is how you add aggregations to the search object s = Search() a = A('terms', field='category') s.aggs.bucket('category_terms', a) s.aggs.bucket('per_category', 'terms', field='category') s.aggs['per_category'].metric('clicks_per_category', 'sum', field='clicks') s.aggs['per_category'].bucket('tags_per_category', 'terms', fireld='tags')
def browse(request, sentiment=None, group=None): s = Search(using=es, index="fcc-comments") description = None s.query = FunctionScore( query=s.query, functions=[SF('random_score', seed=int(time.time()))] ) if group: source = group s = s.filter('terms', **{'analysis.source.keyword': [source]}) description = SOURCE_MAP.get(source, {}).get('name') or source details = SOURCE_MAP.get(source, {}).get('details') or "" url = SOURCE_MAP.get(source, {}).get('url') or "" elif sentiment: title_ii = sentiment if title_ii == 'pro': s = s.filter('terms', **{'analysis.titleii': [True]}) description = "Pro Title II" elif title_ii == 'anti': description = 'Anti Title II' s = s.filter('terms', **{'analysis.titleii': [False]}) elif title_ii == 'unknown': description = 'Uncategorized' s = s.exclude('exists', field='analysis.titleii') details, url = "", None s.aggs.bucket("date", A('date_histogram', field='date_submission', interval='month')) s.aggs.bucket('address', A('terms', field='analysis.fulladdress')) s.aggs.bucket('email_domain', A('terms', field='analysis.throwawayemail')) s.aggs.bucket('site', A('terms', field='analysis.onsite')) s.aggs.bucket('ingestion', A('terms', field='analysis.ingestion_method.keyword')) s.aggs.bucket('email_confirmation', A('filters', filters={ 'true': {'term': {'emailConfirmation': 'true'}}, 'false': {'term': {'emailConfirmation': 'false'}} })) # s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw')) stats = OrderedDict({ 'Comment Form': { 'On-site': 0, 'Off-site': 0 }, 'Throwaway Email': { 'True': 0, 'False': 0 }, 'Address': { 'Full Address': 0, 'Partial Address': 0, }, 'Email Confirmation': { 'True': 0, 'False': 0, 'Missing': 0 }, 'Filing Method': { 'API': 0, 'Spreadsheet': 0, 'Direct': 0 }, 'Filing Dates': OrderedDict({ }) }) response = s[:50].execute() total = s.count() for bucket in response.aggregations.date.buckets: d = datetime.fromtimestamp((bucket.key/1000.) + 14400) title = "%s/17 - %s" % (d.strftime("%m"), d.strftime("%B")) stats['Filing Dates'][title] = bucket.doc_count for bucket in response.aggregations.address.buckets: if bucket.key == 1: stats['Address']['Full Address'] = bucket.doc_count elif bucket.key == 0: stats['Address']['Partial Address'] = bucket.doc_count for bucket in response.aggregations.email_domain.buckets: if bucket.key == 1: stats['Throwaway Email']['True'] = bucket.doc_count elif bucket.key == 0: stats['Throwaway Email']['False'] = bucket.doc_count for bucket in response.aggregations.ingestion.buckets: if bucket.key == "api": stats['Filing Method']['API'] = bucket.doc_count elif bucket.key == "csv": stats['Filing Method']['Spreadsheet'] = bucket.doc_count elif bucket.key == "direct": stats['Filing Method']['Direct'] = bucket.doc_count for bucket in response.aggregations.site.buckets: if bucket.key == 1: stats['Comment Form']['On-site'] = bucket.doc_count elif bucket.key == 0: stats['Comment Form']['Off-site'] = bucket.doc_count # stats['Emails']['Unique'] = response.aggregations.unique_emails.value for bucket, value in response.aggs.email_confirmation.to_dict()['buckets'].items(): if bucket == 'true': stats['Email Confirmation']['True'] = value['doc_count'] elif bucket == 'false': stats['Email Confirmation']['False'] = value['doc_count'] stats['Email Confirmation']['Missing'] = ( total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False'] ) context = { 'description': description, 'details': details, 'url': url, 'stats': stats, 'results': response, 'comment_count': total } return render(request, 'listing.html', context)
# -*- coding: utf-8 -*- import json from elasticsearch_dsl.aggs import A from iadz.third_party.elasticsearch.es_fields import create_connection, create_search if __name__ == "__main__": client = create_connection() s = create_search() a = A("terms", field="category") s.aggs.bucket("category_terms", a) response = s.execute() print(f"Response: {json.dumps(response.to_dict())}")