Example #1
0
def tweetQuery(query: str, current_user: User = Depends(get_current_user)):

    a_like_count = A("sum", field="like_count")
    a_retweet_count = A("sum", field="retweet_count")
    a_keywords = A("significant_text", field="text")

    search = Search(using=es, index="tweets-*").query("match", text=query)

    search.aggs.bucket("total_likes", a_like_count)
    search.aggs.bucket("total_retweets", a_retweet_count)
    search.aggs.bucket("keywords", a_keywords)

    res = search.execute().to_dict()

    tweets = {}
    ids = ""
    for tweet in res["hits"]["hits"]:
        tweets[tweet["_id"]] = tweet
        ids += tweet["_id"] + ","

    ids = ids[:-1]

    updated_tweets = tweet_update(tweets, ids)

    return updated_tweets
Example #2
0
def _aggregate_duplicates(form_model, search_parameters, search):
    if search_parameters == 'exactmatch':
        search = search.params(search_type="count")
        form_fields_to_be_filtered = []
        aggs = _aggregate_exact_match_duplicates(form_model.form_fields,
                                                 form_model.id, search.aggs,
                                                 search,
                                                 form_fields_to_be_filtered)
        aggs.bucket('tag', 'terms', field='ds_id_exact', size=0, min_doc_count=2)\
            .bucket('tag', 'top_hits', size=(2 ** 7))
        setattr(form_model, "filter_fields", form_fields_to_be_filtered)

    elif search_parameters == 'datasender':
        search = search.params(search_type="count")
        a = A("terms", field='ds_id_exact', size=0, min_doc_count=2)
        b = A("top_hits", size=(2**10))
        search.aggs.bucket('tag', a).bucket('tag', b)

    else:
        search = search.params(search_type="count")
        a = A("terms",
              field=form_model.id + '_' + search_parameters +
              '_unique_code_exact',
              size=0,
              min_doc_count=2)
        b = A("top_hits", size=(2**10))
        search.aggs.bucket('tag', a).bucket('tag', b)
    return search
Example #3
0
def get_genre_agg():
    s = Search(using=es)
    s = s.index('imdb')
    s.aggs.bucket('genres', A('terms', field='genres'))
    ret = s.execute()
    # logger.debug('genre agg is %s', json.dumps(ret.aggs.to_dict(), indent=2))
    return [x['key'] for x in ret.aggs.to_dict()['genres']['buckets']]
Example #4
0
def first_words(index='codetoname', language='python'):
    es = elasticsearch.Elasticsearch()

    # update first name
    s = elasticsearch_dsl.Search(using=es, index=index, doc_type=language)\
        .query('bool', filter=Q('exists', field='feature') & Q('missing', field='first_name'))
    for hit in s.scan():
        data = hit.to_dict()
        feature = json.loads(data['feature'])
        data['first_name'] = firstname(feature['name'], language)
        es.index(index=index, doc_type=language, id=hit.meta.id, body=data)
    es.indices.refresh(index=index)

    # aggregation
    s = elasticsearch_dsl.Search(using=es, index=index, doc_type=language)\
        .query('bool', filter=Q('exists', field='feature'))
    a = A('terms', field='first_name')
    s.aggs.bucket('first_name_terms', a)
    response = s.execute()

    words = []
    for item in response.aggregations.first_name_terms.buckets:
        percentage = item.doc_count / float(response.hits.total) * 100
        words.append({'word': item.key, 'percentage': percentage})
    return words
Example #5
0
def etl(index='cf_rfem_hist_price',
        start_date='2018-12-26',
        end_date='2019-03-25',
        symbol='rfem'):
    ESLowLevelClientByConnection.get_instance()
    search = Search(index=index, using='high_level_client')[0:100]
    search.query = Q(
        Bool(must=[
            Range(date={
                'gte': '2018-12-26',
                'lte': '2019-03-25'
            }),
            Term(symbol='rfem')
        ]))
    aggs = A(
        DateHistogram(field='date',
                      interval='1d',
                      format='yyyy-MM-dd',
                      min_doc_count=1))
    response = search.execute()
    hits = response['hits']
    hits = hits['hits']
    XX = []
    for hit in hits:
        X = []
        X.append(hit['_source']['changeOverTime'])
        X.append(hit['_source']['changePercent'])
        X.append(hit['_source']['volume'])
        XX.append(X)
    return (XX)
Example #6
0
def bollinger_band(index='cf_etf_hist_price',
                   start_date='2018-12-26',
                   end_date='2019-03-25',
                   symbol='rfem'):
    ESLowLevelClientByConnection.get_instance()
    search = Search(index=index, using='high_level_client')[0:0]
    search.query = Q(
        Bool(must=[
            Range(date={
                'gte': '2018-12-26',
                'lte': '2019-03-25'
            }),
            Term(symbol='rfem')
        ]))
    aggs = A(
        DateHistogram(field='date',
                      interval='1d',
                      format='yyyy-MM-dd',
                      min_doc_count=1))
    aggs_tp = A(
        ScriptedMetric(
            init_script='state.totals=[]',
            map_script=
            'state.totals.add((doc.high.value+doc.low.value+doc.close.value)/3)',
            combine_script=
            'double total=0; for (t in state.totals) {total += t} return total',
            reduce_script=
            'double total=0; for (t in states) {total += t} return total'))
    aggs_moving_avg = A(
        MovingAvg(model='simple', window=20, buckets_path='tp.value'))
    aggs_bbu = A(
        BucketScript(buckets_path={'SMA': '20_trading_days_moving_avg'},
                     script='params.SMA + 0.5'))
    aggs_bbl = A(
        BucketScript(buckets_path={'SMA': '20_trading_days_moving_avg'},
                     script='params.SMA - 0.5'))
    search.aggs.bucket('Bollinger_band',
                       aggs).pipeline('tp', aggs_tp), pipeline(
                           '20_trading_days_moving_avg',
                           aggs_moving_avg), pipeline('BBU',
                                                      aggs_bbu).pipeline(
                                                          'BBL', aggs_bbl)
    response = search.execute()
    print(response.to_dict())
Example #7
0
 def num_repos(self):
     if self._es.indices.exists(index=self._es_index):
         s = elasticsearch_dsl.Search(using=self._es,
                                      index=self._es_index,
                                      doc_type=self._language)
         s.aggs.bucket('num_repos', A('cardinality',
                                      field='repo.github_id'))
         response = s.execute()
         return response.aggregations.num_repos.value
     return 0
Example #8
0
def get_aggregations_for_choice_fields(dbm, form_model, local_time_delta,
                                       pagination_params, sort_params,
                                       search_parameters):
    search = _create_search(dbm, form_model, local_time_delta,
                            pagination_params, sort_params, search_parameters)
    search = search.params(search_type="count")
    field_names = []
    for field in form_model.choice_fields:
        field_name = es_questionnaire_field_name(field.code, form_model.id)
        a = A("terms", field=field_name + '_exact', size=0)
        search.aggs.bucket(field_name, a)
        field_names.append(field_name)
    search_results = search.execute()
    aggs_results = [
        _get_aggregation_result(field_name, search_results)
        for field_name in field_names
    ]
    return aggs_results, search_results.hits.total
Example #9
0
def sources(request):
    s = Search(using=es, index='fcc-comments')
    
    a = A('terms', field='analysis.source.keyword', size=50)
    s.aggs.bucket('sources', a)
    response = s.execute()
    
    context = { 'sources': [] }
    
    for source in response.aggregations.sources.buckets:
        context['sources'].append({
            'key': source.key,
            'count': source.doc_count,
            'name': SOURCE_MAP.get(source.key, {}).get('name'),
            'url': SOURCE_MAP.get(source.key, {}).get('url')
        })
    
    return render(request, 'sources.html', context)
Example #10
0
def index(request):

    s = Search(using=es)

    total = s.count()
    pro_titleii = s.query('match', **{'analysis.titleii': True}).count()
    anti_titleii = s.query('match', **{'analysis.titleii': False}).count()
    unknown_titleii = total - pro_titleii - anti_titleii

    context = {
        'total_comments': s.count(),
        'title_ii': {
            'pro': pro_titleii / total * 100,
            'anti': anti_titleii / total * 100,
            'unknown': unknown_titleii / total * 100
        }
    }
    a = A('terms', field='analysis.source')
    s.aggs.bucket('sources', a)
    response = s.execute()
    context['sources'] = []
    for source in response.aggregations.sources.buckets:
        if source.key == 'unknown':
            continue

        context['sources'].append({
            'key':
            source.key,
            'count':
            source.doc_count,
            'name':
            SOURCE_MAP.get(source.key, {}).get('name'),
            'url':
            SOURCE_MAP.get(source.key, {}).get('url')
        })

        print(source.key, source.doc_count)
    # context['sources'] = s.aggs['sources']

    return render(request, 'index.html', context)
Example #11
0
def browse(request):

    s = Search(using=es)
    description = None

    s.query = FunctionScore(
        query=s.query, functions=[SF('random_score', seed=int(time.time()))])

    if 'source' in request.GET:
        source = request.GET['source']
        s = s.filter('terms', **{'analysis.source': [source]})
        description = SOURCE_MAP.get(source, {}).get('name') or source
    elif 'titleii' in request.GET:
        title_ii = request.GET['titleii']
        if title_ii == 'pro':
            s = s.filter('terms', **{'analysis.titleii': [True]})
            description = "Pro Title II"
        elif title_ii == 'anti':
            description = 'Anti Title II'
            s = s.filter('terms', **{'analysis.titleii': [False]})
        elif title_ii == 'unknown':
            description = 'Uncategorized'
            s = s.exclude('exists', field='analysis.titleii')

    s.aggs.bucket('address', A('terms', field='analysis.fulladdress'))
    s.aggs.bucket('site', A('terms', field='analysis.onsite'))

    s.aggs.bucket(
        'email_confirmation',
        A('filters',
          filters={
              'true': {
                  'term': {
                      'emailConfirmation': 'true'
                  }
              },
              'false': {
                  'term': {
                      'emailConfirmation': 'false'
                  }
              }
          }))

    s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw'))

    # s.aggs.bucket('email_confirmation', A('filters', field='analysis.fulladdress'))

    stats = OrderedDict({
        'Comment Form': {
            'On-site': 0,
            'Off-site': 0
        },
        'Emails': {
            'Unique': 0,
        },
        'Address': {
            'Full Address': 0,
            'Partial Address': 0,
        },
        'Email Confirmation': {
            'True': 0,
            'False': 0,
            'Missing': 0
        }
    })

    response = s[:50].execute()
    total = s.count()
    for bucket in response.aggregations.address.buckets:
        if bucket.key == 1:
            stats['Address']['Full Address'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Address']['Partial Address'] = bucket.doc_count

    for bucket in response.aggregations.site.buckets:
        if bucket.key == 1:
            stats['Comment Form']['On-site'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Comment Form']['Off-site'] = bucket.doc_count

    stats['Emails']['Unique'] = response.aggregations.unique_emails.value

    for bucket, value in response.aggs.email_confirmation.to_dict(
    )['buckets'].items():
        if bucket == 'true':
            stats['Email Confirmation']['True'] = value['doc_count']
        elif bucket == 'false':
            stats['Email Confirmation']['False'] = value['doc_count']
    stats['Email Confirmation']['Missing'] = (
        total - stats['Email Confirmation']['True'] -
        stats['Email Confirmation']['False'])

    context = {
        'description': description,
        'stats': stats,
        'results': response,
        'comment_count': total
    }

    return render(request, 'listing.html', context)
Example #12
0
    def index(self, address=None, *args, **kwargs):
        if not address:
            return dict(results=[])

        search = model.Geocomplete().search()

        address_query = Q()
        postal_code_query = Q()

        (address, postal_code) = self.geocomplete_town_input_parser(address)

        if address:
            address_query = Q('match', name=address)

        if postal_code:
            postal_code_query = Q('match', postal_code=postal_code)

        weight_scoring_function = SF('field_value_factor',
                                     factor=1,
                                     modifier='none',
                                     field='weight')

        search.query = Q('function_score',
                         query=address_query & postal_code_query,
                         functions=[weight_scoring_function])

        dedup_docs = A('top_hits', size=1, sort={'postal_code.raw': 'asc'})

        dedup = A('terms',
                  field='name.raw',
                  size=5,
                  order={'score_sort': 'desc'})

        score_sort = A('max', script=dict(lang='expression', script='_score'))

        dedup.bucket('dedup_docs', dedup_docs)
        dedup.bucket('score_sort', score_sort)
        search.aggs.bucket('dedup', dedup)

        # Do not compute the results, we are only interested by the aggregations
        raw_res = search[0:0].execute()

        res = list()
        for bucket in raw_res.aggregations.dedup.buckets:
            for source_doc in bucket['dedup_docs']['hits']['hits']:
                fields = source_doc['_source']

                name = fields['name']
                complement = fields['complement']
                postal_code = fields['postal_code']
                country = 'France'

                geoloc = fields['geolocation']
                coordinates = dict(lat=geoloc['lat'], lon=geoloc['lon'])

                res.append(
                    dict(name=name,
                         complement=complement,
                         postal_code=postal_code,
                         country=country,
                         coordinates=coordinates))

        return dict(results=res)
Example #13
0
metric.save()

# And now search
s = Search()
s = s.source(['user_id', 'value', 'metadata'])
s.to_dict()
response = s.execute()
print(response.success())
print(response.hits.total)

# Get all records with a time metric value between 2000 and 30000, returning only
s = Search().filter('range', value={"gte": 2000, "lte": 30000})

# Number of records per user
# Define aggregation
agg = A('terms', field='user_id', include='user_id')
# init search
s = Search()
# Size 0
s = s[0:0]
# Add aggregation
s.aggs.bucket('group_by_user', agg)
# Dict representation
s.to_dict()

######################
# Average time by segment
s = Search()

# Set size to 0, we only want aggregations
s = s[0:0]
s = Search()
s = s.filter('terms', tags=['search', 'python'])
# Same as
s = s.query('bool', filter=[Q('terms', tags=['search', 'python'])])

# We can use exclude too
s = s.exclude('terms', tags=['search', 'python'])



#####################################################
# AGGREGATIONS

from elasticsearch_dsl.aggs import A

a = A('terms', field='category')


a.metric('clicks_per_category', 'sum', field='clicks')\
        .bucket('tags_per_category', 'terms', field='tags')

# This is how you add aggregations to the search object
s = Search()
a = A('terms', field='category')
s.aggs.bucket('category_terms', a)


s.aggs.bucket('per_category', 'terms', field='category')
s.aggs['per_category'].metric('clicks_per_category', 'sum', field='clicks')
s.aggs['per_category'].bucket('tags_per_category', 'terms', fireld='tags')
Example #15
0
def browse(request, sentiment=None, group=None):

    s = Search(using=es, index="fcc-comments")
    description = None

    s.query = FunctionScore(
        query=s.query, functions=[SF('random_score', seed=int(time.time()))]
    )

    if group:
        source = group
        s = s.filter('terms', **{'analysis.source.keyword': [source]})
        description = SOURCE_MAP.get(source, {}).get('name') or source
        details = SOURCE_MAP.get(source, {}).get('details') or ""
        url = SOURCE_MAP.get(source, {}).get('url') or ""

    elif sentiment:
        title_ii = sentiment
        if title_ii == 'pro':
            s = s.filter('terms', **{'analysis.titleii': [True]})
            description = "Pro Title II"
        elif title_ii == 'anti':
            description = 'Anti Title II'
            s = s.filter('terms', **{'analysis.titleii': [False]})
        elif title_ii == 'unknown':
            description = 'Uncategorized'
            s = s.exclude('exists', field='analysis.titleii')
        details, url = "", None
    
    s.aggs.bucket("date", A('date_histogram', field='date_submission', interval='month'))
    s.aggs.bucket('address', A('terms', field='analysis.fulladdress'))
    s.aggs.bucket('email_domain', A('terms', field='analysis.throwawayemail'))
    s.aggs.bucket('site', A('terms', field='analysis.onsite'))
    s.aggs.bucket('ingestion', A('terms', field='analysis.ingestion_method.keyword'))
    s.aggs.bucket('email_confirmation', A('filters', filters={
        'true': {'term': {'emailConfirmation': 'true'}},
        'false': {'term': {'emailConfirmation': 'false'}}
    }))

    # s.aggs.bucket('unique_emails', A('cardinality', field='contact_email.raw'))


    stats = OrderedDict({
        'Comment Form': {
            'On-site': 0,
            'Off-site': 0
        },
        'Throwaway Email': {
            'True': 0,
            'False': 0
        },
        'Address': {
            'Full Address': 0,
            'Partial Address': 0,
        },
        'Email Confirmation': {
            'True': 0,
            'False': 0,
            'Missing': 0
        },
        'Filing Method': {
            'API': 0,
            'Spreadsheet': 0,
            'Direct': 0
        },
        'Filing Dates': OrderedDict({
            
        })
    })

    response = s[:50].execute()
    total = s.count()

    for bucket in response.aggregations.date.buckets:
        d = datetime.fromtimestamp((bucket.key/1000.) + 14400)
        title = "%s/17 - %s" % (d.strftime("%m"), d.strftime("%B"))
        stats['Filing Dates'][title] = bucket.doc_count

    for bucket in response.aggregations.address.buckets:
        if bucket.key == 1:
            stats['Address']['Full Address'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Address']['Partial Address'] = bucket.doc_count

    for bucket in response.aggregations.email_domain.buckets:
        if bucket.key == 1:
            stats['Throwaway Email']['True'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Throwaway Email']['False'] = bucket.doc_count

    for bucket in response.aggregations.ingestion.buckets:
        if bucket.key == "api":
            stats['Filing Method']['API'] = bucket.doc_count
        elif bucket.key == "csv":
            stats['Filing Method']['Spreadsheet'] = bucket.doc_count
        elif bucket.key == "direct":
            stats['Filing Method']['Direct'] = bucket.doc_count


    for bucket in response.aggregations.site.buckets:
        if bucket.key == 1:
            stats['Comment Form']['On-site'] = bucket.doc_count
        elif bucket.key == 0:
            stats['Comment Form']['Off-site'] = bucket.doc_count

    # stats['Emails']['Unique'] = response.aggregations.unique_emails.value

    for bucket, value in response.aggs.email_confirmation.to_dict()['buckets'].items():
        if bucket == 'true':
            stats['Email Confirmation']['True'] = value['doc_count']
        elif bucket == 'false':
            stats['Email Confirmation']['False'] = value['doc_count']
    stats['Email Confirmation']['Missing'] = (
        total - stats['Email Confirmation']['True'] - stats['Email Confirmation']['False']
    )

    context = {
        'description': description,
        'details': details,
        'url': url,
        'stats': stats,
        'results': response,
        'comment_count': total
    }

    return render(request, 'listing.html', context)
# -*- coding: utf-8 -*-
import json
from elasticsearch_dsl.aggs import A

from iadz.third_party.elasticsearch.es_fields import create_connection, create_search

if __name__ == "__main__":

    client = create_connection()

    s = create_search()

    a = A("terms", field="category")
    s.aggs.bucket("category_terms", a)

    response = s.execute()

    print(f"Response: {json.dumps(response.to_dict())}")