Example #1
0
 def get_entities_by_sport_and_query(self, sport, query):
     search = Search(using=self.es)
     search = search[0:5]
     if sport == Sport.SOCCER:
         search = search.index('soccer-entity')
     elif sport == Sport.BASKETBALL:
         search = search.index('basketball-entity')
     if query:
         query = '*{}*'.format(query)
         search = search.query(
             QueryString(query=query, fields=['name^5', 'abstract']))
     hits = []
     for hit in search.execute():
         id = hit.meta['id']
         hit = hit.to_dict()
         entity = {'id': id, 'name': hit['name']}
         if 'abstract' in hit:
             entity['abstract'] = hit['abstract']
         else:
             entity['abstract'] = 'None'
         if 'type' in hit:
             entity['type'] = hit['type']
         else:
             entity['abstract'] = 'None'
         hits.append(entity)
     return hits
Example #2
0
def test_basic_stats(app, db, es, locations, event_queues, minimal_record):
    """Test basic statistics results."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files
        metadata=minimal_record,
        n_records=10,
        n_versions=2,
        n_files=3,
        event_data={'user_id': '1'},
        # 4 event timestamps
        start_date=datetime(2018, 1, 1, 13),
        end_date=datetime(2018, 1, 1, 15),
        interval=timedelta(minutes=30))

    # Events indices
    prefix = app.config['SEARCH_INDEX_PREFIX']

    # 2 versions * 10 records * 3 files * 4 events -> 240
    assert search.index(prefix + 'events-stats-file-download').count() == 240
    # 2 versions * 10 records * 4 events -> 80
    assert search.index(prefix + 'events-stats-record-view').count() == 80

    # Aggregations indices
    # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks

    # 30d
    assert search.index(prefix + 'stats-file-download').count() == 30

    # 30d
    assert search.index(prefix + 'stats-record-view').count() == 30

    # 2bm + 2bm
    assert search.index(prefix + 'stats-bookmarks').count() == 4

    # Records index
    for _, record, _ in records:
        doc = \
             current_search_client.get(
                index=build_alias_name('records'),
                id=str(record.id),
                params={'_source_includes': '_stats'}
                )
        assert doc['_source']['_stats'] == {
            # 4 view events
            'views': 4.0,
            'version_views': 8.0,
            # 4 view events over 2 different hours
            'unique_views': 2.0,
            'version_unique_views': 2.0,
            # 4 download events * 3 files
            'downloads': 12.0,
            'version_downloads': 24.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 2.0,
            'version_unique_downloads': 2.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 120.0,
            'version_volume': 240.0,
        }
Example #3
0
def test_large_stats(app, db, es, locations, event_queues, minimal_record):
    """Test a larger number of events, aggregations, and results."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files
        metadata=minimal_record,
        n_records=3,
        n_versions=4,
        n_files=2,
        event_data={'user_id': '1'},
        # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day)
        start_date=datetime(2018, 3, 1),
        end_date=datetime(2018, 5, 1),
        interval=timedelta(hours=12))

    # Events indices
    prefix = app.config['SEARCH_INDEX_PREFIX']

    # 4 versions * 3 records * 2 files * 122 events -> 2928
    assert search.index(prefix + 'events-stats-file-download').count() == 2928
    # 4 versions * 3 records * 122 events -> 1464
    assert search.index(prefix + 'events-stats-record-view').count() == 1464

    # Aggregations indices
    # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks
    q = search.index(prefix + 'stats-file-download')
    q = q.doc_type('file-download-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records
    q = search.index(prefix + 'stats-record-view')
    q = q.doc_type('record-view-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records

    # Records index
    for _, record, _ in records:
        doc = \
             current_search_client.get(
                index=build_alias_name('records'),
                id=str(record.id),
                params={'_source_includes': '_stats'}
                )
        assert doc['_source']['_stats'] == {
            # 4 view events
            'views': 122.0,
            'version_views': 488.0,
            # 4 view events over 2 different hours
            'unique_views': 122.0,
            'version_unique_views': 122.0,
            # 4 download events * 3 files
            'downloads': 244.0,
            'version_downloads': 976.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 122.0,
            'version_unique_downloads': 122.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 2440.0,
            'version_volume': 9760.0,
        }
Example #4
0
def test_aggregations_process(script_info, event_queues, es, indexed_events):
    """Test "aggregations process" CLI command."""
    search = Search(using=es)
    runner = CliRunner()

    # Invalid argument
    result = runner.invoke(
        stats, ['aggregations', 'process', 'invalid-aggr-type', '--eager'],
        obj=script_info)
    assert result.exit_code == 2
    assert 'Invalid aggregation type(s):' in result.output

    result = runner.invoke(stats, [
        'aggregations', 'process', 'file-download-agg',
        '--start-date=2018-01-01', '--end-date=2018-01-10', '--eager'
    ],
                           obj=script_info)
    assert result.exit_code == 0

    agg_alias = search.index('stats-file-download')

    es.indices.refresh(index='*')
    assert agg_alias.count() == 10
    assert agg_alias.doc_type('file-download-agg-bookmark').count() == 0
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 10
    assert search.index('stats-file-download-2018-01').count() == 10

    # Run again over same period, but update the bookmark
    result = runner.invoke(stats, [
        'aggregations', 'process', 'file-download-agg',
        '--start-date=2018-01-01', '--end-date=2018-01-10', '--eager',
        '--update-bookmark'
    ],
                           obj=script_info)
    assert result.exit_code == 0

    es.indices.refresh(index='*')
    assert agg_alias.count() == 12
    assert agg_alias.doc_type('file-download-agg-bookmark').count() == 2
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 10
    assert search.index('stats-file-download-2018-01').count() == 12

    # Run over all the events via celery task
    result = runner.invoke(
        stats,
        ['aggregations', 'process', 'file-download-agg', '--update-bookmark'],
        obj=script_info)
    assert result.exit_code == 0

    es.indices.refresh(index='*')
    assert agg_alias.count() == 54
    assert agg_alias.doc_type('file-download-agg-bookmark').count() == 8
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 46
    assert search.index('stats-file-download-2018-01').count() == 36
    assert search.index('stats-file-download-2018-02').count() == 18
def test_large_stats(app, db, es, locations, event_queues, minimal_record):
    """Test record page view event import."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files
        metadata=minimal_record,
        n_records=3,
        n_versions=4,
        n_files=2,
        event_data={'user_id': '1'},
        # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day)
        start_date=datetime(2018, 3, 1),
        end_date=datetime(2018, 5, 1),
        interval=timedelta(hours=12))

    # Events indices
    # 4 versions * 3 records * 2 files * 122 events -> 2928
    assert search.index('events-stats-file-download').count() == 2928
    # 4 versions * 3 records * 122 events -> 1464
    assert search.index('events-stats-record-view').count() == 1464

    # Aggregations indices
    # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks
    q = search.index('stats-file-download')
    q = q.doc_type('file-download-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records
    q = search.index('stats-record-view')
    q = q.doc_type('record-view-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records

    # Reords index
    for _, record, _ in records:
        doc = (RecordsSearch().get_record(
            record.id).source(include='_stats').execute()[0])
        assert doc['_stats'] == {
            # 4 view events
            'views': 122.0,
            'version_views': 488.0,
            # 4 view events over 2 different hours
            'unique_views': 122.0,
            'version_unique_views': 122.0,
            # 4 download events * 3 files
            'downloads': 244.0,
            'version_downloads': 976.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 122.0,
            'version_unique_downloads': 122.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 2440.0,
            'version_volume': 9760.0,
        }
Example #6
0
def movie_page(mid):
    s = Search(using=es)
    s = s.index('imdb')
    s = s.filter('term', _id=mid)
    ret = s.execute()
    return render_template('single.html',
                           movie=get_movie_detail(ret.hits[0].to_dict()))
Example #7
0
def get_genre_agg():
    s = Search(using=es)
    s = s.index('imdb')
    s.aggs.bucket('genres', A('terms', field='genres'))
    ret = s.execute()
    # logger.debug('genre agg is %s', json.dumps(ret.aggs.to_dict(), indent=2))
    return [x['key'] for x in ret.aggs.to_dict()['genres']['buckets']]
    def search(self, doc_type, query=""):
        """
        Execute search query and retrive results

        :param doc_type: Type in ElasticSearch
        :param query: search query
        :return: list with results
        """
        results = []
        if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta:
            q = Q("multi_match",
                  query=query.lower(),
                  fields=["title"])

            s = Search()
            s = s.using(self.client)
            s = s.index(self.index_name)
            s = s.doc_type(doc_type)
            s = s.query(q)
            print "search query: " + str(s.to_dict())

            response = s.execute()

            for resp in response:
                results.append(resp)
        return results
Example #9
0
def get_genre_agg():
    s = Search(using=es)
    s = s.index('imdb')
    s.aggs.bucket('genres', A('terms', field='genres'))
    ret = s.execute()
    return [
        x['key'] for x in ret.to_dict()['aggregations']['genres']['buckets']
    ]
Example #10
0
def test_failing_processors(app, es, event_queues, caplog):
    """Test events that raise an exception when processed."""
    search = Search(using=es)

    current_queues.declare()
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    def _raises_on_second_call(doc):
        if _raises_on_second_call.calls == 1:
            _raises_on_second_call.calls += 1
            raise Exception('mocked-exception')
        _raises_on_second_call.calls += 1
        return doc

    _raises_on_second_call.calls = 0

    queue = current_queues.queues['stats-file-download']
    indexer = EventsIndexer(queue, preprocessors=[_raises_on_second_call])

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 4
    assert not es.indices.exists('events-stats-file-download-2018-01-01')
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert not es.indices.exists('events-stats-file-download-2018-01-03')
    assert not es.indices.exists('events-stats-file-download-2018-01-04')
    assert not es.indices.exists_alias(name='events-stats-file-download')

    with caplog.at_level(logging.ERROR):
        indexer.run()  # 2nd event raises exception and is dropped

    # Check that the error was logged
    error_logs = [r for r in caplog.records if r.levelno == logging.ERROR]
    assert len(error_logs) == 1
    assert error_logs[0].msg == 'Error while processing event'
    assert error_logs[0].exc_info[1].args[0] == 'mocked-exception'

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 0
    assert search.index('events-stats-file-download').count() == 3
    assert search.index('events-stats-file-download-2018-01-01').count() == 1
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert search.index('events-stats-file-download-2018-01-03').count() == 1
    assert search.index('events-stats-file-download-2018-01-04').count() == 1
Example #11
0
def test_basic_stats(app, db, es, locations, event_queues, minimal_record):
    """Test basic statistics results."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files
        metadata=minimal_record,
        n_records=10,
        n_versions=2,
        n_files=3,
        event_data={'user_id': '1'},
        # 4 event timestamps
        start_date=datetime(2018, 1, 1, 13),
        end_date=datetime(2018, 1, 1, 15),
        interval=timedelta(minutes=30))
    # Events indices
    # 2 versions * 10 records * 3 files * 4 events -> 240
    assert search.index('events-stats-file-download').count() == 240
    # 2 versions * 10 records * 4 events -> 80
    assert search.index('events-stats-record-view').count() == 80

    # Aggregations indices
    # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks
    assert search.index('stats-file-download').count() == 32  # 2bm + 30d
    assert search.index('stats-record-view').count() == 32  # 2bm + 30d

    # Reords index
    for _, record, _ in records:
        doc = (RecordsSearch().get_record(
            record.id).source(include='_stats').execute()[0])
        assert doc['_stats'] == {
            # 4 view events
            'views': 4.0,
            'version_views': 8.0,
            # 4 view events over 2 different hours
            'unique_views': 2.0,
            'version_unique_views': 2.0,
            # 4 download events * 3 files
            'downloads': 12.0,
            'version_downloads': 24.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 2.0,
            'version_unique_downloads': 2.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 120.0,
            'version_volume': 240.0,
        }
def recommendationSearch(search):
    s = Search(using=es)
    s = s.index('job_index')
    search['offset'] = int(search['offset'])

    condition = []
    #location
    if search.has_key('state'):
        qState = Q('match_phrase', state=search['state'])
        condition.append(qState)
    if search.has_key('city'):
        qCity = Q('match', city=search['city'])
        condition.append(qCity)

    # professional & education background
    if search.has_key('pbg') or search.has_key('degree') or search.has_key(
            'major'):
        qBG = Q('multi_match',
                query=search['pbg'] + ' ' + str(search['degree']) + " " +
                str(search['major']),
                type='cross_fields',
                fields=['title', 'summary'])
        condition.append(qBG)

    # jobtype
    if search.has_key('type'):
        qType = Q('match', jobtype=search['type'])
        condition.append(qType)

    # salary
    if search.has_key('salary'):
        search['salary'] = int(search['salary'])
        qSalary = Q('range', salary={'gte': search['salary']})
        condition.append(qSalary)

    q = Q('bool', should=condition, minimum_should_match=1)
    s = s.query(q)

    s = s[search['offset']:search['offset'] + 10]
    pp = pprint.PrettyPrinter(depth=6)
    pp.pprint(s.to_dict())
    response = s.execute()

    resultlist = []
    print response.hits.total
    for hit in response.hits:
        result = {}
        result['id'] = hit.meta.id
        result['score'] = hit.meta.score
        result['title'] = hit['title']
        result['summary'] = hit['summary'][:180]
        result['url'] = 'www.indeed.com' + hit['url']
        result['company'] = hit['company']
        result['location'] = hit['location']
        result['postingdate'] = str(datetime.datetime.fromordinal(hit['date']))
        resultlist.append(result)

    return resultlist
Example #13
0
def test_large_stats(app, db, es, locations, event_queues, minimal_record):
    """Test record page view event import."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files
        metadata=minimal_record, n_records=3, n_versions=4, n_files=2,
        event_data={'user_id': '1'},
        # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day)
        start_date=datetime(2018, 3, 1),
        end_date=datetime(2018, 5, 1),
        interval=timedelta(hours=12))

    # Events indices
    # 4 versions * 3 records * 2 files * 122 events -> 2928
    assert search.index('events-stats-file-download').count() == 2928
    # 4 versions * 3 records * 122 events -> 1464
    assert search.index('events-stats-record-view').count() == 1464

    # Aggregations indices
    # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks
    q = search.index('stats-file-download')
    q = q.doc_type('file-download-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records
    q = search.index('stats-record-view')
    q = q.doc_type('record-view-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records

    # Reords index
    for _, record, _ in records:
        doc = (
            RecordsSearch().get_record(record.id)
            .source(include='_stats').execute()[0])
        assert doc['_stats'] == {
            # 4 view events
            'views': 122.0, 'version_views': 488.0,
            # 4 view events over 2 different hours
            'unique_views': 122.0, 'version_unique_views': 122.0,
            # 4 download events * 3 files
            'downloads': 244.0, 'version_downloads': 976.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 122.0, 'version_unique_downloads': 122.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 2440.0, 'version_volume': 9760.0,
        }
Example #14
0
def get_suggest(input):
    if not input:
        return None
    s = Search(using=es)
    s = s.index('imdb')
    s = s.suggest('suggestion', input, completion={'field': 'suggest'})
    s = s.source(False)
    ret = s.execute()
    results = [x['text'] for x in ret.suggest.suggestion[0]['options']]
    return jsonify(result=results)
Example #15
0
def test_aggregations_list_bookmarks(script_info, event_queues, es,
                                     aggregated_events):
    """Test "aggregations list-bookmarks" CLI command."""
    search = Search(using=es)
    runner = CliRunner()

    current_search.flush_and_refresh(index='*')
    agg_alias = search.index('stats-file-download')
    assert agg_alias.count() == 31
    assert search.index('bookmark-index').count() == 5
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 31
    assert search.index('stats-file-download-2018-01').count() == 31

    result = runner.invoke(
        stats, ['aggregations', 'list-bookmarks', 'file-download-agg'],
        obj=script_info)
    assert result.exit_code == 0

    bookmarks_query = search.index('bookmark-index')
    bookmarks = [b.date for b in bookmarks_query.scan()]
    assert all(b in result.output for b in bookmarks)
def page_detail(id):
    try:
        # search the document based on its metaid
        s = Search(using=es)
        s = s.index('job_index')
        s = s.filter('term', _id=id)
        ret = s.execute()
        job=get_job_detail(ret.hits[0].to_dict(),id)

        return render_template('detail.html', job)
    except KeyError:
        return "Problem"
    def common_search(self, search_obj: Search, **kwargs):
        assert search_obj
        if kwargs.get('offset'):
            offset = kwargs.get('offset')
        else:
            offset = 0
        if kwargs.get('limit'):
            limit = kwargs.get('limit')
        else:
            limit = 100
        search_obj = search_obj[offset:offset + limit]
        if kwargs.get('index'):
            index = kwargs.get('index')
            search_obj = search_obj.index(index)

        if kwargs.get('raw_result') is not None:
            raw_result = kwargs.get('raw_result')
        else:
            raw_result = False
        if kwargs.get('attach_id'):
            attach_id = kwargs.get('attach_id')
        else:
            attach_id = False
        if kwargs.get('with_page_info') is not None:
            with_page_info = kwargs.get('with_page_info')
        else:
            with_page_info = False

        print("\nES query:", search_obj.to_dict())

        res = search_obj.execute()

        if raw_result is True:
            return res

        data = list()

        for hit in res.hits.hits:
            d = hit['_source'].to_dict()
            if attach_id:
                d['index'] = hit['_index']
                d['doc_id'] = hit['_id']
            data.append(d)
        if with_page_info is True:
            page_info = {
                "total": res.hits.total.value,
                "limit": limit,
                "offset": offset
            }
            return data, page_info
        else:
            return data
Example #18
0
def test_basic_stats(app, db, es, locations, event_queues, minimal_record):
    """Test basic statistics results."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files
        metadata=minimal_record, n_records=10, n_versions=2, n_files=3,
        event_data={'user_id': '1'},
        # 4 event timestamps
        start_date=datetime(2018, 1, 1, 13),
        end_date=datetime(2018, 1, 1, 15),
        interval=timedelta(minutes=30))
    # Events indices
    # 2 versions * 10 records * 3 files * 4 events -> 240
    assert search.index('events-stats-file-download').count() == 240
    # 2 versions * 10 records * 4 events -> 80
    assert search.index('events-stats-record-view').count() == 80

    # Aggregations indices
    # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks
    assert search.index('stats-file-download').count() == 32  # 2bm + 30d
    assert search.index('stats-record-view').count() == 32  # 2bm + 30d

    # Reords index
    for _, record, _ in records:
        doc = (
            RecordsSearch().get_record(record.id)
            .source(include='_stats').execute()[0])
        assert doc['_stats'] == {
            # 4 view events
            'views': 4.0, 'version_views': 8.0,
            # 4 view events over 2 different hours
            'unique_views': 2.0, 'version_unique_views': 2.0,
            # 4 download events * 3 files
            'downloads': 12.0, 'version_downloads': 24.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 2.0, 'version_unique_downloads': 2.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 120.0, 'version_volume': 240.0,
        }
Example #19
0
 def get_entity(self, sport, element):
     search = Search(using=self.es)
     if sport == Sport.SOCCER:
         search = search.index('soccer-entity')
     if sport == Sport.BASKETBALL:
         search = search.index('basketball-entity')
     search = search.query(Match(_id=element[0]))
     response = search.execute()
     if len(response) > 0:
         entity = {'name': response[0]['name']}
         if 'abstract' in response[0]:
             entity['abstract'] = response[0]['abstract']
         else:
             entity['abstract'] = 'None'
         if 'type' in response[0]:
             entity['type'] = response[0]['type']
         else:
             entity['type'] = 'None'
     else:
         entity = {'name': element[0], 'abstract': 'None', 'type': 'None'}
     entity['similarity'] = round(element[1], 2)
     entity['sport'] = sport.value
     return entity
def jobdetail(id):
    s = Search(using=es)
    s = s.index('job_index')
    s = s.filter('term', _id=id)
    ret = s.execute()
    hit = ret.hits[0].to_dict()
    job = {}
    job['id'] = id
    job['title'] = hit['title']
    job['summary'] = hit['summary']
    job['url'] = 'www.indeed.com' + hit['url']
    job['company'] = hit['company']
    job['location'] = hit['location']
    if hit['salary'] == '':
        job['salary'] = 'Unknown'
    else:
        job['salary'] = hit['salary']
    job['jobtype'] = hit['jobtype']
    return job
Example #21
0
    def get_langs_from_unlabeled_tweets(self, **kwargs):

        # TODO: we need to execute this in case the user doesn't have it enabled. I can't find the
        # PUT / twitterfdl2017 / _mapping / tweet
        # {
        #     "properties": {
        #         "lang": {
        #             "type": "text",
        #             "fielddata": true
        #         }
        #     }
        # }

        the_host = "http://" + kwargs["host"] + ":" + kwargs["port"]
        client = connections.create_connection(hosts=[the_host])
        s = Search(using=client, index=kwargs["index"], doc_type="tweet")

        body = {
            "size": 0,
            "aggs": {
                "distinct_lang": {
                    "terms": {
                        "field": "lang",
                        "size": 1000
                    }
                }
            }
        }

        s = Search.from_dict(body)
        s = s.index(kwargs["index"])
        s = s.doc_type("tweet")
        body = s.to_dict()

        t = s.execute()

        distinct_langs = []
        for item in t.aggregations.distinct_lang:
            # print(item.key, item.doc_count)
            distinct_langs.append(item.key)

        return distinct_langs
Example #22
0
def test_aggregations_delete(script_info, event_queues, es, aggregated_events):
    """Test "aggregations process" CLI command."""
    search = Search(using=es)
    runner = CliRunner()

    current_search.flush_and_refresh(index='*')
    agg_alias = search.index('stats-file-download')
    assert agg_alias.count() == 31
    assert search.index('bookmark-index').count() == 5
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 31
    assert search.index('stats-file-download-2018-01').count() == 31

    result = runner.invoke(stats, [
        'aggregations', 'delete', 'file-download-agg',
        '--start-date=2018-01-01', '--end-date=2018-01-10', '--yes'
    ],
                           obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')
    agg_alias = search.index('stats-file-download')

    assert agg_alias.count() == 21
    assert search.index('bookmark-index').count() == 4
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 21
    assert search.index('stats-file-download-2018-01').count() == 21

    # Delete all aggregations
    result = runner.invoke(stats, ['aggregations', 'delete', '--yes'],
                           obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')
    agg_alias = search.index('stats-file-download')
    assert agg_alias.count() == 0
    assert agg_alias.doc_type('file-download-agg-bookmark').count() == 0
    assert agg_alias.doc_type('file-download-day-aggregation').count() == 0
    assert search.index('stats-file-download-2018-01').count() == 0
Example #23
0
def index():
    page = 1
    sort = request.args.get('sort')
    search = request.args.get('search')
    logger.debug(request.args)
    s = Search(using=es)
    s = s.index('imdb')
    s = s.source(includes=['title', 'poster', '_id'])
    s = s[(page - 1) * PAGESIZE:page * PAGESIZE]
    if search:
        s = s.query(
            Q('multi_match',
              query=search,
              fields=['title', 'summary', 'casts'])).extra(size=8)
    if sort:
        s = s.sort(sort)
    ret = s.execute()
    logger.debug(ret.hits)
    movies = get_movies(ret.hits)
    genres = get_genre_agg()
    return render_template('review.html', movies=movies, genres=genres)
Example #24
0
    def doSearch(self, body):
        try:
            client = connections.create_connection(hosts=[settings.ES_URL])
            s = Search(using=client,
                       index=settings.ES_INDEX_NAME,
                       doc_type=settings.ES_INDEX_TYPE)
            s = Search.from_dict(body)
            s = s.index(settings.ES_INDEX_NAME)
            s = s.doc_type(settings.ES_INDEX_TYPE)

            # hightlight the following fields in the search result
            s = s.highlight('title')
            s = s.highlight('description')
            s = s.highlight('data_time')
            s = s.highlight('source')

            body = s.to_dict()
            response = s.execute()
        except Exception:
            return None

        return response
def companySearch(search):
    s = Search(using=es)
    search['offset'] = int(search['offset'])
    s = s.index('job_index')
    s = s.query('match_phrase', company=search['company'])
    s = s[search['offset']:search['offset'] + 10]
    response = s.execute()

    resultlist = []
    print response.hits.total
    for hit in response.hits:
        result = {}
        result['id'] = hit.meta.id
        result['score'] = hit.meta.score
        result['title'] = hit['title']
        result['summary'] = hit['summary'][:180]
        result['url'] = 'www.indeed.com' + hit['url']
        result['company'] = hit['company']
        result['location'] = hit['location']
        result['postingdate'] = str(datetime.datetime.fromordinal(hit['date']))
        resultlist.append(result)

    return resultlist
Example #26
0
def index(genre=None):
    # Get index first page items
    page = 1
    if request.args.get('page'):
        page = int(request.args.get('page'))
    genre = request.args.get('genre')
    sort = request.args.get('sort')
    search = request.args.get('search')
    print genre
    print page
    print search
    print sort
    s = Search(using=es)
    s = s.index('imdb')
    s = s.source(includes=['vote', 'title', 'poster', '_id'])
    s = s.query(Q('match_all'))
    if genre:
        s = s.query('bool', filter=[Q('term', genres=genre)])
    if sort:
        s = s.sort(sort)
    if search:
        s = s.query(
            Q('multi_match',
              query=search,
              fields=['title', 'summary', 'casts', 'creators',
                      'genres'])).extra(size=8)
    s = s[(page - 1) * PAGE_SIZE:page * PAGE_SIZE]
    ret = s.execute()
    logger.debug(ret)
    movies, max_page = get_movies(
        ret.hits), int(ret.hits.total / PAGE_SIZE) + 1
    return get_list_result(movies,
                           max_page,
                           page=page,
                           genre=genre,
                           sort=sort,
                           search=search)
Example #27
0
    def set_search(self, search: Search) -> Search:
        search_helper = SearchHelper(self.dataset.type)

        search = search.index(self.dataset.index).filter(
            search_helper.query_lang_term(self._lang.value))
        for cooccur_word in self.cooccur_words:
            search = search.filter(
                search_helper.query_text_tokens_term(cooccur_word))

        if self.dataset.type == DatasetType.NASTY:
            if self.search_filter:
                search = search.filter(
                    search_helper.query_nasty_filter_term(
                        self.search_filter.name))
            if self.search_query:
                search = search.filter(
                    search_helper.query_nasty_query_term(self.search_query))
            if self.user_verified:
                search = search.filter(
                    search_helper.query_nasty_user_verified_term(
                        self.user_verified))

        elif (self.dataset.type == DatasetType.NEWS_CSV
              or self.dataset.type == DatasetType.MAXQDA_CODED_NEWS_CSV):
            if self.url_netloc:
                search = search.filter(
                    search_helper.query_news_csv_url_netloc_term(
                        self.url_netloc))

        if (self.dataset.type == DatasetType.MAXQDA_CODED_NASTY
                or self.dataset.type == DatasetType.MAXQDA_CODED_NEWS_CSV):
            search = search.filter(
                search_helper.query_maxqda_coded_code_identifier_terms(
                    list(self.code_identifier)))

        return search
Example #28
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (field_data['namespace'],
                                  field_data['in_database_name'])

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (operator_wildcards[param.operator] %
                                    param.value)
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            self._add_second_level_aggs(
                param,
                search.aggs,
                facets_size,
                histogram_intervals,
            )

        # Create sub-aggregations.
        for key in params:
            if not key.startswith('_aggs.'):
                continue

            fields = key.split('.')[1:]

            if fields[0] not in self.all_fields:
                continue

            base_bucket = self._get_fields_agg(fields[0], facets_size)
            sub_bucket = base_bucket

            for field in fields[1:]:
                # For each field, make a bucket, then include that bucket in
                # the latest one, and then make that new bucket the latest.
                if field in self.all_fields:
                    tmp_bucket = self._get_fields_agg(field, facets_size)
                    sub_bucket.bucket(field, tmp_bucket)
                    sub_bucket = tmp_bucket

            for value in params[key]:
                self._add_second_level_aggs(
                    value,
                    sub_bucket,
                    facets_size,
                    histogram_intervals,
                )

            search.aggs.bucket(fields[0], base_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            key = '_histogram.%s' % f
            if params.get(key):
                histogram_bucket = self._get_histogram_agg(
                    f, histogram_intervals)

                for param in params[key]:
                    self._add_second_level_aggs(
                        param,
                        histogram_bucket,
                        facets_size,
                        histogram_intervals,
                    )

                search.aggs.bucket('histogram_%s' % f, histogram_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #29
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    if value.startswith('_histogram.'):
                        # This is a histogram aggregation we want to run,
                        # not a terms aggregation.
                        field_name = value[len('_histogram.'):]
                        if field_name not in self.histogram_fields:
                            continue

                        histogram_type = (
                            self.all_fields[field_name]['query_type'] == 'date'
                            and 'date_histogram' or 'histogram'
                        )
                        sig_bucket.bucket(
                            'histogram_%s' % field_name,
                            histogram_type,
                            field=self.get_field_name(field_name),
                            interval=histogram_intervals[field_name],
                        )
                    else:
                        sig_bucket.bucket(
                            value,
                            'terms',
                            field=self.get_field_name(value),
                            size=facets_size,
                        )

            search.aggs.bucket('signature', sig_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            if params.get('_histogram.%s' % f):
                histogram_type = (
                    self.all_fields[f]['query_type'] == 'date'
                    and 'date_histogram' or 'histogram'
                )
                date_bucket = A(
                    histogram_type,
                    field=self.get_field_name(f),
                    interval=histogram_intervals[f],
                )
                for param in params['_histogram.%s' % f]:
                    for value in param.value:
                        if not value:
                            continue

                        field_name = self.get_field_name(value)
                        val_bucket = A(
                            'terms',
                            field=field_name,
                            size=facets_size,
                        )
                        date_bucket.bucket(value, val_bucket)

                search.aggs.bucket('histogram_%s' % f, date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #30
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get("_fields"):
            raise MissingArgumentError("_fields")
        self.all_fields = kwargs["_fields"]

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params["date"])

        if "%" in self.context.get_index_template():
            # If the index template is date-centric, remove indices before the retention
            # policy because they're not valid to search through and probably don't
            # exist
            policy = datetime.timedelta(
                weeks=self.context.get_retention_policy())
            template = self.context.get_index_template()
            indices = prune_invalid_indices(indices, policy, template)

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.context.get_doctype(),
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith("_"):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == "_results_offset":
                        results_from = param.value[0]
                    elif param.name == "_results_number":
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                "_results_number",
                                msg=("_results_number cannot be greater "
                                     "than 1,000"),
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                "_results_number",
                                msg="_results_number cannot be negative",
                            )
                    elif param.name == "_facets_size":
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                "_facets_size greater than 10,000")

                    for f in self.histogram_fields:
                        if param.name == "_histogram_interval.%s" % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ("date", "datetime"):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == "enum":
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == "str" and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    "~": "*%s*",  # contains
                    "^": "%s*",  # starts with
                    "$": "*%s",  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    ">": "gt",
                    "<": "lt",
                    ">=": "gte",
                    "<=": "lte"
                }

                args = {}
                filter_type = "term"
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, str) or " " not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = "query"
                            args = Q(
                                "simple_query_string",
                                query=param.value[0],
                                fields=[name],
                                default_operator="and",
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = "terms"
                        filter_value = param.value
                elif param.operator == "=":
                    # is exactly
                    if field_data["has_full_version"]:
                        name = "%s.full" % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = "range"
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == "__null__":
                    filter_type = "missing"
                    args["field"] = name
                elif param.operator == "__true__":
                    filter_type = "term"
                    filter_value = True
                elif param.operator == "@":
                    filter_type = "regexp"
                    if field_data["has_full_version"]:
                        name = "%s.full" % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = "query"

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data["has_full_version"]:
                        name = "%s.full" % name

                    q_args = {}
                    q_args[name] = operator_wildcards[
                        param.operator] % param.value
                    query = Q("wildcard", **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == "range":
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F("bool", must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params["_columns"]:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params["_sort"]:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith("-"):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = "-" + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(params, search, facets_size,
                                      histogram_intervals)

        # Query and compute results.
        hits = []

        if params["_return_query"][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {"query": search.to_dict(), "indices": indices}

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, "aggregations", {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, "_shards", {})

                break  # Yay! Results!
            except NotFoundError as e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    "type": "missing_index",
                    "index": missing_index
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error)[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass

                # Re-raise the original exception
                raise

        if shards and shards.failed:
            # Some shards failed. We want to explain what happened in the
            # results, so the client can decide what to do.
            failed_indices = defaultdict(int)
            for failure in shards.failures:
                failed_indices[failure.index] += 1

            for index, shards_count in failed_indices.items():
                errors.append({
                    "type": "shards",
                    "index": index,
                    "shards_count": shards_count
                })

        return {
            "hits": hits,
            "total": total,
            "facets": aggregations,
            "errors": errors
        }
Example #31
0
def test_events_process(script_info, event_queues, es_with_templates):
    """Test "events process" CLI command."""
    es = es_with_templates
    search = Search(using=es)
    runner = CliRunner()

    # Invalid argument
    result = runner.invoke(
        stats, ['events', 'process', 'invalid-event-type', '--eager'],
        obj=script_info)
    assert result.exit_code == 2
    assert 'Invalid event type(s):' in result.output

    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1, 10), (2018, 1, 1, 12), (2018, 1, 1, 14)]
    ])
    current_stats.publish('record-view', [
        _create_record_view_event(date)
        for date in [(2018, 1, 1, 10), (2018, 1, 1, 12), (2018, 1, 1, 14)]
    ])

    result = runner.invoke(stats,
                           ['events', 'process', 'file-download', '--eager'],
                           obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')

    assert search.index('events-stats-file-download-2018-01-01').count() == 3
    assert search.index('events-stats-file-download').count() == 3
    assert not es.indices.exists('events-stats-record-view-2018-01-01')
    assert not es.indices.exists_alias(name='events-stats-record-view')

    result = runner.invoke(stats,
                           ['events', 'process', 'record-view', '--eager'],
                           obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')
    assert search.index('events-stats-file-download-2018-01-01').count() == 3
    assert search.index('events-stats-file-download').count() == 3
    assert search.index('events-stats-record-view-2018-01-01').count() == 3
    assert search.index('events-stats-record-view').count() == 3

    # Create some more events
    current_stats.publish('file-download',
                          [_create_file_download_event((2018, 2, 1, 12))])
    current_stats.publish('record-view',
                          [_create_record_view_event((2018, 2, 1, 10))])

    # Process all event types via a celery task
    result = runner.invoke(stats, ['events', 'process'], obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')
    assert search.index('events-stats-file-download-2018-01-01').count() == 3
    assert search.index('events-stats-file-download-2018-02-01').count() == 1
    assert search.index('events-stats-file-download').count() == 4
    assert search.index('events-stats-record-view-2018-01-01').count() == 3
    assert search.index('events-stats-record-view-2018-02-01').count() == 1
    assert search.index('events-stats-record-view').count() == 4
Example #32
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't restrict on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot return it' % value
                    )

                if not field_['is_returned']:
                    # Returning this field is not allowed.
                    raise BadArgumentError(
                        value,
                        msg='Field "%s" is not allowed to be returned' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't sort on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot sort on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=self.config.facets_max_number
                )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
def generalSearch(search):
    global sortedresult
    print search
    search['offset'] = int(search['offset'])
    if search['sort_by_date'] and search['offset'] > 0 and len(
            sortedresult) > 0:
        print sortedresult
        return sortedresult[search['offset']:search['offset'] + 10]
    s = Search(using=es)
    s = s.index('job_index')
    s = s.query(Q('match_all'))

    # title
    if search.has_key('jobtitle'):
        s = s.query('multi_match',
                    query=search['jobtitle'],
                    type='cross_fields',
                    fields=['title', 'summary'],
                    operator='and')

    # job description
    if search.has_key('description') or search.has_key('jobtitle'):
        summary = ""
        if search.has_key('jobtitle'):
            summary += search["jobtitle"]
            if search.has_key('description'):
                summary += " " + search['description']
        else:
            summary = search['description']
        s = s.query('match', summary=summary)

    # company
    if search.has_key('company'):
        s = s.query('match', company=search['company'])

    # location
    if search.has_key('state'):
        s = s.query('match_phrase', state=search['state'])
    if search.has_key('city'):
        s = s.query('match', city=search['city'])

    # jobtype
    if search.has_key('type'):
        s = s.query('match', jobtype=search['type'])

    # salary
    if search.has_key('salary'):
        search['salary'] = int(search['salary'])
        s = s.query('range', salary={'gte': search['salary']})

    # date
    if search.has_key('date'):
        days = re.findall(r"(\d+)", search['date'])[0]
        days = int(days)
        today = datetime.datetime.now().toordinal()
        s = s.query('range', date={'gte': today - days})

    pp = pprint.PrettyPrinter(depth=6)
    pp.pprint(s.to_dict())

    if search['sort_by_date']:
        s = s[0:3000]
        response = s.execute()
        resultlist = []
        print response.hits.total
        print len(response.hits)
        for hit in response.hits:
            result = {}
            result['id'] = hit.meta.id
            result['score'] = hit.meta.score
            result['title'] = hit['title']
            result['summary'] = hit['summary'][:180]
            result['url'] = 'www.indeed.com' + hit['url']
            result['company'] = hit['company']
            result['location'] = hit['location']
            result['postingdate'] = str(
                datetime.datetime.fromordinal(hit['date']))
            resultlist.append(result)
        sortedresult = sorted(resultlist,
                              key=lambda d: d['postingdate'],
                              reverse=1)
        return sortedresult[search['offset']:search['offset'] + 10]
    else:
        s = s[search['offset']:search['offset'] + 10]
        response = s.execute()

        resultlist = []
        print response.hits.total
        for hit in response.hits:
            result = {}
            result['id'] = hit.meta.id
            result['score'] = hit.meta.score
            result['title'] = hit['title']
            result['summary'] = hit['summary'][:180]
            result['url'] = 'www.indeed.com' + hit['url']
            result['company'] = hit['company']
            result['location'] = hit['location']
            result['postingdate'] = str(
                datetime.datetime.fromordinal(hit['date']))
            resultlist.append(result)

        return resultlist
Example #34
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=(
                                    '_results_number cannot be greater '
                                    'than 1,000'
                                )
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative'
                            )
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000'
                            )

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(
                params,
                search,
                facets_size,
                histogram_intervals
            )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error
                    )[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise
Example #35
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=('_results_number cannot be greater '
                                     'than 1,000'))
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000')

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (operator_wildcards[param.operator] %
                                    param.value)
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(params, search, facets_size,
                                      histogram_intervals)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error)[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise
Example #36
0
            #                                 folder in folders]
        elif query in search_query_list:
            test_model.__setattr__(
                count_query_map[search_query_list.index(query)],
                format_es_hits_total(res.hits.total))
        else:
            print(query in search_query_list)
    recent_article_count = sorted(date_count_res, key=lambda k: k.date)
    recent_publish_count = sorted(date_published_res, key=lambda k: k.date)
    today_article_count = recent_article_count[-1].count
    today_publish_count = recent_publish_count[-1].count
    stats_model = TeamArticleStatsModel(
        today_article_count=today_article_count,
        today_publish_count=today_publish_count,
        recent_publish_count=recent_publish_count,
        recent_article_count=recent_article_count,
        total_article_count=total_article_count,
        folder_article_count=folder_article_count)

    print(test_model)
    return stats_model


if __name__ == '__main__':
    query = Search(
        using=Elasticsearch("http://elasticsearch.kube-system:9200"))
    res = query.index('bees_articles_stcn').execute()
    print(isinstance(res.hits.total, AttrDict))
    print(get_article_status_count_by_es())
    print(get_team_article_stats_from_es(2, 7))