def get_entities_by_sport_and_query(self, sport, query): search = Search(using=self.es) search = search[0:5] if sport == Sport.SOCCER: search = search.index('soccer-entity') elif sport == Sport.BASKETBALL: search = search.index('basketball-entity') if query: query = '*{}*'.format(query) search = search.query( QueryString(query=query, fields=['name^5', 'abstract'])) hits = [] for hit in search.execute(): id = hit.meta['id'] hit = hit.to_dict() entity = {'id': id, 'name': hit['name']} if 'abstract' in hit: entity['abstract'] = hit['abstract'] else: entity['abstract'] = 'None' if 'type' in hit: entity['type'] = hit['type'] else: entity['abstract'] = 'None' hits.append(entity) return hits
def test_basic_stats(app, db, es, locations, event_queues, minimal_record): """Test basic statistics results.""" search = Search(using=es) records = create_stats_fixtures( # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files metadata=minimal_record, n_records=10, n_versions=2, n_files=3, event_data={'user_id': '1'}, # 4 event timestamps start_date=datetime(2018, 1, 1, 13), end_date=datetime(2018, 1, 1, 15), interval=timedelta(minutes=30)) # Events indices prefix = app.config['SEARCH_INDEX_PREFIX'] # 2 versions * 10 records * 3 files * 4 events -> 240 assert search.index(prefix + 'events-stats-file-download').count() == 240 # 2 versions * 10 records * 4 events -> 80 assert search.index(prefix + 'events-stats-record-view').count() == 80 # Aggregations indices # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks # 30d assert search.index(prefix + 'stats-file-download').count() == 30 # 30d assert search.index(prefix + 'stats-record-view').count() == 30 # 2bm + 2bm assert search.index(prefix + 'stats-bookmarks').count() == 4 # Records index for _, record, _ in records: doc = \ current_search_client.get( index=build_alias_name('records'), id=str(record.id), params={'_source_includes': '_stats'} ) assert doc['_source']['_stats'] == { # 4 view events 'views': 4.0, 'version_views': 8.0, # 4 view events over 2 different hours 'unique_views': 2.0, 'version_unique_views': 2.0, # 4 download events * 3 files 'downloads': 12.0, 'version_downloads': 24.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 2.0, 'version_unique_downloads': 2.0, # 4 download events * 3 files * 10 bytes 'volume': 120.0, 'version_volume': 240.0, }
def test_large_stats(app, db, es, locations, event_queues, minimal_record): """Test a larger number of events, aggregations, and results.""" search = Search(using=es) records = create_stats_fixtures( # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files metadata=minimal_record, n_records=3, n_versions=4, n_files=2, event_data={'user_id': '1'}, # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day) start_date=datetime(2018, 3, 1), end_date=datetime(2018, 5, 1), interval=timedelta(hours=12)) # Events indices prefix = app.config['SEARCH_INDEX_PREFIX'] # 4 versions * 3 records * 2 files * 122 events -> 2928 assert search.index(prefix + 'events-stats-file-download').count() == 2928 # 4 versions * 3 records * 122 events -> 1464 assert search.index(prefix + 'events-stats-record-view').count() == 1464 # Aggregations indices # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks q = search.index(prefix + 'stats-file-download') q = q.doc_type('file-download-day-aggregation') assert q.count() == 915 # 61 days * 15 records q = search.index(prefix + 'stats-record-view') q = q.doc_type('record-view-day-aggregation') assert q.count() == 915 # 61 days * 15 records # Records index for _, record, _ in records: doc = \ current_search_client.get( index=build_alias_name('records'), id=str(record.id), params={'_source_includes': '_stats'} ) assert doc['_source']['_stats'] == { # 4 view events 'views': 122.0, 'version_views': 488.0, # 4 view events over 2 different hours 'unique_views': 122.0, 'version_unique_views': 122.0, # 4 download events * 3 files 'downloads': 244.0, 'version_downloads': 976.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 122.0, 'version_unique_downloads': 122.0, # 4 download events * 3 files * 10 bytes 'volume': 2440.0, 'version_volume': 9760.0, }
def test_aggregations_process(script_info, event_queues, es, indexed_events): """Test "aggregations process" CLI command.""" search = Search(using=es) runner = CliRunner() # Invalid argument result = runner.invoke( stats, ['aggregations', 'process', 'invalid-aggr-type', '--eager'], obj=script_info) assert result.exit_code == 2 assert 'Invalid aggregation type(s):' in result.output result = runner.invoke(stats, [ 'aggregations', 'process', 'file-download-agg', '--start-date=2018-01-01', '--end-date=2018-01-10', '--eager' ], obj=script_info) assert result.exit_code == 0 agg_alias = search.index('stats-file-download') es.indices.refresh(index='*') assert agg_alias.count() == 10 assert agg_alias.doc_type('file-download-agg-bookmark').count() == 0 assert agg_alias.doc_type('file-download-day-aggregation').count() == 10 assert search.index('stats-file-download-2018-01').count() == 10 # Run again over same period, but update the bookmark result = runner.invoke(stats, [ 'aggregations', 'process', 'file-download-agg', '--start-date=2018-01-01', '--end-date=2018-01-10', '--eager', '--update-bookmark' ], obj=script_info) assert result.exit_code == 0 es.indices.refresh(index='*') assert agg_alias.count() == 12 assert agg_alias.doc_type('file-download-agg-bookmark').count() == 2 assert agg_alias.doc_type('file-download-day-aggregation').count() == 10 assert search.index('stats-file-download-2018-01').count() == 12 # Run over all the events via celery task result = runner.invoke( stats, ['aggregations', 'process', 'file-download-agg', '--update-bookmark'], obj=script_info) assert result.exit_code == 0 es.indices.refresh(index='*') assert agg_alias.count() == 54 assert agg_alias.doc_type('file-download-agg-bookmark').count() == 8 assert agg_alias.doc_type('file-download-day-aggregation').count() == 46 assert search.index('stats-file-download-2018-01').count() == 36 assert search.index('stats-file-download-2018-02').count() == 18
def test_large_stats(app, db, es, locations, event_queues, minimal_record): """Test record page view event import.""" search = Search(using=es) records = create_stats_fixtures( # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files metadata=minimal_record, n_records=3, n_versions=4, n_files=2, event_data={'user_id': '1'}, # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day) start_date=datetime(2018, 3, 1), end_date=datetime(2018, 5, 1), interval=timedelta(hours=12)) # Events indices # 4 versions * 3 records * 2 files * 122 events -> 2928 assert search.index('events-stats-file-download').count() == 2928 # 4 versions * 3 records * 122 events -> 1464 assert search.index('events-stats-record-view').count() == 1464 # Aggregations indices # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks q = search.index('stats-file-download') q = q.doc_type('file-download-day-aggregation') assert q.count() == 915 # 61 days * 15 records q = search.index('stats-record-view') q = q.doc_type('record-view-day-aggregation') assert q.count() == 915 # 61 days * 15 records # Reords index for _, record, _ in records: doc = (RecordsSearch().get_record( record.id).source(include='_stats').execute()[0]) assert doc['_stats'] == { # 4 view events 'views': 122.0, 'version_views': 488.0, # 4 view events over 2 different hours 'unique_views': 122.0, 'version_unique_views': 122.0, # 4 download events * 3 files 'downloads': 244.0, 'version_downloads': 976.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 122.0, 'version_unique_downloads': 122.0, # 4 download events * 3 files * 10 bytes 'volume': 2440.0, 'version_volume': 9760.0, }
def movie_page(mid): s = Search(using=es) s = s.index('imdb') s = s.filter('term', _id=mid) ret = s.execute() return render_template('single.html', movie=get_movie_detail(ret.hits[0].to_dict()))
def get_genre_agg(): s = Search(using=es) s = s.index('imdb') s.aggs.bucket('genres', A('terms', field='genres')) ret = s.execute() # logger.debug('genre agg is %s', json.dumps(ret.aggs.to_dict(), indent=2)) return [x['key'] for x in ret.aggs.to_dict()['genres']['buckets']]
def search(self, doc_type, query=""): """ Execute search query and retrive results :param doc_type: Type in ElasticSearch :param query: search query :return: list with results """ results = [] if type(query) in [str, unicode] and type(doc_type) == DocTypeMeta: q = Q("multi_match", query=query.lower(), fields=["title"]) s = Search() s = s.using(self.client) s = s.index(self.index_name) s = s.doc_type(doc_type) s = s.query(q) print "search query: " + str(s.to_dict()) response = s.execute() for resp in response: results.append(resp) return results
def get_genre_agg(): s = Search(using=es) s = s.index('imdb') s.aggs.bucket('genres', A('terms', field='genres')) ret = s.execute() return [ x['key'] for x in ret.to_dict()['aggregations']['genres']['buckets'] ]
def test_failing_processors(app, es, event_queues, caplog): """Test events that raise an exception when processed.""" search = Search(using=es) current_queues.declare() current_stats.publish('file-download', [ _create_file_download_event(date) for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)] ]) def _raises_on_second_call(doc): if _raises_on_second_call.calls == 1: _raises_on_second_call.calls += 1 raise Exception('mocked-exception') _raises_on_second_call.calls += 1 return doc _raises_on_second_call.calls = 0 queue = current_queues.queues['stats-file-download'] indexer = EventsIndexer(queue, preprocessors=[_raises_on_second_call]) current_search.flush_and_refresh(index='*') assert get_queue_size('stats-file-download') == 4 assert not es.indices.exists('events-stats-file-download-2018-01-01') assert not es.indices.exists('events-stats-file-download-2018-01-02') assert not es.indices.exists('events-stats-file-download-2018-01-03') assert not es.indices.exists('events-stats-file-download-2018-01-04') assert not es.indices.exists_alias(name='events-stats-file-download') with caplog.at_level(logging.ERROR): indexer.run() # 2nd event raises exception and is dropped # Check that the error was logged error_logs = [r for r in caplog.records if r.levelno == logging.ERROR] assert len(error_logs) == 1 assert error_logs[0].msg == 'Error while processing event' assert error_logs[0].exc_info[1].args[0] == 'mocked-exception' current_search.flush_and_refresh(index='*') assert get_queue_size('stats-file-download') == 0 assert search.index('events-stats-file-download').count() == 3 assert search.index('events-stats-file-download-2018-01-01').count() == 1 assert not es.indices.exists('events-stats-file-download-2018-01-02') assert search.index('events-stats-file-download-2018-01-03').count() == 1 assert search.index('events-stats-file-download-2018-01-04').count() == 1
def test_basic_stats(app, db, es, locations, event_queues, minimal_record): """Test basic statistics results.""" search = Search(using=es) records = create_stats_fixtures( # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files metadata=minimal_record, n_records=10, n_versions=2, n_files=3, event_data={'user_id': '1'}, # 4 event timestamps start_date=datetime(2018, 1, 1, 13), end_date=datetime(2018, 1, 1, 15), interval=timedelta(minutes=30)) # Events indices # 2 versions * 10 records * 3 files * 4 events -> 240 assert search.index('events-stats-file-download').count() == 240 # 2 versions * 10 records * 4 events -> 80 assert search.index('events-stats-record-view').count() == 80 # Aggregations indices # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks assert search.index('stats-file-download').count() == 32 # 2bm + 30d assert search.index('stats-record-view').count() == 32 # 2bm + 30d # Reords index for _, record, _ in records: doc = (RecordsSearch().get_record( record.id).source(include='_stats').execute()[0]) assert doc['_stats'] == { # 4 view events 'views': 4.0, 'version_views': 8.0, # 4 view events over 2 different hours 'unique_views': 2.0, 'version_unique_views': 2.0, # 4 download events * 3 files 'downloads': 12.0, 'version_downloads': 24.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 2.0, 'version_unique_downloads': 2.0, # 4 download events * 3 files * 10 bytes 'volume': 120.0, 'version_volume': 240.0, }
def recommendationSearch(search): s = Search(using=es) s = s.index('job_index') search['offset'] = int(search['offset']) condition = [] #location if search.has_key('state'): qState = Q('match_phrase', state=search['state']) condition.append(qState) if search.has_key('city'): qCity = Q('match', city=search['city']) condition.append(qCity) # professional & education background if search.has_key('pbg') or search.has_key('degree') or search.has_key( 'major'): qBG = Q('multi_match', query=search['pbg'] + ' ' + str(search['degree']) + " " + str(search['major']), type='cross_fields', fields=['title', 'summary']) condition.append(qBG) # jobtype if search.has_key('type'): qType = Q('match', jobtype=search['type']) condition.append(qType) # salary if search.has_key('salary'): search['salary'] = int(search['salary']) qSalary = Q('range', salary={'gte': search['salary']}) condition.append(qSalary) q = Q('bool', should=condition, minimum_should_match=1) s = s.query(q) s = s[search['offset']:search['offset'] + 10] pp = pprint.PrettyPrinter(depth=6) pp.pprint(s.to_dict()) response = s.execute() resultlist = [] print response.hits.total for hit in response.hits: result = {} result['id'] = hit.meta.id result['score'] = hit.meta.score result['title'] = hit['title'] result['summary'] = hit['summary'][:180] result['url'] = 'www.indeed.com' + hit['url'] result['company'] = hit['company'] result['location'] = hit['location'] result['postingdate'] = str(datetime.datetime.fromordinal(hit['date'])) resultlist.append(result) return resultlist
def test_large_stats(app, db, es, locations, event_queues, minimal_record): """Test record page view event import.""" search = Search(using=es) records = create_stats_fixtures( # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files metadata=minimal_record, n_records=3, n_versions=4, n_files=2, event_data={'user_id': '1'}, # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day) start_date=datetime(2018, 3, 1), end_date=datetime(2018, 5, 1), interval=timedelta(hours=12)) # Events indices # 4 versions * 3 records * 2 files * 122 events -> 2928 assert search.index('events-stats-file-download').count() == 2928 # 4 versions * 3 records * 122 events -> 1464 assert search.index('events-stats-record-view').count() == 1464 # Aggregations indices # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks q = search.index('stats-file-download') q = q.doc_type('file-download-day-aggregation') assert q.count() == 915 # 61 days * 15 records q = search.index('stats-record-view') q = q.doc_type('record-view-day-aggregation') assert q.count() == 915 # 61 days * 15 records # Reords index for _, record, _ in records: doc = ( RecordsSearch().get_record(record.id) .source(include='_stats').execute()[0]) assert doc['_stats'] == { # 4 view events 'views': 122.0, 'version_views': 488.0, # 4 view events over 2 different hours 'unique_views': 122.0, 'version_unique_views': 122.0, # 4 download events * 3 files 'downloads': 244.0, 'version_downloads': 976.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 122.0, 'version_unique_downloads': 122.0, # 4 download events * 3 files * 10 bytes 'volume': 2440.0, 'version_volume': 9760.0, }
def get_suggest(input): if not input: return None s = Search(using=es) s = s.index('imdb') s = s.suggest('suggestion', input, completion={'field': 'suggest'}) s = s.source(False) ret = s.execute() results = [x['text'] for x in ret.suggest.suggestion[0]['options']] return jsonify(result=results)
def test_aggregations_list_bookmarks(script_info, event_queues, es, aggregated_events): """Test "aggregations list-bookmarks" CLI command.""" search = Search(using=es) runner = CliRunner() current_search.flush_and_refresh(index='*') agg_alias = search.index('stats-file-download') assert agg_alias.count() == 31 assert search.index('bookmark-index').count() == 5 assert agg_alias.doc_type('file-download-day-aggregation').count() == 31 assert search.index('stats-file-download-2018-01').count() == 31 result = runner.invoke( stats, ['aggregations', 'list-bookmarks', 'file-download-agg'], obj=script_info) assert result.exit_code == 0 bookmarks_query = search.index('bookmark-index') bookmarks = [b.date for b in bookmarks_query.scan()] assert all(b in result.output for b in bookmarks)
def page_detail(id): try: # search the document based on its metaid s = Search(using=es) s = s.index('job_index') s = s.filter('term', _id=id) ret = s.execute() job=get_job_detail(ret.hits[0].to_dict(),id) return render_template('detail.html', job) except KeyError: return "Problem"
def common_search(self, search_obj: Search, **kwargs): assert search_obj if kwargs.get('offset'): offset = kwargs.get('offset') else: offset = 0 if kwargs.get('limit'): limit = kwargs.get('limit') else: limit = 100 search_obj = search_obj[offset:offset + limit] if kwargs.get('index'): index = kwargs.get('index') search_obj = search_obj.index(index) if kwargs.get('raw_result') is not None: raw_result = kwargs.get('raw_result') else: raw_result = False if kwargs.get('attach_id'): attach_id = kwargs.get('attach_id') else: attach_id = False if kwargs.get('with_page_info') is not None: with_page_info = kwargs.get('with_page_info') else: with_page_info = False print("\nES query:", search_obj.to_dict()) res = search_obj.execute() if raw_result is True: return res data = list() for hit in res.hits.hits: d = hit['_source'].to_dict() if attach_id: d['index'] = hit['_index'] d['doc_id'] = hit['_id'] data.append(d) if with_page_info is True: page_info = { "total": res.hits.total.value, "limit": limit, "offset": offset } return data, page_info else: return data
def test_basic_stats(app, db, es, locations, event_queues, minimal_record): """Test basic statistics results.""" search = Search(using=es) records = create_stats_fixtures( # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files metadata=minimal_record, n_records=10, n_versions=2, n_files=3, event_data={'user_id': '1'}, # 4 event timestamps start_date=datetime(2018, 1, 1, 13), end_date=datetime(2018, 1, 1, 15), interval=timedelta(minutes=30)) # Events indices # 2 versions * 10 records * 3 files * 4 events -> 240 assert search.index('events-stats-file-download').count() == 240 # 2 versions * 10 records * 4 events -> 80 assert search.index('events-stats-record-view').count() == 80 # Aggregations indices # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks assert search.index('stats-file-download').count() == 32 # 2bm + 30d assert search.index('stats-record-view').count() == 32 # 2bm + 30d # Reords index for _, record, _ in records: doc = ( RecordsSearch().get_record(record.id) .source(include='_stats').execute()[0]) assert doc['_stats'] == { # 4 view events 'views': 4.0, 'version_views': 8.0, # 4 view events over 2 different hours 'unique_views': 2.0, 'version_unique_views': 2.0, # 4 download events * 3 files 'downloads': 12.0, 'version_downloads': 24.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 2.0, 'version_unique_downloads': 2.0, # 4 download events * 3 files * 10 bytes 'volume': 120.0, 'version_volume': 240.0, }
def get_entity(self, sport, element): search = Search(using=self.es) if sport == Sport.SOCCER: search = search.index('soccer-entity') if sport == Sport.BASKETBALL: search = search.index('basketball-entity') search = search.query(Match(_id=element[0])) response = search.execute() if len(response) > 0: entity = {'name': response[0]['name']} if 'abstract' in response[0]: entity['abstract'] = response[0]['abstract'] else: entity['abstract'] = 'None' if 'type' in response[0]: entity['type'] = response[0]['type'] else: entity['type'] = 'None' else: entity = {'name': element[0], 'abstract': 'None', 'type': 'None'} entity['similarity'] = round(element[1], 2) entity['sport'] = sport.value return entity
def jobdetail(id): s = Search(using=es) s = s.index('job_index') s = s.filter('term', _id=id) ret = s.execute() hit = ret.hits[0].to_dict() job = {} job['id'] = id job['title'] = hit['title'] job['summary'] = hit['summary'] job['url'] = 'www.indeed.com' + hit['url'] job['company'] = hit['company'] job['location'] = hit['location'] if hit['salary'] == '': job['salary'] = 'Unknown' else: job['salary'] = hit['salary'] job['jobtype'] = hit['jobtype'] return job
def get_langs_from_unlabeled_tweets(self, **kwargs): # TODO: we need to execute this in case the user doesn't have it enabled. I can't find the # PUT / twitterfdl2017 / _mapping / tweet # { # "properties": { # "lang": { # "type": "text", # "fielddata": true # } # } # } the_host = "http://" + kwargs["host"] + ":" + kwargs["port"] client = connections.create_connection(hosts=[the_host]) s = Search(using=client, index=kwargs["index"], doc_type="tweet") body = { "size": 0, "aggs": { "distinct_lang": { "terms": { "field": "lang", "size": 1000 } } } } s = Search.from_dict(body) s = s.index(kwargs["index"]) s = s.doc_type("tweet") body = s.to_dict() t = s.execute() distinct_langs = [] for item in t.aggregations.distinct_lang: # print(item.key, item.doc_count) distinct_langs.append(item.key) return distinct_langs
def test_aggregations_delete(script_info, event_queues, es, aggregated_events): """Test "aggregations process" CLI command.""" search = Search(using=es) runner = CliRunner() current_search.flush_and_refresh(index='*') agg_alias = search.index('stats-file-download') assert agg_alias.count() == 31 assert search.index('bookmark-index').count() == 5 assert agg_alias.doc_type('file-download-day-aggregation').count() == 31 assert search.index('stats-file-download-2018-01').count() == 31 result = runner.invoke(stats, [ 'aggregations', 'delete', 'file-download-agg', '--start-date=2018-01-01', '--end-date=2018-01-10', '--yes' ], obj=script_info) assert result.exit_code == 0 current_search.flush_and_refresh(index='*') agg_alias = search.index('stats-file-download') assert agg_alias.count() == 21 assert search.index('bookmark-index').count() == 4 assert agg_alias.doc_type('file-download-day-aggregation').count() == 21 assert search.index('stats-file-download-2018-01').count() == 21 # Delete all aggregations result = runner.invoke(stats, ['aggregations', 'delete', '--yes'], obj=script_info) assert result.exit_code == 0 current_search.flush_and_refresh(index='*') agg_alias = search.index('stats-file-download') assert agg_alias.count() == 0 assert agg_alias.doc_type('file-download-agg-bookmark').count() == 0 assert agg_alias.doc_type('file-download-day-aggregation').count() == 0 assert search.index('stats-file-download-2018-01').count() == 0
def index(): page = 1 sort = request.args.get('sort') search = request.args.get('search') logger.debug(request.args) s = Search(using=es) s = s.index('imdb') s = s.source(includes=['title', 'poster', '_id']) s = s[(page - 1) * PAGESIZE:page * PAGESIZE] if search: s = s.query( Q('multi_match', query=search, fields=['title', 'summary', 'casts'])).extra(size=8) if sort: s = s.sort(sort) ret = s.execute() logger.debug(ret.hits) movies = get_movies(ret.hits) genres = get_genre_agg() return render_template('review.html', movies=movies, genres=genres)
def doSearch(self, body): try: client = connections.create_connection(hosts=[settings.ES_URL]) s = Search(using=client, index=settings.ES_INDEX_NAME, doc_type=settings.ES_INDEX_TYPE) s = Search.from_dict(body) s = s.index(settings.ES_INDEX_NAME) s = s.doc_type(settings.ES_INDEX_TYPE) # hightlight the following fields in the search result s = s.highlight('title') s = s.highlight('description') s = s.highlight('data_time') s = s.highlight('source') body = s.to_dict() response = s.execute() except Exception: return None return response
def companySearch(search): s = Search(using=es) search['offset'] = int(search['offset']) s = s.index('job_index') s = s.query('match_phrase', company=search['company']) s = s[search['offset']:search['offset'] + 10] response = s.execute() resultlist = [] print response.hits.total for hit in response.hits: result = {} result['id'] = hit.meta.id result['score'] = hit.meta.score result['title'] = hit['title'] result['summary'] = hit['summary'][:180] result['url'] = 'www.indeed.com' + hit['url'] result['company'] = hit['company'] result['location'] = hit['location'] result['postingdate'] = str(datetime.datetime.fromordinal(hit['date'])) resultlist.append(result) return resultlist
def index(genre=None): # Get index first page items page = 1 if request.args.get('page'): page = int(request.args.get('page')) genre = request.args.get('genre') sort = request.args.get('sort') search = request.args.get('search') print genre print page print search print sort s = Search(using=es) s = s.index('imdb') s = s.source(includes=['vote', 'title', 'poster', '_id']) s = s.query(Q('match_all')) if genre: s = s.query('bool', filter=[Q('term', genres=genre)]) if sort: s = s.sort(sort) if search: s = s.query( Q('multi_match', query=search, fields=['title', 'summary', 'casts', 'creators', 'genres'])).extra(size=8) s = s[(page - 1) * PAGE_SIZE:page * PAGE_SIZE] ret = s.execute() logger.debug(ret) movies, max_page = get_movies( ret.hits), int(ret.hits.total / PAGE_SIZE) + 1 return get_list_result(movies, max_page, page=page, genre=genre, sort=sort, search=search)
def set_search(self, search: Search) -> Search: search_helper = SearchHelper(self.dataset.type) search = search.index(self.dataset.index).filter( search_helper.query_lang_term(self._lang.value)) for cooccur_word in self.cooccur_words: search = search.filter( search_helper.query_text_tokens_term(cooccur_word)) if self.dataset.type == DatasetType.NASTY: if self.search_filter: search = search.filter( search_helper.query_nasty_filter_term( self.search_filter.name)) if self.search_query: search = search.filter( search_helper.query_nasty_query_term(self.search_query)) if self.user_verified: search = search.filter( search_helper.query_nasty_user_verified_term( self.user_verified)) elif (self.dataset.type == DatasetType.NEWS_CSV or self.dataset.type == DatasetType.MAXQDA_CODED_NEWS_CSV): if self.url_netloc: search = search.filter( search_helper.query_news_csv_url_netloc_term( self.url_netloc)) if (self.dataset.type == DatasetType.MAXQDA_CODED_NASTY or self.dataset.type == DatasetType.MAXQDA_CODED_NEWS_CSV): search = search.filter( search_helper.query_maxqda_coded_code_identifier_terms( list(self.code_identifier))) return search
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] self._build_fields() # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError('_results_number too large') elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % (field_data['namespace'], field_data['in_database_name']) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = (operator_wildcards[param.operator] % param.value) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: self._add_second_level_aggs( param, search.aggs, facets_size, histogram_intervals, ) # Create sub-aggregations. for key in params: if not key.startswith('_aggs.'): continue fields = key.split('.')[1:] if fields[0] not in self.all_fields: continue base_bucket = self._get_fields_agg(fields[0], facets_size) sub_bucket = base_bucket for field in fields[1:]: # For each field, make a bucket, then include that bucket in # the latest one, and then make that new bucket the latest. if field in self.all_fields: tmp_bucket = self._get_fields_agg(field, facets_size) sub_bucket.bucket(field, tmp_bucket) sub_bucket = tmp_bucket for value in params[key]: self._add_second_level_aggs( value, sub_bucket, facets_size, histogram_intervals, ) search.aggs.bucket(fields[0], base_bucket) # Create histograms. for f in self.histogram_fields: key = '_histogram.%s' % f if params.get(key): histogram_bucket = self._get_histogram_agg( f, histogram_intervals) for param in params[key]: self._add_second_level_aggs( param, histogram_bucket, facets_size, histogram_intervals, ) search.aggs.bucket('histogram_%s' % f, histogram_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] self._build_fields() # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError('_results_number too large') elif param.name == '_facets_size': facets_size = param.value[0] for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value, full=False) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: if not value: continue field_name = self.get_field_name(value) search.aggs.bucket( value, 'terms', field=field_name, size=facets_size, ) # Create signature aggregations. if params.get('_aggs.signature'): sig_bucket = A( 'terms', field=self.get_field_name('signature'), size=facets_size, ) for param in params['_aggs.signature']: for value in param.value: if not value: continue if value.startswith('_histogram.'): # This is a histogram aggregation we want to run, # not a terms aggregation. field_name = value[len('_histogram.'):] if field_name not in self.histogram_fields: continue histogram_type = ( self.all_fields[field_name]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) sig_bucket.bucket( 'histogram_%s' % field_name, histogram_type, field=self.get_field_name(field_name), interval=histogram_intervals[field_name], ) else: sig_bucket.bucket( value, 'terms', field=self.get_field_name(value), size=facets_size, ) search.aggs.bucket('signature', sig_bucket) # Create histograms. for f in self.histogram_fields: if params.get('_histogram.%s' % f): histogram_type = ( self.all_fields[f]['query_type'] == 'date' and 'date_histogram' or 'histogram' ) date_bucket = A( histogram_type, field=self.get_field_name(f), interval=histogram_intervals[f], ) for param in params['_histogram.%s' % f]: for value in param.value: if not value: continue field_name = self.get_field_name(value) val_bucket = A( 'terms', field=field_name, size=facets_size, ) date_bucket.bucket(value, val_bucket) search.aggs.bucket('histogram_%s' % f, date_bucket) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get("_fields"): raise MissingArgumentError("_fields") self.all_fields = kwargs["_fields"] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params["date"]) if "%" in self.context.get_index_template(): # If the index template is date-centric, remove indices before the retention # policy because they're not valid to search through and probably don't # exist policy = datetime.timedelta( weeks=self.context.get_retention_policy()) template = self.context.get_index_template() indices = prune_invalid_indices(indices, policy, template) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.context.get_doctype(), ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith("_"): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == "_results_offset": results_from = param.value[0] elif param.name == "_results_number": results_number = param.value[0] if results_number > 1000: raise BadArgumentError( "_results_number", msg=("_results_number cannot be greater " "than 1,000"), ) if results_number < 0: raise BadArgumentError( "_results_number", msg="_results_number cannot be negative", ) elif param.name == "_facets_size": facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( "_facets_size greater than 10,000") for f in self.histogram_fields: if param.name == "_histogram_interval.%s" % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ("date", "datetime"): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == "enum": param.value = [x.lower() for x in param.value] elif param.data_type == "str" and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { "~": "*%s*", # contains "^": "%s*", # starts with "$": "*%s", # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { ">": "gt", "<": "lt", ">=": "gte", "<=": "lte" } args = {} filter_type = "term" filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, str) or " " not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = "query" args = Q( "simple_query_string", query=param.value[0], fields=[name], default_operator="and", ).to_dict() else: # There are several terms, this is a terms filter. filter_type = "terms" filter_value = param.value elif param.operator == "=": # is exactly if field_data["has_full_version"]: name = "%s.full" % name filter_value = param.value elif param.operator in operator_range: filter_type = "range" filter_value = { operator_range[param.operator]: param.value } elif param.operator == "__null__": filter_type = "missing" args["field"] = name elif param.operator == "__true__": filter_type = "term" filter_value = True elif param.operator == "@": filter_type = "regexp" if field_data["has_full_version"]: name = "%s.full" % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = "query" # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data["has_full_version"]: name = "%s.full" % name q_args = {} q_args[name] = operator_wildcards[ param.operator] % param.value query = Q("wildcard", **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == "range": sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F("bool", must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params["_columns"]: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params["_sort"]: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith("-"): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = "-" + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations(params, search, facets_size, histogram_intervals) # Query and compute results. hits = [] if params["_return_query"][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return {"query": search.to_dict(), "indices": indices} errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, "aggregations", {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, "_shards", {}) break # Yay! Results! except NotFoundError as e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ "type": "missing_index", "index": missing_index }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error)[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass # Re-raise the original exception raise if shards and shards.failed: # Some shards failed. We want to explain what happened in the # results, so the client can decide what to do. failed_indices = defaultdict(int) for failure in shards.failures: failed_indices[failure.index] += 1 for index, shards_count in failed_indices.items(): errors.append({ "type": "shards", "index": index, "shards_count": shards_count }) return { "hits": hits, "total": total, "facets": aggregations, "errors": errors }
def test_events_process(script_info, event_queues, es_with_templates): """Test "events process" CLI command.""" es = es_with_templates search = Search(using=es) runner = CliRunner() # Invalid argument result = runner.invoke( stats, ['events', 'process', 'invalid-event-type', '--eager'], obj=script_info) assert result.exit_code == 2 assert 'Invalid event type(s):' in result.output current_stats.publish('file-download', [ _create_file_download_event(date) for date in [(2018, 1, 1, 10), (2018, 1, 1, 12), (2018, 1, 1, 14)] ]) current_stats.publish('record-view', [ _create_record_view_event(date) for date in [(2018, 1, 1, 10), (2018, 1, 1, 12), (2018, 1, 1, 14)] ]) result = runner.invoke(stats, ['events', 'process', 'file-download', '--eager'], obj=script_info) assert result.exit_code == 0 current_search.flush_and_refresh(index='*') assert search.index('events-stats-file-download-2018-01-01').count() == 3 assert search.index('events-stats-file-download').count() == 3 assert not es.indices.exists('events-stats-record-view-2018-01-01') assert not es.indices.exists_alias(name='events-stats-record-view') result = runner.invoke(stats, ['events', 'process', 'record-view', '--eager'], obj=script_info) assert result.exit_code == 0 current_search.flush_and_refresh(index='*') assert search.index('events-stats-file-download-2018-01-01').count() == 3 assert search.index('events-stats-file-download').count() == 3 assert search.index('events-stats-record-view-2018-01-01').count() == 3 assert search.index('events-stats-record-view').count() == 3 # Create some more events current_stats.publish('file-download', [_create_file_download_event((2018, 2, 1, 12))]) current_stats.publish('record-view', [_create_record_view_event((2018, 2, 1, 10))]) # Process all event types via a celery task result = runner.invoke(stats, ['events', 'process'], obj=script_info) assert result.exit_code == 0 current_search.flush_and_refresh(index='*') assert search.index('events-stats-file-download-2018-01-01').count() == 3 assert search.index('events-stats-file-download-2018-02-01').count() == 1 assert search.index('events-stats-file-download').count() == 4 assert search.index('events-stats-record-view-2018-01-01').count() == 3 assert search.index('events-stats-record-view-2018-02-01').count() == 1 assert search.index('events-stats-record-view').count() == 4
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = None for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = '%s.%s' % ( field_data['namespace'], field_data['in_database_name'] ) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ( isinstance(val, basestring) and ' ' not in val ): filter_value = val # If the term contains white spaces, we want to perform # a phrase query. Thus we do nothing here and let this # value be handled later. else: filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator == '>': # greater than filter_type = 'range' filter_value = { 'gt': param.value } elif param.operator == '<': # lower than filter_type = 'range' filter_value = { 'lt': param.value } elif param.operator == '>=': # greater than or equal to filter_type = 'range' filter_value = { 'gte': param.value } elif param.operator == '<=': # lower than or equal to filter_type = 'range' filter_value = { 'lte': param.value } elif param.operator == '__null__': # is null filter_type = 'missing' args['field'] = name if filter_value is not None: args[name] = filter_value if args: if param.operator_not: new_filter = ~F(filter_type, **args) else: new_filter = F(filter_type, **args) if sub_filters is None: sub_filters = new_filter elif param.data_type == 'enum': sub_filters |= new_filter else: sub_filters &= new_filter continue # These use a wildcard and thus need to be in a query # instead of a filter. operator_wildcards = { '~': '*%s*', # contains '$': '%s*', # starts with '^': '*%s' # ends with } if param.operator in operator_wildcards: if field_data['has_full_version']: name = '%s.full' % name query_type = 'wildcard' args[name] = ( operator_wildcards[param.operator] % param.value ) elif not param.operator: # This is a phrase that was passed down. query_type = 'simple_query_string' args['query'] = param.value[0] args['fields'] = [name] args['default_operator'] = 'and' if args: query = Q(query_type, **args) if param.operator_not: query = ~query search = search.query(query) else: # If we reach this point, that means the operator is # not supported, and we should raise an error about that. raise NotImplementedError( 'Operator %s is not supported' % param.operator ) if filters is None: filters = sub_filters elif sub_filters is not None: filters &= sub_filters search = search.filter(filters) # Restricting returned fields. fields = [] for param in params['_columns']: for value in param.value: if not value: continue try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't restrict on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot return it' % value ) if not field_['is_returned']: # Returning this field is not allowed. raise BadArgumentError( value, msg='Field "%s" is not allowed to be returned' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product and descending version. desc = False if value.startswith('-'): desc = True value = value[1:] try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't sort on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot sort on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. for param in params['_facets']: for value in param.value: try: field_ = self.all_fields[value] except KeyError: # That is not a known field, we can't facet on it. raise BadArgumentError( value, msg='Unknown field "%s", cannot facet on it' % value ) field_name = '%s.%s' % ( field_['namespace'], field_['in_database_name'] ) if field_['has_full_version']: # If the param has a full version, that means what matters # is the full string, and not its individual terms. field_name += '.full' search.aggs.bucket( value, 'terms', field=field_name, size=self.config.facets_max_number ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = self.format_aggregations(results.aggregations) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} break
def generalSearch(search): global sortedresult print search search['offset'] = int(search['offset']) if search['sort_by_date'] and search['offset'] > 0 and len( sortedresult) > 0: print sortedresult return sortedresult[search['offset']:search['offset'] + 10] s = Search(using=es) s = s.index('job_index') s = s.query(Q('match_all')) # title if search.has_key('jobtitle'): s = s.query('multi_match', query=search['jobtitle'], type='cross_fields', fields=['title', 'summary'], operator='and') # job description if search.has_key('description') or search.has_key('jobtitle'): summary = "" if search.has_key('jobtitle'): summary += search["jobtitle"] if search.has_key('description'): summary += " " + search['description'] else: summary = search['description'] s = s.query('match', summary=summary) # company if search.has_key('company'): s = s.query('match', company=search['company']) # location if search.has_key('state'): s = s.query('match_phrase', state=search['state']) if search.has_key('city'): s = s.query('match', city=search['city']) # jobtype if search.has_key('type'): s = s.query('match', jobtype=search['type']) # salary if search.has_key('salary'): search['salary'] = int(search['salary']) s = s.query('range', salary={'gte': search['salary']}) # date if search.has_key('date'): days = re.findall(r"(\d+)", search['date'])[0] days = int(days) today = datetime.datetime.now().toordinal() s = s.query('range', date={'gte': today - days}) pp = pprint.PrettyPrinter(depth=6) pp.pprint(s.to_dict()) if search['sort_by_date']: s = s[0:3000] response = s.execute() resultlist = [] print response.hits.total print len(response.hits) for hit in response.hits: result = {} result['id'] = hit.meta.id result['score'] = hit.meta.score result['title'] = hit['title'] result['summary'] = hit['summary'][:180] result['url'] = 'www.indeed.com' + hit['url'] result['company'] = hit['company'] result['location'] = hit['location'] result['postingdate'] = str( datetime.datetime.fromordinal(hit['date'])) resultlist.append(result) sortedresult = sorted(resultlist, key=lambda d: d['postingdate'], reverse=1) return sortedresult[search['offset']:search['offset'] + 10] else: s = s[search['offset']:search['offset'] + 10] response = s.execute() resultlist = [] print response.hits.total for hit in response.hits: result = {} result['id'] = hit.meta.id result['score'] = hit.meta.score result['title'] = hit['title'] result['summary'] = hit['summary'][:180] result['url'] = 'www.indeed.com' + hit['url'] result['company'] = hit['company'] result['location'] = hit['location'] result['postingdate'] = str( datetime.datetime.fromordinal(hit['date'])) resultlist.append(result) return resultlist
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=( '_results_number cannot be greater ' 'than 1,000' ) ) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative' ) elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000' ) for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = ( operator_wildcards[param.operator] % param.value ) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations( params, search, facets_size, histogram_intervals ) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error )[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass raise
def get(self, **kwargs): """Return a list of results and aggregations based on parameters. The list of accepted parameters (with types and default values) is in the database and can be accessed with the super_search_fields service. """ # Require that the list of fields be passed. if not kwargs.get('_fields'): raise MissingArgumentError('_fields') self.all_fields = kwargs['_fields'] # Filter parameters and raise potential errors. params = self.get_parameters(**kwargs) # Find the indices to use to optimize the elasticsearch query. indices = self.get_indices(params['date']) # Create and configure the search object. search = Search( using=self.get_connection(), index=indices, doc_type=self.config.elasticsearch.elasticsearch_doctype, ) # Create filters. filters = [] histogram_intervals = {} for field, sub_params in params.items(): sub_filters = None for param in sub_params: if param.name.startswith('_'): # By default, all param values are turned into lists, # even when they have and can have only one value. # For those we know there can only be one value, # so we just extract it from the made-up list. if param.name == '_results_offset': results_from = param.value[0] elif param.name == '_results_number': results_number = param.value[0] if results_number > 1000: raise BadArgumentError( '_results_number', msg=('_results_number cannot be greater ' 'than 1,000')) if results_number < 0: raise BadArgumentError( '_results_number', msg='_results_number cannot be negative') elif param.name == '_facets_size': facets_size = param.value[0] # Why cap it? # Because if the query is covering a lot of different # things you can get a really really large query # which can hog resources excessively. # Downloading, as an example, 100k facets (and 0 hits) # when there is plenty of data yields a 11MB JSON # file. if facets_size > 10000: raise BadArgumentError( '_facets_size greater than 10,000') for f in self.histogram_fields: if param.name == '_histogram_interval.%s' % f: histogram_intervals[f] = param.value[0] # Don't use meta parameters in the query. continue field_data = self.all_fields[param.name] name = self.get_full_field_name(field_data) if param.data_type in ('date', 'datetime'): param.value = datetimeutil.date_to_string(param.value) elif param.data_type == 'enum': param.value = [x.lower() for x in param.value] elif param.data_type == 'str' and not param.operator: param.value = [x.lower() for x in param.value] # Operators needing wildcards, and the associated value # transformation with said wildcards. operator_wildcards = { '~': '*%s*', # contains '^': '%s*', # starts with '$': '*%s' # ends with } # Operators needing ranges, and the associated Elasticsearch # comparison operator. operator_range = { '>': 'gt', '<': 'lt', '>=': 'gte', '<=': 'lte', } args = {} filter_type = 'term' filter_value = None if not param.operator: # contains one of the terms if len(param.value) == 1: val = param.value[0] if not isinstance(val, basestring) or ' ' not in val: # There's only one term and no white space, this # is a simple term filter. filter_value = val else: # If the term contains white spaces, we want to # perform a phrase query. filter_type = 'query' args = Q( 'simple_query_string', query=param.value[0], fields=[name], default_operator='and', ).to_dict() else: # There are several terms, this is a terms filter. filter_type = 'terms' filter_value = param.value elif param.operator == '=': # is exactly if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_range: filter_type = 'range' filter_value = { operator_range[param.operator]: param.value } elif param.operator == '__null__': filter_type = 'missing' args['field'] = name elif param.operator == '__true__': filter_type = 'term' filter_value = True elif param.operator == '@': filter_type = 'regexp' if field_data['has_full_version']: name = '%s.full' % name filter_value = param.value elif param.operator in operator_wildcards: filter_type = 'query' # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data['has_full_version']: name = '%s.full' % name q_args = {} q_args[name] = (operator_wildcards[param.operator] % param.value) query = Q('wildcard', **q_args) args = query.to_dict() if filter_value is not None: args[name] = filter_value if args: new_filter = F(filter_type, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter elif filter_type == 'range': sub_filters &= new_filter else: sub_filters |= new_filter continue if sub_filters is not None: filters.append(sub_filters) search = search.filter(F('bool', must=filters)) # Restricting returned fields. fields = [] # We keep track of the requested columns in order to make sure we # return those column names and not aliases for example. self.request_columns = [] for param in params['_columns']: for value in param.value: if not value: continue self.request_columns.append(value) field_name = self.get_field_name(value, full=False) fields.append(field_name) search = search.fields(fields) # Sorting. sort_fields = [] for param in params['_sort']: for value in param.value: if not value: continue # Values starting with a '-' are sorted in descending order. # In order to retrieve the database name of the field, we # must first remove the '-' part and add it back later. # Example: given ['product', '-version'], the results will be # sorted by ascending product then descending version. desc = False if value.startswith('-'): desc = True value = value[1:] field_name = self.get_field_name(value) if desc: # The underlying library understands that '-' means # sorting in descending order. field_name = '-' + field_name sort_fields.append(field_name) search = search.sort(*sort_fields) # Pagination. results_to = results_from + results_number search = search[results_from:results_to] # Create facets. if facets_size: self._create_aggregations(params, search, facets_size, histogram_intervals) # Query and compute results. hits = [] if params['_return_query'][0].value[0]: # Return only the JSON query that would be sent to elasticsearch. return { 'query': search.to_dict(), 'indices': indices, } errors = [] # We call elasticsearch with a computed list of indices, based on # the date range. However, if that list contains indices that do not # exist in elasticsearch, an error will be raised. We thus want to # remove all failing indices until we either have a valid list, or # an empty list in which case we return no result. while True: try: results = search.execute() for hit in results: hits.append(self.format_fields(hit.to_dict())) total = search.count() aggregations = getattr(results, 'aggregations', {}) if aggregations: aggregations = self.format_aggregations(aggregations) shards = getattr(results, '_shards', {}) break # Yay! Results! except NotFoundError, e: missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0] if missing_index in indices: del indices[indices.index(missing_index)] else: # Wait what? An error caused by an index that was not # in the request? That should never happen, but in case # it does, better know it. raise errors.append({ 'type': 'missing_index', 'index': missing_index, }) if indices: # Update the list of indices and try again. # Note: we need to first empty the list of indices before # updating it, otherwise the removed indices never get # actually removed. search = search.index().index(*indices) else: # There is no index left in the list, return an empty # result. hits = [] total = 0 aggregations = {} shards = None break except RequestError as exception: # Try to handle it gracefully if we can find out what # input was bad and caused the exception. try: bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall( exception.error)[-1] # Loop over the original parameters to try to figure # out which *key* had the bad input. for key, value in kwargs.items(): if value == bad_input: raise BadArgumentError(key) except IndexError: # Not an ElasticsearchParseException exception pass raise
# folder in folders] elif query in search_query_list: test_model.__setattr__( count_query_map[search_query_list.index(query)], format_es_hits_total(res.hits.total)) else: print(query in search_query_list) recent_article_count = sorted(date_count_res, key=lambda k: k.date) recent_publish_count = sorted(date_published_res, key=lambda k: k.date) today_article_count = recent_article_count[-1].count today_publish_count = recent_publish_count[-1].count stats_model = TeamArticleStatsModel( today_article_count=today_article_count, today_publish_count=today_publish_count, recent_publish_count=recent_publish_count, recent_article_count=recent_article_count, total_article_count=total_article_count, folder_article_count=folder_article_count) print(test_model) return stats_model if __name__ == '__main__': query = Search( using=Elasticsearch("http://elasticsearch.kube-system:9200")) res = query.index('bees_articles_stcn').execute() print(isinstance(res.hits.total, AttrDict)) print(get_article_status_count_by_es()) print(get_team_article_stats_from_es(2, 7))