def test_synchro(self): """ Test that the data is synchronized between the SQL database and Elasticsearch. """ random.seed(99) # use a seed to get deterministic random numbers departement = random.choice(dpt.DEPARTEMENTS) offices = Office.query.filter( Office.departement == departement).limit(100) # If we got no results here, assume that tests are running locally, # i.e. on a dev machine which currently only have data for "Metz". # TODO: clearly identify which Elasticsearch we are talking to. if offices.count() == 0: offices = Office.query.filter(Office.departement == 57).limit(100) self.assertTrue(offices.count() > 0) es = Elasticsearch() scores = {office.siret: office.score for office in offices} body = { "query": { "filtered": { "filter": { "terms": { "siret": list(scores.keys()) } } } } } res = es.search(index=settings.ES_INDEX, doc_type="office", body=body) for office in res['hits']['hits']: index_score = office["_source"]["score"] siret = office["_source"]["siret"] self.assertEqual(index_score, scores[siret])
def is_elasticsearch_alive(): try: es = Elasticsearch() es.ping() return True except: return False
def is_elasticsearch_alive(): try: es = Elasticsearch() es.ping() return True # pylint: disable=W0703 except Exception as e: logger.exception(e) return False
def _get_offices_from_es(self, query) -> Dict: res: Dict es = Elasticsearch() logger.debug("Elastic Search request : %s", query) res = es.search(index=settings.ES_INDEX, doc_type="office", body=query) return res
def _count_offices_from_es(json_body): es = Elasticsearch() res = es.count(index=settings.ES_INDEX, doc_type="office", body=json_body) return res["count"]
def get_offices_from_es_and_db(json_body, sort, rome_codes, hiring_type): """ Fetch offices first from Elasticsearch, then from the database. Returns a tuple of (offices, office_count), where `offices` is a list of results as Office instances (with some extra attributes only available in Elasticsearch) and `office_count` an integer of the results number. `sort` is needed to find back the distance between each office and the search location, to store it and display it later on the frontend or in the API result. `rome_codes` and `hiring_type` are needed in the case of multi rome search, to find back for each office which rome_code actually did match. """ if sort not in sorting.SORT_FILTERS: # This should never happen. # An API request would have already raised a InvalidFetcherArgument exception, # and a Frontend request would have fallbacked to default sorting. raise ValueError("unknown sorting : %s" % sort) es = Elasticsearch() logger.debug("Elastic Search request : %s", json_body) res = es.search(index=settings.ES_INDEX, doc_type="office", body=json_body) office_count = res['hits']['total'] offices = [] siret_list = [office["_source"]["siret"] for office in res['hits']['hits']] if siret_list: office_objects = Office.query.filter(Office.siret.in_(siret_list)) office_dict = {obj.siret: obj for obj in office_objects} for siret in siret_list: try: office = office_dict[siret] except KeyError: # ES and DB out of sync: siret is in ES but not in DB - this should never happen logger.error( "ES and DB out of sync: siret %s is in ES but not in DB - this should never happen", siret) raise if office.has_city(): offices.append(office) else: logging.info("office siret %s does not have city, ignoring...", siret) # FIXME it's not great to add new properties to an existing object. It # would be better to wrap the office objects in a new OfficeResult # class that would add new properties related to the query. es_offices_by_siret = { item['_source']['siret']: item for item in res['hits']['hits'] } # FIXME These hardcoded values are soooooo ugly, unfortunately it is not so # easy to make it DNRY. For the corresponding code see method build_json_body_elastic_search(). distance_sort_index = { sorting.SORT_FILTER_DISTANCE: 0, sorting.SORT_FILTER_SCORE: 2, }[sort] sort_fields_total = { sorting.SORT_FILTER_DISTANCE: 1, # (distance_sort) sorting.SORT_FILTER_SCORE: 3, # (boosted_romes_sort, randomized_score_sort, distance_sort) }[sort] for position, office in enumerate(offices, start=1): # Get the corresponding item from the Elasticsearch results. es_office = es_offices_by_siret[office.siret] if len(es_office["sort"]) != sort_fields_total: raise ValueError( "Incorrect number of sorting fields in ES response.") # Add an extra `distance` attribute with one digit. office.distance = round(es_office["sort"][distance_sort_index], 1) # position is later used in labonneboite/web/static/js/results.js office.position = position if len(rome_codes) > 1: # Identify which rome_code actually matched this office. keyname = get_score_for_rome_field_name( hiring_type, rome_codes[0]).split('.')[0] all_scores = es_office['_source'][keyname] scores_of_searched_romes = dict([(rome, all_scores[rome]) for rome in rome_codes if rome in all_scores]) # https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary rome_with_highest_score = max(scores_of_searched_romes, key=scores_of_searched_romes.get) # Store it as an extra attribute. office.matched_rome = rome_with_highest_score rome_code_for_contact_mode = rome_with_highest_score else: rome_code_for_contact_mode = rome_codes[0] # Set boost flag office.boost = False boosted_rome_keyname = get_boosted_rome_field_name( hiring_type, rome_codes[0]).split('.')[0] if boosted_rome_keyname in es_office['_source']: boost_romes = es_office['_source'][boosted_rome_keyname] romes_intersection = set(rome_codes).intersection(boost_romes) office.boost = bool(romes_intersection) # Set contact mode and position office.contact_mode = util.get_contact_mode_for_rome_and_office( rome_code_for_contact_mode, office) try: aggregations = res['aggregations'] except KeyError: aggregations = [] return offices, office_count, aggregations
def build_job_label_suggestions(term, size=autocomplete.MAX_JOBS): es = Elasticsearch() body = { "_source": ["ogr_description", "rome_description", "rome_code"], "query": { "match": { # Query for multiple words or multiple parts of words across multiple fields. # Based on https://qbox.io/blog/an-introduction-to-ngrams-in-elasticsearch "_all": unidecode.unidecode(term), } }, "aggs": { "by_rome_code": { "terms": { "field": "rome_code", "size": 0, # Note: a maximum of 550 buckets will be fetched, as we have 550 unique ROME codes # FIXME `order` cannot work without a computed `max_score`, see the `max_score` comment below. # Order results by sub-aggregation named 'max_score' # "order": {"max_score": "desc"}, }, "aggs": { # Only 1 result per rome code: include only 1 top hit on each bucket in the results. # Another way of saying this is that for all OGR matching a given ROME, we only # keep the most relevant OGR. "by_top_hit": {"top_hits": {"size": 1}}, # FIXME `max_score` below does not work with Elasticsearch 1.7. # Fixed in elasticsearch 2.0+: # https://github.com/elastic/elasticsearch/issues/10091#issuecomment-193676966 # FTR @vermeer made another try to find a workaround as of Feb 2018, and failed. # The only way out is to upgrade to elasticsearch 2.0+ # Set max score among all members of this bucket # "max_score": {"max": {"lang": "expression", "script": "_score"}}, }, }, }, "size": 0, } res = es.search(index=settings.ES_INDEX, doc_type="ogr", body=body) suggestions = [] # Since ordering cannot be done easily through Elasticsearch 1.7 (`max_score` not working), # we do it in Python at this time. results = res['aggregations']['by_rome_code']['buckets'] results.sort(key=lambda e: e['by_top_hit']['hits']['max_score'], reverse=True) for hit in results: if len(suggestions) < size: hit = hit['by_top_hit']['hits']['hits'][0] source = hit['_source'] highlight = hit.get('highlight', {}) try: rome_description = highlight['rome_description.autocomplete'][0] except KeyError: rome_description = source['rome_description'] try: ogr_description = highlight['ogr_description.autocomplete'][0] except KeyError: ogr_description = source['ogr_description'] label = "%s (%s, ...)" % (rome_description, ogr_description) value = "%s (%s, ...)" % (source["rome_description"], source["ogr_description"]) score = round(hit['_score'], 1) suggestions.append({ 'id': source['rome_code'], 'label': label, 'value': value, 'occupation': slugify(source['rome_description'].lower()), 'score': score, }) else: break return suggestions
def build_location_suggestions(term): term = term.title() es = Elasticsearch() zipcode_match = [{ "prefix": { "zipcode": term } }, ] city_match = [{ "match": { "city_name.autocomplete": { "query": term, } }}, { "match": { "city_name.stemmed": { "query": term, "boost": 1, } }}, { "match_phrase_prefix": { "city_name.stemmed": { "query": term, } }}] filters = zipcode_match try: int(term) except ValueError: filters.extend(city_match) body = { "query": { # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html "function_score": { "query": { "bool": { "should": filters, }, }, # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-field-value-factor "field_value_factor": { "field": "population", "modifier": "log1p", } }, }, "size": autocomplete.MAX_LOCATIONS, } res = es.search(index=settings.ES_INDEX, doc_type="location", body=body) suggestions = [] first_score = None for hit in res['hits']['hits']: if not first_score: first_score = hit['_score'] source = hit['_source'] if source['zipcode']: # and hit['_score'] > 0.1 * first_score: city_name = source['city_name'].replace('"', '') label = '%s (%s)' % (city_name, source['zipcode']) city = { 'city': source['slug'], 'zipcode': source['zipcode'], 'label': label, 'latitude': source['location']['lat'], 'longitude': source['location']['lon'], } suggestions.append(city) return suggestions