Example #1
0
    def test_synchro(self):
        """
        Test that the data is synchronized between the SQL database and Elasticsearch.
        """
        random.seed(99)  # use a seed to get deterministic random numbers
        departement = random.choice(dpt.DEPARTEMENTS)
        offices = Office.query.filter(
            Office.departement == departement).limit(100)

        # If we got no results here, assume that tests are running locally,
        # i.e. on a dev machine which currently only have data for "Metz".
        # TODO: clearly identify which Elasticsearch we are talking to.
        if offices.count() == 0:
            offices = Office.query.filter(Office.departement == 57).limit(100)

        self.assertTrue(offices.count() > 0)

        es = Elasticsearch()
        scores = {office.siret: office.score for office in offices}
        body = {
            "query": {
                "filtered": {
                    "filter": {
                        "terms": {
                            "siret": list(scores.keys())
                        }
                    }
                }
            }
        }
        res = es.search(index=settings.ES_INDEX, doc_type="office", body=body)
        for office in res['hits']['hits']:
            index_score = office["_source"]["score"]
            siret = office["_source"]["siret"]
            self.assertEqual(index_score, scores[siret])
Example #2
0
def is_elasticsearch_alive():
    try:
        es = Elasticsearch()
        es.ping()
        return True
    except:
        return False
Example #3
0
def is_elasticsearch_alive():
    try:
        es = Elasticsearch()
        es.ping()
        return True
    # pylint: disable=W0703
    except Exception as e:
        logger.exception(e)
        return False
 def _get_offices_from_es(self, query) -> Dict:
     res: Dict
     es = Elasticsearch()
     logger.debug("Elastic Search request : %s", query)
     res = es.search(index=settings.ES_INDEX, doc_type="office", body=query)
     return res
 def _count_offices_from_es(json_body):
     es = Elasticsearch()
     res = es.count(index=settings.ES_INDEX,
                    doc_type="office",
                    body=json_body)
     return res["count"]
Example #6
0
def get_offices_from_es_and_db(json_body, sort, rome_codes, hiring_type):
    """
    Fetch offices first from Elasticsearch, then from the database.

    Returns a tuple of (offices, office_count), where `offices` is a
    list of results as Office instances (with some extra attributes only available
    in Elasticsearch) and `office_count` an integer of the results number.

    `sort` is needed to find back the distance between each office and the search location,
    to store it and display it later on the frontend or in the API result.

    `rome_codes` and `hiring_type` are needed in the case of multi rome search, to find
    back for each office which rome_code actually did match.
    """
    if sort not in sorting.SORT_FILTERS:
        # This should never happen.
        # An API request would have already raised a InvalidFetcherArgument exception,
        # and a Frontend request would have fallbacked to default sorting.
        raise ValueError("unknown sorting : %s" % sort)

    es = Elasticsearch()
    logger.debug("Elastic Search request : %s", json_body)
    res = es.search(index=settings.ES_INDEX, doc_type="office", body=json_body)

    office_count = res['hits']['total']
    offices = []
    siret_list = [office["_source"]["siret"] for office in res['hits']['hits']]

    if siret_list:

        office_objects = Office.query.filter(Office.siret.in_(siret_list))
        office_dict = {obj.siret: obj for obj in office_objects}

        for siret in siret_list:
            try:
                office = office_dict[siret]
            except KeyError:
                # ES and DB out of sync: siret is in ES but not in DB - this should never happen
                logger.error(
                    "ES and DB out of sync: siret %s is in ES but not in DB - this should never happen",
                    siret)
                raise
            if office.has_city():
                offices.append(office)
            else:
                logging.info("office siret %s does not have city, ignoring...",
                             siret)

    # FIXME it's not great to add new properties to an existing object. It
    # would be better to wrap the office objects in a new OfficeResult
    # class that would add new properties related to the query.
    es_offices_by_siret = {
        item['_source']['siret']: item
        for item in res['hits']['hits']
    }
    # FIXME These hardcoded values are soooooo ugly, unfortunately it is not so
    # easy to make it DNRY. For the corresponding code see method build_json_body_elastic_search().
    distance_sort_index = {
        sorting.SORT_FILTER_DISTANCE: 0,
        sorting.SORT_FILTER_SCORE: 2,
    }[sort]
    sort_fields_total = {
        sorting.SORT_FILTER_DISTANCE: 1,  # (distance_sort)
        sorting.SORT_FILTER_SCORE:
        3,  # (boosted_romes_sort, randomized_score_sort, distance_sort)
    }[sort]
    for position, office in enumerate(offices, start=1):
        # Get the corresponding item from the Elasticsearch results.
        es_office = es_offices_by_siret[office.siret]

        if len(es_office["sort"]) != sort_fields_total:
            raise ValueError(
                "Incorrect number of sorting fields in ES response.")
        # Add an extra `distance` attribute with one digit.
        office.distance = round(es_office["sort"][distance_sort_index], 1)
        # position is later used in labonneboite/web/static/js/results.js
        office.position = position

        if len(rome_codes) > 1:
            # Identify which rome_code actually matched this office.
            keyname = get_score_for_rome_field_name(
                hiring_type, rome_codes[0]).split('.')[0]
            all_scores = es_office['_source'][keyname]
            scores_of_searched_romes = dict([(rome, all_scores[rome])
                                             for rome in rome_codes
                                             if rome in all_scores])
            # https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
            rome_with_highest_score = max(scores_of_searched_romes,
                                          key=scores_of_searched_romes.get)
            # Store it as an extra attribute.
            office.matched_rome = rome_with_highest_score
            rome_code_for_contact_mode = rome_with_highest_score
        else:
            rome_code_for_contact_mode = rome_codes[0]

        # Set boost flag
        office.boost = False
        boosted_rome_keyname = get_boosted_rome_field_name(
            hiring_type, rome_codes[0]).split('.')[0]
        if boosted_rome_keyname in es_office['_source']:
            boost_romes = es_office['_source'][boosted_rome_keyname]
            romes_intersection = set(rome_codes).intersection(boost_romes)
            office.boost = bool(romes_intersection)

        # Set contact mode and position
        office.contact_mode = util.get_contact_mode_for_rome_and_office(
            rome_code_for_contact_mode, office)

    try:
        aggregations = res['aggregations']
    except KeyError:
        aggregations = []

    return offices, office_count, aggregations
Example #7
0
def build_job_label_suggestions(term, size=autocomplete.MAX_JOBS):

    es = Elasticsearch()

    body = {
        "_source": ["ogr_description", "rome_description", "rome_code"],
        "query": {
            "match": {
                # Query for multiple words or multiple parts of words across multiple fields.
                # Based on https://qbox.io/blog/an-introduction-to-ngrams-in-elasticsearch
                "_all": unidecode.unidecode(term),
            }
        },
        "aggs": {
            "by_rome_code": {
                "terms": {
                    "field": "rome_code",
                    "size": 0,
                    # Note: a maximum of 550 buckets will be fetched, as we have 550 unique ROME codes

                    # FIXME `order` cannot work without a computed `max_score`, see the `max_score` comment below.
                    # Order results by sub-aggregation named 'max_score'
                    # "order": {"max_score": "desc"},
                },
                "aggs": {
                    # Only 1 result per rome code: include only 1 top hit on each bucket in the results.
                    # Another way of saying this is that for all OGR matching a given ROME, we only
                    # keep the most relevant OGR.
                    "by_top_hit": {"top_hits": {"size": 1}},

                    # FIXME `max_score` below does not work with Elasticsearch 1.7.
                    # Fixed in elasticsearch 2.0+:
                    # https://github.com/elastic/elasticsearch/issues/10091#issuecomment-193676966

                    # FTR @vermeer made another try to find a workaround as of Feb 2018, and failed.
                    # The only way out is to upgrade to elasticsearch 2.0+

                    # Set max score among all members of this bucket
                    # "max_score": {"max": {"lang": "expression", "script": "_score"}},
                },
            },
        },
        "size": 0,
    }

    res = es.search(index=settings.ES_INDEX, doc_type="ogr", body=body)

    suggestions = []

    # Since ordering cannot be done easily through Elasticsearch 1.7 (`max_score` not working),
    # we do it in Python at this time.
    results = res['aggregations']['by_rome_code']['buckets']
    results.sort(key=lambda e: e['by_top_hit']['hits']['max_score'], reverse=True)

    for hit in results:
        if len(suggestions) < size:
            hit = hit['by_top_hit']['hits']['hits'][0]
            source = hit['_source']
            highlight = hit.get('highlight', {})
            try:
                rome_description = highlight['rome_description.autocomplete'][0]
            except KeyError:
                rome_description = source['rome_description']
            try:
                ogr_description = highlight['ogr_description.autocomplete'][0]
            except KeyError:
                ogr_description = source['ogr_description']
            label = "%s (%s, ...)" % (rome_description, ogr_description)
            value = "%s (%s, ...)" % (source["rome_description"], source["ogr_description"])
            score = round(hit['_score'], 1)
            suggestions.append({
                'id': source['rome_code'],
                'label': label,
                'value': value,
                'occupation': slugify(source['rome_description'].lower()),
                'score': score,
            })
        else:
            break

    return suggestions
Example #8
0
def build_location_suggestions(term):
    term = term.title()
    es = Elasticsearch()
    zipcode_match = [{
        "prefix": {
            "zipcode": term
        }
    }, ]

    city_match = [{
        "match": {
            "city_name.autocomplete": {
                "query": term,
            }
        }}, {
        "match": {
            "city_name.stemmed": {
                "query": term,
                "boost": 1,
            }
        }}, {
        "match_phrase_prefix": {
            "city_name.stemmed": {
                "query": term,
            }
        }}]

    filters = zipcode_match

    try:
        int(term)
    except ValueError:
        filters.extend(city_match)

    body = {
        "query": {
            # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html
            "function_score": {
                "query": {
                    "bool": {
                        "should": filters,
                    },
                },
                # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-field-value-factor
                "field_value_factor": {
                    "field": "population",
                    "modifier": "log1p",
                }
            },
        },
        "size": autocomplete.MAX_LOCATIONS,
    }
    res = es.search(index=settings.ES_INDEX, doc_type="location", body=body)

    suggestions = []
    first_score = None

    for hit in res['hits']['hits']:
        if not first_score:
            first_score = hit['_score']
        source = hit['_source']
        if source['zipcode']:  # and hit['_score'] > 0.1 * first_score:
            city_name = source['city_name'].replace('"', '')
            label = '%s (%s)' % (city_name, source['zipcode'])
            city = {
                'city': source['slug'],
                'zipcode': source['zipcode'],
                'label': label,
                'latitude': source['location']['lat'],
                'longitude': source['location']['lon'],
            }
            suggestions.append(city)
    return suggestions