Python Elasticsearch Examples, labonneboite.common.es.Elasticsearch Python Examples

Example #1

0

Show file

File: create_index.py Project: PBBM/labonneboite

def switch_es_index():
    """
    Context manager that will ensure that some code will operate on a new ES
    index. This new index will then be associated to the reference alias and
    the old index(es) will be dropped.

    Usage:

        with switch_es_index():
            # Here, all code will run on the new index
            run_some_code()

        # Here, the old indexes no longer exist and the reference alias points
        # to the new index
    """
    # Find current index names (there may be one, zero or more)
    alias_name = settings.ES_INDEX
    try:
        old_index_names = list(es.Elasticsearch().indices.get_alias(
            settings.ES_INDEX).keys())
    except NotFoundError:
        old_index_names = []

    # Activate new index
    new_index_name = es.get_new_index_name()
    settings.ES_INDEX = new_index_name

    # Create new index
    es.create_index(new_index_name)

    try:
        yield
    except:
        # Delete newly created index
        es.drop_index(new_index_name)
        raise
    finally:
        # Set back alias name
        settings.ES_INDEX = alias_name

    # Switch alias
    # TODO this should be done in one shot with a function in es.py module
    es.add_alias_to_index(new_index_name)
    for old_index_name in old_index_names:
        es.Elasticsearch().indices.delete_alias(index=old_index_name,
                                                name=alias_name)

    # Delete old index
    for old_index_name in old_index_names:
        es.drop_index(old_index_name)

Example #2

0

Show file

File: create_index.py Project: PBBM/labonneboite

def update_offices_geolocations():
    """
    Remove or add extra geolocations to offices.
    New geolocations are entered into the system through the `OfficeAdminExtraGeoLocation` table.
    """
    for extra_geolocation in db_session.query(
            OfficeAdminExtraGeoLocation).all():
        office = Office.query.filter_by(siret=extra_geolocation.siret).first()
        if office:
            locations = []
            if office.y and office.x:
                locations.append({'lat': office.y, 'lon': office.x})
            if not extra_geolocation.is_outdated():
                locations.extend(
                    extra_geolocation.geolocations_as_lat_lon_properties())
                office.has_multi_geolocations = True
            else:
                office.has_multi_geolocations = False
            # Apply changes in DB.
            office.save()
            # Apply changes in ElasticSearch.
            body = {'doc': {'locations': locations}}
            es.Elasticsearch().update(
                index=settings.ES_INDEX,
                doc_type=es.OFFICE_TYPE,
                id=office.siret,
                body=body,
                params={'ignore': 404},
            )

Example #3

0

Show file

File: create_index.py Project: PBBM/labonneboite

def remove_offices():
    """
    Remove offices (overload the data provided by the importer).
    """
    # When returning multiple rows, the SQLAlchemy Query class can only give them out as tuples.
    # We need to unpack them explicitly.
    offices_to_remove = [
        siret for (siret, ) in db_session.query(OfficeAdminRemove.siret).all()
    ]

    for siret in offices_to_remove:
        # Apply changes in ElasticSearch.
        try:
            es.Elasticsearch().delete(index=settings.ES_INDEX,
                                      doc_type=es.OFFICE_TYPE,
                                      id=siret)
        except TransportError as e:
            if e.status_code != 404:
                raise
        # Apply changes in DB.
        office = Office.query.filter_by(siret=siret).first()
        if office:
            try:
                office.delete()
            except OperationalError:  # retry once in case of deadlock error
                time.sleep(10)
                office.delete()
            # Delete the current PDF.
            pdf_util.delete_file(office)

Example #4

0

Show file

File: test_api_scripts.py Project: thomasszuber/labonneboite

    def test_update_office_boost_flag_all_romes_alternance(self):
        """
        Test `update_offices` boosted flag is present when all romes are boosted
        """
        office_to_update = OfficeAdminUpdate(sirets='00000000000009',
                                             name='Office 9',
                                             boost_alternance=True)
        office_to_update.save()
        script.update_offices()
        es.Elasticsearch().indices.flush()

        params = self.add_security_params({
            'commune_id':
            self.positions['nantes']['commune_id'],
            'rome_codes':
            'D1211',
            'user':
            '******',
            'contract':
            'alternance'
        })

        with self.test_request_context:
            rv = self.app.get(self.url_for("api.company_list", **params))
            self.assertEqual(rv.status_code, 200, msg=rv.data)
            data_list = json.loads(rv.data.decode())

            self.assertEqual(len(data_list['companies']), 2)

            # 00000000000009 is boosted and is the first result
            self.assertTrue(data_list['companies'][0]['boosted'])
            # 00000000000008 is not boosted and is the second result
            self.assertFalse(data_list['companies'][1]['boosted'])

Example #5

0

Show file

File: test_base.py Project: thomasszuber/labonneboite

    def setUp(self):
        if env.get_current_env() != env.ENV_TEST:
            raise ValueError(
                "Running database tests, but not in test mode. You"
                " most certainly don't want to do that. Set the"
                " `LBB_ENV=test` environment variable.")

        # Disable elasticsearch logging
        logging.getLogger('elasticsearch').setLevel(logging.CRITICAL)
        logging.getLogger('main').setLevel(logging.CRITICAL)

        # Create MySQL tables.
        delete_db()
        init_db()

        # Create ES index.
        self.assertIn('test', settings.ES_INDEX)
        self.es = es.Elasticsearch()
        es.drop_and_create_index()

        return super(DatabaseTest, self).setUp()

Example #6

0

Show file

File: test_api_scripts.py Project: StartupsPoleEmploi/labonneboite

    def test_update_office_boost_flag_specific_romes_alternance(self):
        """
        Test `update_offices` boosted flag is present
        """
        office_to_update = OfficeAdminUpdate(
            sirets='00000000000008',
            name='Office 8',
            boost_alternance=True,
            romes_alternance_to_boost="D1211",  # Boost score only for this ROME.
        )
        office_to_update.save(commit=True)
        script.update_offices(OfficeAdminUpdate)
        es.Elasticsearch().indices.flush()

        params = self.add_security_params({
            'commune_id':
            self.positions['nantes']['commune_id'],
            'rome_codes':
            'D1211',
            'user':
            '******',
            'contract':
            'alternance'
        })

        with self.test_request_context():
            rv = self.app.get(self.url_for("api.company_list", **params))
            self.assertEqual(rv.status_code, 200)
            data_list = json.loads(rv.data.decode())

            self.assertEqual(len(data_list['companies']), 2)

            # 00000000000008 should be boosted and be the first result
            self.assertEqual(data_list['companies'][0]['siret'],
                             '00000000000008')
            self.assertTrue(data_list['companies'][0]['boosted'])

            # 00000000000009 should not be boosted and be the second result
            self.assertFalse(data_list['companies'][1]['boosted'])

Example #7

0

Show file

File: create_index.py Project: PBBM/labonneboite

def add_offices():
    """
    Add offices (complete the data provided by the importer).
    """
    for office_to_add in db_session.query(OfficeAdminAdd).all():

        office = Office.query.filter_by(siret=office_to_add.siret).first()

        # Only create a new office if it does not already exist.
        # This guarantees that the importer data will always have precedence.
        if not office:

            # The `headcount` field of an `OfficeAdminAdd` instance has a `code` attribute.
            if hasattr(office_to_add.headcount, 'code'):
                headcount = office_to_add.headcount.code
            else:
                headcount = office_to_add.headcount

            # Create the new office in DB.
            new_office = Office()
            # Use `inspect` because `Office` columns are named distinctly from attributes.
            for field_name in list(inspect(Office).columns.keys()):
                try:
                    value = getattr(office_to_add, field_name)
                except AttributeError:
                    # Some fields are not shared between `Office` and `OfficeAdminAdd`.
                    continue
                if field_name == 'headcount':
                    value = headcount
                setattr(new_office, field_name, value)
            db_session.add(new_office)
            db_session.commit()

            # Create the new office in ES.
            doc = get_office_as_es_doc(office_to_add)
            es.Elasticsearch().create(index=settings.ES_INDEX,
                                      doc_type=es.OFFICE_TYPE,
                                      id=office_to_add.siret,
                                      body=doc)

Example #8

0

Show file

File: create_index.py Project: PBBM/labonneboite

def update_offices():
    """
    Update offices (overload the data provided by the importer).
    """
    # Good engineering eliminates users being able to do the wrong thing as much as possible.
    # But since it is possible to store multiple SIRETs, there is no longer any constraint of uniqueness
    # on a SIRET. As a result, it shouldn't but there may be `n` entries in `OfficeAdminUpdate`
    # for the same SIRET. We order the query by creation date ASC so that the most recent changes take
    # priority over any older ones.
    for office_to_update in db_session.query(OfficeAdminUpdate).order_by(
            asc(OfficeAdminUpdate.date_created)).all():

        for siret in OfficeAdminUpdate.as_list(office_to_update.sirets):

            office = Office.query.filter_by(siret=siret).first()

            if office:
                # Apply changes in DB.
                office.company_name = office_to_update.new_company_name or office.company_name
                office.office_name = office_to_update.new_office_name or office.office_name
                office.email = '' if office_to_update.remove_email else (
                    office_to_update.new_email or office.email)
                office.tel = '' if office_to_update.remove_phone else (
                    office_to_update.new_phone or office.tel)
                office.website = '' if office_to_update.remove_website else (
                    office_to_update.new_website or office.website)

                office.email_alternance = office_to_update.email_alternance
                office.phone_alternance = office_to_update.phone_alternance
                office.website_alternance = office_to_update.website_alternance

                # Note : we need to handle when score and score_alternance = 0
                office.score = office_to_update.score if office_to_update.score is not None else office.score
                office.score_alternance = office_to_update.score_alternance if office_to_update.score_alternance is not None else office.score_alternance
                office.social_network = office_to_update.social_network
                office.contact_mode = office_to_update.contact_mode
                office.save()

                # Apply changes in ElasticSearch.
                body = {
                    'doc': {
                        'email': office.email,
                        'phone': office.tel,
                        'website': office.website,
                        'flag_alternance': 1 if office.flag_alternance else 0
                    }
                }

                scores_by_rome, scores_alternance_by_rome, boosted_romes, boosted_alternance_romes = get_scores_by_rome_and_boosted_romes(
                    office, office_to_update)
                if scores_by_rome:
                    body['doc']['scores_by_rome'] = scores_by_rome
                    body['doc']['boosted_romes'] = boosted_romes
                if scores_alternance_by_rome:
                    body['doc'][
                        'scores_alternance_by_rome'] = scores_alternance_by_rome
                    body['doc'][
                        'boosted_alternance_romes'] = boosted_alternance_romes

                # The update API makes partial updates: existing `scalar` fields are overwritten,
                # but `objects` fields are merged together.
                # https://www.elastic.co/guide/en/elasticsearch/guide/1.x/partial-updates.html
                # However `scores_by_rome` and `boosted_romes` need to be overwritten because they
                # may change over time.
                # To do this, we perform 2 requests: the first one resets `scores_by_rome` and
                # `boosted_romes` and the second one populates them.
                delete_body = {'doc': {}}
                delete_body = {
                    'doc': {
                        'scores_by_rome': None,
                        'boosted_romes': None,
                        'scores_alternance_by_rome': None,
                        'boosted_alternance_romes': None
                    }
                }

                # Unfortunately these cannot easily be bulked :-(
                # The reason is there is no way to tell bulk to ignore missing documents (404)
                # for a partial update. Tried it and failed it on Oct 2017 @vermeer.
                es.Elasticsearch().update(index=settings.ES_INDEX,
                                          doc_type=es.OFFICE_TYPE,
                                          id=siret,
                                          body=delete_body,
                                          params={'ignore': 404})
                es.Elasticsearch().update(index=settings.ES_INDEX,
                                          doc_type=es.OFFICE_TYPE,
                                          id=siret,
                                          body=body,
                                          params={'ignore': 404})

                # Delete the current PDF thus it will be regenerated at the next download attempt.
                pdf_util.delete_file(office)