def switch_es_index(): """ Context manager that will ensure that some code will operate on a new ES index. This new index will then be associated to the reference alias and the old index(es) will be dropped. Usage: with switch_es_index(): # Here, all code will run on the new index run_some_code() # Here, the old indexes no longer exist and the reference alias points # to the new index """ # Find current index names (there may be one, zero or more) alias_name = settings.ES_INDEX try: old_index_names = list(es.Elasticsearch().indices.get_alias( settings.ES_INDEX).keys()) except NotFoundError: old_index_names = [] # Activate new index new_index_name = es.get_new_index_name() settings.ES_INDEX = new_index_name # Create new index es.create_index(new_index_name) try: yield except: # Delete newly created index es.drop_index(new_index_name) raise finally: # Set back alias name settings.ES_INDEX = alias_name # Switch alias # TODO this should be done in one shot with a function in es.py module es.add_alias_to_index(new_index_name) for old_index_name in old_index_names: es.Elasticsearch().indices.delete_alias(index=old_index_name, name=alias_name) # Delete old index for old_index_name in old_index_names: es.drop_index(old_index_name)
def update_offices_geolocations(): """ Remove or add extra geolocations to offices. New geolocations are entered into the system through the `OfficeAdminExtraGeoLocation` table. """ for extra_geolocation in db_session.query( OfficeAdminExtraGeoLocation).all(): office = Office.query.filter_by(siret=extra_geolocation.siret).first() if office: locations = [] if office.y and office.x: locations.append({'lat': office.y, 'lon': office.x}) if not extra_geolocation.is_outdated(): locations.extend( extra_geolocation.geolocations_as_lat_lon_properties()) office.has_multi_geolocations = True else: office.has_multi_geolocations = False # Apply changes in DB. office.save() # Apply changes in ElasticSearch. body = {'doc': {'locations': locations}} es.Elasticsearch().update( index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=office.siret, body=body, params={'ignore': 404}, )
def remove_offices(): """ Remove offices (overload the data provided by the importer). """ # When returning multiple rows, the SQLAlchemy Query class can only give them out as tuples. # We need to unpack them explicitly. offices_to_remove = [ siret for (siret, ) in db_session.query(OfficeAdminRemove.siret).all() ] for siret in offices_to_remove: # Apply changes in ElasticSearch. try: es.Elasticsearch().delete(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=siret) except TransportError as e: if e.status_code != 404: raise # Apply changes in DB. office = Office.query.filter_by(siret=siret).first() if office: try: office.delete() except OperationalError: # retry once in case of deadlock error time.sleep(10) office.delete() # Delete the current PDF. pdf_util.delete_file(office)
def test_update_office_boost_flag_all_romes_alternance(self): """ Test `update_offices` boosted flag is present when all romes are boosted """ office_to_update = OfficeAdminUpdate(sirets='00000000000009', name='Office 9', boost_alternance=True) office_to_update.save() script.update_offices() es.Elasticsearch().indices.flush() params = self.add_security_params({ 'commune_id': self.positions['nantes']['commune_id'], 'rome_codes': 'D1211', 'user': '******', 'contract': 'alternance' }) with self.test_request_context: rv = self.app.get(self.url_for("api.company_list", **params)) self.assertEqual(rv.status_code, 200, msg=rv.data) data_list = json.loads(rv.data.decode()) self.assertEqual(len(data_list['companies']), 2) # 00000000000009 is boosted and is the first result self.assertTrue(data_list['companies'][0]['boosted']) # 00000000000008 is not boosted and is the second result self.assertFalse(data_list['companies'][1]['boosted'])
def setUp(self): if env.get_current_env() != env.ENV_TEST: raise ValueError( "Running database tests, but not in test mode. You" " most certainly don't want to do that. Set the" " `LBB_ENV=test` environment variable.") # Disable elasticsearch logging logging.getLogger('elasticsearch').setLevel(logging.CRITICAL) logging.getLogger('main').setLevel(logging.CRITICAL) # Create MySQL tables. delete_db() init_db() # Create ES index. self.assertIn('test', settings.ES_INDEX) self.es = es.Elasticsearch() es.drop_and_create_index() return super(DatabaseTest, self).setUp()
def test_update_office_boost_flag_specific_romes_alternance(self): """ Test `update_offices` boosted flag is present """ office_to_update = OfficeAdminUpdate( sirets='00000000000008', name='Office 8', boost_alternance=True, romes_alternance_to_boost="D1211", # Boost score only for this ROME. ) office_to_update.save(commit=True) script.update_offices(OfficeAdminUpdate) es.Elasticsearch().indices.flush() params = self.add_security_params({ 'commune_id': self.positions['nantes']['commune_id'], 'rome_codes': 'D1211', 'user': '******', 'contract': 'alternance' }) with self.test_request_context(): rv = self.app.get(self.url_for("api.company_list", **params)) self.assertEqual(rv.status_code, 200) data_list = json.loads(rv.data.decode()) self.assertEqual(len(data_list['companies']), 2) # 00000000000008 should be boosted and be the first result self.assertEqual(data_list['companies'][0]['siret'], '00000000000008') self.assertTrue(data_list['companies'][0]['boosted']) # 00000000000009 should not be boosted and be the second result self.assertFalse(data_list['companies'][1]['boosted'])
def add_offices(): """ Add offices (complete the data provided by the importer). """ for office_to_add in db_session.query(OfficeAdminAdd).all(): office = Office.query.filter_by(siret=office_to_add.siret).first() # Only create a new office if it does not already exist. # This guarantees that the importer data will always have precedence. if not office: # The `headcount` field of an `OfficeAdminAdd` instance has a `code` attribute. if hasattr(office_to_add.headcount, 'code'): headcount = office_to_add.headcount.code else: headcount = office_to_add.headcount # Create the new office in DB. new_office = Office() # Use `inspect` because `Office` columns are named distinctly from attributes. for field_name in list(inspect(Office).columns.keys()): try: value = getattr(office_to_add, field_name) except AttributeError: # Some fields are not shared between `Office` and `OfficeAdminAdd`. continue if field_name == 'headcount': value = headcount setattr(new_office, field_name, value) db_session.add(new_office) db_session.commit() # Create the new office in ES. doc = get_office_as_es_doc(office_to_add) es.Elasticsearch().create(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=office_to_add.siret, body=doc)
def update_offices(): """ Update offices (overload the data provided by the importer). """ # Good engineering eliminates users being able to do the wrong thing as much as possible. # But since it is possible to store multiple SIRETs, there is no longer any constraint of uniqueness # on a SIRET. As a result, it shouldn't but there may be `n` entries in `OfficeAdminUpdate` # for the same SIRET. We order the query by creation date ASC so that the most recent changes take # priority over any older ones. for office_to_update in db_session.query(OfficeAdminUpdate).order_by( asc(OfficeAdminUpdate.date_created)).all(): for siret in OfficeAdminUpdate.as_list(office_to_update.sirets): office = Office.query.filter_by(siret=siret).first() if office: # Apply changes in DB. office.company_name = office_to_update.new_company_name or office.company_name office.office_name = office_to_update.new_office_name or office.office_name office.email = '' if office_to_update.remove_email else ( office_to_update.new_email or office.email) office.tel = '' if office_to_update.remove_phone else ( office_to_update.new_phone or office.tel) office.website = '' if office_to_update.remove_website else ( office_to_update.new_website or office.website) office.email_alternance = office_to_update.email_alternance office.phone_alternance = office_to_update.phone_alternance office.website_alternance = office_to_update.website_alternance # Note : we need to handle when score and score_alternance = 0 office.score = office_to_update.score if office_to_update.score is not None else office.score office.score_alternance = office_to_update.score_alternance if office_to_update.score_alternance is not None else office.score_alternance office.social_network = office_to_update.social_network office.contact_mode = office_to_update.contact_mode office.save() # Apply changes in ElasticSearch. body = { 'doc': { 'email': office.email, 'phone': office.tel, 'website': office.website, 'flag_alternance': 1 if office.flag_alternance else 0 } } scores_by_rome, scores_alternance_by_rome, boosted_romes, boosted_alternance_romes = get_scores_by_rome_and_boosted_romes( office, office_to_update) if scores_by_rome: body['doc']['scores_by_rome'] = scores_by_rome body['doc']['boosted_romes'] = boosted_romes if scores_alternance_by_rome: body['doc'][ 'scores_alternance_by_rome'] = scores_alternance_by_rome body['doc'][ 'boosted_alternance_romes'] = boosted_alternance_romes # The update API makes partial updates: existing `scalar` fields are overwritten, # but `objects` fields are merged together. # https://www.elastic.co/guide/en/elasticsearch/guide/1.x/partial-updates.html # However `scores_by_rome` and `boosted_romes` need to be overwritten because they # may change over time. # To do this, we perform 2 requests: the first one resets `scores_by_rome` and # `boosted_romes` and the second one populates them. delete_body = {'doc': {}} delete_body = { 'doc': { 'scores_by_rome': None, 'boosted_romes': None, 'scores_alternance_by_rome': None, 'boosted_alternance_romes': None } } # Unfortunately these cannot easily be bulked :-( # The reason is there is no way to tell bulk to ignore missing documents (404) # for a partial update. Tried it and failed it on Oct 2017 @vermeer. es.Elasticsearch().update(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=siret, body=delete_body, params={'ignore': 404}) es.Elasticsearch().update(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=siret, body=body, params={'ignore': 404}) # Delete the current PDF thus it will be regenerated at the next download attempt. pdf_util.delete_file(office)