def deduplicate_users(): # We import the app to initialize the social models import labonneboite.web.app # pylint: disable=unused-import,unused-variable from labonneboite.common.database import db_session from labonneboite.common.models import auth from labonneboite.common.models.user_favorite_offices import UserFavoriteOffice # Iterate on duplicated users for user in auth.User.query.group_by('external_id').having( sa.func.count(auth.User.external_id) > 1): duplicate_user_ids = [] favorite_count = 0 # Create favorites, if necessary for duplicate_user in auth.User.query.filter( auth.User.external_id == user.external_id, auth.User.id != user.id): duplicate_user_ids.append(duplicate_user.id) for favorite in duplicate_user.favorite_offices: _, created = UserFavoriteOffice.get_or_create( user_id=user.id, office_siret=favorite.office_siret) if created: favorite_count += 1 print( "Removing {} duplicates for user #{} ({} favorite added to original user)" .format(len(duplicate_user_ids), user.id, favorite_count)) # Remove duplicate social user db_session.query(auth.UserSocialAuth).filter( auth.UserSocialAuth.user_id.in_(duplicate_user_ids)).delete( synchronize_session=False) # Remove duplicate user auth.User.query.filter(auth.User.id.in_(duplicate_user_ids)).delete( synchronize_session=False)
def test_delete_user_account(self): """ Test the deletion of a user account. """ url = self.url_for('user.account_delete') with self.test_request_context(): self.login(self.user) # Display the account deletion confirmation page. rv = self.app.get(url) self.assertEqual(rv.status_code, 200) # Confirm account deletion. rv = self.app.post(url, data={'confirm_deletion': 1}) # The user should be redirected to the PEAM logout endpoint. self.assertEqual(rv.status_code, 302) self.assertIn(settings.PEAM_AUTH_BASE_URL, rv.location) self.assertIn(self.user_social_auth.extra_data['id_token'], rv.location) # The user and its info should have been deleted. self.assertEqual(db_session.query(User).count(), 0) self.assertEqual(db_session.query(UserFavoriteOffice).count(), 0) self.assertEqual(db_session.query(UserSocialAuth).count(), 0) # The user should now be anonymous and cannot access protected pages. rv = self.app.get(url) self.assertEqual(rv.status_code, 401)
def setUp(self): """ Populate the DB with data required for these tests to work. """ super(UserAccountTest, self).setUp() self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(self.user) db_session.flush() self.office1 = Office( departement='57', siret='00000000000001', company_name='1', headcount='5', city_code='57070', zipcode='57070', naf='4646Z', score=90, x=6.166667, y=49.133333, ) self.office2 = Office( departement='57', siret='00000000000002', company_name='1', headcount='5', city_code='57070', zipcode='57070', naf='4646Z', score=90, x=6.166667, y=49.133333, ) db_session.add_all([self.office1, self.office2]) db_session.flush() self.user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) self.fav1 = UserFavoriteOffice(user_id=self.user.id, office_siret=self.office1.siret) self.fav2 = UserFavoriteOffice(user_id=self.user.id, office_siret=self.office2.siret) db_session.add_all([self.user_social_auth, self.fav1, self.fav2]) db_session.flush() db_session.commit() self.assertEqual(db_session.query(User).count(), 1) self.assertEqual(db_session.query(Office).count(), 2) self.assertEqual(db_session.query(UserFavoriteOffice).count(), 2) self.assertEqual(db_session.query(UserSocialAuth).count(), 1)
def test_office_admin_add(self): form = { "siret": "78548035101646", "company_name": "SUPERMARCHES MATCH", "office_name": "SUPERMARCHES MATCH", "naf": "4711D", "street_number": "45", "street_name": "AVENUE ANDRE MALRAUX", "city_code": "57463", "zipcode": "57000", "email": "*****@*****.**", "tel": "0387787878", "website": "http://www.supermarchesmatch.fr", "flag_alternance": 0, "flag_junior": 0, "flag_senior": 0, "flag_handicap": 0, "departement": "57", "headcount": "12", "score": 90, "score_alternance": 75, "x": 6.17952, "y": 49.1044, "reason": "Demande de mise en avant", } with self.test_request_context(): # Create an user admin self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe', active=True, is_admin=True) db_session.add(self.user) db_session.flush() user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) db_session.add(user_social_auth) db_session.commit() # Login as user admin self.user = db_session.query(User).filter_by(id=self.user.id).first() self.assertEqual(db_session.query(User).count(), 1) self.login(self.user) # Create OfficeAdminRemove self.assertEqual(0, OfficeAdminAdd.query.filter_by(id=1).count()) self.app.post(url_for('officeadminadd.create_view'), data=form) self.assertEqual(1, OfficeAdminAdd.query.filter_by(id=1).count()) # Delete OfficeAdminAdd self.app.post(url_for('officeadminadd.delete_view'), data={'id': 1}) self.assertEqual(0, OfficeAdminRemove.query.filter_by(id=1).count())
def test_office_admin_remove(self): # Create officeAdminRemove form = { 'siret': '01234567891234', 'name': 'Test company', 'reason': 'N/A', 'initiative': 'office', } with self.test_request_context(): # Create an user admin self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe', active=True, is_admin=True) db_session.add(self.user) db_session.flush() user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) db_session.add(user_social_auth) db_session.commit() # Login as user admin self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertEqual(db_session.query(User).count(), 1) self.login(self.user) # Create OfficeAdminRemove self.assertEqual( 0, OfficeAdminRemove.query.filter_by( siret='01234567891234').count()) self.app.post(url_for('officeadminremove.create_view'), data=form) self.assertEqual( 1, OfficeAdminRemove.query.filter_by( siret='01234567891234').count()) # Delete OfficeAdminRemove self.app.post(url_for('officeadminremove.delete_view'), data={'id': 1}) self.assertEqual(0, OfficeAdminRemove.query.filter_by(id=1).count())
def get_or_create(cls, defaults=None, **kwargs): try: return db_session.query(cls).filter_by(**kwargs).one(), False except NoResultFound: if defaults: kwargs.update(defaults) instance = cls(**kwargs) try: db_session.add(instance) db_session.commit() return instance, True except IntegrityError: db_session.rollback() return db_session.query(cls).filter_by(**kwargs).one(), True
def get_user_social_auth(user_id, provider=PEAMOpenIdConnect.name): """ Return the latest `UserSocialAuth` instance for the given `user_id` and `provider`. """ return (db_session.query(UserSocialAuth).filter_by( user_id=user_id, provider=provider).order_by(desc(UserSocialAuth.id)).first())
def remove_offices(): """ Remove offices (overload the data provided by the importer). """ # When returning multiple rows, the SQLAlchemy Query class can only give them out as tuples. # We need to unpack them explicitly. offices_to_remove = [ siret for (siret, ) in db_session.query(OfficeAdminRemove.siret).all() ] for siret in offices_to_remove: # Apply changes in ElasticSearch. try: es.Elasticsearch().delete(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=siret) except TransportError as e: if e.status_code != 404: raise # Apply changes in DB. office = Office.query.filter_by(siret=siret).first() if office: try: office.delete() except OperationalError: # retry once in case of deadlock error time.sleep(10) office.delete() # Delete the current PDF. pdf_util.delete_file(office)
def update_offices_geolocations(): """ Remove or add extra geolocations to offices. New geolocations are entered into the system through the `OfficeAdminExtraGeoLocation` table. """ for extra_geolocation in db_session.query( OfficeAdminExtraGeoLocation).all(): office = Office.query.filter_by(siret=extra_geolocation.siret).first() if office: locations = [] if office.y and office.x: locations.append({'lat': office.y, 'lon': office.x}) if not extra_geolocation.is_outdated(): locations.extend( extra_geolocation.geolocations_as_lat_lon_properties()) office.has_multi_geolocations = True else: office.has_multi_geolocations = False # Apply changes in DB. office.save() # Apply changes in ElasticSearch. body = {'doc': {'locations': locations}} es.Elasticsearch().update( index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=office.siret, body=body, params={'ignore': 404}, )
def logout(user_social_auth=None): """ Log a user out. Param `user_social_auth`: a `UserSocialAuth` instance. `None` most of the time, except when a user is coming from the `user.account_delete` view. This param is intended to be passed when the view is called directly as a Python function, i.e. not with a `redirect()`. """ if not current_user.is_authenticated: return redirect(url_for('root.home')) logged_with_peam = session.get( 'social_auth_last_login_backend') == PEAMOpenIdConnect.name if logged_with_peam: if not user_social_auth: user_social_auth = get_user_social_auth(current_user.id) if user_social_auth: id_token = user_social_auth.extra_data['id_token'] # Force delete PEAMU token. db_session.query(UserSocialAuth).filter_by(user_id=current_user.id).delete() db_session.commit() # Log the user out and destroy the LBB session. activity.log('deconnexion') logout_user() # Clean the session: drop Python Social Auth info because it isn't done by `logout_user`. if 'social_auth_last_login_backend' in session: # Some backends have a `backend-name_state` stored in session as required by e.g. Oauth2. social_auth_state_key = '%s_state' % session['social_auth_last_login_backend'] if social_auth_state_key in session: session.pop(social_auth_state_key) session.pop('social_auth_last_login_backend') # Log the user out from PEAM and destroy the PEAM session. if logged_with_peam and user_social_auth: params = { 'id_token_hint': id_token, 'redirect_uri': url_for('auth.logout_from_peam_callback', _external=True), } peam_logout_url = '%s/compte/deconnexion?%s' % ( settings.PEAM_AUTH_BASE_URL, urlencode(params)) # After this redirect, the user will be redirected to the LBB website `logout_from_peam_callback` route. return redirect(peam_logout_url) return redirect(url_for('root.home'))
def create_offices_for_departement(departement): """ Populate the `office` type in ElasticSearch with offices having given departement. """ actions = [] logger.info("STARTED indexing offices for departement=%s ...", departement) # For LBB we apply two thresholds to show an office: # 1) its global all-rome-included score should be at least SCORE_REDUCING_MINIMUM_THRESHOLD # 2) its score adapted to requested rome should be at least SCORE_FOR_ROME_MINIMUM # For LBA we only apply the second threshold (SCORE_ALTERNANCE_FOR_ROME_MINIMUM) # and no longer apply the all-rome-included score threshold, in order to include # more relevant smaller companies. all_offices = db_session.query(Office).filter( and_( Office.departement == departement, or_( Office.score >= importer_settings.SCORE_REDUCING_MINIMUM_THRESHOLD, # Fetching offices with score lower than SCORE_ALTERNANCE_FOR_ROME_MINIMUM # would be a waste of resources as score-for-rome will always be less or # equal to all-rome-included score. Office.score_alternance >= scoring_util.SCORE_ALTERNANCE_FOR_ROME_MINIMUM, ), )).all() for office in all_offices: st.increment_office_count() es_doc = get_office_as_es_doc(office) office_is_reachable = ('scores_by_rome' in es_doc) or ('scores_alternance_by_rome' in es_doc) if office_is_reachable: st.increment_indexed_office_count() actions.append({ '_op_type': 'index', '_index': settings.ES_INDEX, '_type': es.OFFICE_TYPE, '_id': office.siret, '_source': es_doc, }) bulk_actions(actions) completed_jobs_counter.increment() logger.info( "COMPLETED indexing offices for departement=%s (%s of %s jobs completed)", departement, completed_jobs_counter.value, len(dpt.DEPARTEMENTS), ) display_performance_stats(departement)
def user_favs_as_sirets(cls, user): """ Returns the favorites offices of a user as a list of sirets. Useful to check if an office is already in the favorites of a user. """ if user.is_anonymous: return [] sirets = [fav.office_siret for fav in db_session.query(cls).filter_by(user_id=user.id)] return sirets
def get_user_social_auth(user_id): """ Return the latest `UserSocialAuth` instance for the given `user_id`. """ return ( db_session.query(UserSocialAuth) .filter_by(user_id=user_id) .order_by(desc(UserSocialAuth.id)) .first() )
def test_get_user_social_auth(self): """ Test the `get_user_social_auth()` function. """ user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(user) db_session.flush() expected_user_social_auth = UserSocialAuth(provider=PEAMOpenIdConnect.name, extra_data=None, user_id=user.id) db_session.add(expected_user_social_auth) db_session.flush() db_session.commit() self.assertEqual(db_session.query(User).count(), 1) self.assertEqual(db_session.query(UserSocialAuth).count(), 1) user_social_auth = get_user_social_auth(user.id) self.assertEqual(user_social_auth.id, expected_user_social_auth.id)
def account_delete(): """ Ask for a confirmation, then delete the current user account and all of its information. """ form = UserAccountDeleteForm(request.form) if request.method == 'POST' and form.validate(): # Store the current `UserSocialAuth` instance in memory because it will be deleted # but it will also be needed later to properly logout the user from PEAM. user_social_auth = get_user_social_auth(current_user.id) # Now we can safely delete the current `UserSocialAuth` instance. # We have to delete it because it has a foreign key to the User table. # We don't need to deal with the other tables of Social Auth, see: # https://python-social-auth.readthedocs.io/en/latest/storage.html db_session.query(UserSocialAuth).filter_by( user_id=current_user.id).delete() # Delete the current user. # The user's favorites will be deleted at the same time because of the `ondelete='CASCADE'` # on the `user_id` field of the `UserFavoriteOffice` model. db_session.query(User).filter_by(id=current_user.id).delete() db_session.commit() message = "La suppression de votre compte a bien été effectuée." flash(message, 'warning') # Return the `logout` view directly. It allows us to pass the full # `user_social_auth` object as a parameter. return logout(user_social_auth=user_social_auth) context = { 'form': form, } return render_template('user/account_delete.html', **context)
def user_favs_as_csv(cls, user): """ Returns the favorites offices of a user as a CSV text. """ header_row = cls.as_csv_header_row() if user.is_anonymous: return header_row rows = [fav.as_csv_row() for fav in db_session.query(cls).filter_by(user_id=user.id)] csv_text = "%s\r\n%s" % ( header_row, "\r\n".join(rows), ) return csv_text
def setUp(self, *args, **kwargs): super(AdminTest, self).setUp(*args, **kwargs) self.user = User(email='*****@*****.**', gender='male', first_name='John', last_name='Doe') db_session.add(self.user) db_session.flush() # Required for `self.logout` to work which looks for the `extra_data` attribute. user_social_auth = UserSocialAuth( provider=PEAMOpenIdConnect.name, extra_data={'id_token': 'fake'}, user_id=self.user.id, ) db_session.add(user_social_auth) db_session.commit() self.assertEqual(db_session.query(User).count(), 1)
def test_clean(self): """ Test `OfficeAdminExtraGeoLocation.clean()`. """ extra_geolocation = OfficeAdminExtraGeoLocation( siret="38524664000176", codes="75110\n\n\n\n\n\n\n57616", reason="Paris 10 + Metz Saint Julien", ) db_session.add(extra_geolocation) db_session.commit() # The `clean()` method should have been called automatically. extra_geolocation = db_session.query( OfficeAdminExtraGeoLocation).first() # Multiple newlines should have been removed. self.assertEqual(extra_geolocation.codes, '57616\n75110') # Corresponding Lat/Lon coords should have been found and stored. self.assertEqual( extra_geolocation.geolocations, '[[49.135208952059884, 6.207906756168173], [48.8815994262695, 2.36229991912841]]' )
def after_check(self): query = db_session.query(RawOffice.departement.distinct().label("departement")) departements = [row.departement for row in query.all()] if len(departements) != settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES: msg = "wrong number of departements : %s instead of expected %s" % ( len(departements), settings.DISTINCT_DEPARTEMENTS_HAVING_OFFICES ) raise Exception(msg) # FIXME parallelize for better performance for departement in departements: count = RawOffice.query.filter_by(departement=departement).count() logger.info("number of companies in departement %s : %i", departement, count) if not count >= settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT: msg = "too few companies in departement : %s instead of expected %s" % ( count, settings.MINIMUM_OFFICES_TO_BE_EXTRACTED_PER_DEPARTEMENT ) raise Exception(msg)
def add_offices(): """ Add offices (complete the data provided by the importer). """ for office_to_add in db_session.query(OfficeAdminAdd).all(): office = Office.query.filter_by(siret=office_to_add.siret).first() # Only create a new office if it does not already exist. # This guarantees that the importer data will always have precedence. if not office: # The `headcount` field of an `OfficeAdminAdd` instance has a `code` attribute. if hasattr(office_to_add.headcount, 'code'): headcount = office_to_add.headcount.code else: headcount = office_to_add.headcount # Create the new office in DB. new_office = Office() # Use `inspect` because `Office` columns are named distinctly from attributes. for field_name in list(inspect(Office).columns.keys()): try: value = getattr(office_to_add, field_name) except AttributeError: # Some fields are not shared between `Office` and `OfficeAdminAdd`. continue if field_name == 'headcount': value = headcount setattr(new_office, field_name, value) db_session.add(new_office) db_session.commit() # Create the new office in ES. doc = get_office_as_es_doc(office_to_add) es.Elasticsearch().create(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=office_to_add.siret, body=doc)
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. '20200803ExtractApp' # will match 20200803 date_string = self.input_filename.split('/')[-1][0:8] try: self.last_historical_data_date_in_file = datetime.strptime(date_string, "%Y%m%d") except ValueError: raise Exception("couldn't find a date pattern in filename. filename should be \ like 20200803ExtractApp.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_alternance_contracts = 0 imported_alternance_contracts_distribution = {} not_imported_alternance_contracts = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date))\ .filter(Hiring.contract_type == self.contract_type).first()[0] logger.info("will now extract all alternance contracts with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip() # FIXME detect column positions from header if b"SIRET" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 10000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, departement = parse_alternance_line(line) except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue except InvalidSiretException: error_message = traceback.format_exc() logger.info("invalid siret met at row: %i", count) logger.info(error_message) self.invalid_siret_errors += 1 continue except InvalidZipCodeException: logger.info("invalid zip code met at row: %i", count) self.invalid_zipcode_errors += 1 continue # This part of code is useless : # The data used has a lot of late contracts inputs # So we have to insert ALL the contracts from different dates # alternance_contract_should_be_imported = ( # hiring_date > last_historical_data_date_in_db # and hiring_date <= self.last_historical_data_date_in_file #) if hiring_date <= self.last_historical_data_date_in_file: statement = ( siret, hiring_date, self.contract_type, departement, None, #contract_duration None, #iiann None, #tranche_age None, #handicap_label None, #duree_pec date_insertion ) statements.append(statement) imported_alternance_contracts += 1 if hiring_date.year not in imported_alternance_contracts_distribution: imported_alternance_contracts_distribution[hiring_date.year] = {} if hiring_date.month not in imported_alternance_contracts_distribution[hiring_date.year]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month] = {} if hiring_date.day not in imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month]: imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] = 0 imported_alternance_contracts_distribution[hiring_date.year][hiring_date.month][hiring_date.day] += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into hirings table: %s", sys.exc_info()[1]) raise logger.info(f"Types de contrats à importer : {self.contract_name}") logger.info(f"processed {count} lba_contracts...") logger.info(f"imported lba_contracts: {imported_alternance_contracts}") logger.info(f"not imported lba_contracts: {not_imported_alternance_contracts}") logger.info(f"zipcode errors: {self.invalid_zipcode_errors}") logger.info(f"invalid_row errors: {self.invalid_row_errors}") logger.info(f"invalid siret errors: {self.invalid_siret_errors}") # if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: # raise IOError('too many zipcode errors') # if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: # raise IOError('too many invalid_row errors') con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type ) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime('%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def update_offices(): """ Update offices (overload the data provided by the importer). """ # Good engineering eliminates users being able to do the wrong thing as much as possible. # But since it is possible to store multiple SIRETs, there is no longer any constraint of uniqueness # on a SIRET. As a result, it shouldn't but there may be `n` entries in `OfficeAdminUpdate` # for the same SIRET. We order the query by creation date ASC so that the most recent changes take # priority over any older ones. for office_to_update in db_session.query(OfficeAdminUpdate).order_by( asc(OfficeAdminUpdate.date_created)).all(): for siret in OfficeAdminUpdate.as_list(office_to_update.sirets): office = Office.query.filter_by(siret=siret).first() if office: # Apply changes in DB. office.company_name = office_to_update.new_company_name or office.company_name office.office_name = office_to_update.new_office_name or office.office_name office.email = '' if office_to_update.remove_email else ( office_to_update.new_email or office.email) office.tel = '' if office_to_update.remove_phone else ( office_to_update.new_phone or office.tel) office.website = '' if office_to_update.remove_website else ( office_to_update.new_website or office.website) office.email_alternance = office_to_update.email_alternance office.phone_alternance = office_to_update.phone_alternance office.website_alternance = office_to_update.website_alternance # Note : we need to handle when score and score_alternance = 0 office.score = office_to_update.score if office_to_update.score is not None else office.score office.score_alternance = office_to_update.score_alternance if office_to_update.score_alternance is not None else office.score_alternance office.social_network = office_to_update.social_network office.contact_mode = office_to_update.contact_mode office.save() # Apply changes in ElasticSearch. body = { 'doc': { 'email': office.email, 'phone': office.tel, 'website': office.website, 'flag_alternance': 1 if office.flag_alternance else 0 } } scores_by_rome, scores_alternance_by_rome, boosted_romes, boosted_alternance_romes = get_scores_by_rome_and_boosted_romes( office, office_to_update) if scores_by_rome: body['doc']['scores_by_rome'] = scores_by_rome body['doc']['boosted_romes'] = boosted_romes if scores_alternance_by_rome: body['doc'][ 'scores_alternance_by_rome'] = scores_alternance_by_rome body['doc'][ 'boosted_alternance_romes'] = boosted_alternance_romes # The update API makes partial updates: existing `scalar` fields are overwritten, # but `objects` fields are merged together. # https://www.elastic.co/guide/en/elasticsearch/guide/1.x/partial-updates.html # However `scores_by_rome` and `boosted_romes` need to be overwritten because they # may change over time. # To do this, we perform 2 requests: the first one resets `scores_by_rome` and # `boosted_romes` and the second one populates them. delete_body = {'doc': {}} delete_body = { 'doc': { 'scores_by_rome': None, 'boosted_romes': None, 'scores_alternance_by_rome': None, 'boosted_alternance_romes': None } } # Unfortunately these cannot easily be bulked :-( # The reason is there is no way to tell bulk to ignore missing documents (404) # for a partial update. Tried it and failed it on Oct 2017 @vermeer. es.Elasticsearch().update(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=siret, body=delete_body, params={'ignore': 404}) es.Elasticsearch().update(index=settings.ES_INDEX, doc_type=es.OFFICE_TYPE, id=siret, body=body, params={'ignore': 404}) # Delete the current PDF thus it will be regenerated at the next download attempt. pdf_util.delete_file(office)
def run_task(self): date_insertion = datetime.now() logger.info("extracting %s ", self.input_filename) # this pattern matches the first date # e.g. 'lbb_xdpdpae_delta_201611102200.bz2' # will match 2018-09-12 date_pattern = r'.*_(\d\d\d\d\d\d\d\d)\d\d\d\d' #We keep only the date in the file name, ex: 20190910 = 10th september 2019 date_match = re.match(date_pattern, self.input_filename) if date_match: date_part = date_match.groups()[0] self.last_historical_data_date_in_file = datetime.strptime( date_part, "%Y%m%d") logger.debug("identified last_historical_data_date_in_file=%s", self.last_historical_data_date_in_file) else: raise Exception( "couldn't find a date pattern in filename. filename should be \ like lbb_xdpdpae_delta_YYYYMMDDHHMM.csv") count = 0 statements = [] something_new = False query = """ INSERT into %s( siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion ) values(%%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s, %%s) """ % settings.HIRING_TABLE imported_dpae = 0 imported_dpae_distribution = {} not_imported_dpae = 0 last_historical_data_date_in_db = db_session.query(func.max(Hiring.hiring_date)) \ .filter(Hiring.contract_type.in_((Hiring.CONTRACT_TYPE_CDI, Hiring.CONTRACT_TYPE_CDD, Hiring.CONTRACT_TYPE_CTT))).first()[0] if last_historical_data_date_in_db is None: last_historical_data_date_in_db = DEFAULT_DATETIME_DPAE logger.info( "will now extract all dpae with hiring_date between %s and %s", last_historical_data_date_in_db, self.last_historical_data_date_in_file) with import_util.get_reader(self.input_filename) as myfile: con, cur = import_util.create_cursor() header_line = myfile.readline().strip( ) # FIXME detect column positions from header if b"siret" not in header_line: logger.debug(header_line) raise Exception("wrong header line") for line in myfile: line = line.decode() count += 1 if not count % 100000: logger.debug("reading line %i", count) try: try: cur.executemany(query, statements) except OperationalError: # retry once in case of deadlock error time.sleep(10) cur.executemany(query, statements) statements = [] con.commit() something_new = True except: logger.error( "error in executing statement into dpae table: %s", sys.exc_info()[1]) statements = [] raise try: siret, hiring_date, _, contract_type, departement, contract_duration, \ iiann, tranche_age, handicap_label, duree_pec = parse_dpae_line(line) except ValueError: self.zipcode_errors += 1 continue except InvalidRowException: logger.info("invalid_row met at row: %i", count) self.invalid_row_errors += 1 continue dpae_should_be_imported = ( hiring_date > last_historical_data_date_in_db and hiring_date <= self.last_historical_data_date_in_file # For DPAE contracts we only keep all CDI, only long enough CDD (at least 31 days) # and we ignore CTT. and (contract_type == Hiring.CONTRACT_TYPE_CDI or (contract_type == Hiring.CONTRACT_TYPE_CDD and contract_duration is not None and contract_duration > 31))) if dpae_should_be_imported: statement = (siret, hiring_date, contract_type, departement, contract_duration, iiann, tranche_age, handicap_label, duree_pec, date_insertion) statements.append(statement) imported_dpae += 1 if hiring_date.year not in imported_dpae_distribution: imported_dpae_distribution[hiring_date.year] = {} if hiring_date.month not in imported_dpae_distribution[ hiring_date.year]: imported_dpae_distribution[hiring_date.year][ hiring_date.month] = {} if hiring_date.day not in imported_dpae_distribution[ hiring_date.year][hiring_date.month]: imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] = 0 imported_dpae_distribution[hiring_date.year][ hiring_date.month][hiring_date.day] += 1 else: not_imported_dpae += 1 # run remaining statements try: cur.executemany(query, statements) something_new = True except: logger.error("error in executing statement into dpae table: %s", sys.exc_info()[1]) raise logger.info("processed %i dpae...", count) logger.info("imported dpae: %i", imported_dpae) logger.info("not imported dpae: %i", not_imported_dpae) logger.info("zipcode errors: %i", self.zipcode_errors) logger.info("invalid_row errors: %i", self.invalid_row_errors) if self.zipcode_errors > settings.MAXIMUM_ZIPCODE_ERRORS: raise IOError('too many zipcode errors') if self.invalid_row_errors > settings.MAXIMUM_INVALID_ROWS: raise IOError('too many invalid_row errors') logger.info("verifying good number of dpae imported.") query = "select count(*) from hirings h where hiring_date > %s and hiring_date <= %s and h.contract_type in (1,2,3)" cur.execute(query, [ last_historical_data_date_in_db, self.last_historical_data_date_in_file ]) res = cur.fetchone() if res[0] != imported_dpae: raise DoublonException( f"Too many DPAE ({res[0]}) in DB compared to DPAE file ({imported_dpae})." ) logger.info("verifying number of DPAE: OK.") con.commit() cur.close() con.close() try: statistics = DpaeStatistics( last_import=datetime.now(), most_recent_data_date=self.last_historical_data_date_in_file, file_type=self.file_type) db_session.add(statistics) db_session.commit() logger.info("First way to insert DPAE statistics in DB : OK") except OperationalError: # For an obscure reason, the DpaeStatistics way to insert does not work on the bonaparte server # So we insert it directly via an SQL query # This job has been broken for more than a year, only way to fix it : db_session.rollback() last_import_date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') most_recent_date = self.last_historical_data_date_in_file.strftime( '%Y-%m-%d %H:%M:%S') query = f"insert into dpae_statistics (last_import, most_recent_data_date, file_type) values ('{last_import_date}','{most_recent_date}','{self.file_type}')" con, cur = import_util.create_cursor() cur.execute(query) con.commit() cur.close() con.close() logger.info("Second way to insert DPAE statistics in DB : OK") logger.info("finished importing dpae...") return something_new
def test_admin_access(self): """ Test admin access permissions. """ admin_urls = [ self.url_for('admin.index'), self.url_for('users.index_view'), self.url_for('officeadminadd.index_view'), self.url_for('officeadminremove.index_view'), self.url_for('officeadminupdate.index_view'), self.url_for('officeadminextrageolocation.index_view'), ] with self.test_request_context(): for url in admin_urls: # Access should be denied when a user is not logged in. db_session.query(User).update({ User.active: True, User.is_admin: False }) db_session.commit() self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertTrue(self.user.active) self.assertFalse(self.user.is_admin) rv = self.app.get(url) self.assertEqual(rv.status_code, 404) self.login(self.user) # Access should be denied when a user is logged in but is not an admin. rv = self.app.get(url) self.assertEqual(rv.status_code, 404) # Access should be granted when a user is logged in and is admin. db_session.query(User).update({ User.active: True, User.is_admin: True }) db_session.commit() self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertTrue(self.user.active) self.assertTrue(self.user.is_admin) rv = self.app.get(url) self.assertEqual(rv.status_code, 200) # Access should be denied when a user is not active. db_session.query(User).update({ User.active: False, User.is_admin: True }) db_session.commit() self.user = db_session.query(User).filter_by( id=self.user.id).first() self.assertFalse(self.user.active) self.assertTrue(self.user.is_admin) rv = self.app.get(url) self.assertEqual(rv.status_code, 404) self.logout()