Beispiel #1
0
 def get_score_for_rome_code(self, rome_code, hiring_type=None):
     hiring_type = hiring_type or hiring_type_util.DEFAULT
     if hiring_type not in hiring_type_util.VALUES:
         raise ValueError("Unknown hiring_type")
     raw_score = self.score if hiring_type == hiring_type_util.DPAE else self.score_alternance
     return scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
         score=raw_score, rome_code=rome_code, naf_code=self.naf)
Beispiel #2
0
 def get_stars_for_rome_code(self, rome_code):
     """
     Converts the score (int from 0 to 100) to a number of stars (float from 0.0 and 5.0).
     In case a rome_code is given, instead of using general all-jobs-included score,
     use the score adjusted to the given rome_code.
     """
     score = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
         score=self.score, rome_code=rome_code, naf_code=self.naf)
     stars_num = score / (100 / 5)
     if stars_num < 0.0 or stars_num > 5.0:
         raise Exception(
             "unexpected starts_num value %s for siret %s and rome_code %s"
             % (stars_num, self.siret, rome_code))
     return stars_num
Beispiel #3
0
    def get_docs(self):
        docs = copy.deepcopy(DOCS)
        for _, doc in enumerate(docs, start=1):
            # Build scores for relevant ROME codes.
            naf = doc['naf']
            rome_codes = list(mapping_util.MANUAL_NAF_ROME_MAPPING[naf].keys())

            # FIXME this is some dangerous code duplication with create_index, we should someday
            # make it more DNRY.
            score = doc['score']
            scores_by_rome = {}
            for rome_code in rome_codes:
                scores_by_rome[
                    rome_code] = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=score,
                        rome_code=rome_code,
                        naf_code=naf,
                    )
            if scores_by_rome:
                doc['scores_by_rome'] = scores_by_rome

            # FIXME this is some dangerous code duplication with create_index, we should someday
            # make it more DNRY.
            score_alternance = doc['score_alternance']
            scores_alternance_by_rome = {}
            for rome_code in rome_codes:
                raw_score = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                    score=score_alternance,
                    rome_code=rome_code,
                    naf_code=naf,
                )
                if raw_score > 0:  # dirty fix until proper DNRY with create_index
                    scores_alternance_by_rome[rome_code] = raw_score
            if scores_alternance_by_rome:
                doc['scores_alternance_by_rome'] = scores_alternance_by_rome
        return docs
Beispiel #4
0
def get_scores_by_rome_and_boosted_romes(office, office_to_update=None):

    ## 0 - Get all romes related to the company

    # fetch all rome_codes mapped to the naf of this office
    # as we will compute a score adjusted for each of them
    office_nafs = [office.naf]
    # Handle NAFs added to a company
    if office_to_update:
        office_nafs += office_to_update.as_list(office_to_update.nafs_to_add)

    scores_by_rome = {}
    scores_alternance_by_rome = {}
    # elasticsearch does not understand sets, so we use a dict of 'key => True' instead
    boosted_romes = {}
    boosted_alternance_romes = {}

    if PSE_STUDY_IS_ENABLED:
        sirets_to_remove_pse = load_siret_to_remove()
        office_to_remove_pse = (office.siret in sirets_to_remove_pse)
    else:
        office_to_remove_pse = False

    for naf in office_nafs:
        try:
            naf_rome_codes = mapping_util.get_romes_for_naf(naf)
        except KeyError:
            # unfortunately some NAF codes have no matching ROME at all
            continue

        ## 1- DPAE

        romes_to_boost = []
        romes_to_remove = []
        if office_to_update:
            romes_to_boost = office_to_update.as_list(
                office_to_update.romes_to_boost)
            romes_to_remove = office_to_update.as_list(
                office_to_update.romes_to_remove)

        # Add unrelated rome for indexing (with boost) and remove unwanted romes
        rome_codes = set(naf_rome_codes).union(
            set(romes_to_boost)) - set(romes_to_remove)

        for rome_code in rome_codes:
            # Manage office boosting - DPAE
            if office_to_update and office_to_update.boost:
                if not office_to_update.romes_to_boost:
                    # Boost the score for all ROME codes.
                    boosted_romes[rome_code] = True
                elif rome_code in romes_to_boost:
                    # Boost the score for some ROME codes only.
                    boosted_romes[rome_code] = True

            # Scoring part
            score_dpae = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                score=office.score, rome_code=rome_code, naf_code=naf)

            # Get the score minimum for a rome code with metiers en tension
            score_minimum_for_rome = scoring_util.get_score_minimum_for_rome(
                rome_code)

            if (score_dpae >= score_minimum_for_rome
                    or rome_code in boosted_romes):
                if office_to_remove_pse:
                    # 0 as a special score for PSE companies ensures they
                    # always show up *last* in the results, even after
                    # results randomization
                    score_dpae = 0
                if rome_code in scores_by_rome:
                    # this ROME was already computed before for another NAF
                    if score_dpae > scores_by_rome[rome_code]:
                        # keep highest score for this rome among all possible NAF codes
                        scores_by_rome[rome_code] = score_dpae
                else:
                    scores_by_rome[rome_code] = score_dpae
                    st.increment_office_score_for_rome_count()

        ## 2 - Alternance

        romes_alternance_to_boost = []
        romes_alternance_to_remove = []
        if office_to_update:
            romes_alternance_to_boost = office_to_update.as_list(
                office_to_update.romes_alternance_to_boost)
            romes_alternance_to_remove = office_to_update.as_list(
                office_to_update.romes_alternance_to_remove)

        rome_codes_alternance = (
            set(naf_rome_codes).union(set(romes_alternance_to_boost)) -
            set(romes_alternance_to_remove))

        for rome_code in rome_codes_alternance:
            # Manage office boosting - Alternance
            if office_to_update and office_to_update.boost_alternance:
                if not office_to_update.romes_alternance_to_boost:
                    # Boost the score for all ROME codes.
                    boosted_alternance_romes[rome_code] = True
                elif rome_code in romes_alternance_to_boost:
                    # Boost the score for some ROME codes only.
                    boosted_alternance_romes[rome_code] = True

            # Scoring part

            score_alternance = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                score=office.score_alternance,
                rome_code=rome_code,
                naf_code=naf)

            # Get the score minimum for a rome code with metiers en tension
            score_minimum_for_rome_alternance = scoring_util.get_score_minimum_for_rome(
                rome_code, alternance=True)

            if (score_alternance >= score_minimum_for_rome_alternance
                    or rome_code in boosted_alternance_romes):
                if rome_code in scores_alternance_by_rome:
                    # this ROME was already computed before for another NAF
                    if score_alternance > scores_alternance_by_rome[rome_code]:
                        # keep highest score for this rome among all possible NAF codes
                        scores_alternance_by_rome[rome_code] = score_alternance
                else:
                    scores_alternance_by_rome[rome_code] = score_alternance
                    st.increment_office_score_alternance_for_rome_count()

    return scores_by_rome, scores_alternance_by_rome, boosted_romes, boosted_alternance_romes
Beispiel #5
0
    def setUp(self, *args, **kwargs):
        super(ApiBaseTest, self).setUp(*args, **kwargs)

        # Insert test data into Elasticsearch.
        docs = [
            {
                'naf':
                '7320Z',  # Map to ROME D1405.
                'siret':
                '00000000000001',
                'company_name':
                'Raison sociale 1',
                'score':
                68,
                'score_alternance':
                18,
                'headcount':
                11,
                'locations':
                self.positions['bayonville_sur_mad']['coords'],
                'name':
                'Office 1',
                'flag_alternance':
                0,
                'flag_pmsmp':
                0,
                'department':
                self.positions['bayonville_sur_mad']['zip_code'][0:2],
            },
            {
                'naf':
                '7320Z',  # Map to ROME D1405.
                'siret':
                '00000000000002',
                'company_name':
                'Raison sociale 2',
                'score':
                69,
                'score_alternance':
                18,
                'headcount':
                31,
                'locations':
                self.positions['bayonville_sur_mad']['coords'],
                'name':
                'Office 2',
                'flag_alternance':
                0,
                'flag_pmsmp':
                0,
                'department':
                self.positions['bayonville_sur_mad']['zip_code'][0:2],
            },
            {
                'naf':
                '7320Z',  # Map to ROME D1405.
                'siret':
                '00000000000003',
                'score':
                70,
                'score_alternance':
                18,
                'headcount':
                31,
                'locations':
                self.positions['bayonville_sur_mad']['coords'],
                'name':
                'Office 3',
                'flag_alternance':
                0,
                'flag_pmsmp':
                0,
                'department':
                self.positions['bayonville_sur_mad']['zip_code'][0:2],
            },
            {
                'naf': '7320Z',  # Map to ROME D1405.
                'siret': '00000000000004',
                'score': 71,
                'score_alternance': 18,
                'headcount': 31,
                'locations': self.positions['caen']['coords'],
                'name': 'Office 4',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['caen']['zip_code'][0:2],
            },
            {
                'naf': '9511Z',  # Map to ROME M1801.
                'siret': '00000000000005',
                'score': 71,
                'score_alternance': 18,
                'headcount': 31,
                'locations': self.positions['caen']['coords'],
                'name': 'Office 5',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['caen']['zip_code'][0:2],
            },
            # For NAF filter
            {
                'naf': '4711C',  # Map to ROME D1508.
                'siret': '00000000000006',
                'score': 75,
                'score_alternance': 18,
                'headcount': 31,
                'locations': self.positions['metz']['coords'],
                'name': 'Office 6',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['metz']['zip_code'][0:2],
            },
            {
                'naf': '5610C',  # Map to ROME D1508.
                'siret': '00000000000007',
                'score': 70,
                'score_alternance': 18,
                'headcount': 50,
                'locations': self.positions['metz']['coords'],
                'name': 'Office 7',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['metz']['zip_code'][0:2],
            },
            # For result sort
            {
                'naf': '9103Z',  # Map to ROME D1211
                'siret': '00000000000008',
                'score': 75,
                'score_alternance': 51,
                'headcount': 50,
                'locations': self.positions['nantes']['coords'],
                'name': 'Office 8',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['nantes']['zip_code'][0:2],
            },
            {
                'naf': '7010Z',  # Map to ROME D1211
                'siret': '00000000000009',
                'score': 99,
                'score_alternance': 51,
                'headcount': 50,
                'locations':
                self.positions['reze']['coords'],  # City close to Nantes
                'name': 'Office 9',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['reze']['zip_code'][0:2],
            },
            # For contract filter
            {
                'naf': '4669A',  # Map to Rome D1213
                'siret': '00000000000010',
                'score': 78,
                'score_alternance': 0,
                'headcount': 34,
                'locations': self.positions['lille']['coords'],
                'name': 'Office 10',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['lille']['zip_code'][0:2],
            },
            {
                'naf': '4669A',  # Map to Rome D1213
                'siret': '00000000000011',
                'score': 82,
                'score_alternance': 80,
                'headcount': 65,
                'locations': self.positions['lille']['coords'],
                'name': 'Office 11',
                'flag_alternance': 1,
                'flag_pmsmp': 0,
                'department': self.positions['lille']['zip_code'][0:2],
            },
            # For headcount filter
            {
                'naf': '7022Z',  # Map to Rome M1202
                'siret': '00000000000012',
                'score': 82,
                'score_alternance': 18,
                'headcount': 11,
                'locations': self.positions['toulouse']['coords'],
                'name': 'Office 12',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['toulouse']['zip_code'][0:2],
            },
            {
                'naf': '7010Z',  # Map to Rome M1202
                'siret': '00000000000013',
                'score': 82,
                'score_alternance': 18,
                'headcount': 22,
                'locations': self.positions['toulouse']['coords'],
                'name': 'Office 13',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['toulouse']['zip_code'][0:2],
            },
            # For headcount_text
            {
                'naf': '4648Z',  # Map to Rome B1603
                'siret': '00000000000014',
                'score': 80,
                'score_alternance': 18,
                'headcount': 53,  # headcount_text : '10 000 salariés et plus'
                'locations': self.positions['pau']['coords'],
                'name': 'Office 14',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['pau']['zip_code'][0:2],
            },
            # For flag_alternance in response
            {
                'naf': '4648Z',  # Map to Rome B1603
                'siret': '00000000000015',
                'score': 80,
                'score_alternance': 18,
                'headcount': 53,
                'locations': self.positions['poitiers']['coords'],
                'name': 'Office 15',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['poitiers']['zip_code'][0:2],
            },
            {
                'naf': '4648Z',  # Map to Rome B1603
                'siret': '00000000000016',
                'score': 70,
                'score_alternance': 80,
                'headcount': 53,
                'locations': self.positions['poitiers']['coords'],
                'name': 'Office 16',
                'flag_alternance': 1,
                'flag_pmsmp': 0,
                'department': self.positions['poitiers']['zip_code'][0:2],
            },
            # For filter_by_department and filter_by_flag_pmsmp
            {
                'naf': '5229A',  # Map to Rome N1202
                'siret': '00000000000017',
                'score': 90,
                'score_alternance': 18,
                'headcount': 53,
                'locations': self.positions['paris']['coords'],
                'name': 'Office 17',
                'flag_alternance': 0,
                'flag_pmsmp': 1,
                'department': self.positions['paris']['zip_code'][0:2],
            },
            {
                'naf': '5229A',  # Map to Rome N1202
                'siret': '00000000000018',
                'score': 78,
                'score_alternance': 18,
                'headcount': 53,
                'locations': self.positions['neuilly-sur-seine']['coords'],
                'name': 'Office 18',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department':
                self.positions['neuilly-sur-seine']['zip_code'][0:2],
            },
            # For filters in response tests
            {
                'naf': '4910Z',  # Map to Rome N4403
                'siret': '00000000000019',
                'score': 76,
                'score_alternance': 18,
                'headcount': 0o1,
                'locations': self.positions['toulon']['coords'],
                'name': 'Office 19',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['toulon']['zip_code'][0:2],
            },
            {
                'naf': '4910Z',  # Map to Rome N4403
                'siret': '00000000000020',
                'score': 90,
                'score_alternance': 18,
                'headcount': 0o3,
                'locations': self.positions['toulon']['coords'],
                'name': 'Office 20',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['toulon']['zip_code'][0:2],
            },
            {
                'naf': '4920Z',  # Map to Rome N4403
                'siret': '00000000000021',
                'score': 43,
                'score_alternance': 18,
                'headcount': 53,
                'locations': self.positions['toulon']['coords'],
                'name': 'Office 21',
                'flag_alternance': 1,
                'flag_pmsmp': 0,
                'department': self.positions['toulon']['zip_code'][0:2],
            },
            # For distance filter => between 10-30km
            {
                'naf': '4910Z',  # Map to Rome N4403
                'siret': '00000000000023',
                'score': 89,
                'score_alternance': 18,
                'headcount': 31,
                'locations':
                self.positions['hyeres']['coords'],  # 15km of Toulon
                'name': 'Office 23',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['hyeres']['zip_code'][0:2],
            },
            # For distance filter => between 30-50km
            {
                'naf': '4910Z',  # Map to Rome N4403
                'siret': '00000000000024',
                'score': 30,
                'score_alternance': 18,
                'headcount': 12,
                'locations':
                self.positions['aubagne']['coords'],  # 35km of Toulon
                'name': 'Office 24',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['aubagne']['zip_code'][0:2],
            },
            # For distance between 50-100km
            {
                'naf': '4910Z',  # Map to Rome N4403
                'siret': '00000000000025',
                'score': 82,
                'score_alternance': 18,
                'headcount': 11,
                'locations':
                self.positions['draguignan']['coords'],  # 60km of Toulon
                'name': 'Office 25',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['draguignan']['zip_code'][0:2],
            },
            # For distance filter => between 100-3000km
            {
                'naf': '4910Z',  # Map to Rome N4403
                'siret': '00000000000026',
                'score': 67,
                'score_alternance': 18,
                'headcount': 51,
                'locations':
                self.positions['limoges']['coords'],  # 500km of Toulon
                'name': 'Office 26',
                'flag_alternance': 0,
                'flag_pmsmp': 0,
                'department': self.positions['limoges']['zip_code'][0:2],
            },
        ]
        for _, doc in enumerate(docs, start=1):
            # Build scores for relevant ROME codes.
            naf = doc['naf']
            rome_codes = list(mapping_util.MANUAL_NAF_ROME_MAPPING[naf].keys())

            # FIXME this is some dangerous code duplication with create_index, we should someday
            # make it more DNRY.
            score = doc['score']
            scores_by_rome = {}
            for rome_code in rome_codes:
                scores_by_rome[
                    rome_code] = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=score,
                        rome_code=rome_code,
                        naf_code=naf,
                    )
            if scores_by_rome:
                doc['scores_by_rome'] = scores_by_rome

            # FIXME this is some dangerous code duplication with create_index, we should someday
            # make it more DNRY.
            score_alternance = doc['score_alternance']
            scores_alternance_by_rome = {}
            for rome_code in rome_codes:
                raw_score = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                    score=score_alternance,
                    rome_code=rome_code,
                    naf_code=naf,
                )
                if raw_score > 0:  # dirty fix until proper DNRY with create_index
                    scores_alternance_by_rome[rome_code] = raw_score
            if scores_alternance_by_rome:
                doc['scores_alternance_by_rome'] = scores_alternance_by_rome

            # just like in other environments, id should be the siret
            self.es.index(index=settings.ES_INDEX,
                          doc_type=es.OFFICE_TYPE,
                          id=doc['siret'],
                          body=doc)

        # need for ES to register our new documents, flaky test here otherwise
        self.es.indices.flush(index=settings.ES_INDEX)

        # Create related Office instances into MariaDB/MySQL.
        for doc in docs:

            # Set the right `commune_id` and `zipcode` depending on the location.
            commune_id = None
            zip_code = None
            for position in self.positions:
                if doc['locations'] == self.positions[position]['coords']:
                    commune_id = self.positions[position]['commune_id']
                    zip_code = self.positions[position]['zip_code']
                    break

            if not commune_id:
                raise ValueError(
                    "Cannot create an entry in Office with a city absent from self.positions."
                )

            office = Office(
                office_name=doc['name'],
                siret=doc['siret'],
                score=doc['score'],
                score_alternance=doc['score_alternance'],
                naf=doc['naf'],
                city_code=commune_id,
                zipcode=zip_code,
                email='*****@*****.**',
                departement=zip_code[:2],
                company_name=doc['company_name']
                if 'company_name' in doc else '',
                flag_alternance=doc['flag_alternance'],
                flag_pmsmp=doc['flag_pmsmp'],
                headcount=doc['headcount'],
                x=doc['locations'][0]['lon'],
                y=doc['locations'][0]['lat'],
            )
            office.save()

        # We should have as much entries in MariaDB/MySQL than in Elasticsearch, except
        # one more in ES for the fake document actually.
        es_count = self.es.count(index=settings.ES_INDEX,
                                 doc_type=es.OFFICE_TYPE,
                                 body={'query': {
                                     'match_all': {}
                                 }})
        self.assertEqual(Office.query.count() + 1, es_count['count'])
    def setUp(self, *args, **kwargs):
        super(ApiBaseTest, self).setUp(*args, **kwargs)

        # Delete index.
        self.es.indices.delete(index=self.ES_TEST_INDEX, ignore=[404])

        # Create new index.
        request_body = {
            "mappings": {
                "office": {
                    "properties": {
                        "naf": {
                            "type": "string",
                            "index": "not_analyzed"
                        },
                        "siret": {
                            "type": "string",
                            "index": "not_analyzed"
                        },
                        "name": {
                            "type": "string",
                            "index": "not_analyzed"
                        },
                        "score": {
                            "type": "integer",
                            "index": "not_analyzed"
                        },
                        "headcount": {
                            "type": "integer",
                            "index": "not_analyzed"
                        },
                        "location": {
                            "type": "geo_point",
                        }
                    }
                }
            }
        }

        for rome_code in settings.ROME_DESCRIPTIONS.keys():
            request_body["mappings"]["office"]["properties"][
                "score_for_rome_%s" % rome_code] = {
                    "type": "integer",
                    "index": "not_analyzed"
                }
        self.es.indices.create(index=self.ES_TEST_INDEX, body=request_body)

        # Insert test data into Elasticsearch.
        docs = [
            {
                'naf': u'7320Z',  # Map to ROME D1405.
                'siret': u'00000000000001',
                'score': 68,
                'headcount': 11,
                'location': self.positions['bayonville_sur_mad']['location'],
                'name': u'Office 1',
            },
            {
                'naf': u'7320Z',  # Map to ROME D1405.
                'siret': u'00000000000002',
                'score': 69,
                'headcount': 31,
                'location': self.positions['bayonville_sur_mad']['location'],
                'name': u'Office 2',
            },
            {
                'naf': u'7320Z',  # Map to ROME D1405.
                'siret': u'00000000000003',
                'score': 70,
                'headcount': 31,
                'location': self.positions['bayonville_sur_mad']['location'],
                'name': u'Office 3',
            },
            {
                'naf': u'7320Z',  # Map to ROME D1405.
                'siret': u'00000000000004',
                'score': 71,
                'headcount': 31,
                'location': self.positions['caen']['location'],
                'name': u'Office 4',
            },
            {
                'naf': u'9511Z',  # Map to ROME M1801.
                'siret': u'00000000000005',
                'score': 71,
                'headcount': 31,
                'location': self.positions['caen']['location'],
                'name': u'Office 5',
            },
        ]
        for i, doc in enumerate(docs, start=1):
            # build scores for relevant ROME codes
            naf = doc['naf']
            score = doc['score']
            rome_codes = mapping_util.MANUAL_NAF_ROME_MAPPING[naf].keys()

            for rome_code in rome_codes:
                office_score_for_current_rome = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                    score=score, rome_code=rome_code, naf_code=naf)
                doc['score_for_rome_%s' %
                    rome_code] = office_score_for_current_rome

            self.es.index(index=self.ES_TEST_INDEX,
                          doc_type=self.ES_OFFICE_TYPE,
                          id=i,
                          body=doc)

        # need for ES to register our new documents, flaky test here otherwise
        time.sleep(1)

        # Create related Office instances into MariaDB/MySQL.
        for doc in docs:

            # Set the right `commune_id` and `zipcode` depending on the location.
            commune_id = None
            zip_code = None
            for position in self.positions:
                if doc['location'] == self.positions[position]['location']:
                    commune_id = self.positions[position]['commune_id']
                    zip_code = self.positions[position]['zip_code']
                    break

            if not commune_id:
                raise ValueError(
                    "Cannot create an entry in Office with a city absent from self.positions."
                )

            office = Office(
                company_name=doc['name'],
                siret=doc['siret'],
                score=doc['score'],
                naf=doc['naf'],
                city_code=commune_id,
                zipcode=zip_code,
                email=u'*****@*****.**',
                departement=zip_code[:2],
                x=doc['location']['lon'],
                y=doc['location']['lat'],
            )
            office.save()

        # We should have as much entries in MariaDB/MySQL than in Elasticsearch.
        self.assertEquals(Office.query.count(), len(docs))
def compute_effective_and_predicted_hirings():
    logger.info(f"\n Start : Computing effective hirings")

    importer_cycles_infos = PerfImporterCycleInfos.query.filter(
        PerfImporterCycleInfos.computed == False).all()
    importer_cycles_infos_to_compute = []
    for ici in importer_cycles_infos:
        if os.environ["LBB_ENV"] in ["development", "test"]:
            importer_cycles_infos_to_compute.append(ici)
            continue
        if ici.prediction_end_date < datetime.now():
            importer_cycles_infos_to_compute.append(ici)

    logger.info(
        f"Importer cycles infos which have not been computed yet : {[i.file_name for i in importer_cycles_infos_to_compute]}"
    )

    for ici in importer_cycles_infos_to_compute:
        perf_division_per_rome_dict = load_perf_division_per_rome_dict()

        naf_not_founds = set()
        nb_companies_with_naf_not_found = 0

        logger.info(
            f"Start computing for importer cycle infos : {ici._id} - {ici.file_name}"
        )

        engine = import_util.create_sqlalchemy_engine()
        ppaeh = PerfPredictionAndEffectiveHirings.query.filter(
            PerfPredictionAndEffectiveHirings.importer_cycle_infos_id ==
            ici._id)
        columns_companies = [
            "_id", "siret", "naf", "lbb_nb_predicted_hirings_score",
            "lba_nb_predicted_hirings_score"
        ]
        dict_df_companies = {}
        dict_ppaeh = {}
        for col in columns_companies:
            dict_df_companies[col] = []
        for perf in ppaeh:
            dict_ppaeh[perf._id] = perf
            for col in columns_companies:
                dict_df_companies[col].append(getattr(perf, col))
        del ppaeh
        df_companies_list = pd.DataFrame(data=dict_df_companies)

        logger.info(f"Nb offices to compute : {len(df_companies_list)}")

        query_hirings_lbb = f"SELECT siret, count(*) as lbb_nb_effective_hirings \
                FROM hirings\
                WHERE hiring_date >= '{ici.prediction_start_date}'\
                and hiring_date <= '{ici.prediction_end_date}'\
                and (contract_type={Hiring.CONTRACT_TYPE_CDD} or contract_type={Hiring.CONTRACT_TYPE_CDI})\
                GROUP BY siret;"

        df_hirings_lbb = pd.read_sql_query(query_hirings_lbb, engine)
        logger.info(
            f"Nb offices found in hirings for lbb : {len(df_hirings_lbb)}")

        query_hirings_lba = f"SELECT siret, count(*) as lba_nb_effective_hirings \
                FROM hirings\
                WHERE hiring_date >= '{ici.prediction_start_date}'\
                and hiring_date <= '{ici.prediction_end_date}'\
                and (contract_type={Hiring.CONTRACT_TYPE_APR} or contract_type={Hiring.CONTRACT_TYPE_CP})\
                GROUP BY siret;"

        df_hirings_lba = pd.read_sql_query(query_hirings_lba, engine)
        logger.info(
            f"Nb offices found in hirings for lba: {len(df_hirings_lba)}")

        engine.close()

        df_merge_hirings_tmp = pd.merge(df_companies_list,
                                        df_hirings_lbb,
                                        how='left',
                                        on="siret")
        df_merged = pd.merge(df_merge_hirings_tmp,
                             df_hirings_lba,
                             how='left',
                             on="siret")

        # Compute the predicted hirings from the score
        df_merged["lbb_nb_predicted_hirings"] = df_merged[
            "lbb_nb_predicted_hirings_score"].apply(
                lambda x: scoring_util.get_hirings_from_score(x))
        df_merged["lba_nb_predicted_hirings"] = df_merged[
            "lba_nb_predicted_hirings_score"].apply(
                lambda x: scoring_util.get_hirings_from_score(x))

        df_merged = df_merged.fillna(0)

        cols_we_want_to_keep = [
            "_id",
            "siret",
            "naf",
            "lbb_nb_effective_hirings",
            "lba_nb_effective_hirings",
            "lbb_nb_predicted_hirings",
            "lba_nb_predicted_hirings",
            "lbb_nb_predicted_hirings_score",
            "lba_nb_predicted_hirings_score",
        ]

        df_merged = df_merged[cols_we_want_to_keep]

        values_to_update = df_merged.values.tolist()
        count = 0

        updated_ppaeh = []
        for row in values_to_update:
            row_id = row[0]
            siret = row[1]
            naf = row[2]
            params = dict(
                zip([
                    "lbb_nb_effective_hirings", "lba_nb_effective_hirings",
                    "lbb_nb_predicted_hirings", "lba_nb_predicted_hirings"
                ], row[3:7]))
            lbb_nb_predicted_hirings_score = row[7]
            lba_nb_predicted_hirings_score = row[8]
            # foo
            pred_effective_hirings = dict_ppaeh[row_id]
            updated_values = {"_id": row_id}
            for key, val in params.items():
                updated_values[key] = val
            is_a_bonne_boite = False
            is_a_bonne_alternance = False

            naf_present_in_mapping_rome_naf = naf in perf_division_per_rome_dict

            if naf_present_in_mapping_rome_naf:
                for rome_code, values in perf_division_per_rome_dict[
                        naf].items():
                    score_lbb = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=lbb_nb_predicted_hirings_score,
                        rome_code=rome_code,
                        naf_code=naf)
                    if score_lbb >= values["threshold_lbb"]:
                        perf_division_per_rome_dict[naf][rome_code][
                            "nb_bonne_boites_lbb"] += 1
                        is_a_bonne_boite = True

                    score_lba = scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
                        score=lba_nb_predicted_hirings_score,
                        rome_code=rome_code,
                        naf_code=naf)
                    if score_lba >= values["threshold_lba"]:
                        perf_division_per_rome_dict[naf][rome_code][
                            "nb_bonne_boites_lba"] += 1
                        is_a_bonne_alternance = True
            else:
                naf_not_founds.add(naf)
                nb_companies_with_naf_not_found += 1
            pred_effective_hirings.is_a_bonne_boite = is_a_bonne_boite
            pred_effective_hirings.is_a_bonne_alternance = is_a_bonne_alternance
            updated_values["is_a_bonne_boite"] = is_a_bonne_boite
            updated_values["is_a_bonne_alternance"] = is_a_bonne_alternance

            updated_ppaeh.append(updated_values)
            count += 1
            # Commit all the 10 000 transactions
            if len(updated_ppaeh) % 100000 == 0:
                logger.info(f"{count} companies have been treated")
                db_session.bulk_update_mappings(
                    PerfPredictionAndEffectiveHirings, updated_ppaeh)
                db_session.commit()
                updated_ppaeh = []

        # Commit for the remaining rows
        db_session.bulk_update_mappings(PerfPredictionAndEffectiveHirings,
                                        updated_ppaeh)
        db_session.commit()
        updated_ppaeh = []

        logger.info(
            f"Number of naf not found in the mapping rome naf for this importer cycle : {len(naf_not_founds)}"
        )
        logger.info(
            f"List of naf not found in the mapping rome naf for this importer cycle : {naf_not_founds}"
        )
        logger.info(
            f"Number of companies with naf not found in the mapping rome naf for this importer cycle : {nb_companies_with_naf_not_found}"
        )
        logger.info(f"Number of total companies : {count}")

        for naf_code, romes_list in perf_division_per_rome_dict.items():
            for rome_code, values in romes_list.items():
                division_per_rome = PerfDivisionPerRome(
                    importer_cycle_infos_id=ici._id,
                    naf=naf_code,
                    rome=rome_code,
                    threshold_lbb=values["threshold_lbb"],
                    threshold_lba=values["threshold_lba"],
                    nb_bonne_boites_lbb=values["nb_bonne_boites_lbb"],
                    nb_bonne_boites_lba=values["nb_bonne_boites_lba"],
                )
                db_session.add(division_per_rome)

        db_session.commit()

        ici.computed = True
        db_session.add(ici)
        db_session.commit()
Beispiel #8
0
def get_true_score(row):
    return scoring_util.get_score_adjusted_to_rome_code_and_naf_code(
        score=row['total_score'],
        rome_code=row['rome'],
        naf_code=row['codenaf'])