Esempio n. 1
0
    def test_importer_main(self, mongo_mock):
        """Test of basic usage of the importer_main function."""
        mongo_mock.return_value = mock.MagicMock()
        mongo.importer_main(_my_importer_func,
                            'my-collection',
                            ['foo', '--arg1', 'Value of arg1'],
                            flag_values=gflags.FlagValues())

        import_in_collection = mongo_mock.return_value.import_in_collection
        self.assertTrue(import_in_collection.called)
        call_args = import_in_collection.call_args[0]
        self.assertEqual([{'arg1': 'Value of arg1', 'dummy': 2}], call_args[0])
        self.assertEqual('my-collection', call_args[1])
Esempio n. 2
0
    def test_importer_main_no_args_but_default(self, mongo_mock):
        """Test the importer_main without args but with default value."""
        def import_func(arg1='default value'):
            """Foo."""
            return [{'dummy': 2, 'arg1': arg1}]

        mongo_mock.return_value = mock.MagicMock()
        mongo.importer_main(import_func,
                            'my-collection', ['foo'],
                            flag_values=gflags.FlagValues())
        import_in_collection = mongo_mock.return_value.import_in_collection
        self.assertTrue(import_in_collection.called)
        call_args = import_in_collection.call_args[0]
        self.assertEqual([{'arg1': 'default value', 'dummy': 2}], call_args[0])
Esempio n. 3
0
 def test_importer_main_with_output_file(self, mongo_mock):
     """Test that data gets written to file instead of DB when file given."""
     out_path = tempfile.mktemp()
     mongo.importer_main(
         _my_importer_func, 'my-collection',
         ['', '--to_json', out_path, '--arg1', 'arg1 test value'],
         flag_values=gflags.FlagValues())
     import_in_collection = mongo_mock.return_value.import_in_collection
     self.assertFalse(import_in_collection.called)
     with open(out_path) as json_file:
         json_content = json_file.read()
         self.assertEqual(
             [{'arg1': 'arg1 test value', 'dummy': 2}],
             json.loads(json_content))
         self.assertTrue(json_content.endswith('\n'))
Esempio n. 4
0
    def test_importer_main_with_input_file(self, pymongo_mock):
        """Test that the import_func doesn't get called with an input file."""
        mock_importer_func = mock.MagicMock(spec=_my_importer_func)

        def importer_func():
            """Foo."""
            mock_importer_func()

        client = mongomock.MongoClient('mongodb://mongo-url/test')
        pymongo_mock.MongoClient.return_value = client
        testdata_dir = path.join(path.dirname(__file__), 'testdata')
        json_path = path.join(testdata_dir, 'import_dummy_data.json')
        mongo.importer_main(importer_func,
                            'my_collection', ['', '--from_json', json_path],
                            flag_values=gflags.FlagValues())
        self.assertFalse(mock_importer_func.called)
        self.assertEqual(1, len(list(client.test.my_collection.find())))
Esempio n. 5
0
    def test_importer_filter_ids(self, mongo_mock):
        """Test of the filter_ids flag."""

        def richer_importer_func():
            """An importer with many outputs."""
            return list({'_id': 'foo-%02d' % i, 'value': i} for i in range(20))

        mongo_mock.return_value = mock.MagicMock()
        mongo.importer_main(
            richer_importer_func, 'my-collection',
            ['foo', '--filter_ids', 'foo-.2'],
            flag_values=gflags.FlagValues())

        import_in_collection = mongo_mock.return_value.import_in_collection
        self.assertTrue(import_in_collection.called)
        call_args = import_in_collection.call_args[0]
        self.assertEqual(
            [{'_id': 'foo-02', 'value': 2}, {'_id': 'foo-12', 'value': 12}],
            call_args[0])
        self.assertEqual('my-collection', call_args[1])
Esempio n. 6
0
    # Filter Job Groups on having more than 10 offers per departement month.
    top_departements_per_month = \
        top_departements_per_month[top_departements_per_month.offers > 10]

    # Adding the job groups inside.
    def _create_jobgroups(jobs):
        return jobs[['name', 'romeId', 'offers']].to_dict(orient='records')

    romes_per_dep_month = top_departements_per_month.groupby(
        ['departementId', 'creationMonth', 'departementSeasonalOffers'])\
        .apply(_create_jobgroups)\
        .to_frame('jobGroups')\
        .reset_index()\
        .rename(columns={'creationMonth': '_id'})

    def _create_month_stats(jobs):
        return jobs[['departementId', 'jobGroups', 'departementSeasonalOffers']]\
            .to_dict(orient='records')

    monthly_data = romes_per_dep_month\
        .groupby('_id')\
        .apply(_create_month_stats)\
        .to_frame('departementStats')\
        .reset_index()

    return monthly_data.to_dict(orient='records')


if __name__ == '__main__':
    mongo.importer_main(csv2dicts, 'seasonal_jobbing')  # pragma: no cover
    missions['isAvailableEverywhere'] = missions.JobId.map(all_post_codes) == _EVERYWHERE_POSTCODES
    if sum(missions.isAvailableEverywhere):
        everywhere_missions = missions[missions.isAvailableEverywhere].drop_duplicates('JobId')
        country_wide_missions = [
            {'_id': '', 'missions': _get_random_missions_picker(5)(everywhere_missions)},
        ]
    else:
        country_wide_missions = []

    # TODO(pascal): Add some missions per city as well.

    departement_missions = missions[~missions.isAvailableEverywhere]\
        .groupby('departement').apply(_get_random_missions_picker(5))
    return country_wide_missions + [
        {'_id': departement_id, 'missions': missions}
        for departement_id, missions in departement_missions.iteritems()]


def _get_random_missions_picker(num_missions):
    def _pick_random_missions(missions):
        if len(missions) > num_missions:
            samples = missions.sample(num_missions)
        else:
            samples = missions
        return samples[['associationName', 'title', 'link', 'description']].to_dict('records')
    return _pick_random_missions


if __name__ == '__main__':
    mongo.importer_main(get_missions_dicts, 'volunteering_missions')  # pragma: no-cover
def download_and_count():
    """Import the # of job offers available per job group and dept in MongoDB.

    Returns:
        Recent job offers count as a LocalJobStats JSON-proto compatible dict.
    """
    counts = collections.defaultdict(int)
    for job_offer in _iterate_job_offers():
        local_id = '%s:%s' % (job_offer['DEPARTEMENT_CODE'],
                              job_offer['ROME_PROFESSION_CARD_CODE'])
        counts[local_id] += 1
    return [{
        '_id': local_id,
        'numAvailableJobOffers': count
    } for local_id, count in counts.items()]


def _iterate_job_offers():
    client = emploi_store.Client(
        client_id=os.getenv('EMPLOI_STORE_CLIENT_ID'),
        client_secret=os.getenv('EMPLOI_STORE_CLIENT_SECRET'))
    package = client.get_package('offres')
    resource = package.get_resource(name="Offres d'emploi")
    return resource.records(
        fields=['DEPARTEMENT_CODE', 'ROME_PROFESSION_CARD_CODE'])


if __name__ == '__main__':
    mongo.importer_main(  # pragma: no-cover
        download_and_count, 'recent_job_offers')
def validate(values, proto_class):
    """Validate that the values have the right format.

    Args:
        values: an iterable of dict with the JSON values of proto. They may
            have an additional "_id" field that will be ignored.
        proto_class: the Python class of the proto that should be contained in
            the values.
    Returns:
        the input for chainability
    Raises:
        ValueError if one of the values doesn't have the right format.
    """
    for value in values:
        proto = proto_class()
        _id = value.pop('_id', None)
        # Enforce Proto schema.
        try:
            json_format.Parse(json.dumps(value), proto)
        except json_format.ParseError as error:
            raise ValueError('Error while parsing:\n%s\n%s' %
                             (json.dumps(value, indent=2), error))
        if _id is not None:
            value['_id'] = _id
    return values


if __name__ == '__main__':
    mongo.importer_main(airtable2dicts, 'test')  # pragma: no-cover
    file3 = pandas.read_csv(file3_path)
    file3.columns = ['postcodes', 'postcode']

    city_stats = cleaned_data.french_city_stats(data_folder)
    city_stats = city_stats[~city_stats.city_id.
                            isin(['13055', '75056', '69123'])]
    postcode_to_range_mapping = {}
    for zip_codes in city_stats.zipCode:
        for zip_code in zip_codes.split('-'):
            postcode_to_range_mapping[zip_code] = zip_codes

    job_groups = cleaned_data.rome_job_groups(data_folder)
    rome_ids = job_groups.reset_index()
    rome_ids['merge_id'] = 1
    rome_ids = rome_ids[['code_rome', 'merge_id']]
    rome_ids.columns = ['rome_id', 'merge_id']
    file3['merge_id'] = 1
    outer_product = pandas.merge(file3, rome_ids, how='outer', on=['merge_id'])

    massaged_file3 = outer_product[['rome_id', 'postcode']]
    data_zones = pandas.concat([data_zones, massaged_file3])

    padded_postcodes = data_zones.postcode.astype(str).str.pad(5, 'left', '0')
    data_zones['postcodes'] = padded_postcodes.map(postcode_to_range_mapping)
    data_zones.drop_duplicates(['rome_id', 'postcodes'], inplace=True)
    return data_zones


if __name__ == "__main__":
    mongo.importer_main(csv2dicts, 'unverified_data_zones')  # pragma: no cover
Esempio n. 11
0
    for proto_name, airtable_name in _AIRTABLE_ASSET_TO_PROTO_FIELD.items():
        value = airtable_fields.get(airtable_name)
        if value:
            try:
                assets[proto_name] = _assert_markdown_list(value)
            except ValueError as error:
                errors.append(
                    ValueError('The field %s is not formatted correctly: %s' %
                               (airtable_name, error)))
    if errors:
        raise ValueError('The job %s has %d, errors:\n%s' %
                         (airtable_fields.get('code_rome'), len(errors),
                          '\n'.join(str(error) for error in errors)))
    return airtable_fields['code_rome'], assets


def _assert_markdown_list(value):
    lines = value.strip().split('\n')
    if not lines:
        return ''
    for line in lines:
        if not _MARKDOWN_LIST_LINE_REGEXP.match(line):
            raise ValueError(
                'Each line should start with a * and an upper case, found: %s'
                % line)
    return '\n'.join(lines)


if __name__ == "__main__":
    mongo.importer_main(make_dicts, 'job_group_info')  # pragma: no cover
Esempio n. 12
0
            - departement_id: the ID of the département or None if the group
              covers multiple départements.
            - region_id: the ID of the région or None if the group covers
              multiple régions.
        The DataFrame has only one row indexed with <city_id>:<code_rome>.
    """
    if len(job_seekers) < _MINIMUM_GROUP_SIZE:
        return None
    estimation = {
        'days': int(job_seekers.duration.median()),
    }
    departement_ids = job_seekers.departement_id.unique()
    region_ids = job_seekers.region_id.unique()
    code_rome = job_seekers.iloc[0]['code_rome']
    group_index = job_seekers.iloc[0]['city_id'] + ':' + code_rome
    return pandas.DataFrame(
        {
            'city_id': [job_seekers.iloc[0]['city_id']],
            'city_name': [job_seekers.iloc[0]['city_name']],
            'code_rome': [code_rome],
            'duration': [estimation],
            'departement_id':
            [departement_ids[0] if len(departement_ids) == 1 else None],
            'region_id': [region_ids[0] if len(region_ids) == 1 else None],
        },
        index=[group_index])


if __name__ == "__main__":
    mongo.importer_main(fhs2dicts, 'fhs_local_diagnosis')  # pragma: no cover
Esempio n. 13
0
def _workup_to_proto(event, departements):
    if event.get('address', '').strip().lower() == 'en ligne':
        geo_filters = []
    else:
        close_departements = departements[
            (departements.max_latitude + _LAT_BUFFER >= event['latitude'])
            & (departements.min_latitude - _LAT_BUFFER <= event['latitude']) &
            (departements.max_longitude + _LNG_BUFFER >= event['longitude']) &
            (departements.min_longitude - _LNG_BUFFER <= event['longitude'])]
        if close_departements.empty:
            raise ValueError('Event is next to no French départements:\n%s',
                             event)
        geo_filters = [
            'for-departement(%s)' %
            ','.join(sorted(close_departements.departement_id))
        ]

    # TODO(pascal): Add better filters for reorientation.
    return {
        '_id': event['id'],
        'filters': geo_filters,
        'link': _WORKUP_EVENT_URL % event['slug'],
        'organiser': event['organiser'],
        'startDate': event['date'],
        'title': event['title'],
    }


if __name__ == '__main__':
    mongo.importer_main(events2dicts, 'events')  # pragma: no-cover
Esempio n. 14
0
from bob_emploi.lib import mongo


def csv2dicts(stats_filename):
    """Prepare cities for upload to MongoDB.

    Args:
        stats_filename: path to a file containing stats about cities.

    Returns:
        A list of dict JSON-like object compatible with the geo_pb2.FrenchCity
        proto.
    """
    city_stats = pandas.read_csv(stats_filename,
                                 sep=',',
                                 header=None,
                                 usecols=[10, 19, 20],
                                 names=['_id', 'longitude', 'latitude'],
                                 dtype={
                                     '_id': str,
                                     'latitude': float,
                                     'longitude': float
                                 })
    city_stats.dropna()
    return city_stats.to_dict(orient='records')


if __name__ == '__main__':
    mongo.importer_main(csv2dicts, 'cities')  # pragma: no-cover
Esempio n. 15
0
    }


def _update_salaries(imt):
    old_imt = imt.loc['imt']
    new_salaries = imt.loc['updated_salaries']
    if not isinstance(new_salaries, dict):
        return old_imt
    return dict(old_imt, **new_salaries)


def finalize_salary_estimation(estimation):
    """Finalize the data for a SalaryEstimation proto.

    Args:
        estimation: a dict with min/max/medianSalary. This dict will be
            modified.

    Returns:
        The input dict with additional fields to be displayed.
    """
    estimation['shortText'] = '{} - {}'.format(
        locale.format('%d', estimation['minSalary'], grouping=True),
        locale.format('%d', estimation['maxSalary'], grouping=True))
    estimation['unit'] = 'ANNUAL_GROSS_SALARY'
    return estimation


if __name__ == '__main__':
    mongo.importer_main(csv2dicts, 'local_diagnosis')  # pragma: no cover
    def _get_jobs(self, count_threshold):
        jobs = self._get_sorted_requirements(_RequirementKind.job, count_threshold)
        for job_id, count, unused_percent_required in jobs:
            yield {
                'percentSuggested': round(100 * count / self.num_offers),
                'codeOgr': job_id,
            }


def csv2dicts(job_offers_csv, colnames_txt):
    """Import the requirement from job offers grouped by Job Group in MongoDB.

    Args:
        job_offers_csv: Path of the csv containing the data.
        colnames_txt: Path to a file containing the name of the CSV's columns.
    Returns:
        Requirements as a JobRequirements JSON-proto compatible dict.
    """
    job_groups = collections.defaultdict(_RequirementsCollector)
    for job_offer in job_offers.iterate(
            job_offers_csv, colnames_txt, _REQUIRED_FIELDS):
        job_groups[job_offer.rome_profession_card_code].collect(job_offer)
    return [
        dict(job_groups[job_group_id].get_proto_dict(), _id=job_group_id)
        for job_group_id in sorted(job_groups)]


if __name__ == "__main__":
    mongo.importer_main(csv2dicts, 'job_requirements')  # pragma: no cover
Esempio n. 17
0
    required_fields = {
        _CITY_CODE_FIELD, _JOB_GROUP_CODE_FIELD, _LATITUDE_CODE_FIELD,
        _LONGITUDE_CODE_FIELD, _CITY_NAME_CODE_FIELD}

    offers_rows = job_offers.iterate(offers_file, colnames, required_fields)

    city_data = _list_hiring_cities(offers_rows, min_creation_date, data_folder)

    # Computing the threshold per job group.
    job_group_threshold = collections.defaultdict(float)
    for job_group, offers in city_data.offers_per_job_group.items():
        job_group_threshold[job_group] = math.pow(offers, 0.6) / 40

    job_group_to_kept_cities = collections.defaultdict(list)
    for job_group, city_ids in city_data.job_group_to_city_ids.items():
        kept_cities = []
        for city_id, offer_count in city_ids.items():
            if offer_count > job_group_threshold[job_group]:
                kept_cities.append({'city': city_data.city_info[city_id], 'offers': offer_count})

        job_group_to_kept_cities[job_group] = sorted(
            kept_cities, key=lambda k: k['offers'], reverse=True)

    return [
        {'_id': job_group_id, 'hiringCities': job_group_weighted_cities}
        for job_group_id, job_group_weighted_cities in job_group_to_kept_cities.items()]


if __name__ == '__main__':
    mongo.importer_main(extract_offers_per_cities, 'hiring_cities')  # pragma: no cover
Esempio n. 18
0
    by_region = city_count.set_index(group_cols)
    by_region['region_count'] = region_count
    city_count = by_region.reset_index()

    # Compute country counts for each city.
    country_count = recent_offers.groupby('rome_id').id_offre.count()
    by_country = city_count.set_index('rome_id')
    by_country['country_count'] = country_count
    city_count = by_country.reset_index()

    for row in city_count.itertuples():
        res.append({
            '_id': row.rome_id + ':c' + row.city_code,
            'city': {
                'cityId': row.city_code,
                'name': row.city_name,
                'departementId': row.departement_code,
                'departementName': row.departement_name,
                'regionId': row.region_code,
                'regionName': row.region_name,
            },
            'cityCount': int(row.city_count),
            'departementCount': int(row.departement_count),
            'regionCount': int(row.region_count),
            'countryCount': int(row.country_count),
        })
    return res

if __name__ == "__main__":
    mongo.importer_main(csv2dicts, 'job_offers')  # pragma: no cover
Esempio n. 19
0
"""Importer for e-Territoire URLs into MongoDB."""
import requests

from bob_emploi.lib import mongo


def get_cities_dicts():
    """Download e-Territoire URLs from their website and prepare them.

    Returns:
        For each city (by INSEE ID) a deep link URL.
    """
    response = requests.get('http://www.eterritoire.fr/webservice/listeCommunes.php')
    response.raise_for_status()
    urls = response.json()

    return [{'_id': u['idinsee'], 'path': u['url']} for u in urls]


if __name__ == '__main__':
    mongo.importer_main(get_cities_dicts, 'eterritoire_links')  # pragma: no-cover
Esempio n. 20
0
    ]]
    samples.rename(columns={
        'target_job': 'codeOgr',
        'target_job_name': 'name',
        'target_job_masculine_name': 'masculineName',
        'target_job_feminine_name': 'feminineName',
    },
                   inplace=True)

    return {
        'jobGroup': {
            'romeId': jobs.target_job_group.iloc[0],
            'name': jobs.target_job_group_name.iloc[0],
            'samples': samples.to_dict('records'),
        }
    }


def _sample_jobs(num_samples):
    def _sampling(jobs):
        if len(jobs.index) > num_samples:
            jobs = jobs.sample(n=num_samples)
        jobs = jobs[['codeOgr', 'name', 'masculineName', 'feminineName']]
        return jobs.to_dict('records')

    return _sampling


if __name__ == '__main__':
    mongo.importer_main(csv2dicts, 'similar_jobs')  # pragma: no cover
Esempio n. 21
0
from bob_emploi.lib import mongo

API_KEY = os.getenv('AIRTABLE_API_KEY')


def airtable2dicts(base_id, table, view=None):
    """Import the users email from Airtable.

    Args:
        base_id: the ID of your Airtable app.
        table: the name of the table to import.
        view: optional - the name of the view to import.
    Returns:
        an iterable of dict with the JSON values of the proto.
    """
    if not API_KEY:
        raise ValueError(
            'No API key found. Create an airtable API key at '
            'https://airtable.com/account and set it in the AIRTABLE_API_KEY '
            'env var.')
    client = airtable.Airtable(base_id, API_KEY)
    records = client.iterate(table, view=view)

    return [{'_id': r.get('fields', {}).get('email', '')} for r in records]


if __name__ == '__main__':
    mongo.importer_main(airtable2dicts,
                        'show_unverified_data_users')  # pragma: no-cover