Esempio n. 1
0
def validate_config(config, check_with_data):
    """
    Check that the config passed as argument is a valid configuration.

    :param config: A config dictionary to fetch.
    :param check_with_data: Whether we should use the available OpenData to
    check the config values.
    :return: ``True`` if the configuration is valid, ``False`` otherwise.
    """
    def _check_constraints_bounds(bounds):
        """
        Check the bounds for numeric constraints.
        """
        assert len(bounds) == 2
        assert all(
            x is None or
            (
                isinstance(x, (float, int)) and
                x >= 0
            )
            for x in bounds
        )
        if bounds[0] is not None and bounds[1] is not None:
            assert bounds[1] > bounds[0]

    try:
        # Note: The traceback fetching code only handle single line asserts.
        # Then, we disable line-too-long pylint check and E501 flake8 checks
        # and use long lines whenever needed, in order to have the full assert
        # message in the log output.
        # pylint: disable=locally-disabled,line-too-long

        assert config["passes"] in [0, 1, 2, 3]
        assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0)  # noqa: E501

        assert config["data_directory"] is None or isinstance(config["data_directory"], str)  # noqa: E501
        assert os.path.isdir(config["data_directory"])
        assert isinstance(config["search_index"], str)
        assert config["modules_path"] is None or isinstance(config["modules_path"], str)  # noqa: E501

        assert config["database"] is None or isinstance(config["database"], str)  # noqa: E501

        assert isinstance(config["port"], int)
        assert isinstance(config["host"], str)
        assert config["webserver"] is None or isinstance(config["webserver"], str)  # noqa: E501
        assert config["backends"] is None or isinstance(config["backends"], list)  # noqa: E501

        assert isinstance(config["send_email"], bool)
        assert config["smtp_server"] is None or isinstance(config["smtp_server"], str)  # noqa: E501
        assert config["smtp_port"] is None or isinstance(config["smtp_port"], int)  # noqa: E501
        assert config["smtp_to"] is None or isinstance(config["smtp_to"], list)

        # Ensure constraints are ok
        assert config["constraints"]
        for constraint in config["constraints"].values():
            assert "type" in constraint
            assert isinstance(constraint["type"], str)
            assert constraint["type"].upper() in ["RENT", "SALE", "SHARING"]
            
            assert "minimum_pictures" in constraint
            assert isinstance(constraint["minimum_pictures"], int)
            assert constraint["minimum_pictures"] >= 0

            assert "house_types" in constraint
            assert constraint["house_types"]
            for house_type in constraint["house_types"]:
                assert house_type.upper() in ["APART", "HOUSE", "PARKING", "LAND", "OTHER", "UNKNOWN"]  # noqa: E501

            assert "postal_codes" in constraint
            assert constraint["postal_codes"]
            if check_with_data:
                opendata_postal_codes = [
                    x.postal_code
                    for x in data.load_data(PostalCode, constraint, config)
                ]
                for postal_code in constraint["postal_codes"]:
                    assert postal_code in opendata_postal_codes  # noqa: E501

            assert "area" in constraint
            _check_constraints_bounds(constraint["area"])

            assert "cost" in constraint
            _check_constraints_bounds(constraint["cost"])

            assert "rooms" in constraint
            _check_constraints_bounds(constraint["rooms"])

            assert "bedrooms" in constraint
            _check_constraints_bounds(constraint["bedrooms"])

            assert "time_to" in constraint
            assert isinstance(constraint["time_to"], dict)
            for name, item in constraint["time_to"].items():
                assert isinstance(name, str)
                assert "gps" in item
                assert isinstance(item["gps"], list)
                assert len(item["gps"]) == 2
                assert "time" in item
                _check_constraints_bounds(item["time"])

        return True
    except (AssertionError, KeyError):
        _, _, exc_traceback = sys.exc_info()
        return traceback.extract_tb(exc_traceback)[-1][-1]
Esempio n. 2
0
def guess_stations(flats_list, constraint, config):
    """
    Try to match the station field with a list of available stations nearby.

    :param flats_list: A list of flats dict.
    :param constraint: The constraint that the ``flats_list`` should satisfy.
    :param config: A config dict.

    :return: An updated list of flats dict with guessed nearby stations.
    """
    distance_threshold = config['max_distance_housing_station']
    opendata = {
        "postal_codes": data.load_data(PostalCode, constraint, config),
        "stations": data.load_data(PublicTransport, constraint, config)
    }

    for flat in flats_list:
        flat_station = flat.get("station", None)

        if not flat_station:
            # Skip everything if empty station
            LOGGER.info(
                "No stations field for flat %s, skipping stations lookup.",
                flat["id"])
            continue

        # Weboob modules can return several stations in a comma-separated list.
        flat_stations = flat_station.split(',')
        # But some stations containing a comma exist, so let's add the initial
        # value to the list of stations to check if there was one.
        if len(flat_stations) > 1:
            flat_stations.append(flat_station)

        matched_stations = []
        for tentative_station in flat_stations:
            matched_stations += fuzzy_match(
                tentative_station, [x.name for x in opendata["stations"]],
                limit=10,
                threshold=50)

        # Keep only one occurrence of each station
        matched_stations = list(set(matched_stations))

        # Filter out the stations that are obviously too far and not well
        # guessed
        good_matched_stations = []
        postal_code = flat["flatisfy"].get("postal_code", None)
        if postal_code:
            # If there is a postal code, check that the matched station is
            # closed to it
            postal_code_gps = next((x.lat, x.lng)
                                   for x in opendata["postal_codes"]
                                   if x.postal_code == postal_code)
            for station in matched_stations:
                # Note that multiple stations with the same name exist in a
                # city, hence the list of stations objects for a given matching
                # station name.
                stations_objects = [
                    x for x in opendata["stations"] if x.name == station[0]
                ]
                for station_data in stations_objects:
                    distance = tools.distance(
                        (station_data.lat, station_data.lng), postal_code_gps)
                    if distance < distance_threshold:
                        # If at least one of the coordinates for a given
                        # station is close enough, that's ok and we can add
                        # the station
                        good_matched_stations.append({
                            "key":
                            station[0],
                            "name":
                            station_data.name,
                            "confidence":
                            station[1],
                            "gps": (station_data.lat, station_data.lng)
                        })
                        break
                    LOGGER.info(
                        ("Station %s is too far from flat %s (%dm > %dm), "
                         "discarding this station."), station[0], flat["id"],
                        int(distance), int(distance_threshold))
        else:
            LOGGER.info(
                "No postal code for flat %s, skipping stations detection.",
                flat["id"])

        if not good_matched_stations:
            # No stations found, log it and cotninue with next housing
            LOGGER.info("No stations found for flat %s, matching %s.",
                        flat["id"], flat["station"])
            continue

        LOGGER.info("Found stations for flat %s: %s (matching %s).",
                    flat["id"], ", ".join(x["name"]
                                          for x in good_matched_stations),
                    flat["station"])

        # If some stations were already filled in and the result is different,
        # display some warning to the user
        if ("matched_stations" in flat["flatisfy"] and (
                # Do a set comparison, as ordering is not important
                set([
                    station["name"]
                    for station in flat["flatisfy"]["matched_stations"]
                ]) != set(
                    [station["name"] for station in good_matched_stations]))):
            LOGGER.warning(
                "Replacing previously fetched stations for flat %s. Found "
                "stations differ from the previously found ones.", flat["id"])

        flat["flatisfy"]["matched_stations"] = good_matched_stations

    return flats_list
Esempio n. 3
0
def guess_postal_code(flats_list,
                      constraint,
                      config,
                      distance_threshold=20000):
    """
    Try to guess the postal code from the location of the flats.

    :param flats_list: A list of flats dict.
    :param constraint: The constraint that the ``flats_list`` should satisfy.
    :param config: A config dict.
    :param distance_threshold: Maximum distance in meters between the
        constraint postal codes (from config) and the one found by this
        function, to avoid bad fuzzy matching. Can be ``None`` to disable
        thresholding.

    :return: An updated list of flats dict with guessed postal code.
    """
    opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)}

    for flat in flats_list:
        location = flat.get("location", None)
        if not location:
            # Skip everything if empty location
            LOGGER.info(("No location field for flat %s, skipping postal "
                         "code lookup."), flat["id"])
            continue

        postal_code = None
        # Try to find a postal code directly
        try:
            postal_code = re.search(r"[0-9]{5}", location)
            assert postal_code is not None
            postal_code = postal_code.group(0)

            # Check the postal code is within the db
            assert postal_code in [
                x.postal_code for x in opendata["postal_codes"]
            ]

            LOGGER.info("Found postal code in location field for flat %s: %s.",
                        flat["id"], postal_code)
        except AssertionError:
            postal_code = None

        # If not found, try to find a city
        if not postal_code:
            # Find all fuzzy-matching cities
            matched_cities = fuzzy_match(
                location, [x.name for x in opendata["postal_codes"]],
                limit=None)
            if matched_cities:
                # Find associated postal codes
                matched_postal_codes = []
                for matched_city_name, _ in matched_cities:
                    postal_code_objects_for_city = [
                        x for x in opendata["postal_codes"]
                        if x.name == matched_city_name
                    ]
                    matched_postal_codes.extend(
                        pc.postal_code for pc in postal_code_objects_for_city)
                # Try to match them with postal codes in config constraint
                matched_postal_codes_in_config = (
                    set(matched_postal_codes)
                    & set(constraint["postal_codes"]))
                if matched_postal_codes_in_config:
                    # If there are some matched postal codes which are also in
                    # config, use them preferentially. This avoid ignoring
                    # incorrectly some flats in cities with multiple postal
                    # codes, see #110.
                    postal_code = next(iter(matched_postal_codes_in_config))
                else:
                    # Otherwise, simply take any matched postal code.
                    postal_code = matched_postal_codes[0]
                LOGGER.info(
                    ("Found postal code in location field through city lookup "
                     "for flat %s: %s."), flat["id"], postal_code)

        # Check that postal code is not too far from the ones listed in config,
        # limit bad fuzzy matching
        if postal_code and distance_threshold:
            distance = min(
                tools.distance(
                    next((x.lat, x.lng) for x in opendata["postal_codes"]
                         if x.postal_code == postal_code),
                    next((x.lat, x.lng) for x in opendata["postal_codes"]
                         if x.postal_code == constraint_postal_code))
                for constraint_postal_code in constraint["postal_codes"])

            if distance > distance_threshold:
                LOGGER.info(
                    ("Postal code %s found for flat %s is off-constraints "
                     "(distance is %dm > %dm). Let's consider it is an "
                     "artifact match and keep the post without this postal "
                     "code."), postal_code, flat["id"], int(distance),
                    int(distance_threshold))
                postal_code = None

        # Store it
        if postal_code:
            existing_postal_code = flat["flatisfy"].get("postal_code", None)
            if existing_postal_code and existing_postal_code != postal_code:
                LOGGER.warning(
                    "Replacing previous postal code %s by %s for flat %s.",
                    existing_postal_code, postal_code, flat["id"])
            flat["flatisfy"]["postal_code"] = postal_code
        else:
            LOGGER.info("No postal code found for flat %s.", flat["id"])

    return flats_list
Esempio n. 4
0
def guess_postal_code(flats_list,
                      constraint,
                      config,
                      distance_threshold=20000):
    """
    Try to guess the postal code from the location of the flats.

    :param flats_list: A list of flats dict.
    :param constraint: The constraint that the ``flats_list`` should satisfy.
    :param config: A config dict.
    :param distance_threshold: Maximum distance in meters between the
    constraint postal codes (from config) and the one found by this function,
    to avoid bad fuzzy matching. Can be ``None`` to disable thresholding.

    :return: An updated list of flats dict with guessed postal code.
    """
    opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)}

    for flat in flats_list:
        location = flat.get("location", None)
        if not location:
            # Skip everything if empty location
            LOGGER.info(("No location field for flat %s, skipping postal "
                         "code lookup."), flat["id"])
            continue

        postal_code = None
        # Try to find a postal code directly
        try:
            postal_code = re.search(r"[0-9]{5}", location)
            assert postal_code is not None
            postal_code = postal_code.group(0)

            # Check the postal code is within the db
            assert postal_code in [
                x.postal_code for x in opendata["postal_codes"]
            ]

            LOGGER.info("Found postal code in location field for flat %s: %s.",
                        flat["id"], postal_code)
        except AssertionError:
            postal_code = None

        # If not found, try to find a city
        cities = {x.name: x for x in opendata["postal_codes"]}
        if not postal_code:
            matched_city = fuzzy_match(location, cities.keys(), limit=1)
            if matched_city:
                # Store the matching postal code
                matched_city = matched_city[0]
                matched_city_name = matched_city[0]
                postal_code = (cities[matched_city_name].postal_code)
                LOGGER.info(
                    ("Found postal code in location field through city lookup "
                     "for flat %s: %s."), flat["id"], postal_code)

        # Check that postal code is not too far from the ones listed in config,
        # limit bad fuzzy matching
        if postal_code and distance_threshold:
            distance = min(
                tools.distance(
                    next((x.lat, x.lng) for x in opendata["postal_codes"]
                         if x.postal_code == postal_code),
                    next((x.lat, x.lng) for x in opendata["postal_codes"]
                         if x.postal_code == constraint_postal_code))
                for constraint_postal_code in constraint["postal_codes"])

            if distance > distance_threshold:
                LOGGER.info(
                    ("Postal code %s found for flat %s is off-constraints "
                     "(distance is %dm > %dm). Let's consider it is an "
                     "artifact match and keep the post without this postal "
                     "code."), postal_code, flat["id"], int(distance),
                    int(distance_threshold))
                postal_code = None

        # Store it
        if postal_code:
            existing_postal_code = flat["flatisfy"].get("postal_code", None)
            if existing_postal_code and existing_postal_code != postal_code:
                LOGGER.warning(
                    "Replacing previous postal code %s by %s for flat %s.",
                    existing_postal_code, postal_code, flat["id"])
            flat["flatisfy"]["postal_code"] = postal_code
        else:
            LOGGER.info("No postal code found for flat %s.", flat["id"])

    return flats_list
Esempio n. 5
0
def validate_config(config, check_with_data):
    """
    Check that the config passed as argument is a valid configuration.

    :param config: A config dictionary to fetch.
    :param check_with_data: Whether we should use the available OpenData to
        check the config values.
    :return: ``True`` if the configuration is valid, ``False`` otherwise.
    """
    def _check_constraints_bounds(bounds):
        """
        Check the bounds for numeric constraints.
        """
        assert isinstance(bounds, list)
        assert len(bounds) == 2
        assert all(
            x is None or
            (
                isinstance(x, (float, int)) and
                x >= 0
            )
            for x in bounds
        )
        if bounds[0] is not None and bounds[1] is not None:
            assert bounds[1] > bounds[0]

    try:
        # Note: The traceback fetching code only handle single line asserts.
        # Then, we disable line-too-long pylint check and E501 flake8 checks
        # and use long lines whenever needed, in order to have the full assert
        # message in the log output.
        # pylint: disable=locally-disabled,line-too-long

        assert config["passes"] in [0, 1, 2, 3]
        assert config["max_entries"] is None or (isinstance(config["max_entries"], int) and config["max_entries"] > 0)  # noqa: E501

        assert config["data_directory"] is None or isinstance(config["data_directory"], str)  # noqa: E501
        assert os.path.isdir(config["data_directory"])
        assert isinstance(config["search_index"], str)
        assert config["modules_path"] is None or isinstance(config["modules_path"], str)  # noqa: E501

        assert config["database"] is None or isinstance(config["database"], str)  # noqa: E501

        assert isinstance(config["port"], int)
        assert isinstance(config["host"], str)
        assert config["webserver"] is None or isinstance(config["webserver"], str)  # noqa: E501
        assert config["backends"] is None or isinstance(config["backends"], list)  # noqa: E501

        assert isinstance(config["send_email"], bool)
        assert config["smtp_server"] is None or isinstance(config["smtp_server"], str)  # noqa: E501
        assert config["smtp_port"] is None or isinstance(config["smtp_port"], int)  # noqa: E501
        assert config["smtp_username"] is None or isinstance(config["smtp_username"], str)  # noqa: E501
        assert config["smtp_password"] is None or isinstance(config["smtp_password"], str)  # noqa: E501
        assert config["smtp_to"] is None or isinstance(config["smtp_to"], list)

        assert isinstance(config["store_personal_data"], bool)
        assert isinstance(config["max_distance_housing_station"], (int, float))
        assert isinstance(config["duplicate_threshold"], int)
        assert isinstance(config["duplicate_image_hash_threshold"], int)

        # API keys
        assert config["navitia_api_key"] is None or isinstance(config["navitia_api_key"], str)  # noqa: E501
        assert config["mapbox_api_key"] is None or isinstance(config["mapbox_api_key"], str)  # noqa: E501

        # Ensure constraints are ok
        assert config["constraints"]
        for constraint in config["constraints"].values():
            assert "type" in constraint
            assert isinstance(constraint["type"], str)
            assert constraint["type"].upper() in POSTS_TYPES.__members__

            assert "minimum_nb_photos" in constraint
            if constraint["minimum_nb_photos"]:
                assert isinstance(constraint["minimum_nb_photos"], int)
                assert constraint["minimum_nb_photos"] >= 0

            assert "description_should_contain" in constraint
            assert isinstance(constraint["description_should_contain"], list)
            if constraint["description_should_contain"]:
                for term in constraint["description_should_contain"]:
                    assert isinstance(term, str)

            assert "description_should_not_contain" in constraint
            assert isinstance(constraint["description_should_not_contain"],
                              list)
            if constraint["description_should_not_contain"]:
                for term in constraint["description_should_not_contain"]:
                    assert isinstance(term, str)

            assert "house_types" in constraint
            assert constraint["house_types"]
            for house_type in constraint["house_types"]:
                assert house_type.upper() in HOUSE_TYPES.__members__

            assert "postal_codes" in constraint
            assert constraint["postal_codes"]
            assert all(isinstance(x, str) for x in constraint["postal_codes"])
            if check_with_data:
                # Ensure data is built into db
                data.preprocess_data(config, force=False)
                # Check postal codes
                opendata_postal_codes = [
                    x.postal_code
                    for x in data.load_data(PostalCode, constraint, config)
                ]
                for postal_code in constraint["postal_codes"]:
                    assert postal_code in opendata_postal_codes  # noqa: E501

            assert "area" in constraint
            _check_constraints_bounds(constraint["area"])

            assert "cost" in constraint
            _check_constraints_bounds(constraint["cost"])

            assert "rooms" in constraint
            _check_constraints_bounds(constraint["rooms"])

            assert "bedrooms" in constraint
            _check_constraints_bounds(constraint["bedrooms"])

            assert "time_to" in constraint
            assert isinstance(constraint["time_to"], dict)
            for name, item in constraint["time_to"].items():
                assert isinstance(name, str)
                assert "gps" in item
                assert isinstance(item["gps"], list)
                assert len(item["gps"]) == 2
                assert "time" in item
                _check_constraints_bounds(item["time"])
                if "mode" in item:
                    TimeToModes[item["mode"]]

        return True
    except (AssertionError, KeyError):
        _, _, exc_traceback = sys.exc_info()
        return traceback.extract_tb(exc_traceback)[-1][-1]
Esempio n. 6
0
def guess_postal_code(flats_list, constraint, config, distance_threshold=20000):
    """
    Try to guess the postal code from the location of the flats.

    :param flats_list: A list of flats dict.
    :param constraint: The constraint that the ``flats_list`` should satisfy.
    :param config: A config dict.
    :param distance_threshold: Maximum distance in meters between the
        constraint postal codes (from config) and the one found by this
        function, to avoid bad fuzzy matching. Can be ``None`` to disable
        thresholding.

    :return: An updated list of flats dict with guessed postal code.
    """
    opendata = {"postal_codes": data.load_data(PostalCode, constraint, config)}

    for flat in flats_list:
        location = flat.get("location", None)
        if not location:
            addr = flat.get("address", None)
            if addr:
                location = addr["full_address"]
        if not location:
            # Skip everything if empty location
            LOGGER.info(
                ("No location field for flat %s, skipping postal code lookup. (%s)"),
                flat["id"],
                flat.get("address"),
            )
            continue

        postal_code = None
        insee_code = None
        position = None

        # Try to find a postal code directly
        try:
            postal_code = re.search(r"[0-9]{5}", location)
            assert postal_code is not None
            postal_code = postal_code.group(0)

            # Check the postal code is within the db
            assert postal_code in [x.postal_code for x in opendata["postal_codes"]]

            LOGGER.debug(
                "Found postal code directly in location field for flat %s: %s.",
                flat["id"],
                postal_code,
            )
        except AssertionError:
            postal_code = None

        # Then fetch position (and postal_code is couldn't be found earlier)
        cities = opendata["postal_codes"]
        if postal_code:
            cities = [x for x in cities if x.postal_code == postal_code]
        (postal_code, insee_code, position) = guess_location_position(
            location, cities, constraint, postal_code is not None
        )

        # Check that postal code is not too far from the ones listed in config,
        # limit bad fuzzy matching
        if postal_code and distance_threshold:
            distance = min(
                tools.distance(
                    next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == postal_code),
                    next((x.lat, x.lng) for x in opendata["postal_codes"] if x.postal_code == constraint_postal_code),
                )
                for constraint_postal_code in constraint["postal_codes"]
            )

            if distance > distance_threshold:
                LOGGER.info(
                    (
                        "Postal code %s found for flat %s @ %s is off-constraints "
                        "(distance is %dm > %dm). Let's consider it is an "
                        "artifact match and keep the post without this postal "
                        "code."
                    ),
                    postal_code,
                    flat["id"],
                    location,
                    int(distance),
                    int(distance_threshold),
                )
                postal_code = None
                position = None

        # Store it
        if postal_code:
            existing_postal_code = flat["flatisfy"].get("postal_code", None)
            if existing_postal_code and existing_postal_code != postal_code:
                LOGGER.warning(
                    "Replacing previous postal code %s by %s for flat %s.",
                    existing_postal_code,
                    postal_code,
                    flat["id"],
                )
            flat["flatisfy"]["postal_code"] = postal_code
        else:
            LOGGER.info("No postal code found for flat %s.", flat["id"])

        if insee_code:
            flat["flatisfy"]["insee_code"] = insee_code

        if position:
            flat["flatisfy"]["position"] = position
        LOGGER.debug(
            "found postal_code=%s insee_code=%s position=%s for flat %s (%s).",
            postal_code,
            insee_code,
            position,
            flat["id"],
            location,
        )

    return flats_list