Example #1
0
File: matcher.py Project: 3taps/geo
    def calc_parents(self, location, parents_by_level):
        """ Recursively calculate the given location's parents.

            The parameters are as follows:

                'location'

                    A dictionary with the details of this location, as returned
                    by a previous call to locationCache.get().

                'parents_by_level'

                    A dictionary mapping level numbers to a set of location
                    codes at that level.

            We add this location's parents to 'parents_by_level', and then
            recursively process each parent so that its parents get added to
            'parents_by_level' as well.
        """
        for parent_code in location["parents"]:
            parent = locationCache.get(parent_code)
            parents_by_level[parent["level"]].add(parent_code)

        for parent_code in location["parents"]:
            parent = locationCache.get(parent_code)
            self.calc_parents(parent, parents_by_level)
Example #2
0
File: matcher.py Project: 3taps/geo
    def location_matched(self, field, location, log):
        """ Respond to a location being matched for the given field.

            The parameters are as follows:

                'field'

                    The name of the field we've calculated the location for.

                'location'

                    A dictionary describing the location that we identified for
                    this field.  This dictionary should have 'code' and 'name'
                    entries, containing the location code and name,
                    respectively, for the identified location.

                'log'

                    A list of strings which represent the debugging log for
                    this geolocation attempt.  This will be set to None if we
                    aren't in verbose mode.

            We attempt to infer additional locations based on this new field
            value and the other values we have been given (or inferred)
            previously.  Suitable debugging information will be written to the
            'log' list, if it exists.

            Upon completion, we return an (inferred_locs, error) tuple, where:
            'inferred_locs' contains the additional fields which we have
            inferred the value of (if any), and 'error' will be an error to use
            for this field, if any.

            Note that 'inferred_locs' will be a dictionary mapping field names
            to a dictionary with 'code' and 'name' entries for the
            newly-inferred location value.

            If we discovered an inconsistency with the matched locations,
            'error' will be set to a string describing what went wrong.
        """
        self.known_fields.add(field)

        # Update the set of possible values for this field, replacing the
        # existing values with the newly-calculated one.  Note that this will
        # fail if we already have a set of possible values and this
        # newly-calculated value isn't in the set.

        prev_values = list(self.possible_values[field])
        self.update_possible_values(field, set([location["code"]]))
        if len(self.possible_values[field]) == 0:
            if log != None:
                s = self.display_list(
                    prev_values, "possible value for this field is:", "possible values for this field are:"
                )
                log.append("Oops!  We set the '" + field + "' field to " + location["code"] + ", but the " + s + ".")
            return ({}, "Inconsistent location values")

        # Calculate the list of parents (and parents-of-parents, etc) for the
        # newly-calculated location.

        parents_by_level = {}  # Maps level number to set of location codes.
        for level in [1, 2, 3, 4, 5, 6, 7]:
            parents_by_level[level] = set()

        location = locationCache.get(code=location["code"])
        self.calc_parents(location, parents_by_level)

        # Now process each level of parents in turn, updating the set of
        # possible values for that level to include the newly-calculated set of
        # possible parents.

        inferred_locs = {}

        for level_field, level in LOCATION_FIELDS:
            if len(parents_by_level[level]) == 0:
                continue

            prev_values = list(self.possible_values[level_field])
            self.update_possible_values(level_field, parents_by_level[level])
            if len(self.possible_values[level_field]) == 0:
                # Oops...we've got an inconsistent set of possible locations ->
                # abandon ship.
                if log != None:
                    s1 = self.display_list(list(parents_by_level[level]), "must have the value", "can have the value")
                    s2 = self.display_list(
                        prev_values, "only possible value for this field is:", "possible values for this field are:"
                    )
                    log.append(
                        "Oops!  We inferred that the '" + level_field + "' field " + s1 + ", but the " + s2 + "."
                    )
                return ({}, "Inconsistent location values")
            elif len(self.possible_values[level_field]) == 1:
                # We've inferred exactly one location -> use it.
                if level_field not in self.known_fields:
                    loc_code = list(self.possible_values[level_field])[0]
                    location = locationCache.get(code=loc_code)

                    inferred_locs[level_field] = {"code": loc_code, "name": location["name"]}
                    self.known_fields.add(level_field)

        # Finally, return the list of inferred locations back to the caller.

        return (inferred_locs, None)
Example #3
0
File: matcher.py Project: 3taps/geo
def matchStructuredLocation(source_code, loc_data, location, verbose):
    """ Attempt to match against the various parts of a structured location.

        The parameters are as follows:

            'source_code'

                The 3taps code for the data source we are processing data for.

            'loc_data'

                A copy of the structured location data.  This should be a
                dictionary with some or all of the following entries:

                    "country"
                    "state"
                    "metro"
                    "region"
                    "county"
                    "city"
                    "locality"
                    "zipCode"

            'location'

                The location we are processing.  This will be the dictionary
                returned back to the caller for this posting.

            'verbose'

                If True, add entries to location['log'] describing the progress
                of the match attempt.

        We add entries to the 'location' dictionary for the given location
        data and source.
    """
    if verbose:
        log = location["log"]
    else:
        log = None

    # Prepare to calculate the location for each of the supplied fields.  The
    # 'calculated_fields' dictionary contains an entry for each field that
    # we've explicitly calculated, while the 'inferred_fields' dictionary
    # contains an entry for each field that we've inferred.

    calculated_fields = {}  # Maps field name to a dictionary with 'code' and
    # 'name' entries.

    inferred_fields = {}  # Maps field name to a dictionary with 'code' and
    # 'name' entries.

    # Prepare to infer additional locations based on the values we calculate.

    inferrer = LocationInferrer()

    # Start by dealing with the ZIP code field, if we've been given one.  We
    # have to deal with ZIP codes specially, as there are no "names" for a ZIP
    # code.

    if "zipCode" in loc_data:
        try:
            loc = Location.objects.get(code="USA-" + str(loc_data["zipCode"]))
            zipCode = {"code": loc.code, "name": loc.name}
        except Location.DoesNotExist:
            zipCode = {"error": "No such ZIP code"}

        calculated_fields["zipCode"] = zipCode

        if verbose and not zipCode.has_key("error"):
            log.append("Assigning ZIP code: " + zipCode["code"])

        if not zipCode.has_key("error"):
            inferred_locs, error = inferrer.location_matched("zipCode", zipCode, log)
            if error != None:
                # Oops -- the ZIP code itself yielded an inconsistency.  Reject
                # the ZIP code value.
                if verbose:
                    log.append("Inferrer failed, error = " + error)
                calculated_fields["zipCode"] = {"error": error}

            # Remember the inferred locations, if any.

            for inferred_field, loc in inferred_locs.items():
                if verbose:
                    log.append("Inferring " + inferred_field + " = " + loc["code"])
                inferred_fields[inferred_field] = loc

    # Now find the list of possible locations for each of the fields we've been
    # given (except for the ZIP code field).  If a field's name can't be found,
    # we update the 'location' dictionary to indicate the failure and ignore
    # that field completely.

    possible_locs = {}  # Maps each field name to a list of possible locations
    # that the supplied field value can map to.  Each list
    # item will be a dictionary with the following entries:
    #
    #     'code'
    #
    #         The location code to use if this location is
    #         selected.
    #
    #     'name'
    #
    #         The name of the location to use if this
    #         location is selected.
    #
    #     'filter'
    #
    #         A dictionary mapping field names to location
    #         codes.
    #
    #     'active'
    #
    #         True iff this possible location hasn't been
    #         rejected.

    for field, level in LOCATION_FIELDS:
        if field not in loc_data:
            continue

        possible_locs[field] = []

        name_id = nameCache.search(level, loc_data[field])
        if name_id == None:
            location[field] = {"error": "No such name"}
            continue

        for loc_name in locationNameCache.filterByNameID(name_id):

            # If this location name uses a source filter and we've been given a
            # source, only include the location name if the source is correct.

            if source_code != None and loc_name["sourceFilter"] != None:
                if loc_name["sourceFilter"] != source_code:
                    continue

            # Add this location name to the list of possible locations for this
            # name.

            filter = {}
            if loc_name["countryFilter"] != None:
                filter["country"] = loc_name["countryFilter"]
            if loc_name["stateFilter"] != None:
                filter["state"] = loc_name["stateFilter"]
            if loc_name["metroFilter"] != None:
                filter["metro"] = loc_name["metroFilter"]
            if loc_name["regionFilter"] != None:
                filter["region"] = loc_name["regionFilter"]
            if loc_name["countyFilter"] != None:
                filter["county"] = loc_name["countyFilter"]
            if loc_name["cityFilter"] != None:
                filter["city"] = loc_name["cityFilter"]

            if verbose:
                filter_values = []
                for key, value in sorted(filter.items()):
                    filter_values.append(key + "=" + value)
                if len(filter_values) == 0:
                    filter_str = "no filter"
                else:
                    filter_str = "filter: " + ", ".join(filter_values)

                log.append(
                    "Considering location "
                    + loc_name["loc_code"]
                    + " as a possible meaning of "
                    + field
                    + " name '"
                    + loc_data[field]
                    + "' with "
                    + filter_str
                )

            loc = locationCache.get(loc_name["loc_code"])

            possible_locs[field].append(
                {"code": loc_name["loc_code"], "name": loc["name"], "filter": filter, "active": True}
            )

    # Reject location names which depend on filter values we can't calculate
    # directly.  If, for example, a city name depends on a metro name, and we
    # haven't been given a metro name, then we reject that possible location
    # name because we'll never be able to match against it.

    for field, level in LOCATION_FIELDS:
        if field not in loc_data:
            continue

        # Find the lowest level field that we've got a name for, starting one
        # level up from the current field's level.

        lowest_index = None
        for i in range(len(LOCATION_FIELDS) - 1, -1, -1):
            aField, aLevel = LOCATION_FIELDS[i]
            if aLevel >= level:
                continue
            if aField in loc_data:
                lowest_index = i
                break

        if lowest_index != None:
            for i in range(len(possible_locs[field])):
                for filter_field, aLevel in LOCATION_FIELDS[lowest_index + 1 :]:
                    if filter_field in possible_locs[field][i]["filter"]:
                        if verbose:
                            log.append(
                                "Rejecting "
                                + possible_locs[field][i]["code"]
                                + " as a possible "
                                + field
                                + " value because it has a filter"
                                + " value we can't calculate."
                            )
                        possible_locs[field][i]["active"] = False

    # Display a summary of the possible locations for each field, if we're
    # running in verbose mode.

    if verbose:
        log.append("")
        log.append("Possible field values:")
        for field, level in LOCATION_FIELDS:
            if field not in loc_data:
                continue

            possible_values = []
            for possibility in possible_locs[field]:
                if possibility["active"]:
                    if len(possibility["filter"]) > 0:
                        filter = []
                        for key, value in possibility["filter"].items():
                            filter.append(key + "=" + value)
                        filter = " (" + ",".join(filter) + ")"
                    else:
                        filter = ""

                    possible_values.append(possibility["code"] + filter)

            log.append("   " + field + ":" + ", ".join(possible_values))
        log.append("")

    # Now repeatedly go through the list of fields, attempting to calculate the
    # location to use for each field.  Keep going for as long as we make
    # progress.

    while True:

        progress_made = False  # initially.

        for field, level in LOCATION_FIELDS:
            if field not in loc_data:
                continue  # Not supplied.
            if field in calculated_fields:
                continue  # Already calculated.

            progress, results = calc_loc_for_field(field, possible_locs[field], calculated_fields, log)

            if not progress:
                continue

            progress_made = True

            if results != None:
                # We've successfully identified the location to use for this
                # field.

                if verbose:
                    log.append("Identified " + results["code"] + " as the " + field + " value.")

                # Use the newly-calculated location to infer other location
                # values, if possible.

                inferred_locs, error = inferrer.location_matched(field, results, log)

                if error != None:
                    # Oops -- setting the field to this value resulted in an
                    # inconsistency.  Reject this field value.
                    if verbose:
                        log.append("Inferrer failed, error = " + error)
                    calculated_fields[field] = {"error": error}

                # Remember the inferred locations, if any.

                for inferred_field, loc in inferred_locs.items():
                    if verbose:
                        log.append("Inferring " + inferred_field + " = " + loc["code"])
                    inferred_fields[inferred_field] = loc

        # If we didn't make progress on any of our remaining locations, abandon
        # ship.

        if not progress_made:
            break
        else:
            continue

    # Finally, update 'location' with the results of our calculation and
    # inference.

    for field in inferred_fields.keys():
        if field not in location:
            location[field] = inferred_fields[field]

    for field in calculated_fields.keys():
        location[field] = calculated_fields[field]

    for field, level in LOCATION_FIELDS:
        if field in loc_data:
            if field not in location and field not in calculated_fields and field not in inferred_fields:
                if verbose:
                    log.append(
                        "Oops!  Failed to calculate a "
                        + field
                        + " value for the name '"
                        + loc_data[field]
                        + "'.  Treating this field as ambiguous."
                    )
                location[field] = {"error": "Ambiguous location name"}