Exemple #1
0
def matchStructuredLocation(source_code, loc_data, location, verbose):
    """ Attempt to match against the various parts of a structured location.

        The parameters are as follows:

            'source_code'

                The 3taps code for the data source we are processing data for.

            'loc_data'

                A copy of the structured location data.  This should be a
                dictionary with some or all of the following entries:

                    "country"
                    "state"
                    "metro"
                    "region"
                    "county"
                    "city"
                    "locality"
                    "zipCode"

            'location'

                The location we are processing.  This will be the dictionary
                returned back to the caller for this posting.

            'verbose'

                If True, add entries to location['log'] describing the progress
                of the match attempt.

        We add entries to the 'location' dictionary for the given location
        data and source.
    """
    if verbose:
        log = location["log"]
    else:
        log = None

    # Prepare to calculate the location for each of the supplied fields.  The
    # 'calculated_fields' dictionary contains an entry for each field that
    # we've explicitly calculated, while the 'inferred_fields' dictionary
    # contains an entry for each field that we've inferred.

    calculated_fields = {}  # Maps field name to a dictionary with 'code' and
    # 'name' entries.

    inferred_fields = {}  # Maps field name to a dictionary with 'code' and
    # 'name' entries.

    # Prepare to infer additional locations based on the values we calculate.

    inferrer = LocationInferrer()

    # Start by dealing with the ZIP code field, if we've been given one.  We
    # have to deal with ZIP codes specially, as there are no "names" for a ZIP
    # code.

    if "zipCode" in loc_data:
        try:
            loc = Location.objects.get(code="USA-" + str(loc_data["zipCode"]))
            zipCode = {"code": loc.code, "name": loc.name}
        except Location.DoesNotExist:
            zipCode = {"error": "No such ZIP code"}

        calculated_fields["zipCode"] = zipCode

        if verbose and not zipCode.has_key("error"):
            log.append("Assigning ZIP code: " + zipCode["code"])

        if not zipCode.has_key("error"):
            inferred_locs, error = inferrer.location_matched("zipCode", zipCode, log)
            if error != None:
                # Oops -- the ZIP code itself yielded an inconsistency.  Reject
                # the ZIP code value.
                if verbose:
                    log.append("Inferrer failed, error = " + error)
                calculated_fields["zipCode"] = {"error": error}

            # Remember the inferred locations, if any.

            for inferred_field, loc in inferred_locs.items():
                if verbose:
                    log.append("Inferring " + inferred_field + " = " + loc["code"])
                inferred_fields[inferred_field] = loc

    # Now find the list of possible locations for each of the fields we've been
    # given (except for the ZIP code field).  If a field's name can't be found,
    # we update the 'location' dictionary to indicate the failure and ignore
    # that field completely.

    possible_locs = {}  # Maps each field name to a list of possible locations
    # that the supplied field value can map to.  Each list
    # item will be a dictionary with the following entries:
    #
    #     'code'
    #
    #         The location code to use if this location is
    #         selected.
    #
    #     'name'
    #
    #         The name of the location to use if this
    #         location is selected.
    #
    #     'filter'
    #
    #         A dictionary mapping field names to location
    #         codes.
    #
    #     'active'
    #
    #         True iff this possible location hasn't been
    #         rejected.

    for field, level in LOCATION_FIELDS:
        if field not in loc_data:
            continue

        possible_locs[field] = []

        name_id = nameCache.search(level, loc_data[field])
        if name_id == None:
            location[field] = {"error": "No such name"}
            continue

        for loc_name in locationNameCache.filterByNameID(name_id):

            # If this location name uses a source filter and we've been given a
            # source, only include the location name if the source is correct.

            if source_code != None and loc_name["sourceFilter"] != None:
                if loc_name["sourceFilter"] != source_code:
                    continue

            # Add this location name to the list of possible locations for this
            # name.

            filter = {}
            if loc_name["countryFilter"] != None:
                filter["country"] = loc_name["countryFilter"]
            if loc_name["stateFilter"] != None:
                filter["state"] = loc_name["stateFilter"]
            if loc_name["metroFilter"] != None:
                filter["metro"] = loc_name["metroFilter"]
            if loc_name["regionFilter"] != None:
                filter["region"] = loc_name["regionFilter"]
            if loc_name["countyFilter"] != None:
                filter["county"] = loc_name["countyFilter"]
            if loc_name["cityFilter"] != None:
                filter["city"] = loc_name["cityFilter"]

            if verbose:
                filter_values = []
                for key, value in sorted(filter.items()):
                    filter_values.append(key + "=" + value)
                if len(filter_values) == 0:
                    filter_str = "no filter"
                else:
                    filter_str = "filter: " + ", ".join(filter_values)

                log.append(
                    "Considering location "
                    + loc_name["loc_code"]
                    + " as a possible meaning of "
                    + field
                    + " name '"
                    + loc_data[field]
                    + "' with "
                    + filter_str
                )

            loc = locationCache.get(loc_name["loc_code"])

            possible_locs[field].append(
                {"code": loc_name["loc_code"], "name": loc["name"], "filter": filter, "active": True}
            )

    # Reject location names which depend on filter values we can't calculate
    # directly.  If, for example, a city name depends on a metro name, and we
    # haven't been given a metro name, then we reject that possible location
    # name because we'll never be able to match against it.

    for field, level in LOCATION_FIELDS:
        if field not in loc_data:
            continue

        # Find the lowest level field that we've got a name for, starting one
        # level up from the current field's level.

        lowest_index = None
        for i in range(len(LOCATION_FIELDS) - 1, -1, -1):
            aField, aLevel = LOCATION_FIELDS[i]
            if aLevel >= level:
                continue
            if aField in loc_data:
                lowest_index = i
                break

        if lowest_index != None:
            for i in range(len(possible_locs[field])):
                for filter_field, aLevel in LOCATION_FIELDS[lowest_index + 1 :]:
                    if filter_field in possible_locs[field][i]["filter"]:
                        if verbose:
                            log.append(
                                "Rejecting "
                                + possible_locs[field][i]["code"]
                                + " as a possible "
                                + field
                                + " value because it has a filter"
                                + " value we can't calculate."
                            )
                        possible_locs[field][i]["active"] = False

    # Display a summary of the possible locations for each field, if we're
    # running in verbose mode.

    if verbose:
        log.append("")
        log.append("Possible field values:")
        for field, level in LOCATION_FIELDS:
            if field not in loc_data:
                continue

            possible_values = []
            for possibility in possible_locs[field]:
                if possibility["active"]:
                    if len(possibility["filter"]) > 0:
                        filter = []
                        for key, value in possibility["filter"].items():
                            filter.append(key + "=" + value)
                        filter = " (" + ",".join(filter) + ")"
                    else:
                        filter = ""

                    possible_values.append(possibility["code"] + filter)

            log.append("   " + field + ":" + ", ".join(possible_values))
        log.append("")

    # Now repeatedly go through the list of fields, attempting to calculate the
    # location to use for each field.  Keep going for as long as we make
    # progress.

    while True:

        progress_made = False  # initially.

        for field, level in LOCATION_FIELDS:
            if field not in loc_data:
                continue  # Not supplied.
            if field in calculated_fields:
                continue  # Already calculated.

            progress, results = calc_loc_for_field(field, possible_locs[field], calculated_fields, log)

            if not progress:
                continue

            progress_made = True

            if results != None:
                # We've successfully identified the location to use for this
                # field.

                if verbose:
                    log.append("Identified " + results["code"] + " as the " + field + " value.")

                # Use the newly-calculated location to infer other location
                # values, if possible.

                inferred_locs, error = inferrer.location_matched(field, results, log)

                if error != None:
                    # Oops -- setting the field to this value resulted in an
                    # inconsistency.  Reject this field value.
                    if verbose:
                        log.append("Inferrer failed, error = " + error)
                    calculated_fields[field] = {"error": error}

                # Remember the inferred locations, if any.

                for inferred_field, loc in inferred_locs.items():
                    if verbose:
                        log.append("Inferring " + inferred_field + " = " + loc["code"])
                    inferred_fields[inferred_field] = loc

        # If we didn't make progress on any of our remaining locations, abandon
        # ship.

        if not progress_made:
            break
        else:
            continue

    # Finally, update 'location' with the results of our calculation and
    # inference.

    for field in inferred_fields.keys():
        if field not in location:
            location[field] = inferred_fields[field]

    for field in calculated_fields.keys():
        location[field] = calculated_fields[field]

    for field, level in LOCATION_FIELDS:
        if field in loc_data:
            if field not in location and field not in calculated_fields and field not in inferred_fields:
                if verbose:
                    log.append(
                        "Oops!  Failed to calculate a "
                        + field
                        + " value for the name '"
                        + loc_data[field]
                        + "'.  Treating this field as ambiguous."
                    )
                location[field] = {"error": "Ambiguous location name"}
Exemple #2
0
def parseUnstructuredLocation(location, source, log):
    """ Attempt to parse an unstructured location.

        The parameters are as follows:

            'location'

                The text of the location to parse, as a string.
                
            'source'
            
                The 3taps source code for the data source, or None if no source
                was specified.
                
            'log'
            
                If we are in verbose mode, this will be a list of strings that
                we can append our debugging information onto.  If we are not in
                verbose mode, this will be set to None.

        We attempt to parse the given text, identifying either a structured
        location or a geographic coordinate.

        As we parse the location, verbose logging information will be appended
        to the 'log' list, if it isn't None.

        Upon completion, we return a (type, value) tuple, where 'type' is one
        of the following:

            "structured"
            "coordinate"
            "unknown"

        The meaning of 'value' depends on the calculated type:

            * For unstructured locations which were parsed into structured
              locations, 'value' will be a dictionary with a combination of the
              following fields, containing the various parts of the location we
              identified:

                  "country"
                  "state"
                  "metro"
                  "region"
                  "county"
                  "city"
                  "locality"
                  "zipCode"

            * For geographic coordinates, 'value' will be a dictionary with
              "lat" and "long" entries, holding the parsed coordinate value.

            * For unknown locations, 'value' will be None.
    """
    # Start by splitting the string into numbers and non-numbers.

    parts = [] # List of parts within the location.  Each list item is a
               # [type, str] tuple, where 'type' is NUMBER or STRING and 'str'
               # is the string within this part of the location.

    i = 0
    while True:
        match = NUMBER_SPLITTER.search(location, i)
        if match == None: # No more matches.
            if i < len(location)-1:
                parts.append([STRING, location[i:]])
            break
        else:
            if match.start() > i:
                parts.append([STRING, location[i:match.start()]])
            parts.append([NUMBER, location[match.start():match.end()]])
            i = match.end()
            continue

    # If we have two numbers separated by a zero or one character string
    # delimiter, and both numbers are floating point values in the range -180
    # to +180, assume we've got a lat/long coordinate.

    for i in range(len(parts)-2):
        if (parts[i][0] == NUMBER and parts[i+1][0] == STRING
                                  and parts[i+2][0] == NUMBER):
            num1      = parts[i][1]
            delimiter = parts[i+1][1]
            num2      = parts[i+2][1]
            if len(delimiter.strip()) < 2:
                if "." in num1 and "." in num2:
                    n1 = float(num1)
                    n2 = float(num2)
                    if n1 >= -180 and n1 <= +180 and n2 >= -180 and n2 <= +180:
                        # Success!  We've found a lat/long coordinate.
                        return ('coordinate', {'lat'  : n1,
                                               'long' : n2})

    # If we get here, we need to identify the various parts of a structured
    # location.  Start with an empty dictionary.

    structuredLoc = {}

    # If the last number in the string consists of a positive integer number
    # with at least four digits, see if it matches one of our existing ZIP
    # codes.  If so, assume that the number is a ZIP code.

    lastNum = None
    for part in reversed(parts):
        if part[0] == NUMBER:
            lastNum = part[1]
            break

    if lastNum != None:
        if (("-" not in lastNum) and ("." not in lastNum)
                                 and (len(lastNum) >= 4)):
            try:
                loc = Location.objects.get(code="USA-"+lastNum)
            except Location.DoesNotExist:
                loc = None

            if loc != None:
                if log:
                    log.append("Found ZIP code: " + lastNum)
                structuredLoc['zipCode'] = int(lastNum)

    # We now need to identify locations in the supplied string by name.  Start
    # by replacing all punctuation with spaces.

    text = helpers.tidy_name(location)

    # Split the text into individual words.

    words = text.split()

    # Iterate over every contiguous combination of words, seeing if those words
    # define a known location name.  If we find a known name, we remember that
    # name and the levels at which that name can appear.

    known_names = {} # Maps location name to a dictionary with the following
                     # entries:
                     #
                     #    'start_index'
                     #    'levels'
                     #
                     # where 'start_index' is the index into "words" where
                     # this name started, and 'levels' is a dictionary mapping
                     # a level number to a list of LocationName objects for
                     # that level.

    index = 0

    while index < len(words):

        # Try to find a match against a location name starting at the current
        # index.

        length = len(words) - index
        found  = False
        while length > 0:
            s = " ".join(words[index:index+length]).upper()

            names = [] # List of (level, name_id) tuples.
            for level_num in range(1, 8):
                name_id = nameCache.search(level_num, s)
                if name_id != None:
                    names.append((level_num, name_id))

            if len(names) > 0:
                found = True
                break
            else:
                length = length - 1

        if found:
            # We've found a name of length 'length' that matches at least one
            # known location.  Remember it.
            known_names[s] = {'start_index' : index,
                              'levels'      : {}}
            for level_num,name_id in names:
                known_names[s]['levels'][level_num] = []
                for loc_name in LocationName.objects.filter(name=name_id):
                    known_names[s]['levels'][level_num].append(loc_name)

            index = index + length
        else:
            # Keep trying.
            index = index + 1

    # If we're in verbose mode, tell the caller which names we've identified.

    if log != None:
        log.append("The following possible names were found:")
        for name in sorted(known_names.keys()):
            for level,loc_names in known_names[name]['levels'].items():
                for loc_name in loc_names:
                    log.append("    '" + name + "' could be a " +
                               loc_name.location.level.name + " entry for " +
                               loc_name.location.name)

    # Starting at the highest level and working down, choose from the available
    # names at the level.  If there is exactly one name at a given level, we
    # use that name; if there are multiple possible names for a given level, we
    # check the LocationName filters to see if any can be knocked out by the
    # values we've already calculated, hopefully resulting in just one
    # remaining name.  If there are still multiple possible names for a level,
    # we don't set the name at all.

    used_names    = set() # Set of names which we've stored into structuredLoc.
    used_loc_name = {}    # Maps level number to LocationName for that level.

    for level_num in range(1, 8):
        level_name = level_num_to_name(level_num)
        possible_loc_names = [] # List of possible LocationNames at this level.
        for name in known_names.keys():
            if name in used_names: continue # Don't use the same name twice.
            if level_num in known_names[name]['levels']:
                for loc_name in known_names[name]['levels'][level_num]:
                    possible_loc_names.append(loc_name)

        if len(possible_loc_names) == 0:
            continue # No names at this level.
        elif len(possible_loc_names) == 1:
            # There's no ambiguity -> use the one possible name.
            loc_name = possible_loc_names[0]
            if log != None:
                log.append("Choosing " + str(loc_name.location) + " as the " +
                           "only possible " + loc_name.name.level.name)
            structuredLoc[level_name] = loc_name.name.name
            used_loc_name[level_num]  = loc_name
            used_names.add(loc_name.name.name)
        else:
            # We have multiple possible names -> see if we can't filter out any
            # names based on the information we've calculated thus far.
            for i in range(len(possible_loc_names)-1, -1, -1):
                loc_name = possible_loc_names[i]
                matches  = True # initially.
                if loc_name.sourceFilter != None and source != None:
                    if loc_name.sourceFilter.code != source:
                        matches = False
                if loc_name.countryFilter != None and used_loc_name.has_key(1):
                    if loc_name.countryFilter != used_loc_name[1].location:
                        matches = False
                if loc_name.stateFilter != None and used_loc_name.has_key(2):
                    if loc_name.stateFilter != used_loc_name[2].location:
                        matches = False
                if loc_name.metroFilter != None and used_loc_name.has_key(3):
                    if loc_name.metroFilter != used_loc_name[3].location:
                        matches = False
                if loc_name.regionFilter != None and used_loc_name.has_key(4):
                    if loc_name.regionFilter != used_loc_name[4].location:
                        matches = False
                if loc_name.countyFilter != None and used_loc_name.has_key(5):
                    if loc_name.countyFilter != used_loc_name[5].location:
                        matches = False
                if loc_name.cityFilter != None and used_loc_name.has_key(6):
                    if loc_name.cityFilter != used_loc_name[6].location:
                        matches = False

                if not matches:
                    # The filter doesn't match -> remove this LocationName from
                    # the list of possibilies.
                    if log != None:
                        log.append("Excluding " + str(loc_name.location) +
                                   " as a possible "+loc_name.name.level.name +
                                   " because the filter values don't match.")
                    del possible_loc_names[i]

            if len(possible_loc_names) == 1:
                # There's only one possible location name left -> use it.
                loc_name = possible_loc_names[0]
                if log != None:
                    log.append("Choosing "+str(loc_name.location)+" as the " +
                               "only remaining " + loc_name.name.level.name)
                structuredLoc[level_name] = loc_name.name.name
                used_loc_name[level_num]  = loc_name
                used_names.add(loc_name.name.name)
            elif log != None:
                # Tell the caller the bad news.
                if len(possible_loc_names) == 0:
                    log.append("Unable to choose a " +
                               level_name.capitalize() + " entry because all" +
                               " the possible locations were filtered out.")
                else:
                    loc_names = []
                    for loc_name in possible_loc_names:
                        loc_names.append(loc_name.location.name)

                    if len(loc_names) == 2:
                        s = loc_names[0] + " and " + loc_names[1]
                    else:
                        s = ", ".join(loc_names[:-1]) + " and " + loc_names[-1]

                    log.append("Unable to choose between " + s + " for the " +
                               level_name.capitalize() + " entry.")

    # If we couldn't identify anything, tell the caller the bad news.

    if len(structuredLoc) == 0:
        return ("unknown", None)

    # Finally, return the structured location back to the caller.

    return ("structured", structuredLoc)