Beispiel #1
0
  def testPermTagSeq(self):   # - - - - - - - - - - - - - - - - - - - - - - - -
    """Test 'perm_tag_sequence' routine"""

    for l in self.tag_lists:

      t = mymath.perm_tag_sequence(l[0])

      assert len(t) == l[1], \
             '"perm_tag_sequence" returns wrong number of permutations with '+\
             'list: '+str(l[0])+' (should be: '+str(l[1])+'): '+str(len(t))

      for i in range(len(t)):
        assert t[i] == l[2][i], \
               '"perm_tag_sequence" returns wrong permutation: '+str(t[i])+ \
               ', should be: '+str(l[2][i])
Beispiel #2
0
    def testPermTagSeq(
            self):  # - - - - - - - - - - - - - - - - - - - - - - - -
        """Test 'perm_tag_sequence' routine"""

        for l in self.tag_lists:

            t = mymath.perm_tag_sequence(l[0])

            assert len(t) == l[1], \
                   '"perm_tag_sequence" returns wrong number of permutations with '+\
                   'list: '+str(l[0])+' (should be: '+str(l[1])+'): '+str(len(t))

            for i in range(len(t)):
                assert t[i] == l[2][i], \
                       '"perm_tag_sequence" returns wrong permutation: '+str(t[i])+ \
                       ', should be: '+str(l[2][i])
Beispiel #3
0
def get_geoloc_hmm(word_list, tag_list):
    """Process input using a HMM to extract geocode and locality output fields.

  USAGE:
    geoloc_dict = get_geoloc_hmm(word_list, tag_list)

  ARGUMENTS:
    word_list  List of words as produces with clean_tag_locality()
    tag_list   Corresponding list of tags as produces with
               clean_tag_locality()

  DESCRIPTION:
    The routine returns a dictionary with the parsed and extracted output
    fields for both the locality and geocode components. A Hidden Markov Model
    (HMM) is used for this task.

    The dictionary returned can contain the following key words:
    - wayfare_number
    - wayfare_name
    - wayfare_qualifier
    - wayfare_type
    - unit_number
    - unit_type
    - property_name
    - institution_name
    - institution_type    
    - postaddress_number
    - postaddress_type
    - locality_name
    - locality_qualifier
    - postcode
    - territory
    - country
    - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for
                        the most likely HMM state seqence)
  """

    # First, create all permutations of the input tag sequence
    #
    tag_list_seq = mymath.perm_tag_sequence(tag_list)

    msg = ["  Input tag sequence: " + str(tag_list), "  Output tag sequences:"]
    for t in tag_list_seq:
        msg.append("    " + str(t))
    inout.log_message(msg, "v2")

    # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - -
    # and keep the one with highest probability
    #
    max_prob = -1.0
    best_obs_seq = []
    best_tag_list = []

    for t in tag_list_seq:
        [obs_seq, prob] = config.geoloc_hmm.viterbi(t)
        if prob > max_prob:
            best_obs_seq = obs_seq
            best_tag_list = t
            max_prob = prob

        inout.log_message("  Probability " + str(prob) + "  for sequence " + str(t), "v2")

    inout.log_message(
        ["  Best observation sequence: " + str(best_obs_seq), "          with tag sequence: " + str(best_tag_list)],
        "v2",
    )

    # Now process the observation sequence and add elements into dictionary - - -
    #
    tag_list_len = len(tag_list)
    norm_max_prob = max_prob / float(tag_list_len)  # Normalise max. probability
    geoloc_dict = {"geoloc_hmm_proba": [str(norm_max_prob)]}

    list_len = len(word_list)
    for i in range(list_len):  # Loop over words and states
        w = word_list[i]
        s = best_obs_seq[i]

        #  Do not output commas, vertical bars and hyphens  - - - - - - - - - - - -
        #
        if w in ["|", ",", "-", "/"]:
            pass

        elif s == "wfnu":  # Wayfare number - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("wayfare_number", [])
            v.append(w)
            geoloc_dict.update({"wayfare_number": v})

        elif s in ["wfna1", "wfna2", "wfna3"]:  # Wayfare name  - - - - - - - - - -
            v = geoloc_dict.get("wayfare_name", [])
            v.append(w)
            geoloc_dict.update({"wayfare_name": v})

        elif s == "wfql":  # Wayfare qualifier  - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("wayfare_qualifier", [])
            v.append(w)
            geoloc_dict.update({"wayfare_qualifier": v})

        elif s == "wfty":  # Wayfare type - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("wayfare_type", [])
            v.append(w)
            geoloc_dict.update({"wayfare_type": v})

        elif s == "unnu":  # Unit number  - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("unit_number", [])
            v.append(w)
            geoloc_dict.update({"unit_number": v})

        elif s == "unty":  # Unit type  - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("unit_type", [])
            v.append(w)
            geoloc_dict.update({"unit_type": v})

        elif s in ["prna1", "prna2"]:  # Property name - - - - - - - - - - - - - -
            v = geoloc_dict.get("property_name", [])
            v.append(w)
            geoloc_dict.update({"property_name": v})

        elif s in ["inna1", "inna2"]:  # Institution name  - - - - - - - - - - - -
            v = geoloc_dict.get("institution_name", [])
            v.append(w)
            geoloc_dict.update({"institution_name": v})

        elif s == "inty":  # Institution type - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("institution_type", [])
            v.append(w)
            geoloc_dict.update({"institution_type": v})

        elif s == "panu":  # Postal address number  - - - - - - - - - - - - - - -
            v = geoloc_dict.get("postaddress_number", [])
            v.append(w)
            geoloc_dict.update({"postaddress_number": v})

        elif s == "paty":  # Postal address type  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("postaddress_type", [])
            v.append(w)
            geoloc_dict.update({"postaddress_type": v})

        elif s in ["loc1", "loc2"]:  # Locality name - - - - - - - - - - - - - - -
            v = geoloc_dict.get("locality_name", [])
            v.append(w)
            geoloc_dict.update({"locality_name": v})

        elif s == "locql":  # Locality qualifier  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("locality_qualifier", [])
            v.append(w)
            geoloc_dict.update({"locality_qualifier": v})

        elif s == "pc":  # Postcode - - - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("postcode", [])
            v.append(w)
            geoloc_dict.update({"postcode": v})

        elif s in ["ter1", "ter2"]:  # Territory - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("territory", [])
            v.append(w)
            geoloc_dict.update({"territory": v})

        elif s in ["cntr1", "cntr2"]:  # Country - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("country", [])
            v.append(w)
            geoloc_dict.update({"country": v})

        else:  # Should never happen
            msg = [
                "This should never happen!",
                "  Tag: " + str(s),
                "  Word: " + w,
                "  Word list: " + str(word_list),
                "  tag list:  " + str(tag_list),
            ]
            inout.log_message(msg, "warn")

    # Check if concatenated locality and territory words are in lookup-table  - -
    #
    if geoloc_dict.has_key("locality_name"):
        loc = geoloc_dict["locality_name"]
        if len(loc) > 1:  # Locality contains more than one word
            loc_tuple = tuple(loc)  # Make it a tuple
            if config.geoloc_lookup_dict.has_key(loc_tuple):
                new_loc = config.geoloc_lookup_dict[loc_tuple][0]
                geoloc_dict.update({"locality_name": [new_loc]})

    if geoloc_dict.has_key("territory"):
        terr = geoloc_dict["territory"]
        if len(terr) > 1:  # Territory contains more than one word
            terr_tuple = tuple(terr)  # Make it a tuple
            if config.geoloc_lookup_dict.has_key(terr_tuple):
                new_terr = config.geoloc_lookup_dict[terr_tuple][0]
                geoloc_dict.update({"territory": [new_terr]})

    if geoloc_dict.has_key("country"):
        cntr = geoloc_dict["country"]
        if len(cntr) > 1:  # Country contains more than one word
            cntr_tuple = tuple(cntr)  # Make it a tuple
            if config.geoloc_lookup_dict.has_key(cntr_tuple):
                new_cntr = config.geoloc_lookup_dict[cntr_tuple][0]
                geoloc_dict.update({"country": [new_cntr]})

    # Finally do some tests on the output fields  - - - - - - - - - - - - - - - -
    #
    geoloc_items = geoloc_dict.items()

    # Check if a value list has more than three elements, if so print out
    #
    for i in geoloc_items:
        if len(i[1]) > 3:
            inout.log_message(
                "Geocode/locality output field " + str(i[0]) + " contains more than three elements: " + str(i[1]),
                "warn",
            )

    # Check if 'number' elements only contain (alpha-) numerical values - - - - -
    # and also check how many numbers in an element
    #
    if geoloc_dict.has_key("wayfare_number"):  # Check how many wayfare numbers
        v = geoloc_dict["wayfare_number"]
        if len(v) > 2:
            inout.log_message("More than two wayfare numbers: " + str(v), "warn")
        for i in v:
            if i.isalpha():  # Element contains only letters
                inout.log_message("Wayfare number element contains no digits: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("unit_number"):  # Check how many unit numbers
        v = geoloc_dict["unit_number"]
        if len(v) > 1:
            inout.log_message("More than one unit numbers: " + str(v), "warn")
        for i in v:
            if i.isalpha():  # Element contains only letters
                inout.log_message("Unit number element contains no digits: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("postaddress_number"):  # Check postaddress numbers
        v = geoloc_dict["postaddress_number"]
        if len(v) > 1:
            inout.log_message("More than one postaddress numbers: " + str(v), "warn")
        for i in v:
            if i.isalpha():  # Element contains only letters
                inout.log_message("Postaddress number element contains no digits: " + str(v), "warn")
                break  # Exit for loop

    # Check if 'type' elements contain one word only  - - - - - - - - - - - - - -
    # if it's a known type word
    #
    if geoloc_dict.has_key("wayfare_type"):  # Check wayfare type
        v = geoloc_dict["wayfare_type"]
        if len(v) > 1:
            inout.log_message("More than one wayfare type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("WT") < 0)
            ):
                inout.log_message("Wayfare type word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("unit_type"):  # Check unit type
        v = geoloc_dict["unit_type"]
        if len(v) > 1:
            inout.log_message("More than one unit type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("UT") < 0)
            ):
                inout.log_message("Unit type word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("institution_type"):  # Check institution type
        v = geoloc_dict["institution_type"]
        if len(v) > 1:
            inout.log_message("More than one institution type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("IT") < 0)
            ):
                inout.log_message("Institution type word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("postaddress_type"):  # Check postaddress type
        v = geoloc_dict["postaddress_type"]
        if len(v) > 2:
            inout.log_message("More than two postaddress type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("PA") < 0)
            ):
                inout.log_message("Postaddress type word is not known: " + str(v), "warn")
                break  # Exit for loop

    # Check if 'qualifier' elements only contain known qualifier words  - - - - -
    #
    if geoloc_dict.has_key("wayfare_qualifier"):  # Check wayfare qualifier
        v = geoloc_dict["wayfare_qualifier"]
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or (
                config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0)
            ):
                inout.log_message("Wayfare qualifier word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("locality_qualifier"):  # Check locality qualifier
        v = geoloc_dict["locality_qualifier"]
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or (
                config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0)
            ):
                inout.log_message("Locality qualifier word is not known: " + str(v), "warn")
                break  # Exit for loop

    return geoloc_dict
Beispiel #4
0
def get_address_hmm(word_list, tag_list, address_hmm, tag_lookup_table,
                    record_id, fields_str):
  """Process the input using a HMM to extract address output fields.

  USAGE:
    address_dict = get_address_hmm(word_list, tag_list, address_hmm,
                                   tag_lookup_table)

  ARGUMENTS:
    word_list         List of words as produces with tag_address_component()
    tag_list          Corresponding list of tags as produces with
                      tag_address_component()
    address_hmm       A reference to the address hidden Markov model
    tag_lookup_table  A tagging look-up table as defined in 'lookup.py'
    record_id         A string identifying the current record
    fields_str        A string representation of the input fields

  DESCRIPTION:
    The routine returns a dictionary with the parsed and extracted output
    fields for the address component. A Hidden Markov Model (HMM) is used for
    this task.

    The dictionary returned can contain the following key words:
      wayfare_number
      wayfare_name
      wayfare_qualifier
      wayfare_type
      unit_number
      unit_type
      property_name
      institution_name
      institution_type    
      postaddress_number
      postaddress_type
      locality_name
      locality_qualifier
      postcode
      territory
      country
      address_hmm_prob (the probability returned by the Viterbi algorithm for
                        the most likely HMM state seqence)
  """

  # First, create all permutations of the input tag sequence
  #
  tag_list_seq = mymath.perm_tag_sequence(tag_list)

  # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - -
  # and keep the one with highest probability
  #
  max_prob = -1.0
  best_obs_seq   = []
  best_tag_list  = []

  for t in tag_list_seq:
    [obs_seq, prob] = address_hmm.viterbi(t)
    if (prob > max_prob):
       best_obs_seq  = obs_seq
       best_tag_list = t
       max_prob = prob

    print '3:%s  Sequence: %s has Viterbi probability: %f' % \
          (record_id, str(t), prob)

  print '2:%s  Best observation sequence: %s with tag sequence: %s' % \
        (record_id, str(best_obs_seq), str(best_tag_list))

  # Now process the observation sequence and add elements into dictionary - - -
  #
  if (len(tag_list) != len(word_list)):
    print 'error:%s Length of word list and tag list differs: %s, %s%s' % \
          (record_id, str(word_list), str(tag_list), fields_str)
    raise Exception

  list_len = len(tag_list)

  if (list_len == 0):
    print 'warning:%s Empty tag list returned from HMM %s' % \
          (record_id, fields_str)
    return {}  # Return an empty dictionary if not output fields given

  # norm_max_prob = max_prob / float(list_len)  # Normalise max. probability
  address_dict = {'address_hmm_prob':[str(max_prob)]}

  for i in range(list_len):  # Loop over words and states
    w = word_list[i]
    s = best_obs_seq[i]

    #  Do not output commas, vertical bars and hyphens  - - - - - - - - - - - -
    #
    if (w in ['|', ',', '-','/']):
      pass

    elif (s == 'wfnu'):  # Wayfare number - - - - - - - - - - - - - - - - - - -
      v = address_dict.get('wayfare_number',[])
      v.append(w)
      address_dict.update({'wayfare_number':v})

    elif (s in ['wfna1','wfna2','wfna3']):  # Wayfare name  - - - - - - - - - -
      v = address_dict.get('wayfare_name',[])
      v.append(w)
      address_dict.update({'wayfare_name':v})

    elif (s == 'wfql'):  # Wayfare qualifier  - - - - - - - - - - - - - - - - -
      v = address_dict.get('wayfare_qualifier',[])
      v.append(w)
      address_dict.update({'wayfare_qualifier':v})

    elif (s == 'wfty'):  # Wayfare type - - - - - - - - - - - - - - - - - - - -
      v = address_dict.get('wayfare_type',[])
      v.append(w)
      address_dict.update({'wayfare_type':v})

    elif (s == 'unnu'):  # Unit number  - - - - - - - - - - - - - - - - - - - -
      v = address_dict.get('unit_number',[])
      v.append(w)
      address_dict.update({'unit_number':v})

    elif (s == 'unty'):  # Unit type  - - - - - - - - - - - - - - - - - - - - -
      v = address_dict.get('unit_type',[])
      v.append(w)
      address_dict.update({'unit_type':v})

    elif (s in ['prna1','prna2']):  # Property name - - - - - - - - - - - - - -
      v = address_dict.get('property_name',[])
      v.append(w)
      address_dict.update({'property_name':v})

    elif (s in ['inna1','inna2']):  # Institution name  - - - - - - - - - - - -
      v = address_dict.get('institution_name',[])
      v.append(w)
      address_dict.update({'institution_name':v})

    elif (s == 'inty'):  # Institution type - - - - - - - - - - - - - - - - - -
      v = address_dict.get('institution_type',[])
      v.append(w)
      address_dict.update({'institution_type':v})

    elif (s == 'panu'):  # Postal address number  - - - - - - - - - - - - - - -
      v = address_dict.get('postaddress_number',[])
      v.append(w)
      address_dict.update({'postaddress_number':v})

    elif (s == 'paty'):  # Postal address type  - - - - - - - - - - - - - - - -
      v = address_dict.get('postaddress_type',[])
      v.append(w)
      address_dict.update({'postaddress_type':v})

    elif (s in ['loc1','loc2']):  # Locality name - - - - - - - - - - - - - - -
      v = address_dict.get('locality_name',[])
      v.append(w)
      address_dict.update({'locality_name':v})

    elif (s == 'locql'):  # Locality qualifier  - - - - - - - - - - - - - - - -
      v = address_dict.get('locality_qualifier',[])
      v.append(w)
      address_dict.update({'locality_qualifier':v})

    elif (s == 'pc'):  # Postcode - - - - - - - - - - - - - - - - - - - - - - -
      v = address_dict.get('postcode',[])
      v.append(w)
      address_dict.update({'postcode':v})

    elif (s in ['ter1','ter2']):  # Territory - - - - - - - - - - - - - - - - -
      v = address_dict.get('territory',[])
      v.append(w)
      address_dict.update({'territory':v})

    elif (s in ['cntr1','cntr2']):  # Country - - - - - - - - - - - - - - - - -
      v = address_dict.get('country',[])
      v.append(w)
      address_dict.update({'country':v})

    else:  # Should never happen
      print 'warning:%s This should never happen! ' % (record_id) + \
            ' Tag: %s, word: %s, word list: %s, tag list: %s%s' % \
            (str(s), w, str(word_list), str(tag_list),fields_str)

  # Check if concatenated locality and territory words are in lookup-table  - -
  #
  if (address_dict.has_key('locality_name')):
    loc = address_dict['locality_name']
    if (len(loc) > 1):  # Locality contains more than one word
      loc_tuple = tuple(loc)  # Make it a tuple
      if (tag_lookup_table.has_key(loc_tuple)):
         new_loc = tag_lookup_table[loc_tuple][0]
         address_dict.update({'locality_name':[new_loc]})

  if (address_dict.has_key('territory')):
    terr = address_dict['territory']
    if (len(terr) > 1):  # Territory contains more than one word
      terr_tuple = tuple(terr)  # Make it a tuple
      if (tag_lookup_table.has_key(terr_tuple)):
         new_terr = tag_lookup_table[terr_tuple][0]
         address_dict.update({'territory':[new_terr]})

  if (address_dict.has_key('country')):
    cntr = address_dict['country']
    if (len(cntr) > 1):  # Country contains more than one word
      cntr_tuple = tuple(cntr)  # Make it a tuple
      if (tag_lookup_table.has_key(cntr_tuple)):
         new_cntr = tag_lookup_table[cntr_tuple][0]
         address_dict.update({'country':[new_cntr]})

  # Finally do some tests on the output fields  - - - - - - - - - - - - - - - -
  #
  address_items = address_dict.items()

  # Check if a value list has more than three elements, if so print out
  #
  for i in address_items:
    if (len(i[1]) > 3):
      print 'warning:%s Output field "%s" contains' % (record_id, str(i[0]))+ \
            ' more than three elements: %s%s' % (str(i[1]), fields_str)

  # Check if 'number' elements only contain (alpha-) numerical values - - - - -
  # and also check how many numbers in an element
  #
  if (address_dict.has_key('wayfare_number')): # Check how many wayfare numbers
    v = address_dict['wayfare_number']
    if (len(v) > 2):
      print 'warning:%s More than two wayfare numbers: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      if (i.isalpha()):  # Element contains only letters
        print 'warning:%s Wayfare number contains no ' % (record_id) + \
              'digits: %s%s' % (str(v), fields_str)
        break  # Exit for loop

  if (address_dict.has_key('unit_number')):  # Check how many unit numbers
    v = address_dict['unit_number']
    if (len(v) > 1):
      print 'warning:%s More than one unit number: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      if (i.isalpha()):  # Element contains only letters
        print 'warning:%s Unit number contains no ' % (record_id) + \
              'digits: %s%s' % (str(v), fields_str)
        break  # Exit for loop

  if (address_dict.has_key('postaddress_number')): # Check postaddress numbers
    v = address_dict['postaddress_number']
    if (len(v) > 1):
      print 'warning:%s More than one post-address number: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      if (i.isalpha()):  # Element contains only letters
        print 'warning:%s Post-address number contains no ' % (record_id) + \
              'digits: %s%s' % (str(v), fields_str)
        break  # Exit for loop

  # Check if 'type' elements contain one word only  - - - - - - - - - - - - - -
  # if it's a known type word
  #
  if (address_dict.has_key('wayfare_type')):  # Check wayfare type
    v = address_dict['wayfare_type']
    if (len(v) > 1):
      print 'warning:%s More than one wayfare type: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      i = i.split('_')
      i = tuple(i)  # Make it a tuple
      if (not tag_lookup_table.has_key((i))) or \
         (tag_lookup_table.has_key((i)) and 
          (tag_lookup_table[(i)][1].find('WT') < 0)):
        print 'warning:%s Wayfare type word is not known: %s%s' % \
              (record_id, str(v), fields_str)
        break  # Exit for loop

  if (address_dict.has_key('unit_type')):  # Check unit type
    v = address_dict['unit_type']
    if (len(v) > 1):
      print 'warning:%s More than one unit type: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      i = i.split('_')
      i = tuple(i)  # Make it a tuple
      if (not tag_lookup_table.has_key((i))) or \
         (tag_lookup_table.has_key((i)) and \
          (tag_lookup_table[(i)][1].find('UT') < 0)):
        print 'warning:%s Unit type word is not known: %s%s' % \
              (record_id, str(v), fields_str)
        break  # Exit for loop

  if (address_dict.has_key('institution_type')):  # Check institution type
    v = address_dict['institution_type']
    if (len(v) > 1):
      print 'warning:%s More than one institution type: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      i = i.split('_')
      i = tuple(i)  # Make it a tuple
      if (not tag_lookup_table.has_key((i))) or \
         (tag_lookup_table.has_key((i)) and \
          (tag_lookup_table[(i)][1].find('IT') < 0)):
        print 'warning:%s Institution type word is not known: %s%s' % \
              (record_id, str(v), fields_str)
        break  # Exit for loop

  if (address_dict.has_key('postaddress_type')):  # Check postaddress type
    v = address_dict['postaddress_type']
    if (len(v) > 2):
      print 'warning:%s More than two post-address type: %s%s' % \
            (record_id, str(v), fields_str)
    for i in v:
      i = i.split('_')
      i = tuple(i)  # Make it a tuple
      if (not tag_lookup_table.has_key((i))) or \
         (tag_lookup_table.has_key((i)) and \
          (tag_lookup_table[(i)][1].find('PA') < 0)):
        print 'warning:%s Post-address type word is not known: %s%s' % \
              (record_id, str(v), fields_str)
        break  # Exit for loop

  # Check if 'qualifier' elements only contain known qualifier words  - - - - -
  #
  if (address_dict.has_key('wayfare_qualifier')):  # Check wayfare qualifier
    v = address_dict['wayfare_qualifier']
    for i in v:
      if (not tag_lookup_table.has_key((i,))) or \
         (tag_lookup_table.has_key((i,)) and \
          (tag_lookup_table[(i,)][1].find('LQ') < 0)):
        print 'warning:%s Wayfare qualifier word is not known: %s%s' % \
              (record_id, str(v), fields_str)
        break  # Exit for loop

  if (address_dict.has_key('locality_qualifier')):  # Check locality qualifier
    v = address_dict['locality_qualifier']
    for i in v:
      if (not tag_lookup_table.has_key((i,))) or \
         (tag_lookup_table.has_key((i,)) and \
          (tag_lookup_table[(i,)][1].find('LQ') < 0)):
        print 'warning:%s Locality qualifier word is not known: %s%s' % \
              (record_id, str(v), fields_str)
        break  # Exit for loop

  return address_dict
Beispiel #5
0
def tagdata():
  """Main routine, open file, read lines, tag data records, write to out-file.

  USAGE:
    tagdata()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 5):
    print '***** Error: %s needs at least six arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Tagging mode: "name" or "locality"'
    print '*****        - Output training file name'
    print '*****        - Start of block with training records'
    print '*****        - End of block with training records'
    print '*****        - Number of training records'
    print '*****          plus options'
    raise Exception()

  if (config.in_file_name == config.options[2]):
    print '***** Error: Input and output files must differ'
    print '*****        Input file name:          ', config.in_file_name
    print '*****        Output training file name:', config.options[2]
    raise Exception()

  first_rec = int(config.options[2])
  last_rec  = int(config.options[3])
  num_rec   = int(config.options[4])
  in_file_name = config.in_file_name
  out_file_name = config.options[1]

  # Check record number values  - - - - - - - - - - - - - - - - - - - - - - - -
  #
  if (int(first_rec) >= int(last_rec)) or \
     ((int(num_rec)-1) > (int(last_rec)-int(first_rec))):
    print '***** Error: Illegal values for training records block:'
    print '*****        - Start of block with training records:', first_rec
    print '*****        - End of block with training records:  ', last_rec
    print '*****        - Number of training records:          ', num_rec
    raise Exception()

  rec_range = last_rec-first_rec-1  # Range of records in input file

  # Open input file and check number of available records - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0
  for line in f_in.xreadlines():
    line_count += 1
  f_in.close()

  if (last_rec > line_count):  # Illegal value for last record
    print '***** Error: Illegal values for last training records:', last_rec
    print '*****        File only contains',line_count, 'lines/records'
    raise Exception()

  # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
  #
  tag_mode = config.options[0]
  if (tag_mode in ['name','na','n']):
    tag_mode = 'name'
  elif (tag_mode in ['locality','localty','loc','l']):
    tag_mode = 'loc'
  else:
    print '***** Error: Illegal tagging mode:', tag_mode
    print '*****        Must be either "name" or "locality"'
    raise Exception()

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0     # Default: No verbose output
  config.logging = 0     # Default: No logging into a file
  hmm_file_name  = None  # Default: Do not use HMM to standardise training
                         #          records
  retag_file_name = None # Default: Do not retag an existing training file
  config.nowarn  = 0     # Deactivate no warning flag (print/log warning
                         # messages)
  freqs_file_name = None # Default: Do not write frequencies, no -freqs option

  if (len(config.options) > 5):
    options = config.options[5:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file: '+config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write('############'+os.linesep)
        f_log.write('#'+os.linesep)
        f_log.write("# 'pyTagData.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write('#'+os.linesep)
        f_log.write("# Input file name:  "+in_file_name+os.linesep)
        f_log.write("# Output file name: "+out_file_name+os.linesep)
        f_log.write("# Tagging mode:     "+tag_mode+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-hmm'):
        hmm_file_name = options[1]  # Get file name of the HMM to use
        if (hmm_file_name == out_file_name):
          print '***** Error: HMM file name is the same as output file name!'
          raise Exception()

        try:
          f_in = open(hmm_file_name,'r')  # Test if file is available
        except:
          print '***** Error: Cannot open HMM file specified in "-hmm"',
          print 'option:', hmm_file_name
          raise IOError()
        f_in.close()
        options = options[2:]  # Remove processed '-hmm' option and file name

      elif (options[0] == '-retag'):
        if (hmm_file_name == None) and ('-hmm' not in options):
          print '***** Error: "-retag" option can only be used together with',
          print '"-hmm" option (which is not given).'
          raise Exception()

        retag_file_name = options[1]  # Get file name of the already-tagged
                                      # file to re-process
        if (retag_file_name == out_file_name):
          print '***** Error: Retag file name is the same as output file name!'
          raise Exception()
        elif (retag_file_name == in_file_name):
          print '***** Error: Retag file name is the same as input file name!'
          raise Exception()
        elif (retag_file_name == hmm_file_name):
          print '***** Error: Retag file name is the same as HMM file name!'
          raise Exception()

        try:
          f_in = open(retag_file_name,'r')  # Test if file is available

          # Now gather record numbers and previous tags/states, as well as the
          # original header information. Use a simple state machine to do this.
          #
          tagged_recs  = {}
          cleaned_recs = {}
          original_header_lines = []
          state = -1  # Header lines state
          prevline = ''

          for line in f_in.xreadlines():  # Read training file and process it
            line = line.strip()

            if (state == -1) and (len(line) == 0):  # End of header lines
              state = 0
              prevline = line
              continue

            if (state == -1) and (len(line) > 0) and (line[0] == "#"):
              original_header_lines.append("# " + line)
              prevline = line
              continue
            sline = line.split(' ')

            if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \
               and (sline[2][0] == '(') and (sline[2][-2:] == '):'):
              try:	
                rec = int(sline[1])  # Original record number 
                tagged_recs[rec]  = None
                cleaned_recs[rec] = None
                state = 1
              except:
                pass
              prevline = line
              continue

            if (state == 1) and (len(line) > 0) and (line[0] != '#'):
              tagged_recs[rec]  = line
              cleaned_recs[rec] = prevline
              state = 0
              prevline = line
              continue

            if (state == 1) and (len(line) > 0):
              prevline = line
              continue

          f_in.close()
          tagged_recs_keys = tagged_recs.keys()

          num_rec = len(tagged_recs_keys)  # Override specified numbers
          first_rec = 0
          last_rec = line_count

        except:
          print '***** Error: Cannot open tagged training file specified',
          print 'in "-retag" option:', retag_file_name
          raise IOError()

        options = options[2:]  # Remove processed '-retag' option and file name

      elif (options[0][:5] == '-freq'):
        if (hmm_file_name == None) and ('-hmm' not in options):
          print '***** Error: "-feqs" option can only be used together with',
          print '"-hmm" option (which is not given).'
          raise Exception()

        freqs_file_name = options[1]  # File name to write the frequencies to
        if (freqs_file_name == out_file_name):
          print '***** Error: Frequency file name is the same as output',
          print 'file name!'
          raise Exception()
        elif (freqs_file_name == in_file_name):
          print '***** Error: Frequency file name is the same as input',
          print 'file name!'
          raise Exception()
        elif (freqs_file_name == hmm_file_name):
          print '***** Error: Frequency file name is the same as HMM',
          print 'file name!'
          raise Exception()

        options = options[2:]  # Remove processed '-freqs' option and file name
        try:  # Check if file writing is possible
          freqs_out = open(freqs_file_name,'w')
	  freqs_out.close()
        except:
          print '***** Error: Cannot write to frequency output file specified',
          print 'in "-freqs" option:', freqs_file_name
          raise IOError()

      else:
        print '***** Error: Illegal option:', options[0]
        raise Exception()

  # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - -
  #
  if (hmm_file_name != None):
    myhmm = simplehmm.hmm([],[])  # Create new empty HMM object
    myhmm.load_hmm(hmm_file_name)
    myhmm.print_hmm()  # Print HMM (according to verbose and logging level)

  # Open output file and write header - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_out = open(out_file_name,'w')
  except:
    inout.log_message('Cannot open output file: '+out_file_name,'err')
    raise IOError()

  f_out.write("# Tagged training data written by 'pyTagData.py -"+ \
              " Version 0.1'"+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Created '+time.ctime(time.time())+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Input file name:  '+in_file_name+os.linesep)
  f_out.write('# Output file name: '+out_file_name+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Parameters:'+os.linesep)
  f_out.write('# - Start of block with training records: '+str(first_rec)+ \
              os.linesep)
  f_out.write('# - End of block with training records:   '+str(last_rec)+ \
              os.linesep)
  f_out.write('# - Number of training records:           '+str(num_rec)+ \
              os.linesep)
  if (hmm_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \
                os.linesep)
  if (retag_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \
                os.linesep)
    f_out.write("#   Header lines from original training file follow:" + \
                os.linesep)
    for header_line in original_header_lines:
	    f_out.write(header_line + os.linesep)
  if (freqs_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Tag/state pattern frequencies written to file '" + \
                freqs_file_name + os.linesep)
  f_out.write('#'+'-'*70+os.linesep)
  f_out.write(os.linesep)

  rec_count    = 0        # Number of selected records
  num_rec_left = num_rec  # Number of records to be selected left
  rec_selected = {}       # Dictionary of all record numbers that were selected
  seq_freqs = {}          # Dict to hold examples of tag/state patterns

  unchanged_loop_cnt = 0       # Counter of how many loops have been done
                               # without new records being selected
  prev_num_rec_left = num_rec  # Number of records left in the previous
                               # interation

  # Due to the random nature of selecting records, and because sometimes  - - -
  # a selected component can be empty (and is thus not used for training)
  # more than one iteration over the input data set is carried out. In each 
  # iteration, records are selected randomly.
  #
  while (rec_count < num_rec):  # Loop until 'num_rec' records selected

    # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
      f_in = open(in_file_name,'r')
    except:
      inout.log_message('Cannot open input file: '+in_file_name,'err')
      raise IOError()

    line_read = 0  # Number of read lines

    # Skip to start of training block - - - - - - - - - - - - - - - - - - - - -
    #
    if (first_rec > 0):
      for i in range(first_rec):
        f_in.readline()

    while (rec_count < num_rec) and (line_read <= (last_rec-first_rec)):
      line = f_in.readline()

      if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \
         ((retag_file_name == None) and \
          (num_rec_left >= random.randrange(0,rec_range,1))):

        line = line.strip()  # Remove line separators
        config.curr_line = line  # Make a copy of the unprocessed current line

        line = line.lower()  # Make all characters lower case

        inout.log_message(['Record number: '+str(line_read+first_rec)],'v1')
        config.curr_line_no = line_read+first_rec  # Store current line number

        # Process line and extract content into components (name, geocode, etc)
        #
        [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
           inout.process_line(line)

        # Select component and process it - - - - - - - - - - - - - - - - - - -
        #
        if (tag_mode == 'name'):
          if (type(name_comp) == types.ListType):
            component = name_comp[0].strip()+' '+name_comp[1].strip()
          else:
            component = name_comp.strip()
        else:  # Locality component
          component = geocode_comp.strip()+' '+locality_comp.strip()

        if (component != '') and \
           (not rec_selected.has_key((line_read+first_rec))):

          if (tag_mode == 'name'):
            inout.log_message('  Name component: |'+component+'|','v1')

            component = name.clean_name_component(component)
            [word_list, tag_list] = name.tag_name_component(component)
 
          else:  # Locality component
            inout.log_message('  Locality component: |'+component+'|','v1')

            component = locality.clean_geoloc_component(component)
            [word_list, tag_list] = locality.tag_geoloc_component(component)

          if (tag_list != []):  # Only process non-empty tag lists

            # Append record number into dictionary of processed records
            #
            rec_selected.update({(line_read+first_rec):(line_read+first_rec)})

            # Create all permutation sequences of this tag list - - - - - - - -
            #
            tag_seq = mymath.perm_tag_sequence(tag_list)

            inout.log_message(['  Word list: '+str(word_list), \
                               '  Tag list: '+str(tag_list), \
                               '  Tag sequences:'],'v2')

            # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - -
            #
            if (hmm_file_name != None):

              state_seq  = []    # List containing computed HMM state sequences
              max_prob   = -1.0  # maximal probability for a sequence
              max_seq_no = -1    # Number of the seq. with the max. probablity

              # Now give tag sequences to the HMMs to compute state sequences
              #
              i = 0
              for t in tag_seq:
                [obs_seq, prob] = myhmm.viterbi(t)
                state_seq.append(obs_seq)
                if (prob > max_prob):
                  max_prob = prob
                  max_seq_no = i
                i += 1

            # Write original component and resulting tag sequences to output
            #
            f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \
                        '): |'+component+'|'+os.linesep) # Commented original
            num_len = len(str(line_read+first_rec))+len(str(rec_count))+6

            f_out.write('#'+num_len*' '+'|'+' '.join(word_list)+'|'+os.linesep)

            for i in range(len(tag_seq)):
              # Convert each tag sequence into a string for file output
              #
              seq_string = '  '

              if (hmm_file_name != None) and (i != max_seq_no):
                seq_string = '# ' # Comment sequences with not max. probability

              for j in range(len(tag_seq[i])):

                if (hmm_file_name != None):
                  seq_string = seq_string+' '+tag_seq[i][j]+':'+ \
                               state_seq[i][j]+','
                else:
                  seq_string = seq_string+' '+tag_seq[i][j]+':,'

              f_out.write(seq_string[:-1]+os.linesep)  # Write without , at end
              inout.log_message('    '+seq_string[:-1],'v2')

            if (hmm_file_name != None):
              f_out.write('# Maximum Viterbi probability: %0.5f'% \
                          (max_prob) + os.linesep)
              inout.log_message('Maximum Viterbi probability: %0.5f'% \
                                (max_prob), 'v2')

            if (retag_file_name != None) and (tagged_recs[line_read] != None):
              if (tagged_recs[line_read].strip() != seq_string[:-1].strip()):
                f_out.write("# Note: ***** Changed *****" + os.linesep)
                inout.log_message('                      Note:' + \
                                  ' ***** Changed *****','v2')
                f_out.write('# Was: ' + tagged_recs[line_read]+os.linesep)
                            # Write commented original tag sequence
                inout.log_message('Original tag sequence: '+ \
                                  tagged_recs[line_read],'v2')

            f_out.write(os.linesep)  # Write an empty line
            inout.log_message('','v1')  # Print empty lines between records

            if (hmm_file_name != None):
              seq_key = seq_string[:-1]  # Add sequence to dictionary
              if (seq_freqs.has_key(seq_key)):
                seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \
                                          max_prob])
              else:
                seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \
                                      max_prob]]

            rec_count += 1

            # Print process indicator message
            #
            if (config.proc_ind >= 0) and (rec_count > 0):
              if (rec_count % config.proc_ind == 0):
                print 'Processed line', rec_count, 'of', num_rec

      line_read += 1

    f_in.close()

    num_rec_left = num_rec - rec_count

    if (prev_num_rec_left == num_rec_left):  # No new records selected
      unchanged_loop_cnt += 1
    prev_num_rec_left = num_rec_left  # Set to current value

    if (unchanged_loop_cnt > 5):  # Do five loops maximal without selecting
                                  # new records
      config.curr_line_no = -1  # Set to illegal/empty values, as warning is
      config.curr_line    = ''  # not related to the current input line
      inout.log_message(['Can not select more than '+str(rec_count)+ \
                         ' records for training.', \
                         'This is probably due to empty input components.', \
                         'Please reduce value of "num_rec" or increase ' + \
                         'range','between "first_rec" and "last_rec".'],'warn')
      break

    if (num_rec_left < 10):  # Only 10 records left to select
      num_rec_left = num_rec+1  # Set to more than 100% probablity
    elif (num_rec_left < (num_rec / 100.0)):  # Less than 1% records left
      num_rec_left = int(num_rec / 100.0)  # Set to 1%

  f_out.close()

  # If specified, save Viterbi frequencies to a file  - - - - - - - - - - - - -
  #
  if (freqs_file_name != None):
    freqs_out = open(freqs_file_name,'w')  # Open frequency file for writing
    freqs_out.write('# Frequency listing of tag/state patterns written by')
    freqs_out.write('"pyTagData.py - Version 0.1"'+os.linesep)
    freqs_out.write('#'+os.linesep)
    freqs_out.write('# Created '+time.ctime(time.time())+os.linesep)
    freqs_out.write('#'+os.linesep)
    freqs_out.write("# Input file name:  "+in_file_name+os.linesep)
    freqs_out.write("# Output file name: "+out_file_name+os.linesep)
    freqs_out.write(os.linesep)
    freqs_out.write('# Parameters:'+os.linesep)
    freqs_out.write('# - Start of block with training records: '+ \
                    str(first_rec)+os.linesep)
    freqs_out.write('# - End of block with training records:   '+ \
                    str(last_rec)+os.linesep)
    freqs_out.write('# - Number of training records:           '+ \
                    str(num_rec)+os.linesep)
    if (hmm_file_name != None):
      freqs_out.write('#'+os.linesep)
      freqs_out.write("# - Using HMM file '"+hmm_file_name+ \
                      "' for standardisation"+os.linesep)
    if (retag_file_name != None):
      freqs_out.write('#'+os.linesep)
      freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \
                      "'"+os.linesep)
    freqs_out.write('#'+'-'*70+os.linesep)
    freqs_out.write(os.linesep)

    sorted_seq_freqs = []  # Now sort sequences according to their fruequencies
    for key in seq_freqs.keys():
      sorted_seq_freqs.append((len(seq_freqs[key]),key))
    sorted_seq_freqs.sort()

    for skey in sorted_seq_freqs:
      key = skey[1]
      freqs_out.write('# Pattern: '+str(key)+os.linesep)
      freqs_out.write('# Frequency: '+str(skey[0])+os.linesep)
      examples = seq_freqs[key]
      freqs_out.write('# Maximum Viterbi probability: '+ \
                      str(examples[0][1])+os.linesep)
      freqs_out.write('# Examples: '+os.linesep)
      for example in examples:
        freqs_out.write('#    '+str(example[0])+os.linesep)
      freqs_out.write(str(key)+os.linesep)
      freqs_out.write(os.linesep)
    freqs_out.close()

  inout.log_message(['Read '+str(line_read)+' lines, processed '+ \
                    str(rec_count)+' lines', 'End.'],'v1')
Beispiel #6
0
def get_address_hmm(word_list, tag_list, address_hmm, tag_lookup_table,
                    record_id, fields_str):
    """Process the input using a HMM to extract address output fields.

  USAGE:
    address_dict = get_address_hmm(word_list, tag_list, address_hmm,
                                   tag_lookup_table)

  ARGUMENTS:
    word_list         List of words as produces with tag_address_component()
    tag_list          Corresponding list of tags as produces with
                      tag_address_component()
    address_hmm       A reference to the address hidden Markov model
    tag_lookup_table  A tagging look-up table as defined in 'lookup.py'
    record_id         A string identifying the current record
    fields_str        A string representation of the input fields

  DESCRIPTION:
    The routine returns a dictionary with the parsed and extracted output
    fields for the address component. A Hidden Markov Model (HMM) is used for
    this task.

    The dictionary returned can contain the following key words:
      wayfare_number
      wayfare_name
      wayfare_qualifier
      wayfare_type
      unit_number
      unit_type
      property_name
      institution_name
      institution_type    
      postaddress_number
      postaddress_type
      locality_name
      locality_qualifier
      postcode
      territory
      country
      address_hmm_prob (the probability returned by the Viterbi algorithm for
                        the most likely HMM state seqence)
  """

    # First, create all permutations of the input tag sequence
    #
    tag_list_seq = mymath.perm_tag_sequence(tag_list)

    # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - -
    # and keep the one with highest probability
    #
    max_prob = -1.0
    best_obs_seq = []
    best_tag_list = []

    for t in tag_list_seq:
        [obs_seq, prob] = address_hmm.viterbi(t)
        if (prob > max_prob):
            best_obs_seq = obs_seq
            best_tag_list = t
            max_prob = prob

        print '3:%s  Sequence: %s has Viterbi probability: %f' % \
              (record_id, str(t), prob)

    print '2:%s  Best observation sequence: %s with tag sequence: %s' % \
          (record_id, str(best_obs_seq), str(best_tag_list))

    # Now process the observation sequence and add elements into dictionary - - -
    #
    if (len(tag_list) != len(word_list)):
        print 'error:%s Length of word list and tag list differs: %s, %s%s' % \
              (record_id, str(word_list), str(tag_list), fields_str)
        raise Exception

    list_len = len(tag_list)

    if (list_len == 0):
        print 'warning:%s Empty tag list returned from HMM %s' % \
              (record_id, fields_str)
        return {}  # Return an empty dictionary if not output fields given

    # norm_max_prob = max_prob / float(list_len)  # Normalise max. probability
    address_dict = {'address_hmm_prob': [str(max_prob)]}

    for i in range(list_len):  # Loop over words and states
        w = word_list[i]
        s = best_obs_seq[i]

        #  Do not output commas, vertical bars and hyphens  - - - - - - - - - - - -
        #
        if (w in ['|', ',', '-', '/']):
            pass

        elif (s == 'wfnu'
              ):  # Wayfare number - - - - - - - - - - - - - - - - - - -
            v = address_dict.get('wayfare_number', [])
            v.append(w)
            address_dict.update({'wayfare_number': v})

        elif (s in ['wfna1', 'wfna2',
                    'wfna3']):  # Wayfare name  - - - - - - - - - -
            v = address_dict.get('wayfare_name', [])
            v.append(w)
            address_dict.update({'wayfare_name': v})

        elif (s == 'wfql'
              ):  # Wayfare qualifier  - - - - - - - - - - - - - - - - -
            v = address_dict.get('wayfare_qualifier', [])
            v.append(w)
            address_dict.update({'wayfare_qualifier': v})

        elif (s == 'wfty'
              ):  # Wayfare type - - - - - - - - - - - - - - - - - - - -
            v = address_dict.get('wayfare_type', [])
            v.append(w)
            address_dict.update({'wayfare_type': v})

        elif (s == 'unnu'
              ):  # Unit number  - - - - - - - - - - - - - - - - - - - -
            v = address_dict.get('unit_number', [])
            v.append(w)
            address_dict.update({'unit_number': v})

        elif (s == 'unty'
              ):  # Unit type  - - - - - - - - - - - - - - - - - - - - -
            v = address_dict.get('unit_type', [])
            v.append(w)
            address_dict.update({'unit_type': v})

        elif (s in ['prna1',
                    'prna2']):  # Property name - - - - - - - - - - - - - -
            v = address_dict.get('property_name', [])
            v.append(w)
            address_dict.update({'property_name': v})

        elif (s in ['inna1',
                    'inna2']):  # Institution name  - - - - - - - - - - - -
            v = address_dict.get('institution_name', [])
            v.append(w)
            address_dict.update({'institution_name': v})

        elif (s == 'inty'
              ):  # Institution type - - - - - - - - - - - - - - - - - -
            v = address_dict.get('institution_type', [])
            v.append(w)
            address_dict.update({'institution_type': v})

        elif (s == 'panu'
              ):  # Postal address number  - - - - - - - - - - - - - - -
            v = address_dict.get('postaddress_number', [])
            v.append(w)
            address_dict.update({'postaddress_number': v})

        elif (s == 'paty'
              ):  # Postal address type  - - - - - - - - - - - - - - - -
            v = address_dict.get('postaddress_type', [])
            v.append(w)
            address_dict.update({'postaddress_type': v})

        elif (s in ['loc1',
                    'loc2']):  # Locality name - - - - - - - - - - - - - - -
            v = address_dict.get('locality_name', [])
            v.append(w)
            address_dict.update({'locality_name': v})

        elif (s == 'locql'
              ):  # Locality qualifier  - - - - - - - - - - - - - - - -
            v = address_dict.get('locality_qualifier', [])
            v.append(w)
            address_dict.update({'locality_qualifier': v})

        elif (s == 'pc'
              ):  # Postcode - - - - - - - - - - - - - - - - - - - - - - -
            v = address_dict.get('postcode', [])
            v.append(w)
            address_dict.update({'postcode': v})

        elif (s in ['ter1',
                    'ter2']):  # Territory - - - - - - - - - - - - - - - - -
            v = address_dict.get('territory', [])
            v.append(w)
            address_dict.update({'territory': v})

        elif (s in ['cntr1',
                    'cntr2']):  # Country - - - - - - - - - - - - - - - - -
            v = address_dict.get('country', [])
            v.append(w)
            address_dict.update({'country': v})

        else:  # Should never happen
            print 'warning:%s This should never happen! ' % (record_id) + \
                  ' Tag: %s, word: %s, word list: %s, tag list: %s%s' % \
                  (str(s), w, str(word_list), str(tag_list),fields_str)

    # Check if concatenated locality and territory words are in lookup-table  - -
    #
    if (address_dict.has_key('locality_name')):
        loc = address_dict['locality_name']
        if (len(loc) > 1):  # Locality contains more than one word
            loc_tuple = tuple(loc)  # Make it a tuple
            if (tag_lookup_table.has_key(loc_tuple)):
                new_loc = tag_lookup_table[loc_tuple][0]
                address_dict.update({'locality_name': [new_loc]})

    if (address_dict.has_key('territory')):
        terr = address_dict['territory']
        if (len(terr) > 1):  # Territory contains more than one word
            terr_tuple = tuple(terr)  # Make it a tuple
            if (tag_lookup_table.has_key(terr_tuple)):
                new_terr = tag_lookup_table[terr_tuple][0]
                address_dict.update({'territory': [new_terr]})

    if (address_dict.has_key('country')):
        cntr = address_dict['country']
        if (len(cntr) > 1):  # Country contains more than one word
            cntr_tuple = tuple(cntr)  # Make it a tuple
            if (tag_lookup_table.has_key(cntr_tuple)):
                new_cntr = tag_lookup_table[cntr_tuple][0]
                address_dict.update({'country': [new_cntr]})

    # Finally do some tests on the output fields  - - - - - - - - - - - - - - - -
    #
    address_items = address_dict.items()

    # Check if a value list has more than three elements, if so print out
    #
    for i in address_items:
        if (len(i[1]) > 3):
            print 'warning:%s Output field "%s" contains' % (record_id, str(i[0]))+ \
                  ' more than three elements: %s%s' % (str(i[1]), fields_str)

    # Check if 'number' elements only contain (alpha-) numerical values - - - - -
    # and also check how many numbers in an element
    #
    if (address_dict.has_key('wayfare_number')
        ):  # Check how many wayfare numbers
        v = address_dict['wayfare_number']
        if (len(v) > 2):
            print 'warning:%s More than two wayfare numbers: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                print 'warning:%s Wayfare number contains no ' % (record_id) + \
                      'digits: %s%s' % (str(v), fields_str)
                break  # Exit for loop

    if (address_dict.has_key('unit_number')):  # Check how many unit numbers
        v = address_dict['unit_number']
        if (len(v) > 1):
            print 'warning:%s More than one unit number: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                print 'warning:%s Unit number contains no ' % (record_id) + \
                      'digits: %s%s' % (str(v), fields_str)
                break  # Exit for loop

    if (address_dict.has_key('postaddress_number')
        ):  # Check postaddress numbers
        v = address_dict['postaddress_number']
        if (len(v) > 1):
            print 'warning:%s More than one post-address number: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                print 'warning:%s Post-address number contains no ' % (record_id) + \
                      'digits: %s%s' % (str(v), fields_str)
                break  # Exit for loop

    # Check if 'type' elements contain one word only  - - - - - - - - - - - - - -
    # if it's a known type word
    #
    if (address_dict.has_key('wayfare_type')):  # Check wayfare type
        v = address_dict['wayfare_type']
        if (len(v) > 1):
            print 'warning:%s More than one wayfare type: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not tag_lookup_table.has_key((i))) or \
               (tag_lookup_table.has_key((i)) and
                (tag_lookup_table[(i)][1].find('WT') < 0)):
                print 'warning:%s Wayfare type word is not known: %s%s' % \
                      (record_id, str(v), fields_str)
                break  # Exit for loop

    if (address_dict.has_key('unit_type')):  # Check unit type
        v = address_dict['unit_type']
        if (len(v) > 1):
            print 'warning:%s More than one unit type: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not tag_lookup_table.has_key((i))) or \
               (tag_lookup_table.has_key((i)) and \
                (tag_lookup_table[(i)][1].find('UT') < 0)):
                print 'warning:%s Unit type word is not known: %s%s' % \
                      (record_id, str(v), fields_str)
                break  # Exit for loop

    if (address_dict.has_key('institution_type')):  # Check institution type
        v = address_dict['institution_type']
        if (len(v) > 1):
            print 'warning:%s More than one institution type: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not tag_lookup_table.has_key((i))) or \
               (tag_lookup_table.has_key((i)) and \
                (tag_lookup_table[(i)][1].find('IT') < 0)):
                print 'warning:%s Institution type word is not known: %s%s' % \
                      (record_id, str(v), fields_str)
                break  # Exit for loop

    if (address_dict.has_key('postaddress_type')):  # Check postaddress type
        v = address_dict['postaddress_type']
        if (len(v) > 2):
            print 'warning:%s More than two post-address type: %s%s' % \
                  (record_id, str(v), fields_str)
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not tag_lookup_table.has_key((i))) or \
               (tag_lookup_table.has_key((i)) and \
                (tag_lookup_table[(i)][1].find('PA') < 0)):
                print 'warning:%s Post-address type word is not known: %s%s' % \
                      (record_id, str(v), fields_str)
                break  # Exit for loop

    # Check if 'qualifier' elements only contain known qualifier words  - - - - -
    #
    if (address_dict.has_key('wayfare_qualifier')):  # Check wayfare qualifier
        v = address_dict['wayfare_qualifier']
        for i in v:
            if (not tag_lookup_table.has_key((i,))) or \
               (tag_lookup_table.has_key((i,)) and \
                (tag_lookup_table[(i,)][1].find('LQ') < 0)):
                print 'warning:%s Wayfare qualifier word is not known: %s%s' % \
                      (record_id, str(v), fields_str)
                break  # Exit for loop

    if (address_dict.has_key('locality_qualifier')
        ):  # Check locality qualifier
        v = address_dict['locality_qualifier']
        for i in v:
            if (not tag_lookup_table.has_key((i,))) or \
               (tag_lookup_table.has_key((i,)) and \
                (tag_lookup_table[(i,)][1].find('LQ') < 0)):
                print 'warning:%s Locality qualifier word is not known: %s%s' % \
                      (record_id, str(v), fields_str)
                break  # Exit for loop

    return address_dict
Beispiel #7
0
def get_geoloc_hmm(word_list, tag_list):
    """Process input using a HMM to extract geocode and locality output fields.

  USAGE:
    geoloc_dict = get_geoloc_hmm(word_list, tag_list)

  ARGUMENTS:
    word_list  List of words as produces with clean_tag_locality()
    tag_list   Corresponding list of tags as produces with
               clean_tag_locality()

  DESCRIPTION:
    The routine returns a dictionary with the parsed and extracted output
    fields for both the locality and geocode components. A Hidden Markov Model
    (HMM) is used for this task.

    The dictionary returned can contain the following key words:
    - wayfare_number
    - wayfare_name
    - wayfare_qualifier
    - wayfare_type
    - unit_number
    - unit_type
    - property_name
    - institution_name
    - institution_type    
    - postaddress_number
    - postaddress_type
    - locality_name
    - locality_qualifier
    - postcode
    - territory
    - country
    - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for
                        the most likely HMM state seqence)
  """

    # First, create all permutations of the input tag sequence
    #
    tag_list_seq = mymath.perm_tag_sequence(tag_list)

    msg = ['  Input tag sequence: ' + str(tag_list), '  Output tag sequences:']
    for t in tag_list_seq:
        msg.append('    ' + str(t))
    inout.log_message(msg, 'v2')

    # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - -
    # and keep the one with highest probability
    #
    max_prob = -1.0
    best_obs_seq = []
    best_tag_list = []

    for t in tag_list_seq:
        [obs_seq, prob] = config.geoloc_hmm.viterbi(t)
        if (prob > max_prob):
            best_obs_seq = obs_seq
            best_tag_list = t
            max_prob = prob

        inout.log_message(
            '  Probability ' + str(prob) + '  for sequence ' + str(t), 'v2')

    inout.log_message([
        '  Best observation sequence: ' + str(best_obs_seq),
        '          with tag sequence: ' + str(best_tag_list)
    ], 'v2')

    # Now process the observation sequence and add elements into dictionary - - -
    #
    tag_list_len = len(tag_list)
    norm_max_prob = max_prob / float(
        tag_list_len)  # Normalise max. probability
    geoloc_dict = {'geoloc_hmm_proba': [str(norm_max_prob)]}

    list_len = len(word_list)
    for i in range(list_len):  # Loop over words and states
        w = word_list[i]
        s = best_obs_seq[i]

        #  Do not output commas, vertical bars and hyphens  - - - - - - - - - - - -
        #
        if (w in ['|', ',', '-', '/']):
            pass

        elif (s == 'wfnu'
              ):  # Wayfare number - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('wayfare_number', [])
            v.append(w)
            geoloc_dict.update({'wayfare_number': v})

        elif (s in ['wfna1', 'wfna2',
                    'wfna3']):  # Wayfare name  - - - - - - - - - -
            v = geoloc_dict.get('wayfare_name', [])
            v.append(w)
            geoloc_dict.update({'wayfare_name': v})

        elif (s == 'wfql'
              ):  # Wayfare qualifier  - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('wayfare_qualifier', [])
            v.append(w)
            geoloc_dict.update({'wayfare_qualifier': v})

        elif (s == 'wfty'
              ):  # Wayfare type - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('wayfare_type', [])
            v.append(w)
            geoloc_dict.update({'wayfare_type': v})

        elif (s == 'unnu'
              ):  # Unit number  - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('unit_number', [])
            v.append(w)
            geoloc_dict.update({'unit_number': v})

        elif (s == 'unty'
              ):  # Unit type  - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('unit_type', [])
            v.append(w)
            geoloc_dict.update({'unit_type': v})

        elif (s in ['prna1',
                    'prna2']):  # Property name - - - - - - - - - - - - - -
            v = geoloc_dict.get('property_name', [])
            v.append(w)
            geoloc_dict.update({'property_name': v})

        elif (s in ['inna1',
                    'inna2']):  # Institution name  - - - - - - - - - - - -
            v = geoloc_dict.get('institution_name', [])
            v.append(w)
            geoloc_dict.update({'institution_name': v})

        elif (s == 'inty'
              ):  # Institution type - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('institution_type', [])
            v.append(w)
            geoloc_dict.update({'institution_type': v})

        elif (s == 'panu'
              ):  # Postal address number  - - - - - - - - - - - - - - -
            v = geoloc_dict.get('postaddress_number', [])
            v.append(w)
            geoloc_dict.update({'postaddress_number': v})

        elif (s == 'paty'
              ):  # Postal address type  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('postaddress_type', [])
            v.append(w)
            geoloc_dict.update({'postaddress_type': v})

        elif (s in ['loc1',
                    'loc2']):  # Locality name - - - - - - - - - - - - - - -
            v = geoloc_dict.get('locality_name', [])
            v.append(w)
            geoloc_dict.update({'locality_name': v})

        elif (s == 'locql'
              ):  # Locality qualifier  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('locality_qualifier', [])
            v.append(w)
            geoloc_dict.update({'locality_qualifier': v})

        elif (s == 'pc'
              ):  # Postcode - - - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('postcode', [])
            v.append(w)
            geoloc_dict.update({'postcode': v})

        elif (s in ['ter1',
                    'ter2']):  # Territory - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('territory', [])
            v.append(w)
            geoloc_dict.update({'territory': v})

        elif (s in ['cntr1',
                    'cntr2']):  # Country - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('country', [])
            v.append(w)
            geoloc_dict.update({'country': v})

        else:  # Should never happen
            msg = ['This should never happen!', '  Tag: '+str(s), '  Word: '+w, \
                   '  Word list: '+str(word_list), \
                   '  tag list:  '+str(tag_list)]
            inout.log_message(msg, 'warn')

    # Check if concatenated locality and territory words are in lookup-table  - -
    #
    if (geoloc_dict.has_key('locality_name')):
        loc = geoloc_dict['locality_name']
        if (len(loc) > 1):  # Locality contains more than one word
            loc_tuple = tuple(loc)  # Make it a tuple
            if (config.geoloc_lookup_dict.has_key(loc_tuple)):
                new_loc = config.geoloc_lookup_dict[loc_tuple][0]
                geoloc_dict.update({'locality_name': [new_loc]})

    if (geoloc_dict.has_key('territory')):
        terr = geoloc_dict['territory']
        if (len(terr) > 1):  # Territory contains more than one word
            terr_tuple = tuple(terr)  # Make it a tuple
            if (config.geoloc_lookup_dict.has_key(terr_tuple)):
                new_terr = config.geoloc_lookup_dict[terr_tuple][0]
                geoloc_dict.update({'territory': [new_terr]})

    if (geoloc_dict.has_key('country')):
        cntr = geoloc_dict['country']
        if (len(cntr) > 1):  # Country contains more than one word
            cntr_tuple = tuple(cntr)  # Make it a tuple
            if (config.geoloc_lookup_dict.has_key(cntr_tuple)):
                new_cntr = config.geoloc_lookup_dict[cntr_tuple][0]
                geoloc_dict.update({'country': [new_cntr]})

    # Finally do some tests on the output fields  - - - - - - - - - - - - - - - -
    #
    geoloc_items = geoloc_dict.items()

    # Check if a value list has more than three elements, if so print out
    #
    for i in geoloc_items:
        if (len(i[1]) > 3):
            inout.log_message('Geocode/locality output field '+ str(i[0])+ \
                    ' contains more than three elements: '+str(i[1]),'warn')

    # Check if 'number' elements only contain (alpha-) numerical values - - - - -
    # and also check how many numbers in an element
    #
    if (geoloc_dict.has_key('wayfare_number')
        ):  # Check how many wayfare numbers
        v = geoloc_dict['wayfare_number']
        if (len(v) > 2):
            inout.log_message('More than two wayfare numbers: ' + str(v),
                              'warn')
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                inout.log_message('Wayfare number element contains no digits: '+ \
                                  str(v),'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('unit_number')):  # Check how many unit numbers
        v = geoloc_dict['unit_number']
        if (len(v) > 1):
            inout.log_message('More than one unit numbers: ' + str(v), 'warn')
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                inout.log_message('Unit number element contains no digits: '+str(v),\
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('postaddress_number')
        ):  # Check postaddress numbers
        v = geoloc_dict['postaddress_number']
        if (len(v) > 1):
            inout.log_message('More than one postaddress numbers: ' + str(v),
                              'warn')
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                inout.log_message('Postaddress number element contains no digits: '+ \
                                  str(v),'warn')
                break  # Exit for loop

    # Check if 'type' elements contain one word only  - - - - - - - - - - - - - -
    # if it's a known type word
    #
    if (geoloc_dict.has_key('wayfare_type')):  # Check wayfare type
        v = geoloc_dict['wayfare_type']
        if (len(v) > 1):
            inout.log_message('More than one wayfare type: ' + str(v), 'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('WT') < 0)):
                inout.log_message('Wayfare type word is not known: ' + str(v),
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('unit_type')):  # Check unit type
        v = geoloc_dict['unit_type']
        if (len(v) > 1):
            inout.log_message('More than one unit type: ' + str(v), 'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('UT') < 0)):
                inout.log_message('Unit type word is not known: ' + str(v),
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('institution_type')):  # Check institution type
        v = geoloc_dict['institution_type']
        if (len(v) > 1):
            inout.log_message('More than one institution type: ' + str(v),
                              'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('IT') < 0)):
                inout.log_message(
                    'Institution type word is not known: ' + str(v), 'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('postaddress_type')):  # Check postaddress type
        v = geoloc_dict['postaddress_type']
        if (len(v) > 2):
            inout.log_message('More than two postaddress type: ' + str(v),
                              'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('PA') < 0)):
                inout.log_message(
                    'Postaddress type word is not known: ' + str(v), 'warn')
                break  # Exit for loop

    # Check if 'qualifier' elements only contain known qualifier words  - - - - -
    #
    if (geoloc_dict.has_key('wayfare_qualifier')):  # Check wayfare qualifier
        v = geoloc_dict['wayfare_qualifier']
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or \
               (config.geoloc_lookup_dict.has_key((i,)) and \
                (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)):
                inout.log_message('Wayfare qualifier word is not known: '+str(v), \
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('locality_qualifier')):  # Check locality qualifier
        v = geoloc_dict['locality_qualifier']
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or \
               (config.geoloc_lookup_dict.has_key((i,)) and \
                (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)):
                inout.log_message('Locality qualifier word is not known: '+str(v), \
                                  'warn')
                break  # Exit for loop

    return geoloc_dict
Beispiel #8
0
def tagdata():
    """Main routine, open file, read lines, tag data records, write to out-file.

  USAGE:
    tagdata()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 5):
        print '***** Error: %s needs at least six arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Tagging mode: "name" or "locality"'
        print '*****        - Output training file name'
        print '*****        - Start of block with training records'
        print '*****        - End of block with training records'
        print '*****        - Number of training records'
        print '*****          plus options'
        raise Exception()

    if (config.in_file_name == config.options[2]):
        print '***** Error: Input and output files must differ'
        print '*****        Input file name:          ', config.in_file_name
        print '*****        Output training file name:', config.options[2]
        raise Exception()

    first_rec = int(config.options[2])
    last_rec = int(config.options[3])
    num_rec = int(config.options[4])
    in_file_name = config.in_file_name
    out_file_name = config.options[1]

    # Check record number values  - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (int(first_rec) >= int(last_rec)) or \
       ((int(num_rec)-1) > (int(last_rec)-int(first_rec))):
        print '***** Error: Illegal values for training records block:'
        print '*****        - Start of block with training records:', first_rec
        print '*****        - End of block with training records:  ', last_rec
        print '*****        - Number of training records:          ', num_rec
        raise Exception()

    rec_range = last_rec - first_rec - 1  # Range of records in input file

    # Open input file and check number of available records - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0
    for line in f_in.xreadlines():
        line_count += 1
    f_in.close()

    if (last_rec > line_count):  # Illegal value for last record
        print '***** Error: Illegal values for last training records:', last_rec
        print '*****        File only contains', line_count, 'lines/records'
        raise Exception()

    # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
    #
    tag_mode = config.options[0]
    if (tag_mode in ['name', 'na', 'n']):
        tag_mode = 'name'
    elif (tag_mode in ['locality', 'localty', 'loc', 'l']):
        tag_mode = 'loc'
    else:
        print '***** Error: Illegal tagging mode:', tag_mode
        print '*****        Must be either "name" or "locality"'
        raise Exception()

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    hmm_file_name = None  # Default: Do not use HMM to standardise training
    #          records
    retag_file_name = None  # Default: Do not retag an existing training file
    config.nowarn = 0  # Deactivate no warning flag (print/log warning
    # messages)
    freqs_file_name = None  # Default: Do not write frequencies, no -freqs option

    if (len(config.options) > 5):
        options = config.options[5:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file: ' + config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write('############' + os.linesep)
                f_log.write('#' + os.linesep)
                f_log.write(
                    "# 'pyTagData.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write('#' + os.linesep)
                f_log.write("# Input file name:  " + in_file_name + os.linesep)
                f_log.write("# Output file name: " + out_file_name +
                            os.linesep)
                f_log.write("# Tagging mode:     " + tag_mode + os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-hmm'):
                hmm_file_name = options[1]  # Get file name of the HMM to use
                if (hmm_file_name == out_file_name):
                    print '***** Error: HMM file name is the same as output file name!'
                    raise Exception()

                try:
                    f_in = open(hmm_file_name,
                                'r')  # Test if file is available
                except:
                    print '***** Error: Cannot open HMM file specified in "-hmm"',
                    print 'option:', hmm_file_name
                    raise IOError()
                f_in.close()
                options = options[
                    2:]  # Remove processed '-hmm' option and file name

            elif (options[0] == '-retag'):
                if (hmm_file_name == None) and ('-hmm' not in options):
                    print '***** Error: "-retag" option can only be used together with',
                    print '"-hmm" option (which is not given).'
                    raise Exception()

                retag_file_name = options[
                    1]  # Get file name of the already-tagged
                # file to re-process
                if (retag_file_name == out_file_name):
                    print '***** Error: Retag file name is the same as output file name!'
                    raise Exception()
                elif (retag_file_name == in_file_name):
                    print '***** Error: Retag file name is the same as input file name!'
                    raise Exception()
                elif (retag_file_name == hmm_file_name):
                    print '***** Error: Retag file name is the same as HMM file name!'
                    raise Exception()

                try:
                    f_in = open(retag_file_name,
                                'r')  # Test if file is available

                    # Now gather record numbers and previous tags/states, as well as the
                    # original header information. Use a simple state machine to do this.
                    #
                    tagged_recs = {}
                    cleaned_recs = {}
                    original_header_lines = []
                    state = -1  # Header lines state
                    prevline = ''

                    for line in f_in.xreadlines(
                    ):  # Read training file and process it
                        line = line.strip()

                        if (state == -1) and (len(line)
                                              == 0):  # End of header lines
                            state = 0
                            prevline = line
                            continue

                        if (state == -1) and (len(line) > 0) and (line[0]
                                                                  == "#"):
                            original_header_lines.append("# " + line)
                            prevline = line
                            continue
                        sline = line.split(' ')

                        if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \
                           and (sline[2][0] == '(') and (sline[2][-2:] == '):'):
                            try:
                                rec = int(sline[1])  # Original record number
                                tagged_recs[rec] = None
                                cleaned_recs[rec] = None
                                state = 1
                            except:
                                pass
                            prevline = line
                            continue

                        if (state
                                == 1) and (len(line) > 0) and (line[0] != '#'):
                            tagged_recs[rec] = line
                            cleaned_recs[rec] = prevline
                            state = 0
                            prevline = line
                            continue

                        if (state == 1) and (len(line) > 0):
                            prevline = line
                            continue

                    f_in.close()
                    tagged_recs_keys = tagged_recs.keys()

                    num_rec = len(
                        tagged_recs_keys)  # Override specified numbers
                    first_rec = 0
                    last_rec = line_count

                except:
                    print '***** Error: Cannot open tagged training file specified',
                    print 'in "-retag" option:', retag_file_name
                    raise IOError()

                options = options[
                    2:]  # Remove processed '-retag' option and file name

            elif (options[0][:5] == '-freq'):
                if (hmm_file_name == None) and ('-hmm' not in options):
                    print '***** Error: "-feqs" option can only be used together with',
                    print '"-hmm" option (which is not given).'
                    raise Exception()

                freqs_file_name = options[
                    1]  # File name to write the frequencies to
                if (freqs_file_name == out_file_name):
                    print '***** Error: Frequency file name is the same as output',
                    print 'file name!'
                    raise Exception()
                elif (freqs_file_name == in_file_name):
                    print '***** Error: Frequency file name is the same as input',
                    print 'file name!'
                    raise Exception()
                elif (freqs_file_name == hmm_file_name):
                    print '***** Error: Frequency file name is the same as HMM',
                    print 'file name!'
                    raise Exception()

                options = options[
                    2:]  # Remove processed '-freqs' option and file name
                try:  # Check if file writing is possible
                    freqs_out = open(freqs_file_name, 'w')
                    freqs_out.close()
                except:
                    print '***** Error: Cannot write to frequency output file specified',
                    print 'in "-freqs" option:', freqs_file_name
                    raise IOError()

            else:
                print '***** Error: Illegal option:', options[0]
                raise Exception()

    # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - -
    #
    if (hmm_file_name != None):
        myhmm = simplehmm.hmm([], [])  # Create new empty HMM object
        myhmm.load_hmm(hmm_file_name)
        myhmm.print_hmm()  # Print HMM (according to verbose and logging level)

    # Open output file and write header - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_out = open(out_file_name, 'w')
    except:
        inout.log_message('Cannot open output file: ' + out_file_name, 'err')
        raise IOError()

    f_out.write("# Tagged training data written by 'pyTagData.py -"+ \
                " Version 0.1'"+os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Created ' + time.ctime(time.time()) + os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Input file name:  ' + in_file_name + os.linesep)
    f_out.write('# Output file name: ' + out_file_name + os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Parameters:' + os.linesep)
    f_out.write('# - Start of block with training records: '+str(first_rec)+ \
                os.linesep)
    f_out.write('# - End of block with training records:   '+str(last_rec)+ \
                os.linesep)
    f_out.write('# - Number of training records:           '+str(num_rec)+ \
                os.linesep)
    if (hmm_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \
                    os.linesep)
    if (retag_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \
                    os.linesep)
        f_out.write("#   Header lines from original training file follow:" + \
                    os.linesep)
        for header_line in original_header_lines:
            f_out.write(header_line + os.linesep)
    if (freqs_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Tag/state pattern frequencies written to file '" + \
                    freqs_file_name + os.linesep)
    f_out.write('#' + '-' * 70 + os.linesep)
    f_out.write(os.linesep)

    rec_count = 0  # Number of selected records
    num_rec_left = num_rec  # Number of records to be selected left
    rec_selected = {}  # Dictionary of all record numbers that were selected
    seq_freqs = {}  # Dict to hold examples of tag/state patterns

    unchanged_loop_cnt = 0  # Counter of how many loops have been done
    # without new records being selected
    prev_num_rec_left = num_rec  # Number of records left in the previous
    # interation

    # Due to the random nature of selecting records, and because sometimes  - - -
    # a selected component can be empty (and is thus not used for training)
    # more than one iteration over the input data set is carried out. In each
    # iteration, records are selected randomly.
    #
    while (rec_count < num_rec):  # Loop until 'num_rec' records selected

        # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        try:
            f_in = open(in_file_name, 'r')
        except:
            inout.log_message('Cannot open input file: ' + in_file_name, 'err')
            raise IOError()

        line_read = 0  # Number of read lines

        # Skip to start of training block - - - - - - - - - - - - - - - - - - - - -
        #
        if (first_rec > 0):
            for i in range(first_rec):
                f_in.readline()

        while (rec_count < num_rec) and (line_read <= (last_rec - first_rec)):
            line = f_in.readline()

            if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \
               ((retag_file_name == None) and \
                (num_rec_left >= random.randrange(0,rec_range,1))):

                line = line.strip()  # Remove line separators
                config.curr_line = line  # Make a copy of the unprocessed current line

                line = line.lower()  # Make all characters lower case

                inout.log_message(
                    ['Record number: ' + str(line_read + first_rec)], 'v1')
                config.curr_line_no = line_read + first_rec  # Store current line number

                # Process line and extract content into components (name, geocode, etc)
                #
                [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
                   inout.process_line(line)

                # Select component and process it - - - - - - - - - - - - - - - - - - -
                #
                if (tag_mode == 'name'):
                    if (type(name_comp) == types.ListType):
                        component = name_comp[0].strip(
                        ) + ' ' + name_comp[1].strip()
                    else:
                        component = name_comp.strip()
                else:  # Locality component
                    component = geocode_comp.strip(
                    ) + ' ' + locality_comp.strip()

                if (component != '') and \
                   (not rec_selected.has_key((line_read+first_rec))):

                    if (tag_mode == 'name'):
                        inout.log_message(
                            '  Name component: |' + component + '|', 'v1')

                        component = name.clean_name_component(component)
                        [word_list,
                         tag_list] = name.tag_name_component(component)

                    else:  # Locality component
                        inout.log_message(
                            '  Locality component: |' + component + '|', 'v1')

                        component = locality.clean_geoloc_component(component)
                        [word_list,
                         tag_list] = locality.tag_geoloc_component(component)

                    if (tag_list != []):  # Only process non-empty tag lists

                        # Append record number into dictionary of processed records
                        #
                        rec_selected.update({
                            (line_read + first_rec): (line_read + first_rec)
                        })

                        # Create all permutation sequences of this tag list - - - - - - - -
                        #
                        tag_seq = mymath.perm_tag_sequence(tag_list)

                        inout.log_message(['  Word list: '+str(word_list), \
                                           '  Tag list: '+str(tag_list), \
                                           '  Tag sequences:'],'v2')

                        # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - -
                        #
                        if (hmm_file_name != None):

                            state_seq = [
                            ]  # List containing computed HMM state sequences
                            max_prob = -1.0  # maximal probability for a sequence
                            max_seq_no = -1  # Number of the seq. with the max. probablity

                            # Now give tag sequences to the HMMs to compute state sequences
                            #
                            i = 0
                            for t in tag_seq:
                                [obs_seq, prob] = myhmm.viterbi(t)
                                state_seq.append(obs_seq)
                                if (prob > max_prob):
                                    max_prob = prob
                                    max_seq_no = i
                                i += 1

                        # Write original component and resulting tag sequences to output
                        #
                        f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \
                                    '): |'+component+'|'+os.linesep) # Commented original
                        num_len = len(str(line_read + first_rec)) + len(
                            str(rec_count)) + 6

                        f_out.write('#' + num_len * ' ' + '|' +
                                    ' '.join(word_list) + '|' + os.linesep)

                        for i in range(len(tag_seq)):
                            # Convert each tag sequence into a string for file output
                            #
                            seq_string = '  '

                            if (hmm_file_name != None) and (i != max_seq_no):
                                seq_string = '# '  # Comment sequences with not max. probability

                            for j in range(len(tag_seq[i])):

                                if (hmm_file_name != None):
                                    seq_string = seq_string+' '+tag_seq[i][j]+':'+ \
                                                 state_seq[i][j]+','
                                else:
                                    seq_string = seq_string + ' ' + tag_seq[i][
                                        j] + ':,'

                            f_out.write(seq_string[:-1] +
                                        os.linesep)  # Write without , at end
                            inout.log_message('    ' + seq_string[:-1], 'v2')

                        if (hmm_file_name != None):
                            f_out.write('# Maximum Viterbi probability: %0.5f'% \
                                        (max_prob) + os.linesep)
                            inout.log_message('Maximum Viterbi probability: %0.5f'% \
                                              (max_prob), 'v2')

                        if (retag_file_name !=
                                None) and (tagged_recs[line_read] != None):
                            if (tagged_recs[line_read].strip() !=
                                    seq_string[:-1].strip()):
                                f_out.write("# Note: ***** Changed *****" +
                                            os.linesep)
                                inout.log_message('                      Note:' + \
                                                  ' ***** Changed *****','v2')
                                f_out.write('# Was: ' +
                                            tagged_recs[line_read] +
                                            os.linesep)
                                # Write commented original tag sequence
                                inout.log_message('Original tag sequence: '+ \
                                                  tagged_recs[line_read],'v2')

                        f_out.write(os.linesep)  # Write an empty line
                        inout.log_message(
                            '', 'v1')  # Print empty lines between records

                        if (hmm_file_name != None):
                            seq_key = seq_string[:
                                                 -1]  # Add sequence to dictionary
                            if (seq_freqs.has_key(seq_key)):
                                seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \
                                                          max_prob])
                            else:
                                seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \
                                                      max_prob]]

                        rec_count += 1

                        # Print process indicator message
                        #
                        if (config.proc_ind >= 0) and (rec_count > 0):
                            if (rec_count % config.proc_ind == 0):
                                print 'Processed line', rec_count, 'of', num_rec

            line_read += 1

        f_in.close()

        num_rec_left = num_rec - rec_count

        if (prev_num_rec_left == num_rec_left):  # No new records selected
            unchanged_loop_cnt += 1
        prev_num_rec_left = num_rec_left  # Set to current value

        if (unchanged_loop_cnt > 5):  # Do five loops maximal without selecting
            # new records
            config.curr_line_no = -1  # Set to illegal/empty values, as warning is
            config.curr_line = ''  # not related to the current input line
            inout.log_message(['Can not select more than '+str(rec_count)+ \
                               ' records for training.', \
                               'This is probably due to empty input components.', \
                               'Please reduce value of "num_rec" or increase ' + \
                               'range','between "first_rec" and "last_rec".'],'warn')
            break

        if (num_rec_left < 10):  # Only 10 records left to select
            num_rec_left = num_rec + 1  # Set to more than 100% probablity
        elif (num_rec_left < (num_rec / 100.0)):  # Less than 1% records left
            num_rec_left = int(num_rec / 100.0)  # Set to 1%

    f_out.close()

    # If specified, save Viterbi frequencies to a file  - - - - - - - - - - - - -
    #
    if (freqs_file_name != None):
        freqs_out = open(freqs_file_name,
                         'w')  # Open frequency file for writing
        freqs_out.write('# Frequency listing of tag/state patterns written by')
        freqs_out.write('"pyTagData.py - Version 0.1"' + os.linesep)
        freqs_out.write('#' + os.linesep)
        freqs_out.write('# Created ' + time.ctime(time.time()) + os.linesep)
        freqs_out.write('#' + os.linesep)
        freqs_out.write("# Input file name:  " + in_file_name + os.linesep)
        freqs_out.write("# Output file name: " + out_file_name + os.linesep)
        freqs_out.write(os.linesep)
        freqs_out.write('# Parameters:' + os.linesep)
        freqs_out.write('# - Start of block with training records: '+ \
                        str(first_rec)+os.linesep)
        freqs_out.write('# - End of block with training records:   '+ \
                        str(last_rec)+os.linesep)
        freqs_out.write('# - Number of training records:           '+ \
                        str(num_rec)+os.linesep)
        if (hmm_file_name != None):
            freqs_out.write('#' + os.linesep)
            freqs_out.write("# - Using HMM file '"+hmm_file_name+ \
                            "' for standardisation"+os.linesep)
        if (retag_file_name != None):
            freqs_out.write('#' + os.linesep)
            freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \
                            "'"+os.linesep)
        freqs_out.write('#' + '-' * 70 + os.linesep)
        freqs_out.write(os.linesep)

        sorted_seq_freqs = [
        ]  # Now sort sequences according to their fruequencies
        for key in seq_freqs.keys():
            sorted_seq_freqs.append((len(seq_freqs[key]), key))
        sorted_seq_freqs.sort()

        for skey in sorted_seq_freqs:
            key = skey[1]
            freqs_out.write('# Pattern: ' + str(key) + os.linesep)
            freqs_out.write('# Frequency: ' + str(skey[0]) + os.linesep)
            examples = seq_freqs[key]
            freqs_out.write('# Maximum Viterbi probability: '+ \
                            str(examples[0][1])+os.linesep)
            freqs_out.write('# Examples: ' + os.linesep)
            for example in examples:
                freqs_out.write('#    ' + str(example[0]) + os.linesep)
            freqs_out.write(str(key) + os.linesep)
            freqs_out.write(os.linesep)
        freqs_out.close()

    inout.log_message(['Read '+str(line_read)+' lines, processed '+ \
                      str(rec_count)+' lines', 'End.'],'v1')