Example #1
0
def tag_geoloc_component(geoloc_str):
    """Tag a geocode locality input component string and make a list.

  USAGE:
    [word_list, tag_list] = tag_geoloc_component(loc_str)

  ARGUMENTS:
    geoloc_str  A string containing the geocode and/or locality components

  DESCRIPTION:
    This routines cleans the input string and extracts words, numbers and
    separators into a list. Each element of this list is assigned one or more
    tags. A 'greedy tagger' is applied, which cheques sequences of list
    elements in the name lookup table (longer sequences first) and replaces
    them with the string and tag from the lookup-table if found.

    The routine returns two lists: words and their tags
  """

    # First, split input string into elements at spaces - - - - - - - - - - - - -
    #
    org_list = geoloc_str.split()  # The original list from the input string
    inout.log_message("  Initial word list: " + str(org_list), "v2")

    tag_list = []  # The initially empty list of tags
    word_list = []  # The initially empty list of words

    while org_list != []:  # As long as not all elements have been processed
        tmp_list = org_list[: config.geoloc_dict_seq_len]  # Extract longest sub-list
        tmp_val = []  # Start with empty value
        tmp_key = tuple(tmp_list)

        while tmp_key != ():  # As long as key not empty and not found in lookup
            if config.geoloc_lookup_dict.has_key(tmp_key):
                tmp_val = config.geoloc_lookup_dict[tmp_key]
                break
            tmp_key = tmp_key[:-1]  # Remove last element in key

        if tmp_val != []:  # A value has been found in the dictionary
            tmp_len = len(tmp_key)  # Length of found sequence

            if tmp_val[0] != "":  # it's not an empty value
                word_list.append(tmp_val[0])  # Append corrected word (or sequence)
                tag_list.append(tmp_val[1])  # Append tag or tags

        else:  # No value has been found in the lookup dictionary, try other tags

            tmp_val = org_list[0]  # Value is first element in the original list
            tmp_len = 1

            if tmp_val.isdigit():  # Element is a number
                word_list.append(tmp_val)
                if len(tmp_val) == 4:
                    tag_list.append("N4")
                else:
                    tag_list.append("NU")

            elif (not tmp_val.isalpha()) and tmp_val.isalnum():  # Alpha-numeric
                word_list.append(tmp_val)
                tag_list.append("AN")

            elif tmp_val == "-":  # Element is a hyphen
                word_list.append(tmp_val)
                tag_list.append("HY")

            elif tmp_val == ",":  # Element is a comma
                word_list.append(tmp_val)
                tag_list.append("CO")

            elif tmp_val == "|":  # Element is a vertical bar
                word_list.append(tmp_val)
                tag_list.append("VB")

            else:  # An unknown element
                word_list.append(tmp_val)
                tag_list.append("UN")

        # Finally remove the processed elements from the original element list
        #
        org_list = org_list[tmp_len:]  # Remove processed elements

    return [word_list, tag_list]
Example #2
0
# Check if definition of input components is correct with file types
#
input_values = input_component.values()
input_len = -1  # Length of the input (either in number of fields (CSV and TAB
                # files) or in characters (COL files)

output_keys = output_field.keys() # Check if 'original_input' is in output
                                  # fields, and if so check for correctness
for k in output_keys:
  if (k[:14] == 'original_input'):
    v = k[14:].strip()
    if (v != ''):  # There is a field or column range given
      if (v[0] == '[') and (v[-1] == ']'):
        v = v[1:-1]  # Remove brackets
      else:
        inout.log_message('Wrong input component definition: '+str(k) + \
                          ' for "original_input" output field','err')
        raise Exception()
      if (v[0] == '(') and (v[-1] == ')'):  # It's a tuple
        v = v[1:-1]  # Remove tuple brackets
      v = v.split(',')  # Make a list
      for i in range(len(v)):
        v[i] = int(v[i])  # Make integers
      if (len(v) == 1):  # One integer only, must be a field number
        input_values.append(v)  # Append 'original_input' field number
      elif (len(v) == 2):  # Two integers, must be a column range
        input_values.append([(v[0],v[1])])  # Append as a tuple
      else:
        inout.log_message('Wrong input component value: '+str(k) + \
                          ' for "original_input" output field','err')
        raise Exception()
Example #3
0
def get_geoloc_hmm(word_list, tag_list):
    """Process input using a HMM to extract geocode and locality output fields.

  USAGE:
    geoloc_dict = get_geoloc_hmm(word_list, tag_list)

  ARGUMENTS:
    word_list  List of words as produces with clean_tag_locality()
    tag_list   Corresponding list of tags as produces with
               clean_tag_locality()

  DESCRIPTION:
    The routine returns a dictionary with the parsed and extracted output
    fields for both the locality and geocode components. A Hidden Markov Model
    (HMM) is used for this task.

    The dictionary returned can contain the following key words:
    - wayfare_number
    - wayfare_name
    - wayfare_qualifier
    - wayfare_type
    - unit_number
    - unit_type
    - property_name
    - institution_name
    - institution_type    
    - postaddress_number
    - postaddress_type
    - locality_name
    - locality_qualifier
    - postcode
    - territory
    - country
    - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for
                        the most likely HMM state seqence)
  """

    # First, create all permutations of the input tag sequence
    #
    tag_list_seq = mymath.perm_tag_sequence(tag_list)

    msg = ["  Input tag sequence: " + str(tag_list), "  Output tag sequences:"]
    for t in tag_list_seq:
        msg.append("    " + str(t))
    inout.log_message(msg, "v2")

    # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - -
    # and keep the one with highest probability
    #
    max_prob = -1.0
    best_obs_seq = []
    best_tag_list = []

    for t in tag_list_seq:
        [obs_seq, prob] = config.geoloc_hmm.viterbi(t)
        if prob > max_prob:
            best_obs_seq = obs_seq
            best_tag_list = t
            max_prob = prob

        inout.log_message("  Probability " + str(prob) + "  for sequence " + str(t), "v2")

    inout.log_message(
        ["  Best observation sequence: " + str(best_obs_seq), "          with tag sequence: " + str(best_tag_list)],
        "v2",
    )

    # Now process the observation sequence and add elements into dictionary - - -
    #
    tag_list_len = len(tag_list)
    norm_max_prob = max_prob / float(tag_list_len)  # Normalise max. probability
    geoloc_dict = {"geoloc_hmm_proba": [str(norm_max_prob)]}

    list_len = len(word_list)
    for i in range(list_len):  # Loop over words and states
        w = word_list[i]
        s = best_obs_seq[i]

        #  Do not output commas, vertical bars and hyphens  - - - - - - - - - - - -
        #
        if w in ["|", ",", "-", "/"]:
            pass

        elif s == "wfnu":  # Wayfare number - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("wayfare_number", [])
            v.append(w)
            geoloc_dict.update({"wayfare_number": v})

        elif s in ["wfna1", "wfna2", "wfna3"]:  # Wayfare name  - - - - - - - - - -
            v = geoloc_dict.get("wayfare_name", [])
            v.append(w)
            geoloc_dict.update({"wayfare_name": v})

        elif s == "wfql":  # Wayfare qualifier  - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("wayfare_qualifier", [])
            v.append(w)
            geoloc_dict.update({"wayfare_qualifier": v})

        elif s == "wfty":  # Wayfare type - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("wayfare_type", [])
            v.append(w)
            geoloc_dict.update({"wayfare_type": v})

        elif s == "unnu":  # Unit number  - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("unit_number", [])
            v.append(w)
            geoloc_dict.update({"unit_number": v})

        elif s == "unty":  # Unit type  - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("unit_type", [])
            v.append(w)
            geoloc_dict.update({"unit_type": v})

        elif s in ["prna1", "prna2"]:  # Property name - - - - - - - - - - - - - -
            v = geoloc_dict.get("property_name", [])
            v.append(w)
            geoloc_dict.update({"property_name": v})

        elif s in ["inna1", "inna2"]:  # Institution name  - - - - - - - - - - - -
            v = geoloc_dict.get("institution_name", [])
            v.append(w)
            geoloc_dict.update({"institution_name": v})

        elif s == "inty":  # Institution type - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("institution_type", [])
            v.append(w)
            geoloc_dict.update({"institution_type": v})

        elif s == "panu":  # Postal address number  - - - - - - - - - - - - - - -
            v = geoloc_dict.get("postaddress_number", [])
            v.append(w)
            geoloc_dict.update({"postaddress_number": v})

        elif s == "paty":  # Postal address type  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("postaddress_type", [])
            v.append(w)
            geoloc_dict.update({"postaddress_type": v})

        elif s in ["loc1", "loc2"]:  # Locality name - - - - - - - - - - - - - - -
            v = geoloc_dict.get("locality_name", [])
            v.append(w)
            geoloc_dict.update({"locality_name": v})

        elif s == "locql":  # Locality qualifier  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("locality_qualifier", [])
            v.append(w)
            geoloc_dict.update({"locality_qualifier": v})

        elif s == "pc":  # Postcode - - - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("postcode", [])
            v.append(w)
            geoloc_dict.update({"postcode": v})

        elif s in ["ter1", "ter2"]:  # Territory - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("territory", [])
            v.append(w)
            geoloc_dict.update({"territory": v})

        elif s in ["cntr1", "cntr2"]:  # Country - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get("country", [])
            v.append(w)
            geoloc_dict.update({"country": v})

        else:  # Should never happen
            msg = [
                "This should never happen!",
                "  Tag: " + str(s),
                "  Word: " + w,
                "  Word list: " + str(word_list),
                "  tag list:  " + str(tag_list),
            ]
            inout.log_message(msg, "warn")

    # Check if concatenated locality and territory words are in lookup-table  - -
    #
    if geoloc_dict.has_key("locality_name"):
        loc = geoloc_dict["locality_name"]
        if len(loc) > 1:  # Locality contains more than one word
            loc_tuple = tuple(loc)  # Make it a tuple
            if config.geoloc_lookup_dict.has_key(loc_tuple):
                new_loc = config.geoloc_lookup_dict[loc_tuple][0]
                geoloc_dict.update({"locality_name": [new_loc]})

    if geoloc_dict.has_key("territory"):
        terr = geoloc_dict["territory"]
        if len(terr) > 1:  # Territory contains more than one word
            terr_tuple = tuple(terr)  # Make it a tuple
            if config.geoloc_lookup_dict.has_key(terr_tuple):
                new_terr = config.geoloc_lookup_dict[terr_tuple][0]
                geoloc_dict.update({"territory": [new_terr]})

    if geoloc_dict.has_key("country"):
        cntr = geoloc_dict["country"]
        if len(cntr) > 1:  # Country contains more than one word
            cntr_tuple = tuple(cntr)  # Make it a tuple
            if config.geoloc_lookup_dict.has_key(cntr_tuple):
                new_cntr = config.geoloc_lookup_dict[cntr_tuple][0]
                geoloc_dict.update({"country": [new_cntr]})

    # Finally do some tests on the output fields  - - - - - - - - - - - - - - - -
    #
    geoloc_items = geoloc_dict.items()

    # Check if a value list has more than three elements, if so print out
    #
    for i in geoloc_items:
        if len(i[1]) > 3:
            inout.log_message(
                "Geocode/locality output field " + str(i[0]) + " contains more than three elements: " + str(i[1]),
                "warn",
            )

    # Check if 'number' elements only contain (alpha-) numerical values - - - - -
    # and also check how many numbers in an element
    #
    if geoloc_dict.has_key("wayfare_number"):  # Check how many wayfare numbers
        v = geoloc_dict["wayfare_number"]
        if len(v) > 2:
            inout.log_message("More than two wayfare numbers: " + str(v), "warn")
        for i in v:
            if i.isalpha():  # Element contains only letters
                inout.log_message("Wayfare number element contains no digits: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("unit_number"):  # Check how many unit numbers
        v = geoloc_dict["unit_number"]
        if len(v) > 1:
            inout.log_message("More than one unit numbers: " + str(v), "warn")
        for i in v:
            if i.isalpha():  # Element contains only letters
                inout.log_message("Unit number element contains no digits: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("postaddress_number"):  # Check postaddress numbers
        v = geoloc_dict["postaddress_number"]
        if len(v) > 1:
            inout.log_message("More than one postaddress numbers: " + str(v), "warn")
        for i in v:
            if i.isalpha():  # Element contains only letters
                inout.log_message("Postaddress number element contains no digits: " + str(v), "warn")
                break  # Exit for loop

    # Check if 'type' elements contain one word only  - - - - - - - - - - - - - -
    # if it's a known type word
    #
    if geoloc_dict.has_key("wayfare_type"):  # Check wayfare type
        v = geoloc_dict["wayfare_type"]
        if len(v) > 1:
            inout.log_message("More than one wayfare type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("WT") < 0)
            ):
                inout.log_message("Wayfare type word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("unit_type"):  # Check unit type
        v = geoloc_dict["unit_type"]
        if len(v) > 1:
            inout.log_message("More than one unit type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("UT") < 0)
            ):
                inout.log_message("Unit type word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("institution_type"):  # Check institution type
        v = geoloc_dict["institution_type"]
        if len(v) > 1:
            inout.log_message("More than one institution type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("IT") < 0)
            ):
                inout.log_message("Institution type word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("postaddress_type"):  # Check postaddress type
        v = geoloc_dict["postaddress_type"]
        if len(v) > 2:
            inout.log_message("More than two postaddress type: " + str(v), "warn")
        for i in v:
            i = i.split("_")
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or (
                config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("PA") < 0)
            ):
                inout.log_message("Postaddress type word is not known: " + str(v), "warn")
                break  # Exit for loop

    # Check if 'qualifier' elements only contain known qualifier words  - - - - -
    #
    if geoloc_dict.has_key("wayfare_qualifier"):  # Check wayfare qualifier
        v = geoloc_dict["wayfare_qualifier"]
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or (
                config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0)
            ):
                inout.log_message("Wayfare qualifier word is not known: " + str(v), "warn")
                break  # Exit for loop

    if geoloc_dict.has_key("locality_qualifier"):  # Check locality qualifier
        v = geoloc_dict["locality_qualifier"]
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or (
                config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0)
            ):
                inout.log_message("Locality qualifier word is not known: " + str(v), "warn")
                break  # Exit for loop

    return geoloc_dict
Example #4
0
def trainhmm():
  """Main routine, open file, read lines, train HMM and save it to file.

  USAGE:
    trainhmm()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 3):
    print '***** Error: %s needs at least four arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Tagging mode: "name" or "locality"'
    print '*****        - Input training file name'
    print '*****        - HMM output file name'
    print '*****          plus options'
    raise Exception()

  if (config.options[1] == config.options[2]):
    print '*** Error: Input and output files must differ'
    print '***        Input training file name:', config.options[1]
    print '***        HMM output file name:    ', config.options[1]
    raise Exception()

  in_file_name  = config.options[1]
  hmm_file_name = config.options[2]

  # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
  #
  tag_mode = config.options[0]
  if (tag_mode in ['name','na','n']):
    tag_mode = 'name'
  elif (tag_mode in ['locality','lolty','loc','l']):
    tag_mode = 'loc'
  else:
    print '***** Error: Illegal tagging mode:', tag_mode
    print '*****        Must be either "name" or "locality"'
    raise Exception()

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0     # Default: No verbose output
  config.logging = 0     # Default: No logging into a file
  smoothing      = None  # Default: No smoothing
  config.nowarn  = 0     # Deactivate no warning flag (print/log warning
                         # messages)

  if (len(config.options) > 3):
    options =  config.options[3:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file: '+config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write("############"+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# 'pyTrainHMM.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# Input file name: "+in_file_name+os.linesep)
        f_log.write("# HMM file name:   "+hmm_file_name+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-s'):
        smoothing = 1  # Set to do a HMM smoothing
        smoothing = options[1]
        if (smoothing in ['l','la','lap','laplac','laplace']):
          smoothing = 'laplace'
        elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\
               'absdiscount']):
          smoothing = 'absdiscount'
        else:  # Illegal value
          print "*** Error: Illegal value for 'smoothing' argument:", smoothing
          print "***        Possible are: 'laplace' or 'absdiscount'"
          raise Exception()

        options = options[2:]  # Remove processed option

      else:
        print '*** Error: Illegal option:', options[0]
        raise Exception()

  # Get HMM states and observations from configuration module - - - - - - - - -
  #
  if (tag_mode == 'name'): 
    state_list = config.name_hmm_states
    obser_list = config.name_hmm_obser

  else:
    state_list = config.geoloc_hmm_states
    obser_list = config.geoloc_hmm_obser

  # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0  # Counter for lines read
  rec_count  = 0  # Counter for training records read

  # Read lines, discard comment lines and process training data lines - - - - -
  #
  training_data = []  # List of training records

  train_list = []  # List of training sequences (dictionaries), extracted from
                   # training data

  for line in xreadlines.xreadlines(f_in):

    if (line[0] != '#') and (line.strip() != ''):
      # Line must contain a training record

      line = line.strip()  # Remove line separators
      config.curr_line = line  # Make a copy of the unprocessed current line

      line_list = line.split(',')  # Split into a list of elements
      line_data = []  # Training data list for one training record

      inout.log_message(['Record number: '+str(rec_count)],'v1')
      config.curr_line_no = line_count  # Store current line number

      for elem in line_list:
        [k,v] = elem.split(':')  # Split into key and value
        tag = k.strip()
        state = v.strip()
        line_data.append((state,tag))

        if (state not in state_list):
          msg = ['Illegal state name in training record: '+state, \
                 'Line: '+str(line_count)+', record: '+str(rec_count), \
                 'Possible values: '+str(state_list)]
          inout.log_message(msg,'err')
          raise Exception()

        if (tag not in obser_list):
          msg = ['Illegal observation (tag) name in training record: '+tag, \
                 'Line: '+str(line_count)+', record: '+str(rec_count), \
                 'Possible values: '+str(obser_list)]
          inout.log_message(msg,'err')
          raise Exception()

      inout.log_message('  Training record '+str(rec_count)+':'+ \
                        str(line_data),'v1')

      train_list.append(line_data)

      rec_count += 1
      inout.log_message('','v1')  # Print empty lines between records

    line_count += 1

  # Close input file  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  f_in.close()

  inout.log_message('','v1')  # Print empty lines between records

  # Initalise HMM and train it with training data - - - - - - - - - - - - - - -
  #
  myhmm = simplehmm.hmm(state_list, obser_list)

  myhmm.train(train_list,smoothing)
  myhmm.print_hmm()

  # Save trained HMM  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  myhmm.save_hmm(hmm_file_name)  

  inout.log_message(['Read '+str(line_count)+' lines, processed '+ \
                    str(rec_count)+' training records', 'End.'],'v1')
Example #5
0
def tagdata():
  """Main routine, open file, read lines, tag data records, write to out-file.

  USAGE:
    tagdata()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 5):
    print '***** Error: %s needs at least six arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Tagging mode: "name" or "locality"'
    print '*****        - Output training file name'
    print '*****        - Start of block with training records'
    print '*****        - End of block with training records'
    print '*****        - Number of training records'
    print '*****          plus options'
    raise Exception()

  if (config.in_file_name == config.options[2]):
    print '***** Error: Input and output files must differ'
    print '*****        Input file name:          ', config.in_file_name
    print '*****        Output training file name:', config.options[2]
    raise Exception()

  first_rec = int(config.options[2])
  last_rec  = int(config.options[3])
  num_rec   = int(config.options[4])
  in_file_name = config.in_file_name
  out_file_name = config.options[1]

  # Check record number values  - - - - - - - - - - - - - - - - - - - - - - - -
  #
  if (int(first_rec) >= int(last_rec)) or \
     ((int(num_rec)-1) > (int(last_rec)-int(first_rec))):
    print '***** Error: Illegal values for training records block:'
    print '*****        - Start of block with training records:', first_rec
    print '*****        - End of block with training records:  ', last_rec
    print '*****        - Number of training records:          ', num_rec
    raise Exception()

  rec_range = last_rec-first_rec-1  # Range of records in input file

  # Open input file and check number of available records - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0
  for line in f_in.xreadlines():
    line_count += 1
  f_in.close()

  if (last_rec > line_count):  # Illegal value for last record
    print '***** Error: Illegal values for last training records:', last_rec
    print '*****        File only contains',line_count, 'lines/records'
    raise Exception()

  # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
  #
  tag_mode = config.options[0]
  if (tag_mode in ['name','na','n']):
    tag_mode = 'name'
  elif (tag_mode in ['locality','localty','loc','l']):
    tag_mode = 'loc'
  else:
    print '***** Error: Illegal tagging mode:', tag_mode
    print '*****        Must be either "name" or "locality"'
    raise Exception()

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0     # Default: No verbose output
  config.logging = 0     # Default: No logging into a file
  hmm_file_name  = None  # Default: Do not use HMM to standardise training
                         #          records
  retag_file_name = None # Default: Do not retag an existing training file
  config.nowarn  = 0     # Deactivate no warning flag (print/log warning
                         # messages)
  freqs_file_name = None # Default: Do not write frequencies, no -freqs option

  if (len(config.options) > 5):
    options = config.options[5:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file: '+config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write('############'+os.linesep)
        f_log.write('#'+os.linesep)
        f_log.write("# 'pyTagData.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write('#'+os.linesep)
        f_log.write("# Input file name:  "+in_file_name+os.linesep)
        f_log.write("# Output file name: "+out_file_name+os.linesep)
        f_log.write("# Tagging mode:     "+tag_mode+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-hmm'):
        hmm_file_name = options[1]  # Get file name of the HMM to use
        if (hmm_file_name == out_file_name):
          print '***** Error: HMM file name is the same as output file name!'
          raise Exception()

        try:
          f_in = open(hmm_file_name,'r')  # Test if file is available
        except:
          print '***** Error: Cannot open HMM file specified in "-hmm"',
          print 'option:', hmm_file_name
          raise IOError()
        f_in.close()
        options = options[2:]  # Remove processed '-hmm' option and file name

      elif (options[0] == '-retag'):
        if (hmm_file_name == None) and ('-hmm' not in options):
          print '***** Error: "-retag" option can only be used together with',
          print '"-hmm" option (which is not given).'
          raise Exception()

        retag_file_name = options[1]  # Get file name of the already-tagged
                                      # file to re-process
        if (retag_file_name == out_file_name):
          print '***** Error: Retag file name is the same as output file name!'
          raise Exception()
        elif (retag_file_name == in_file_name):
          print '***** Error: Retag file name is the same as input file name!'
          raise Exception()
        elif (retag_file_name == hmm_file_name):
          print '***** Error: Retag file name is the same as HMM file name!'
          raise Exception()

        try:
          f_in = open(retag_file_name,'r')  # Test if file is available

          # Now gather record numbers and previous tags/states, as well as the
          # original header information. Use a simple state machine to do this.
          #
          tagged_recs  = {}
          cleaned_recs = {}
          original_header_lines = []
          state = -1  # Header lines state
          prevline = ''

          for line in f_in.xreadlines():  # Read training file and process it
            line = line.strip()

            if (state == -1) and (len(line) == 0):  # End of header lines
              state = 0
              prevline = line
              continue

            if (state == -1) and (len(line) > 0) and (line[0] == "#"):
              original_header_lines.append("# " + line)
              prevline = line
              continue
            sline = line.split(' ')

            if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \
               and (sline[2][0] == '(') and (sline[2][-2:] == '):'):
              try:	
                rec = int(sline[1])  # Original record number 
                tagged_recs[rec]  = None
                cleaned_recs[rec] = None
                state = 1
              except:
                pass
              prevline = line
              continue

            if (state == 1) and (len(line) > 0) and (line[0] != '#'):
              tagged_recs[rec]  = line
              cleaned_recs[rec] = prevline
              state = 0
              prevline = line
              continue

            if (state == 1) and (len(line) > 0):
              prevline = line
              continue

          f_in.close()
          tagged_recs_keys = tagged_recs.keys()

          num_rec = len(tagged_recs_keys)  # Override specified numbers
          first_rec = 0
          last_rec = line_count

        except:
          print '***** Error: Cannot open tagged training file specified',
          print 'in "-retag" option:', retag_file_name
          raise IOError()

        options = options[2:]  # Remove processed '-retag' option and file name

      elif (options[0][:5] == '-freq'):
        if (hmm_file_name == None) and ('-hmm' not in options):
          print '***** Error: "-feqs" option can only be used together with',
          print '"-hmm" option (which is not given).'
          raise Exception()

        freqs_file_name = options[1]  # File name to write the frequencies to
        if (freqs_file_name == out_file_name):
          print '***** Error: Frequency file name is the same as output',
          print 'file name!'
          raise Exception()
        elif (freqs_file_name == in_file_name):
          print '***** Error: Frequency file name is the same as input',
          print 'file name!'
          raise Exception()
        elif (freqs_file_name == hmm_file_name):
          print '***** Error: Frequency file name is the same as HMM',
          print 'file name!'
          raise Exception()

        options = options[2:]  # Remove processed '-freqs' option and file name
        try:  # Check if file writing is possible
          freqs_out = open(freqs_file_name,'w')
	  freqs_out.close()
        except:
          print '***** Error: Cannot write to frequency output file specified',
          print 'in "-freqs" option:', freqs_file_name
          raise IOError()

      else:
        print '***** Error: Illegal option:', options[0]
        raise Exception()

  # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - -
  #
  if (hmm_file_name != None):
    myhmm = simplehmm.hmm([],[])  # Create new empty HMM object
    myhmm.load_hmm(hmm_file_name)
    myhmm.print_hmm()  # Print HMM (according to verbose and logging level)

  # Open output file and write header - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_out = open(out_file_name,'w')
  except:
    inout.log_message('Cannot open output file: '+out_file_name,'err')
    raise IOError()

  f_out.write("# Tagged training data written by 'pyTagData.py -"+ \
              " Version 0.1'"+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Created '+time.ctime(time.time())+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Input file name:  '+in_file_name+os.linesep)
  f_out.write('# Output file name: '+out_file_name+os.linesep)
  f_out.write('#'+os.linesep)
  f_out.write('# Parameters:'+os.linesep)
  f_out.write('# - Start of block with training records: '+str(first_rec)+ \
              os.linesep)
  f_out.write('# - End of block with training records:   '+str(last_rec)+ \
              os.linesep)
  f_out.write('# - Number of training records:           '+str(num_rec)+ \
              os.linesep)
  if (hmm_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \
                os.linesep)
  if (retag_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \
                os.linesep)
    f_out.write("#   Header lines from original training file follow:" + \
                os.linesep)
    for header_line in original_header_lines:
	    f_out.write(header_line + os.linesep)
  if (freqs_file_name != None):
    f_out.write('#'+os.linesep)
    f_out.write("# - Tag/state pattern frequencies written to file '" + \
                freqs_file_name + os.linesep)
  f_out.write('#'+'-'*70+os.linesep)
  f_out.write(os.linesep)

  rec_count    = 0        # Number of selected records
  num_rec_left = num_rec  # Number of records to be selected left
  rec_selected = {}       # Dictionary of all record numbers that were selected
  seq_freqs = {}          # Dict to hold examples of tag/state patterns

  unchanged_loop_cnt = 0       # Counter of how many loops have been done
                               # without new records being selected
  prev_num_rec_left = num_rec  # Number of records left in the previous
                               # interation

  # Due to the random nature of selecting records, and because sometimes  - - -
  # a selected component can be empty (and is thus not used for training)
  # more than one iteration over the input data set is carried out. In each 
  # iteration, records are selected randomly.
  #
  while (rec_count < num_rec):  # Loop until 'num_rec' records selected

    # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
      f_in = open(in_file_name,'r')
    except:
      inout.log_message('Cannot open input file: '+in_file_name,'err')
      raise IOError()

    line_read = 0  # Number of read lines

    # Skip to start of training block - - - - - - - - - - - - - - - - - - - - -
    #
    if (first_rec > 0):
      for i in range(first_rec):
        f_in.readline()

    while (rec_count < num_rec) and (line_read <= (last_rec-first_rec)):
      line = f_in.readline()

      if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \
         ((retag_file_name == None) and \
          (num_rec_left >= random.randrange(0,rec_range,1))):

        line = line.strip()  # Remove line separators
        config.curr_line = line  # Make a copy of the unprocessed current line

        line = line.lower()  # Make all characters lower case

        inout.log_message(['Record number: '+str(line_read+first_rec)],'v1')
        config.curr_line_no = line_read+first_rec  # Store current line number

        # Process line and extract content into components (name, geocode, etc)
        #
        [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
           inout.process_line(line)

        # Select component and process it - - - - - - - - - - - - - - - - - - -
        #
        if (tag_mode == 'name'):
          if (type(name_comp) == types.ListType):
            component = name_comp[0].strip()+' '+name_comp[1].strip()
          else:
            component = name_comp.strip()
        else:  # Locality component
          component = geocode_comp.strip()+' '+locality_comp.strip()

        if (component != '') and \
           (not rec_selected.has_key((line_read+first_rec))):

          if (tag_mode == 'name'):
            inout.log_message('  Name component: |'+component+'|','v1')

            component = name.clean_name_component(component)
            [word_list, tag_list] = name.tag_name_component(component)
 
          else:  # Locality component
            inout.log_message('  Locality component: |'+component+'|','v1')

            component = locality.clean_geoloc_component(component)
            [word_list, tag_list] = locality.tag_geoloc_component(component)

          if (tag_list != []):  # Only process non-empty tag lists

            # Append record number into dictionary of processed records
            #
            rec_selected.update({(line_read+first_rec):(line_read+first_rec)})

            # Create all permutation sequences of this tag list - - - - - - - -
            #
            tag_seq = mymath.perm_tag_sequence(tag_list)

            inout.log_message(['  Word list: '+str(word_list), \
                               '  Tag list: '+str(tag_list), \
                               '  Tag sequences:'],'v2')

            # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - -
            #
            if (hmm_file_name != None):

              state_seq  = []    # List containing computed HMM state sequences
              max_prob   = -1.0  # maximal probability for a sequence
              max_seq_no = -1    # Number of the seq. with the max. probablity

              # Now give tag sequences to the HMMs to compute state sequences
              #
              i = 0
              for t in tag_seq:
                [obs_seq, prob] = myhmm.viterbi(t)
                state_seq.append(obs_seq)
                if (prob > max_prob):
                  max_prob = prob
                  max_seq_no = i
                i += 1

            # Write original component and resulting tag sequences to output
            #
            f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \
                        '): |'+component+'|'+os.linesep) # Commented original
            num_len = len(str(line_read+first_rec))+len(str(rec_count))+6

            f_out.write('#'+num_len*' '+'|'+' '.join(word_list)+'|'+os.linesep)

            for i in range(len(tag_seq)):
              # Convert each tag sequence into a string for file output
              #
              seq_string = '  '

              if (hmm_file_name != None) and (i != max_seq_no):
                seq_string = '# ' # Comment sequences with not max. probability

              for j in range(len(tag_seq[i])):

                if (hmm_file_name != None):
                  seq_string = seq_string+' '+tag_seq[i][j]+':'+ \
                               state_seq[i][j]+','
                else:
                  seq_string = seq_string+' '+tag_seq[i][j]+':,'

              f_out.write(seq_string[:-1]+os.linesep)  # Write without , at end
              inout.log_message('    '+seq_string[:-1],'v2')

            if (hmm_file_name != None):
              f_out.write('# Maximum Viterbi probability: %0.5f'% \
                          (max_prob) + os.linesep)
              inout.log_message('Maximum Viterbi probability: %0.5f'% \
                                (max_prob), 'v2')

            if (retag_file_name != None) and (tagged_recs[line_read] != None):
              if (tagged_recs[line_read].strip() != seq_string[:-1].strip()):
                f_out.write("# Note: ***** Changed *****" + os.linesep)
                inout.log_message('                      Note:' + \
                                  ' ***** Changed *****','v2')
                f_out.write('# Was: ' + tagged_recs[line_read]+os.linesep)
                            # Write commented original tag sequence
                inout.log_message('Original tag sequence: '+ \
                                  tagged_recs[line_read],'v2')

            f_out.write(os.linesep)  # Write an empty line
            inout.log_message('','v1')  # Print empty lines between records

            if (hmm_file_name != None):
              seq_key = seq_string[:-1]  # Add sequence to dictionary
              if (seq_freqs.has_key(seq_key)):
                seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \
                                          max_prob])
              else:
                seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \
                                      max_prob]]

            rec_count += 1

            # Print process indicator message
            #
            if (config.proc_ind >= 0) and (rec_count > 0):
              if (rec_count % config.proc_ind == 0):
                print 'Processed line', rec_count, 'of', num_rec

      line_read += 1

    f_in.close()

    num_rec_left = num_rec - rec_count

    if (prev_num_rec_left == num_rec_left):  # No new records selected
      unchanged_loop_cnt += 1
    prev_num_rec_left = num_rec_left  # Set to current value

    if (unchanged_loop_cnt > 5):  # Do five loops maximal without selecting
                                  # new records
      config.curr_line_no = -1  # Set to illegal/empty values, as warning is
      config.curr_line    = ''  # not related to the current input line
      inout.log_message(['Can not select more than '+str(rec_count)+ \
                         ' records for training.', \
                         'This is probably due to empty input components.', \
                         'Please reduce value of "num_rec" or increase ' + \
                         'range','between "first_rec" and "last_rec".'],'warn')
      break

    if (num_rec_left < 10):  # Only 10 records left to select
      num_rec_left = num_rec+1  # Set to more than 100% probablity
    elif (num_rec_left < (num_rec / 100.0)):  # Less than 1% records left
      num_rec_left = int(num_rec / 100.0)  # Set to 1%

  f_out.close()

  # If specified, save Viterbi frequencies to a file  - - - - - - - - - - - - -
  #
  if (freqs_file_name != None):
    freqs_out = open(freqs_file_name,'w')  # Open frequency file for writing
    freqs_out.write('# Frequency listing of tag/state patterns written by')
    freqs_out.write('"pyTagData.py - Version 0.1"'+os.linesep)
    freqs_out.write('#'+os.linesep)
    freqs_out.write('# Created '+time.ctime(time.time())+os.linesep)
    freqs_out.write('#'+os.linesep)
    freqs_out.write("# Input file name:  "+in_file_name+os.linesep)
    freqs_out.write("# Output file name: "+out_file_name+os.linesep)
    freqs_out.write(os.linesep)
    freqs_out.write('# Parameters:'+os.linesep)
    freqs_out.write('# - Start of block with training records: '+ \
                    str(first_rec)+os.linesep)
    freqs_out.write('# - End of block with training records:   '+ \
                    str(last_rec)+os.linesep)
    freqs_out.write('# - Number of training records:           '+ \
                    str(num_rec)+os.linesep)
    if (hmm_file_name != None):
      freqs_out.write('#'+os.linesep)
      freqs_out.write("# - Using HMM file '"+hmm_file_name+ \
                      "' for standardisation"+os.linesep)
    if (retag_file_name != None):
      freqs_out.write('#'+os.linesep)
      freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \
                      "'"+os.linesep)
    freqs_out.write('#'+'-'*70+os.linesep)
    freqs_out.write(os.linesep)

    sorted_seq_freqs = []  # Now sort sequences according to their fruequencies
    for key in seq_freqs.keys():
      sorted_seq_freqs.append((len(seq_freqs[key]),key))
    sorted_seq_freqs.sort()

    for skey in sorted_seq_freqs:
      key = skey[1]
      freqs_out.write('# Pattern: '+str(key)+os.linesep)
      freqs_out.write('# Frequency: '+str(skey[0])+os.linesep)
      examples = seq_freqs[key]
      freqs_out.write('# Maximum Viterbi probability: '+ \
                      str(examples[0][1])+os.linesep)
      freqs_out.write('# Examples: '+os.linesep)
      for example in examples:
        freqs_out.write('#    '+str(example[0])+os.linesep)
      freqs_out.write(str(key)+os.linesep)
      freqs_out.write(os.linesep)
    freqs_out.close()

  inout.log_message(['Read '+str(line_read)+' lines, processed '+ \
                    str(rec_count)+' lines', 'End.'],'v1')
Example #6
0
def get_geoloc_hmm(word_list, tag_list):
    """Process input using a HMM to extract geocode and locality output fields.

  USAGE:
    geoloc_dict = get_geoloc_hmm(word_list, tag_list)

  ARGUMENTS:
    word_list  List of words as produces with clean_tag_locality()
    tag_list   Corresponding list of tags as produces with
               clean_tag_locality()

  DESCRIPTION:
    The routine returns a dictionary with the parsed and extracted output
    fields for both the locality and geocode components. A Hidden Markov Model
    (HMM) is used for this task.

    The dictionary returned can contain the following key words:
    - wayfare_number
    - wayfare_name
    - wayfare_qualifier
    - wayfare_type
    - unit_number
    - unit_type
    - property_name
    - institution_name
    - institution_type    
    - postaddress_number
    - postaddress_type
    - locality_name
    - locality_qualifier
    - postcode
    - territory
    - country
    - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for
                        the most likely HMM state seqence)
  """

    # First, create all permutations of the input tag sequence
    #
    tag_list_seq = mymath.perm_tag_sequence(tag_list)

    msg = ['  Input tag sequence: ' + str(tag_list), '  Output tag sequences:']
    for t in tag_list_seq:
        msg.append('    ' + str(t))
    inout.log_message(msg, 'v2')

    # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - -
    # and keep the one with highest probability
    #
    max_prob = -1.0
    best_obs_seq = []
    best_tag_list = []

    for t in tag_list_seq:
        [obs_seq, prob] = config.geoloc_hmm.viterbi(t)
        if (prob > max_prob):
            best_obs_seq = obs_seq
            best_tag_list = t
            max_prob = prob

        inout.log_message(
            '  Probability ' + str(prob) + '  for sequence ' + str(t), 'v2')

    inout.log_message([
        '  Best observation sequence: ' + str(best_obs_seq),
        '          with tag sequence: ' + str(best_tag_list)
    ], 'v2')

    # Now process the observation sequence and add elements into dictionary - - -
    #
    tag_list_len = len(tag_list)
    norm_max_prob = max_prob / float(
        tag_list_len)  # Normalise max. probability
    geoloc_dict = {'geoloc_hmm_proba': [str(norm_max_prob)]}

    list_len = len(word_list)
    for i in range(list_len):  # Loop over words and states
        w = word_list[i]
        s = best_obs_seq[i]

        #  Do not output commas, vertical bars and hyphens  - - - - - - - - - - - -
        #
        if (w in ['|', ',', '-', '/']):
            pass

        elif (s == 'wfnu'
              ):  # Wayfare number - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('wayfare_number', [])
            v.append(w)
            geoloc_dict.update({'wayfare_number': v})

        elif (s in ['wfna1', 'wfna2',
                    'wfna3']):  # Wayfare name  - - - - - - - - - -
            v = geoloc_dict.get('wayfare_name', [])
            v.append(w)
            geoloc_dict.update({'wayfare_name': v})

        elif (s == 'wfql'
              ):  # Wayfare qualifier  - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('wayfare_qualifier', [])
            v.append(w)
            geoloc_dict.update({'wayfare_qualifier': v})

        elif (s == 'wfty'
              ):  # Wayfare type - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('wayfare_type', [])
            v.append(w)
            geoloc_dict.update({'wayfare_type': v})

        elif (s == 'unnu'
              ):  # Unit number  - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('unit_number', [])
            v.append(w)
            geoloc_dict.update({'unit_number': v})

        elif (s == 'unty'
              ):  # Unit type  - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('unit_type', [])
            v.append(w)
            geoloc_dict.update({'unit_type': v})

        elif (s in ['prna1',
                    'prna2']):  # Property name - - - - - - - - - - - - - -
            v = geoloc_dict.get('property_name', [])
            v.append(w)
            geoloc_dict.update({'property_name': v})

        elif (s in ['inna1',
                    'inna2']):  # Institution name  - - - - - - - - - - - -
            v = geoloc_dict.get('institution_name', [])
            v.append(w)
            geoloc_dict.update({'institution_name': v})

        elif (s == 'inty'
              ):  # Institution type - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('institution_type', [])
            v.append(w)
            geoloc_dict.update({'institution_type': v})

        elif (s == 'panu'
              ):  # Postal address number  - - - - - - - - - - - - - - -
            v = geoloc_dict.get('postaddress_number', [])
            v.append(w)
            geoloc_dict.update({'postaddress_number': v})

        elif (s == 'paty'
              ):  # Postal address type  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('postaddress_type', [])
            v.append(w)
            geoloc_dict.update({'postaddress_type': v})

        elif (s in ['loc1',
                    'loc2']):  # Locality name - - - - - - - - - - - - - - -
            v = geoloc_dict.get('locality_name', [])
            v.append(w)
            geoloc_dict.update({'locality_name': v})

        elif (s == 'locql'
              ):  # Locality qualifier  - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('locality_qualifier', [])
            v.append(w)
            geoloc_dict.update({'locality_qualifier': v})

        elif (s == 'pc'
              ):  # Postcode - - - - - - - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('postcode', [])
            v.append(w)
            geoloc_dict.update({'postcode': v})

        elif (s in ['ter1',
                    'ter2']):  # Territory - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('territory', [])
            v.append(w)
            geoloc_dict.update({'territory': v})

        elif (s in ['cntr1',
                    'cntr2']):  # Country - - - - - - - - - - - - - - - - -
            v = geoloc_dict.get('country', [])
            v.append(w)
            geoloc_dict.update({'country': v})

        else:  # Should never happen
            msg = ['This should never happen!', '  Tag: '+str(s), '  Word: '+w, \
                   '  Word list: '+str(word_list), \
                   '  tag list:  '+str(tag_list)]
            inout.log_message(msg, 'warn')

    # Check if concatenated locality and territory words are in lookup-table  - -
    #
    if (geoloc_dict.has_key('locality_name')):
        loc = geoloc_dict['locality_name']
        if (len(loc) > 1):  # Locality contains more than one word
            loc_tuple = tuple(loc)  # Make it a tuple
            if (config.geoloc_lookup_dict.has_key(loc_tuple)):
                new_loc = config.geoloc_lookup_dict[loc_tuple][0]
                geoloc_dict.update({'locality_name': [new_loc]})

    if (geoloc_dict.has_key('territory')):
        terr = geoloc_dict['territory']
        if (len(terr) > 1):  # Territory contains more than one word
            terr_tuple = tuple(terr)  # Make it a tuple
            if (config.geoloc_lookup_dict.has_key(terr_tuple)):
                new_terr = config.geoloc_lookup_dict[terr_tuple][0]
                geoloc_dict.update({'territory': [new_terr]})

    if (geoloc_dict.has_key('country')):
        cntr = geoloc_dict['country']
        if (len(cntr) > 1):  # Country contains more than one word
            cntr_tuple = tuple(cntr)  # Make it a tuple
            if (config.geoloc_lookup_dict.has_key(cntr_tuple)):
                new_cntr = config.geoloc_lookup_dict[cntr_tuple][0]
                geoloc_dict.update({'country': [new_cntr]})

    # Finally do some tests on the output fields  - - - - - - - - - - - - - - - -
    #
    geoloc_items = geoloc_dict.items()

    # Check if a value list has more than three elements, if so print out
    #
    for i in geoloc_items:
        if (len(i[1]) > 3):
            inout.log_message('Geocode/locality output field '+ str(i[0])+ \
                    ' contains more than three elements: '+str(i[1]),'warn')

    # Check if 'number' elements only contain (alpha-) numerical values - - - - -
    # and also check how many numbers in an element
    #
    if (geoloc_dict.has_key('wayfare_number')
        ):  # Check how many wayfare numbers
        v = geoloc_dict['wayfare_number']
        if (len(v) > 2):
            inout.log_message('More than two wayfare numbers: ' + str(v),
                              'warn')
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                inout.log_message('Wayfare number element contains no digits: '+ \
                                  str(v),'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('unit_number')):  # Check how many unit numbers
        v = geoloc_dict['unit_number']
        if (len(v) > 1):
            inout.log_message('More than one unit numbers: ' + str(v), 'warn')
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                inout.log_message('Unit number element contains no digits: '+str(v),\
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('postaddress_number')
        ):  # Check postaddress numbers
        v = geoloc_dict['postaddress_number']
        if (len(v) > 1):
            inout.log_message('More than one postaddress numbers: ' + str(v),
                              'warn')
        for i in v:
            if (i.isalpha()):  # Element contains only letters
                inout.log_message('Postaddress number element contains no digits: '+ \
                                  str(v),'warn')
                break  # Exit for loop

    # Check if 'type' elements contain one word only  - - - - - - - - - - - - - -
    # if it's a known type word
    #
    if (geoloc_dict.has_key('wayfare_type')):  # Check wayfare type
        v = geoloc_dict['wayfare_type']
        if (len(v) > 1):
            inout.log_message('More than one wayfare type: ' + str(v), 'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('WT') < 0)):
                inout.log_message('Wayfare type word is not known: ' + str(v),
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('unit_type')):  # Check unit type
        v = geoloc_dict['unit_type']
        if (len(v) > 1):
            inout.log_message('More than one unit type: ' + str(v), 'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('UT') < 0)):
                inout.log_message('Unit type word is not known: ' + str(v),
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('institution_type')):  # Check institution type
        v = geoloc_dict['institution_type']
        if (len(v) > 1):
            inout.log_message('More than one institution type: ' + str(v),
                              'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('IT') < 0)):
                inout.log_message(
                    'Institution type word is not known: ' + str(v), 'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('postaddress_type')):  # Check postaddress type
        v = geoloc_dict['postaddress_type']
        if (len(v) > 2):
            inout.log_message('More than two postaddress type: ' + str(v),
                              'warn')
        for i in v:
            i = i.split('_')
            i = tuple(i)  # Make it a tuple
            if (not config.geoloc_lookup_dict.has_key((i))) or \
               (config.geoloc_lookup_dict.has_key((i)) and \
                (config.geoloc_lookup_dict[(i)][1].find('PA') < 0)):
                inout.log_message(
                    'Postaddress type word is not known: ' + str(v), 'warn')
                break  # Exit for loop

    # Check if 'qualifier' elements only contain known qualifier words  - - - - -
    #
    if (geoloc_dict.has_key('wayfare_qualifier')):  # Check wayfare qualifier
        v = geoloc_dict['wayfare_qualifier']
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or \
               (config.geoloc_lookup_dict.has_key((i,)) and \
                (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)):
                inout.log_message('Wayfare qualifier word is not known: '+str(v), \
                                  'warn')
                break  # Exit for loop

    if (geoloc_dict.has_key('locality_qualifier')):  # Check locality qualifier
        v = geoloc_dict['locality_qualifier']
        for i in v:
            if (not config.geoloc_lookup_dict.has_key((i,))) or \
               (config.geoloc_lookup_dict.has_key((i,)) and \
                (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)):
                inout.log_message('Locality qualifier word is not known: '+str(v), \
                                  'warn')
                break  # Exit for loop

    return geoloc_dict
Example #7
0
def tag_geoloc_component(geoloc_str):
    """Tag a geocode locality input component string and make a list.

  USAGE:
    [word_list, tag_list] = tag_geoloc_component(loc_str)

  ARGUMENTS:
    geoloc_str  A string containing the geocode and/or locality components

  DESCRIPTION:
    This routines cleans the input string and extracts words, numbers and
    separators into a list. Each element of this list is assigned one or more
    tags. A 'greedy tagger' is applied, which cheques sequences of list
    elements in the name lookup table (longer sequences first) and replaces
    them with the string and tag from the lookup-table if found.

    The routine returns two lists: words and their tags
  """

    # First, split input string into elements at spaces - - - - - - - - - - - - -
    #
    org_list = geoloc_str.split()  # The original list from the input string
    inout.log_message('  Initial word list: ' + str(org_list), 'v2')

    tag_list = []  # The initially empty list of tags
    word_list = []  # The initially empty list of words

    while (org_list != []):  # As long as not all elements have been processed
        tmp_list = org_list[:config.
                            geoloc_dict_seq_len]  # Extract longest sub-list
        tmp_val = []  # Start with empty value
        tmp_key = tuple(tmp_list)

        while (tmp_key !=
               ()):  # As long as key not empty and not found in lookup
            if (config.geoloc_lookup_dict.has_key(tmp_key)):
                tmp_val = config.geoloc_lookup_dict[tmp_key]
                break
            tmp_key = tmp_key[:-1]  # Remove last element in key

        if (tmp_val != []):  # A value has been found in the dictionary
            tmp_len = len(tmp_key)  # Length of found sequence

            if (tmp_val[0] != ''):  # it's not an empty value
                word_list.append(
                    tmp_val[0])  # Append corrected word (or sequence)
                tag_list.append(tmp_val[1])  # Append tag or tags

        else:  # No value has been found in the lookup dictionary, try other tags

            tmp_val = org_list[
                0]  # Value is first element in the original list
            tmp_len = 1

            if (tmp_val.isdigit()):  # Element is a number
                word_list.append(tmp_val)
                if (len(tmp_val) == 4):
                    tag_list.append('N4')
                else:
                    tag_list.append('NU')

            elif (not tmp_val.isalpha()
                  ) and tmp_val.isalnum():  # Alpha-numeric
                word_list.append(tmp_val)
                tag_list.append('AN')

            elif (tmp_val == '-'):  # Element is a hyphen
                word_list.append(tmp_val)
                tag_list.append('HY')

            elif (tmp_val == ','):  # Element is a comma
                word_list.append(tmp_val)
                tag_list.append('CO')

            elif (tmp_val == '|'):  # Element is a vertical bar
                word_list.append(tmp_val)
                tag_list.append('VB')

            else:  # An unknown element
                word_list.append(tmp_val)
                tag_list.append('UN')

        # Finally remove the processed elements from the original element list
        #
        org_list = org_list[tmp_len:]  # Remove processed elements

    return [word_list, tag_list]
Example #8
0
def tagdata():
    """Main routine, open file, read lines, tag data records, write to out-file.

  USAGE:
    tagdata()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 5):
        print '***** Error: %s needs at least six arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Tagging mode: "name" or "locality"'
        print '*****        - Output training file name'
        print '*****        - Start of block with training records'
        print '*****        - End of block with training records'
        print '*****        - Number of training records'
        print '*****          plus options'
        raise Exception()

    if (config.in_file_name == config.options[2]):
        print '***** Error: Input and output files must differ'
        print '*****        Input file name:          ', config.in_file_name
        print '*****        Output training file name:', config.options[2]
        raise Exception()

    first_rec = int(config.options[2])
    last_rec = int(config.options[3])
    num_rec = int(config.options[4])
    in_file_name = config.in_file_name
    out_file_name = config.options[1]

    # Check record number values  - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (int(first_rec) >= int(last_rec)) or \
       ((int(num_rec)-1) > (int(last_rec)-int(first_rec))):
        print '***** Error: Illegal values for training records block:'
        print '*****        - Start of block with training records:', first_rec
        print '*****        - End of block with training records:  ', last_rec
        print '*****        - Number of training records:          ', num_rec
        raise Exception()

    rec_range = last_rec - first_rec - 1  # Range of records in input file

    # Open input file and check number of available records - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0
    for line in f_in.xreadlines():
        line_count += 1
    f_in.close()

    if (last_rec > line_count):  # Illegal value for last record
        print '***** Error: Illegal values for last training records:', last_rec
        print '*****        File only contains', line_count, 'lines/records'
        raise Exception()

    # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
    #
    tag_mode = config.options[0]
    if (tag_mode in ['name', 'na', 'n']):
        tag_mode = 'name'
    elif (tag_mode in ['locality', 'localty', 'loc', 'l']):
        tag_mode = 'loc'
    else:
        print '***** Error: Illegal tagging mode:', tag_mode
        print '*****        Must be either "name" or "locality"'
        raise Exception()

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    hmm_file_name = None  # Default: Do not use HMM to standardise training
    #          records
    retag_file_name = None  # Default: Do not retag an existing training file
    config.nowarn = 0  # Deactivate no warning flag (print/log warning
    # messages)
    freqs_file_name = None  # Default: Do not write frequencies, no -freqs option

    if (len(config.options) > 5):
        options = config.options[5:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file: ' + config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write('############' + os.linesep)
                f_log.write('#' + os.linesep)
                f_log.write(
                    "# 'pyTagData.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write('#' + os.linesep)
                f_log.write("# Input file name:  " + in_file_name + os.linesep)
                f_log.write("# Output file name: " + out_file_name +
                            os.linesep)
                f_log.write("# Tagging mode:     " + tag_mode + os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-hmm'):
                hmm_file_name = options[1]  # Get file name of the HMM to use
                if (hmm_file_name == out_file_name):
                    print '***** Error: HMM file name is the same as output file name!'
                    raise Exception()

                try:
                    f_in = open(hmm_file_name,
                                'r')  # Test if file is available
                except:
                    print '***** Error: Cannot open HMM file specified in "-hmm"',
                    print 'option:', hmm_file_name
                    raise IOError()
                f_in.close()
                options = options[
                    2:]  # Remove processed '-hmm' option and file name

            elif (options[0] == '-retag'):
                if (hmm_file_name == None) and ('-hmm' not in options):
                    print '***** Error: "-retag" option can only be used together with',
                    print '"-hmm" option (which is not given).'
                    raise Exception()

                retag_file_name = options[
                    1]  # Get file name of the already-tagged
                # file to re-process
                if (retag_file_name == out_file_name):
                    print '***** Error: Retag file name is the same as output file name!'
                    raise Exception()
                elif (retag_file_name == in_file_name):
                    print '***** Error: Retag file name is the same as input file name!'
                    raise Exception()
                elif (retag_file_name == hmm_file_name):
                    print '***** Error: Retag file name is the same as HMM file name!'
                    raise Exception()

                try:
                    f_in = open(retag_file_name,
                                'r')  # Test if file is available

                    # Now gather record numbers and previous tags/states, as well as the
                    # original header information. Use a simple state machine to do this.
                    #
                    tagged_recs = {}
                    cleaned_recs = {}
                    original_header_lines = []
                    state = -1  # Header lines state
                    prevline = ''

                    for line in f_in.xreadlines(
                    ):  # Read training file and process it
                        line = line.strip()

                        if (state == -1) and (len(line)
                                              == 0):  # End of header lines
                            state = 0
                            prevline = line
                            continue

                        if (state == -1) and (len(line) > 0) and (line[0]
                                                                  == "#"):
                            original_header_lines.append("# " + line)
                            prevline = line
                            continue
                        sline = line.split(' ')

                        if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \
                           and (sline[2][0] == '(') and (sline[2][-2:] == '):'):
                            try:
                                rec = int(sline[1])  # Original record number
                                tagged_recs[rec] = None
                                cleaned_recs[rec] = None
                                state = 1
                            except:
                                pass
                            prevline = line
                            continue

                        if (state
                                == 1) and (len(line) > 0) and (line[0] != '#'):
                            tagged_recs[rec] = line
                            cleaned_recs[rec] = prevline
                            state = 0
                            prevline = line
                            continue

                        if (state == 1) and (len(line) > 0):
                            prevline = line
                            continue

                    f_in.close()
                    tagged_recs_keys = tagged_recs.keys()

                    num_rec = len(
                        tagged_recs_keys)  # Override specified numbers
                    first_rec = 0
                    last_rec = line_count

                except:
                    print '***** Error: Cannot open tagged training file specified',
                    print 'in "-retag" option:', retag_file_name
                    raise IOError()

                options = options[
                    2:]  # Remove processed '-retag' option and file name

            elif (options[0][:5] == '-freq'):
                if (hmm_file_name == None) and ('-hmm' not in options):
                    print '***** Error: "-feqs" option can only be used together with',
                    print '"-hmm" option (which is not given).'
                    raise Exception()

                freqs_file_name = options[
                    1]  # File name to write the frequencies to
                if (freqs_file_name == out_file_name):
                    print '***** Error: Frequency file name is the same as output',
                    print 'file name!'
                    raise Exception()
                elif (freqs_file_name == in_file_name):
                    print '***** Error: Frequency file name is the same as input',
                    print 'file name!'
                    raise Exception()
                elif (freqs_file_name == hmm_file_name):
                    print '***** Error: Frequency file name is the same as HMM',
                    print 'file name!'
                    raise Exception()

                options = options[
                    2:]  # Remove processed '-freqs' option and file name
                try:  # Check if file writing is possible
                    freqs_out = open(freqs_file_name, 'w')
                    freqs_out.close()
                except:
                    print '***** Error: Cannot write to frequency output file specified',
                    print 'in "-freqs" option:', freqs_file_name
                    raise IOError()

            else:
                print '***** Error: Illegal option:', options[0]
                raise Exception()

    # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - -
    #
    if (hmm_file_name != None):
        myhmm = simplehmm.hmm([], [])  # Create new empty HMM object
        myhmm.load_hmm(hmm_file_name)
        myhmm.print_hmm()  # Print HMM (according to verbose and logging level)

    # Open output file and write header - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_out = open(out_file_name, 'w')
    except:
        inout.log_message('Cannot open output file: ' + out_file_name, 'err')
        raise IOError()

    f_out.write("# Tagged training data written by 'pyTagData.py -"+ \
                " Version 0.1'"+os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Created ' + time.ctime(time.time()) + os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Input file name:  ' + in_file_name + os.linesep)
    f_out.write('# Output file name: ' + out_file_name + os.linesep)
    f_out.write('#' + os.linesep)
    f_out.write('# Parameters:' + os.linesep)
    f_out.write('# - Start of block with training records: '+str(first_rec)+ \
                os.linesep)
    f_out.write('# - End of block with training records:   '+str(last_rec)+ \
                os.linesep)
    f_out.write('# - Number of training records:           '+str(num_rec)+ \
                os.linesep)
    if (hmm_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \
                    os.linesep)
    if (retag_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \
                    os.linesep)
        f_out.write("#   Header lines from original training file follow:" + \
                    os.linesep)
        for header_line in original_header_lines:
            f_out.write(header_line + os.linesep)
    if (freqs_file_name != None):
        f_out.write('#' + os.linesep)
        f_out.write("# - Tag/state pattern frequencies written to file '" + \
                    freqs_file_name + os.linesep)
    f_out.write('#' + '-' * 70 + os.linesep)
    f_out.write(os.linesep)

    rec_count = 0  # Number of selected records
    num_rec_left = num_rec  # Number of records to be selected left
    rec_selected = {}  # Dictionary of all record numbers that were selected
    seq_freqs = {}  # Dict to hold examples of tag/state patterns

    unchanged_loop_cnt = 0  # Counter of how many loops have been done
    # without new records being selected
    prev_num_rec_left = num_rec  # Number of records left in the previous
    # interation

    # Due to the random nature of selecting records, and because sometimes  - - -
    # a selected component can be empty (and is thus not used for training)
    # more than one iteration over the input data set is carried out. In each
    # iteration, records are selected randomly.
    #
    while (rec_count < num_rec):  # Loop until 'num_rec' records selected

        # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        #
        try:
            f_in = open(in_file_name, 'r')
        except:
            inout.log_message('Cannot open input file: ' + in_file_name, 'err')
            raise IOError()

        line_read = 0  # Number of read lines

        # Skip to start of training block - - - - - - - - - - - - - - - - - - - - -
        #
        if (first_rec > 0):
            for i in range(first_rec):
                f_in.readline()

        while (rec_count < num_rec) and (line_read <= (last_rec - first_rec)):
            line = f_in.readline()

            if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \
               ((retag_file_name == None) and \
                (num_rec_left >= random.randrange(0,rec_range,1))):

                line = line.strip()  # Remove line separators
                config.curr_line = line  # Make a copy of the unprocessed current line

                line = line.lower()  # Make all characters lower case

                inout.log_message(
                    ['Record number: ' + str(line_read + first_rec)], 'v1')
                config.curr_line_no = line_read + first_rec  # Store current line number

                # Process line and extract content into components (name, geocode, etc)
                #
                [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
                   inout.process_line(line)

                # Select component and process it - - - - - - - - - - - - - - - - - - -
                #
                if (tag_mode == 'name'):
                    if (type(name_comp) == types.ListType):
                        component = name_comp[0].strip(
                        ) + ' ' + name_comp[1].strip()
                    else:
                        component = name_comp.strip()
                else:  # Locality component
                    component = geocode_comp.strip(
                    ) + ' ' + locality_comp.strip()

                if (component != '') and \
                   (not rec_selected.has_key((line_read+first_rec))):

                    if (tag_mode == 'name'):
                        inout.log_message(
                            '  Name component: |' + component + '|', 'v1')

                        component = name.clean_name_component(component)
                        [word_list,
                         tag_list] = name.tag_name_component(component)

                    else:  # Locality component
                        inout.log_message(
                            '  Locality component: |' + component + '|', 'v1')

                        component = locality.clean_geoloc_component(component)
                        [word_list,
                         tag_list] = locality.tag_geoloc_component(component)

                    if (tag_list != []):  # Only process non-empty tag lists

                        # Append record number into dictionary of processed records
                        #
                        rec_selected.update({
                            (line_read + first_rec): (line_read + first_rec)
                        })

                        # Create all permutation sequences of this tag list - - - - - - - -
                        #
                        tag_seq = mymath.perm_tag_sequence(tag_list)

                        inout.log_message(['  Word list: '+str(word_list), \
                                           '  Tag list: '+str(tag_list), \
                                           '  Tag sequences:'],'v2')

                        # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - -
                        #
                        if (hmm_file_name != None):

                            state_seq = [
                            ]  # List containing computed HMM state sequences
                            max_prob = -1.0  # maximal probability for a sequence
                            max_seq_no = -1  # Number of the seq. with the max. probablity

                            # Now give tag sequences to the HMMs to compute state sequences
                            #
                            i = 0
                            for t in tag_seq:
                                [obs_seq, prob] = myhmm.viterbi(t)
                                state_seq.append(obs_seq)
                                if (prob > max_prob):
                                    max_prob = prob
                                    max_seq_no = i
                                i += 1

                        # Write original component and resulting tag sequences to output
                        #
                        f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \
                                    '): |'+component+'|'+os.linesep) # Commented original
                        num_len = len(str(line_read + first_rec)) + len(
                            str(rec_count)) + 6

                        f_out.write('#' + num_len * ' ' + '|' +
                                    ' '.join(word_list) + '|' + os.linesep)

                        for i in range(len(tag_seq)):
                            # Convert each tag sequence into a string for file output
                            #
                            seq_string = '  '

                            if (hmm_file_name != None) and (i != max_seq_no):
                                seq_string = '# '  # Comment sequences with not max. probability

                            for j in range(len(tag_seq[i])):

                                if (hmm_file_name != None):
                                    seq_string = seq_string+' '+tag_seq[i][j]+':'+ \
                                                 state_seq[i][j]+','
                                else:
                                    seq_string = seq_string + ' ' + tag_seq[i][
                                        j] + ':,'

                            f_out.write(seq_string[:-1] +
                                        os.linesep)  # Write without , at end
                            inout.log_message('    ' + seq_string[:-1], 'v2')

                        if (hmm_file_name != None):
                            f_out.write('# Maximum Viterbi probability: %0.5f'% \
                                        (max_prob) + os.linesep)
                            inout.log_message('Maximum Viterbi probability: %0.5f'% \
                                              (max_prob), 'v2')

                        if (retag_file_name !=
                                None) and (tagged_recs[line_read] != None):
                            if (tagged_recs[line_read].strip() !=
                                    seq_string[:-1].strip()):
                                f_out.write("# Note: ***** Changed *****" +
                                            os.linesep)
                                inout.log_message('                      Note:' + \
                                                  ' ***** Changed *****','v2')
                                f_out.write('# Was: ' +
                                            tagged_recs[line_read] +
                                            os.linesep)
                                # Write commented original tag sequence
                                inout.log_message('Original tag sequence: '+ \
                                                  tagged_recs[line_read],'v2')

                        f_out.write(os.linesep)  # Write an empty line
                        inout.log_message(
                            '', 'v1')  # Print empty lines between records

                        if (hmm_file_name != None):
                            seq_key = seq_string[:
                                                 -1]  # Add sequence to dictionary
                            if (seq_freqs.has_key(seq_key)):
                                seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \
                                                          max_prob])
                            else:
                                seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \
                                                      max_prob]]

                        rec_count += 1

                        # Print process indicator message
                        #
                        if (config.proc_ind >= 0) and (rec_count > 0):
                            if (rec_count % config.proc_ind == 0):
                                print 'Processed line', rec_count, 'of', num_rec

            line_read += 1

        f_in.close()

        num_rec_left = num_rec - rec_count

        if (prev_num_rec_left == num_rec_left):  # No new records selected
            unchanged_loop_cnt += 1
        prev_num_rec_left = num_rec_left  # Set to current value

        if (unchanged_loop_cnt > 5):  # Do five loops maximal without selecting
            # new records
            config.curr_line_no = -1  # Set to illegal/empty values, as warning is
            config.curr_line = ''  # not related to the current input line
            inout.log_message(['Can not select more than '+str(rec_count)+ \
                               ' records for training.', \
                               'This is probably due to empty input components.', \
                               'Please reduce value of "num_rec" or increase ' + \
                               'range','between "first_rec" and "last_rec".'],'warn')
            break

        if (num_rec_left < 10):  # Only 10 records left to select
            num_rec_left = num_rec + 1  # Set to more than 100% probablity
        elif (num_rec_left < (num_rec / 100.0)):  # Less than 1% records left
            num_rec_left = int(num_rec / 100.0)  # Set to 1%

    f_out.close()

    # If specified, save Viterbi frequencies to a file  - - - - - - - - - - - - -
    #
    if (freqs_file_name != None):
        freqs_out = open(freqs_file_name,
                         'w')  # Open frequency file for writing
        freqs_out.write('# Frequency listing of tag/state patterns written by')
        freqs_out.write('"pyTagData.py - Version 0.1"' + os.linesep)
        freqs_out.write('#' + os.linesep)
        freqs_out.write('# Created ' + time.ctime(time.time()) + os.linesep)
        freqs_out.write('#' + os.linesep)
        freqs_out.write("# Input file name:  " + in_file_name + os.linesep)
        freqs_out.write("# Output file name: " + out_file_name + os.linesep)
        freqs_out.write(os.linesep)
        freqs_out.write('# Parameters:' + os.linesep)
        freqs_out.write('# - Start of block with training records: '+ \
                        str(first_rec)+os.linesep)
        freqs_out.write('# - End of block with training records:   '+ \
                        str(last_rec)+os.linesep)
        freqs_out.write('# - Number of training records:           '+ \
                        str(num_rec)+os.linesep)
        if (hmm_file_name != None):
            freqs_out.write('#' + os.linesep)
            freqs_out.write("# - Using HMM file '"+hmm_file_name+ \
                            "' for standardisation"+os.linesep)
        if (retag_file_name != None):
            freqs_out.write('#' + os.linesep)
            freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \
                            "'"+os.linesep)
        freqs_out.write('#' + '-' * 70 + os.linesep)
        freqs_out.write(os.linesep)

        sorted_seq_freqs = [
        ]  # Now sort sequences according to their fruequencies
        for key in seq_freqs.keys():
            sorted_seq_freqs.append((len(seq_freqs[key]), key))
        sorted_seq_freqs.sort()

        for skey in sorted_seq_freqs:
            key = skey[1]
            freqs_out.write('# Pattern: ' + str(key) + os.linesep)
            freqs_out.write('# Frequency: ' + str(skey[0]) + os.linesep)
            examples = seq_freqs[key]
            freqs_out.write('# Maximum Viterbi probability: '+ \
                            str(examples[0][1])+os.linesep)
            freqs_out.write('# Examples: ' + os.linesep)
            for example in examples:
                freqs_out.write('#    ' + str(example[0]) + os.linesep)
            freqs_out.write(str(key) + os.linesep)
            freqs_out.write(os.linesep)
        freqs_out.close()

    inout.log_message(['Read '+str(line_read)+' lines, processed '+ \
                      str(rec_count)+' lines', 'End.'],'v1')
Example #9
0
def trainhmm():
    """Main routine, open file, read lines, train HMM and save it to file.

  USAGE:
    trainhmm()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 3):
        print '***** Error: %s needs at least four arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Tagging mode: "name" or "locality"'
        print '*****        - Input training file name'
        print '*****        - HMM output file name'
        print '*****          plus options'
        raise Exception()

    if (config.options[1] == config.options[2]):
        print '*** Error: Input and output files must differ'
        print '***        Input training file name:', config.options[1]
        print '***        HMM output file name:    ', config.options[1]
        raise Exception()

    in_file_name = config.options[1]
    hmm_file_name = config.options[2]

    # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - -
    #
    tag_mode = config.options[0]
    if (tag_mode in ['name', 'na', 'n']):
        tag_mode = 'name'
    elif (tag_mode in ['locality', 'lolty', 'loc', 'l']):
        tag_mode = 'loc'
    else:
        print '***** Error: Illegal tagging mode:', tag_mode
        print '*****        Must be either "name" or "locality"'
        raise Exception()

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    smoothing = None  # Default: No smoothing
    config.nowarn = 0  # Deactivate no warning flag (print/log warning
    # messages)

    if (len(config.options) > 3):
        options = config.options[3:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file: ' + config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write("############" + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write(
                    "# 'pyTrainHMM.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write("# Input file name: " + in_file_name + os.linesep)
                f_log.write("# HMM file name:   " + hmm_file_name + os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-s'):
                smoothing = 1  # Set to do a HMM smoothing
                smoothing = options[1]
                if (smoothing in ['l', 'la', 'lap', 'laplac', 'laplace']):
                    smoothing = 'laplace'
                elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\
                       'absdiscount']):
                    smoothing = 'absdiscount'
                else:  # Illegal value
                    print "*** Error: Illegal value for 'smoothing' argument:", smoothing
                    print "***        Possible are: 'laplace' or 'absdiscount'"
                    raise Exception()

                options = options[2:]  # Remove processed option

            else:
                print '*** Error: Illegal option:', options[0]
                raise Exception()

    # Get HMM states and observations from configuration module - - - - - - - - -
    #
    if (tag_mode == 'name'):
        state_list = config.name_hmm_states
        obser_list = config.name_hmm_obser

    else:
        state_list = config.geoloc_hmm_states
        obser_list = config.geoloc_hmm_obser

    # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0  # Counter for lines read
    rec_count = 0  # Counter for training records read

    # Read lines, discard comment lines and process training data lines - - - - -
    #
    training_data = []  # List of training records

    train_list = [
    ]  # List of training sequences (dictionaries), extracted from
    # training data

    for line in xreadlines.xreadlines(f_in):

        if (line[0] != '#') and (line.strip() != ''):
            # Line must contain a training record

            line = line.strip()  # Remove line separators
            config.curr_line = line  # Make a copy of the unprocessed current line

            line_list = line.split(',')  # Split into a list of elements
            line_data = []  # Training data list for one training record

            inout.log_message(['Record number: ' + str(rec_count)], 'v1')
            config.curr_line_no = line_count  # Store current line number

            for elem in line_list:
                [k, v] = elem.split(':')  # Split into key and value
                tag = k.strip()
                state = v.strip()
                line_data.append((state, tag))

                if (state not in state_list):
                    msg = ['Illegal state name in training record: '+state, \
                           'Line: '+str(line_count)+', record: '+str(rec_count), \
                           'Possible values: '+str(state_list)]
                    inout.log_message(msg, 'err')
                    raise Exception()

                if (tag not in obser_list):
                    msg = ['Illegal observation (tag) name in training record: '+tag, \
                           'Line: '+str(line_count)+', record: '+str(rec_count), \
                           'Possible values: '+str(obser_list)]
                    inout.log_message(msg, 'err')
                    raise Exception()

            inout.log_message('  Training record '+str(rec_count)+':'+ \
                              str(line_data),'v1')

            train_list.append(line_data)

            rec_count += 1
            inout.log_message('', 'v1')  # Print empty lines between records

        line_count += 1

    # Close input file  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    f_in.close()

    inout.log_message('', 'v1')  # Print empty lines between records

    # Initalise HMM and train it with training data - - - - - - - - - - - - - - -
    #
    myhmm = simplehmm.hmm(state_list, obser_list)

    myhmm.train(train_list, smoothing)
    myhmm.print_hmm()

    # Save trained HMM  - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    myhmm.save_hmm(hmm_file_name)

    inout.log_message(['Read '+str(line_count)+' lines, processed '+ \
                      str(rec_count)+' training records', 'End.'],'v1')
Example #10
0
# Check if definition of input components is correct with file types
#
input_values = input_component.values()
input_len = -1  # Length of the input (either in number of fields (CSV and TAB
# files) or in characters (COL files)

output_keys = output_field.keys()  # Check if 'original_input' is in output
# fields, and if so check for correctness
for k in output_keys:
    if (k[:14] == 'original_input'):
        v = k[14:].strip()
        if (v != ''):  # There is a field or column range given
            if (v[0] == '[') and (v[-1] == ']'):
                v = v[1:-1]  # Remove brackets
            else:
                inout.log_message('Wrong input component definition: '+str(k) + \
                                  ' for "original_input" output field','err')
                raise Exception()
            if (v[0] == '(') and (v[-1] == ')'):  # It's a tuple
                v = v[1:-1]  # Remove tuple brackets
            v = v.split(',')  # Make a list
            for i in range(len(v)):
                v[i] = int(v[i])  # Make integers
            if (len(v) == 1):  # One integer only, must be a field number
                input_values.append(v)  # Append 'original_input' field number
            elif (len(v) == 2):  # Two integers, must be a column range
                input_values.append([(v[0], v[1])])  # Append as a tuple
            else:
                inout.log_message('Wrong input component value: '+str(k) + \
                                  ' for "original_input" output field','err')
                raise Exception()
Example #11
0
def standard():
  """Main routine, open file, read lines, standardise them and write into file.

  USAGE:
    standard()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

  # Process command line arguments and check for correctness  - - - - - - - - -
  #
  if (len(config.options) < 2):
    print '***** Error: %s needs at least three arguments:'% (sys.argv[0])
    print '*****        - Name of the project module'
    print '*****        - Number of the first record to be processed'
    print '*****        - Number of records to be processed'
    print '*****          plus options'
    raise Exception()

  first_rec = int(config.options[0])
  num_rec   = int(config.options[1])
  in_file_name = config.in_file_name
  out_file_name = config.out_file_name

  # Check for optional arguments and process if any - - - - - - - - - - - - - -
  #
  config.verbose = 0  # Default: No verbose output
  config.logging = 0  # Default: No logging into a file
  write_header   = 0  # Write header (output field names) to output file
                      # (default: Don't)
  config.nowarn  = 0  # Deactivate no warning flag (print/log warning messages)

  if (len(config.options) > 2):
    options = config.options[2:]
    while (options != []):  # Do a loop processing all options

      if (options[0] == '-nowarn'):
        config.nowarn = 1  # Activate no warning flag
        options = options[1:]  # Remove processed '-nowarn' option

      elif (options[0] == '-v1'):
        config.verbose = 1  # Set to verbose output level 1
        options = options[1:]  # Remove processed '-v1' option

      elif (options[0] == '-v2'):
        config.verbose = 2  # Set to verbose output level 2
        options = options[1:]  # Remove processed '-v2' option

      elif (options[0] == '-l'):
        config.logging = 1
        if (len(options) > 1):
          if (options[1][0] != '-'):  # Not another option, must be a file name
            config.log_file = options[1]  # Get name of log file
            options = options[1:]  # Remove file_name
        options = options[1:]  # Remove processed -'l' option only

        try:
          f_log = open(config.log_file,'a')  # Test if file is appendable
        except:
          print '***** Error ********************',
          print '***** Cannot write to log file:', config.log_file
          raise IOError()

        # Write (append) header to log file
        #
        f_log.write(os.linesep)
        f_log.write('##################################################')
        f_log.write("############"+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# 'pyStandard.py - Version 0.1' process started at: ")
        f_log.write(time.ctime(time.time())+os.linesep)
        f_log.write("#"+os.linesep)
        f_log.write("# Input file name:  "+in_file_name+os.linesep)
        f_log.write("# Output file name: "+out_file_name+os.linesep)
        f_log.write(os.linesep)
        f_log.close()

      elif (options[0] == '-h'):
        write_header = 1
        options = options[1:]  # Remove processed -'h' option

      elif (options[0] == '-hmm-name'):
        hmm_name_file = options[1]  # Get file name of the name HMM to use
        try:
          f_in = open(hmm_name_file,'r')  # Test if file is available
        except:
          print '***** Error ********************',
          print '***** Cannot open HMM file in "-hmm-name" option:',
          print hmm_name_file
          raise IOError()

        f_in.close()
        options = options[2:]  # Remove processed option and file name
        config.name_standard_method = 'hmm'
        config.name_hmm_file_name = hmm_name_file
        config.name_hmm = simplehmm.hmm([],[])  # Create new empty HMM object
        config.name_hmm.load_hmm(config.name_hmm_file_name)

      elif (options[0] == '-hmm-loc'):
        hmm_loc_file = options[1]  # Get file name of the locality HMM to use
        try:
          f_in = open(hmm_loc_file,'r')  # Test if file is available
        except:
          print '***** Error ********************',
          print '***** Cannot open HMM file in "-hmm-loc" option:',
          print hmm_loc_file
          raise IOError()
        f_in.close()
        options = options[2:]  # Remove processed option and file name
        config.geoloc_standard_method == 'hmm'
        config.geoloc_hmm_file_name = hmm_loc_file
        config.geoloc_hmm = simplehmm.hmm([],[])  # Create new HMM object
        config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name)

      else:
        print '***** Error: Illegal option:', options[0]
        raise Exception()

  # Open input file and check number of available records - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  line_count = 0
  for line in f_in.xreadlines():
    line_count += 1
  f_in.close()

  if ((first_rec+num_rec) > line_count):  # Illegal value for last record
    print '***** Error: Illegal values for number of records to process:',
    print num__rec, ', with start record:', start_rec
    print '*****        File only contains',line_count, 'lines/records'
    raise Exception()

  # Open files  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  try:
    f_in = open(in_file_name,'r')
  except:
    inout.log_message('Cannot open input file: '+in_file_name,'err')
    raise IOError()

  try:
    f_out = open(out_file_name,'w')
  except:
    inout.log_message('Cannot open output file: '+out_file_name,'err')
    raise IOError()

  # Write header (name of output fields) into output file - - - - - - - - - - -
  #
  if (write_header == 1):
    header_dict = {}
    for n in config.output_field_names:
      header_dict.update({n:n})  # Dictionary where values are field names

    header_line = inout.compose_line(header_dict,header=1)
    f_out.write(header_line+os.linesep)

  # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  if (first_rec > 0):
    for i in range(first_rec):
      f_in.readline()

  # Read lines, process them and write into output files  - - - - - - - - - - -
  #
  line_read = 0  # Number of read lines

  while (line_read < num_rec):  # Loop until 'num_rec' records processed
    line = f_in.readline()

    # Print process indicator message
    #
    if (config.proc_ind >= 0) and (line_read > 0):  # Only print if activated
      if (line_read % config.proc_ind == 0):
        print 'Processed line', line_read, 'of', num_rec

    line = line.strip()  # Remove line separators
    config.curr_line = line  # Make a copy of the unprocessed current line

    line = line.lower()  # Make all characters lower case

    inout.log_message(['Record '+str(line_read+first_rec)],'v1')
    config.curr_line_no = line_read+first_rec  # Store current line number

    # Process line and extract content into components (name, geocode, etc.)
    #
    [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
           inout.process_line(line)

    # Make a local empty working copy of the output field dictionary  - - - - -
    #
    output_fields = config.output_field.copy()
    output_fields_keys = output_fields.keys()
    for k in output_fields_keys:
      output_fields[k] = ''  # Set all fields to an empty string

    # Standardise name component  - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (type(name_comp) == types.ListType):  # Givenname and surname separate

      givenname_comp = name_comp[0].strip()
      surname_comp   = name_comp[1].strip()

      if (givenname_comp != ''):  # There is a givenname  - - - - - - - - - - -

        inout.log_message('  Givenname component: |'+givenname_comp+'|','v1')

        givenname_comp = name.clean_name_component(givenname_comp)
        [name_list, tag_list] = name.tag_name_component(givenname_comp)
        output_fields['gender_guess'] = name.get_gender_guess(name_list, \
                                        tag_list)
        [name_list, tag_list, output_fields['title']] = \
                                         name.get_title(name_list, tag_list)

        [output_fields['givenname'], output_fields['alt_givenname']] = \
                       name.get_name_component(name_list, tag_list, 'gname')

      if (surname_comp != ''):  # There is a surname  - - - - - - - - - - - - -

        inout.log_message('  Surname component: |'+surname_comp+'|','v1')

        surname_comp = name.clean_name_component(surname_comp)
        [name_list, tag_list] = name.tag_name_component(surname_comp)
        [output_fields['surname'], output_fields['alt_surname']] = \
                        name.get_name_component(name_list, tag_list, 'sname')

    elif (name_comp.strip() != ''):  # Given- and surname both in one field - -

      inout.log_message('  Name component: |'+name_comp+'|','v1')

      name_comp = name.clean_name_component(name_comp)
      [name_list, tag_list] = name.tag_name_component(name_comp)

      output_fields['gender_guess'] = name.get_gender_guess(name_list,tag_list)

      [name_list, tag_list, output_fields['title']] = \
                                        name.get_title(name_list, tag_list)

      if (config.name_standard_method == 'rules'):
        name_dict = name.get_names_rules(name_list, tag_list, 'gname')

      elif (config.name_standard_method == 'hmm'):
        name_dict = name.get_names_hmm(name_list, tag_list)

      else:
        inout.log_message('Illegal name standardisation method:'+ \
                          config.name_standard_method,'err')
        raise Exception()

      for (field,value) in name_dict.items(): # Assign to output dictionary
          output_fields[field] = value 

    # Standardise geocode and locality components using HMM - - - - - - - - - -
    #
    if (config.geoloc_standard_method == 'hmm') and \
       ((geocode_comp.strip() != '') or (locality_comp.strip() != '')):

      geoloc_comp = geocode_comp.strip()+' '+locality_comp.strip()
      inout.log_message('  Geocode and locality component: |'+geoloc_comp+'|',\
                        'v1')

      geoloc_comp = locality.clean_geoloc_component(geoloc_comp)
      [geoloc_words, geoloc_tags] = locality.tag_geoloc_component(geoloc_comp)

      if (geoloc_words != []):  # Component not empty, do HMM standardisation

        geoloc_dict = locality.get_geoloc_hmm(geoloc_words,geoloc_tags)

        for (field,value) in geoloc_dict.items(): # Assign to output dictionary
          output_fields[field] = value

    # Standardise geocode component using rules - - - - - - - - - - - - - - - -
    #
    elif (config.geoloc_standard_method == 'rules') and \
         (geocode_comp.strip() != ''):
      inout.log_message('  Geocode component: |'+geocode_comp+'|','v1')

      ### TO BE DONE
      inout.log_message('Rules based standardisation for geocode is' + \
                        'not implemented yet','err')
      raise Exception()

    # Standardise locality component using rules  - - - - - - - - - - - - - - -
    #
    elif (config.geoloc_standard_method == 'rules') and \
         (locality_comp.strip() != ''):
      inout.log_message('  Locality component: |'+locality_comp+'|','v1')

      ### TO BE FINALISED
      inout.log_message('Rules based standardisation for locality is' + \
                        'not implemented yet','err')
      raise Exception()

#      locality_comp = locality.clean_geoloc_component(locality_comp)
#      [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp)
#
#      [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags)
#      if (terr != ''):
#        output_fields['territory'] = terr
#
#      [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2)
#      if (pc != ''):
#        output_fields['postcode'] = pc
#
#      [loc_name, loc_quali, loc_words4, loc_tags4] = \
#         locality.get_localityname_qualifier(loc_words3, loc_tags3)
#      if (loc_name != ''):
#        output_fields['locality_name'] = loc_name
#      if (loc_quali != ''):
#        output_fields['locality_quali'] = loc_quali
#
#      if (loc_words4 != []):  # Not all words are standardised yet
#        print '  # Remaining word list:', loc_words4  ###### TEST
#        print '  # Remaining tag list: ', loc_tags4   ###### TEST

    # Standardise date strings  - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (date1_comp != ''):
      inout.log_message('  Date1 component: |'+date1_comp+'|','v1')

      [day1,month1,year1,status1] = date.parse_datestr(date1_comp)
      if (day1 != -1):
        output_fields['day1'] = str(day1)
      if (month1 != -1):
        output_fields['month1'] = str(month1)
      if (year1 != -1):
        output_fields['year1'] = str(year1)

    if (date2_comp != ''):
      inout.log_message('  Date2 component: |'+date2_comp+'|','v1')

      [day2,month2,year2,status2] = date.parse_datestr(date2_comp)
      if (day2 != -1):
        output_fields['day2'] = str(day2)
      if (month2 != -1):
        output_fields['month2'] = str(month2)
      if (year2 != -1):
        output_fields['year2'] = str(year2)

    # Create log message of output fields - - - - - - - - - - - - - - - - - - -
    #
    msg = ['  Standardised record output fields:']
    for (field,value) in output_fields.items():
      if (value != '') and (value != []):
        msg.append('    '+field+':'+str(value))
    inout.log_message(msg,'v1')

    # Save standardised record into output field
    #
    out_line = inout.compose_line(output_fields)
    f_out.write(out_line+os.linesep)

    # Increment line counter and go to beginning of loop  - - - - - - - - - - -
    #
    line_read += 1

    inout.log_message('','v1')  # Print empty lines between records

  # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  #
  f_in.close()
  f_out.close()

  msg = ['','Number of warnings: '+str(config.num_warning), \
         'Number of corrected word spillings: '+str(config.num_word_spills)]
  inout.log_message(msg,'v1')

  print msg[1]
  print msg[2]

  inout.log_message('End.','v1')
Example #12
0
def standard():
    """Main routine, open file, read lines, standardise them and write into file.

  USAGE:
    standard()

  ARGUMENTS:
    None

  DESCRIPTION:
    Main routine, see description of module above.
  """

    # Process command line arguments and check for correctness  - - - - - - - - -
    #
    if (len(config.options) < 2):
        print '***** Error: %s needs at least three arguments:' % (sys.argv[0])
        print '*****        - Name of the project module'
        print '*****        - Number of the first record to be processed'
        print '*****        - Number of records to be processed'
        print '*****          plus options'
        raise Exception()

    first_rec = int(config.options[0])
    num_rec = int(config.options[1])
    in_file_name = config.in_file_name
    out_file_name = config.out_file_name

    # Check for optional arguments and process if any - - - - - - - - - - - - - -
    #
    config.verbose = 0  # Default: No verbose output
    config.logging = 0  # Default: No logging into a file
    write_header = 0  # Write header (output field names) to output file
    # (default: Don't)
    config.nowarn = 0  # Deactivate no warning flag (print/log warning messages)

    if (len(config.options) > 2):
        options = config.options[2:]
        while (options != []):  # Do a loop processing all options

            if (options[0] == '-nowarn'):
                config.nowarn = 1  # Activate no warning flag
                options = options[1:]  # Remove processed '-nowarn' option

            elif (options[0] == '-v1'):
                config.verbose = 1  # Set to verbose output level 1
                options = options[1:]  # Remove processed '-v1' option

            elif (options[0] == '-v2'):
                config.verbose = 2  # Set to verbose output level 2
                options = options[1:]  # Remove processed '-v2' option

            elif (options[0] == '-l'):
                config.logging = 1
                if (len(options) > 1):
                    if (options[1][0] !=
                            '-'):  # Not another option, must be a file name
                        config.log_file = options[1]  # Get name of log file
                        options = options[1:]  # Remove file_name
                options = options[1:]  # Remove processed -'l' option only

                try:
                    f_log = open(config.log_file,
                                 'a')  # Test if file is appendable
                except:
                    print '***** Error ********************',
                    print '***** Cannot write to log file:', config.log_file
                    raise IOError()

                # Write (append) header to log file
                #
                f_log.write(os.linesep)
                f_log.write(
                    '##################################################')
                f_log.write("############" + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write(
                    "# 'pyStandard.py - Version 0.1' process started at: ")
                f_log.write(time.ctime(time.time()) + os.linesep)
                f_log.write("#" + os.linesep)
                f_log.write("# Input file name:  " + in_file_name + os.linesep)
                f_log.write("# Output file name: " + out_file_name +
                            os.linesep)
                f_log.write(os.linesep)
                f_log.close()

            elif (options[0] == '-h'):
                write_header = 1
                options = options[1:]  # Remove processed -'h' option

            elif (options[0] == '-hmm-name'):
                hmm_name_file = options[
                    1]  # Get file name of the name HMM to use
                try:
                    f_in = open(hmm_name_file,
                                'r')  # Test if file is available
                except:
                    print '***** Error ********************',
                    print '***** Cannot open HMM file in "-hmm-name" option:',
                    print hmm_name_file
                    raise IOError()

                f_in.close()
                options = options[2:]  # Remove processed option and file name
                config.name_standard_method = 'hmm'
                config.name_hmm_file_name = hmm_name_file
                config.name_hmm = simplehmm.hmm(
                    [], [])  # Create new empty HMM object
                config.name_hmm.load_hmm(config.name_hmm_file_name)

            elif (options[0] == '-hmm-loc'):
                hmm_loc_file = options[
                    1]  # Get file name of the locality HMM to use
                try:
                    f_in = open(hmm_loc_file, 'r')  # Test if file is available
                except:
                    print '***** Error ********************',
                    print '***** Cannot open HMM file in "-hmm-loc" option:',
                    print hmm_loc_file
                    raise IOError()
                f_in.close()
                options = options[2:]  # Remove processed option and file name
                config.geoloc_standard_method == 'hmm'
                config.geoloc_hmm_file_name = hmm_loc_file
                config.geoloc_hmm = simplehmm.hmm([],
                                                  [])  # Create new HMM object
                config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name)

            else:
                print '***** Error: Illegal option:', options[0]
                raise Exception()

    # Open input file and check number of available records - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    line_count = 0
    for line in f_in.xreadlines():
        line_count += 1
    f_in.close()

    if ((first_rec + num_rec) > line_count):  # Illegal value for last record
        print '***** Error: Illegal values for number of records to process:',
        print num__rec, ', with start record:', start_rec
        print '*****        File only contains', line_count, 'lines/records'
        raise Exception()

    # Open files  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    try:
        f_in = open(in_file_name, 'r')
    except:
        inout.log_message('Cannot open input file: ' + in_file_name, 'err')
        raise IOError()

    try:
        f_out = open(out_file_name, 'w')
    except:
        inout.log_message('Cannot open output file: ' + out_file_name, 'err')
        raise IOError()

    # Write header (name of output fields) into output file - - - - - - - - - - -
    #
    if (write_header == 1):
        header_dict = {}
        for n in config.output_field_names:
            header_dict.update({n:
                                n})  # Dictionary where values are field names

        header_line = inout.compose_line(header_dict, header=1)
        f_out.write(header_line + os.linesep)

    # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    if (first_rec > 0):
        for i in range(first_rec):
            f_in.readline()

    # Read lines, process them and write into output files  - - - - - - - - - - -
    #
    line_read = 0  # Number of read lines

    while (line_read < num_rec):  # Loop until 'num_rec' records processed
        line = f_in.readline()

        # Print process indicator message
        #
        if (config.proc_ind >= 0) and (line_read >
                                       0):  # Only print if activated
            if (line_read % config.proc_ind == 0):
                print 'Processed line', line_read, 'of', num_rec

        line = line.strip()  # Remove line separators
        config.curr_line = line  # Make a copy of the unprocessed current line

        line = line.lower()  # Make all characters lower case

        inout.log_message(['Record ' + str(line_read + first_rec)], 'v1')
        config.curr_line_no = line_read + first_rec  # Store current line number

        # Process line and extract content into components (name, geocode, etc.)
        #
        [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \
               inout.process_line(line)

        # Make a local empty working copy of the output field dictionary  - - - - -
        #
        output_fields = config.output_field.copy()
        output_fields_keys = output_fields.keys()
        for k in output_fields_keys:
            output_fields[k] = ''  # Set all fields to an empty string

        # Standardise name component  - - - - - - - - - - - - - - - - - - - - - - -
        #
        if (type(name_comp) == types.ListType
            ):  # Givenname and surname separate

            givenname_comp = name_comp[0].strip()
            surname_comp = name_comp[1].strip()

            if (givenname_comp !=
                    ''):  # There is a givenname  - - - - - - - - - - -

                inout.log_message(
                    '  Givenname component: |' + givenname_comp + '|', 'v1')

                givenname_comp = name.clean_name_component(givenname_comp)
                [name_list, tag_list] = name.tag_name_component(givenname_comp)
                output_fields['gender_guess'] = name.get_gender_guess(name_list, \
                                                tag_list)
                [name_list, tag_list, output_fields['title']] = \
                                                 name.get_title(name_list, tag_list)

                [output_fields['givenname'], output_fields['alt_givenname']] = \
                               name.get_name_component(name_list, tag_list, 'gname')

            if (surname_comp !=
                    ''):  # There is a surname  - - - - - - - - - - - - -

                inout.log_message(
                    '  Surname component: |' + surname_comp + '|', 'v1')

                surname_comp = name.clean_name_component(surname_comp)
                [name_list, tag_list] = name.tag_name_component(surname_comp)
                [output_fields['surname'], output_fields['alt_surname']] = \
                                name.get_name_component(name_list, tag_list, 'sname')

        elif (name_comp.strip() !=
              ''):  # Given- and surname both in one field - -

            inout.log_message('  Name component: |' + name_comp + '|', 'v1')

            name_comp = name.clean_name_component(name_comp)
            [name_list, tag_list] = name.tag_name_component(name_comp)

            output_fields['gender_guess'] = name.get_gender_guess(
                name_list, tag_list)

            [name_list, tag_list, output_fields['title']] = \
                                              name.get_title(name_list, tag_list)

            if (config.name_standard_method == 'rules'):
                name_dict = name.get_names_rules(name_list, tag_list, 'gname')

            elif (config.name_standard_method == 'hmm'):
                name_dict = name.get_names_hmm(name_list, tag_list)

            else:
                inout.log_message('Illegal name standardisation method:'+ \
                                  config.name_standard_method,'err')
                raise Exception()

            for (field,
                 value) in name_dict.items():  # Assign to output dictionary
                output_fields[field] = value

        # Standardise geocode and locality components using HMM - - - - - - - - - -
        #
        if (config.geoloc_standard_method == 'hmm') and \
           ((geocode_comp.strip() != '') or (locality_comp.strip() != '')):

            geoloc_comp = geocode_comp.strip() + ' ' + locality_comp.strip()
            inout.log_message('  Geocode and locality component: |'+geoloc_comp+'|',\
                              'v1')

            geoloc_comp = locality.clean_geoloc_component(geoloc_comp)
            [geoloc_words,
             geoloc_tags] = locality.tag_geoloc_component(geoloc_comp)

            if (geoloc_words !=
                []):  # Component not empty, do HMM standardisation

                geoloc_dict = locality.get_geoloc_hmm(geoloc_words,
                                                      geoloc_tags)

                for (field, value
                     ) in geoloc_dict.items():  # Assign to output dictionary
                    output_fields[field] = value

        # Standardise geocode component using rules - - - - - - - - - - - - - - - -
        #
        elif (config.geoloc_standard_method == 'rules') and \
             (geocode_comp.strip() != ''):
            inout.log_message('  Geocode component: |' + geocode_comp + '|',
                              'v1')

            ### TO BE DONE
            inout.log_message('Rules based standardisation for geocode is' + \
                              'not implemented yet','err')
            raise Exception()

        # Standardise locality component using rules  - - - - - - - - - - - - - - -
        #
        elif (config.geoloc_standard_method == 'rules') and \
             (locality_comp.strip() != ''):
            inout.log_message('  Locality component: |' + locality_comp + '|',
                              'v1')

            ### TO BE FINALISED
            inout.log_message('Rules based standardisation for locality is' + \
                              'not implemented yet','err')
            raise Exception()

#      locality_comp = locality.clean_geoloc_component(locality_comp)
#      [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp)
#
#      [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags)
#      if (terr != ''):
#        output_fields['territory'] = terr
#
#      [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2)
#      if (pc != ''):
#        output_fields['postcode'] = pc
#
#      [loc_name, loc_quali, loc_words4, loc_tags4] = \
#         locality.get_localityname_qualifier(loc_words3, loc_tags3)
#      if (loc_name != ''):
#        output_fields['locality_name'] = loc_name
#      if (loc_quali != ''):
#        output_fields['locality_quali'] = loc_quali
#
#      if (loc_words4 != []):  # Not all words are standardised yet
#        print '  # Remaining word list:', loc_words4  ###### TEST
#        print '  # Remaining tag list: ', loc_tags4   ###### TEST

# Standardise date strings  - - - - - - - - - - - - - - - - - - - - - - - -
#
        if (date1_comp != ''):
            inout.log_message('  Date1 component: |' + date1_comp + '|', 'v1')

            [day1, month1, year1, status1] = date.parse_datestr(date1_comp)
            if (day1 != -1):
                output_fields['day1'] = str(day1)
            if (month1 != -1):
                output_fields['month1'] = str(month1)
            if (year1 != -1):
                output_fields['year1'] = str(year1)

        if (date2_comp != ''):
            inout.log_message('  Date2 component: |' + date2_comp + '|', 'v1')

            [day2, month2, year2, status2] = date.parse_datestr(date2_comp)
            if (day2 != -1):
                output_fields['day2'] = str(day2)
            if (month2 != -1):
                output_fields['month2'] = str(month2)
            if (year2 != -1):
                output_fields['year2'] = str(year2)

        # Create log message of output fields - - - - - - - - - - - - - - - - - - -
        #
        msg = ['  Standardised record output fields:']
        for (field, value) in output_fields.items():
            if (value != '') and (value != []):
                msg.append('    ' + field + ':' + str(value))
        inout.log_message(msg, 'v1')

        # Save standardised record into output field
        #
        out_line = inout.compose_line(output_fields)
        f_out.write(out_line + os.linesep)

        # Increment line counter and go to beginning of loop  - - - - - - - - - - -
        #
        line_read += 1

        inout.log_message('', 'v1')  # Print empty lines between records

    # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    #
    f_in.close()
    f_out.close()

    msg = ['','Number of warnings: '+str(config.num_warning), \
           'Number of corrected word spillings: '+str(config.num_word_spills)]
    inout.log_message(msg, 'v1')

    print msg[1]
    print msg[2]

    inout.log_message('End.', 'v1')