def testPermTagSeq(self): # - - - - - - - - - - - - - - - - - - - - - - - - """Test 'perm_tag_sequence' routine""" for l in self.tag_lists: t = mymath.perm_tag_sequence(l[0]) assert len(t) == l[1], \ '"perm_tag_sequence" returns wrong number of permutations with '+\ 'list: '+str(l[0])+' (should be: '+str(l[1])+'): '+str(len(t)) for i in range(len(t)): assert t[i] == l[2][i], \ '"perm_tag_sequence" returns wrong permutation: '+str(t[i])+ \ ', should be: '+str(l[2][i])
def testPermTagSeq( self): # - - - - - - - - - - - - - - - - - - - - - - - - """Test 'perm_tag_sequence' routine""" for l in self.tag_lists: t = mymath.perm_tag_sequence(l[0]) assert len(t) == l[1], \ '"perm_tag_sequence" returns wrong number of permutations with '+\ 'list: '+str(l[0])+' (should be: '+str(l[1])+'): '+str(len(t)) for i in range(len(t)): assert t[i] == l[2][i], \ '"perm_tag_sequence" returns wrong permutation: '+str(t[i])+ \ ', should be: '+str(l[2][i])
def get_geoloc_hmm(word_list, tag_list): """Process input using a HMM to extract geocode and locality output fields. USAGE: geoloc_dict = get_geoloc_hmm(word_list, tag_list) ARGUMENTS: word_list List of words as produces with clean_tag_locality() tag_list Corresponding list of tags as produces with clean_tag_locality() DESCRIPTION: The routine returns a dictionary with the parsed and extracted output fields for both the locality and geocode components. A Hidden Markov Model (HMM) is used for this task. The dictionary returned can contain the following key words: - wayfare_number - wayfare_name - wayfare_qualifier - wayfare_type - unit_number - unit_type - property_name - institution_name - institution_type - postaddress_number - postaddress_type - locality_name - locality_qualifier - postcode - territory - country - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for the most likely HMM state seqence) """ # First, create all permutations of the input tag sequence # tag_list_seq = mymath.perm_tag_sequence(tag_list) msg = [" Input tag sequence: " + str(tag_list), " Output tag sequences:"] for t in tag_list_seq: msg.append(" " + str(t)) inout.log_message(msg, "v2") # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - - # and keep the one with highest probability # max_prob = -1.0 best_obs_seq = [] best_tag_list = [] for t in tag_list_seq: [obs_seq, prob] = config.geoloc_hmm.viterbi(t) if prob > max_prob: best_obs_seq = obs_seq best_tag_list = t max_prob = prob inout.log_message(" Probability " + str(prob) + " for sequence " + str(t), "v2") inout.log_message( [" Best observation sequence: " + str(best_obs_seq), " with tag sequence: " + str(best_tag_list)], "v2", ) # Now process the observation sequence and add elements into dictionary - - - # tag_list_len = len(tag_list) norm_max_prob = max_prob / float(tag_list_len) # Normalise max. probability geoloc_dict = {"geoloc_hmm_proba": [str(norm_max_prob)]} list_len = len(word_list) for i in range(list_len): # Loop over words and states w = word_list[i] s = best_obs_seq[i] # Do not output commas, vertical bars and hyphens - - - - - - - - - - - - # if w in ["|", ",", "-", "/"]: pass elif s == "wfnu": # Wayfare number - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("wayfare_number", []) v.append(w) geoloc_dict.update({"wayfare_number": v}) elif s in ["wfna1", "wfna2", "wfna3"]: # Wayfare name - - - - - - - - - - v = geoloc_dict.get("wayfare_name", []) v.append(w) geoloc_dict.update({"wayfare_name": v}) elif s == "wfql": # Wayfare qualifier - - - - - - - - - - - - - - - - - v = geoloc_dict.get("wayfare_qualifier", []) v.append(w) geoloc_dict.update({"wayfare_qualifier": v}) elif s == "wfty": # Wayfare type - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("wayfare_type", []) v.append(w) geoloc_dict.update({"wayfare_type": v}) elif s == "unnu": # Unit number - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("unit_number", []) v.append(w) geoloc_dict.update({"unit_number": v}) elif s == "unty": # Unit type - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("unit_type", []) v.append(w) geoloc_dict.update({"unit_type": v}) elif s in ["prna1", "prna2"]: # Property name - - - - - - - - - - - - - - v = geoloc_dict.get("property_name", []) v.append(w) geoloc_dict.update({"property_name": v}) elif s in ["inna1", "inna2"]: # Institution name - - - - - - - - - - - - v = geoloc_dict.get("institution_name", []) v.append(w) geoloc_dict.update({"institution_name": v}) elif s == "inty": # Institution type - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("institution_type", []) v.append(w) geoloc_dict.update({"institution_type": v}) elif s == "panu": # Postal address number - - - - - - - - - - - - - - - v = geoloc_dict.get("postaddress_number", []) v.append(w) geoloc_dict.update({"postaddress_number": v}) elif s == "paty": # Postal address type - - - - - - - - - - - - - - - - v = geoloc_dict.get("postaddress_type", []) v.append(w) geoloc_dict.update({"postaddress_type": v}) elif s in ["loc1", "loc2"]: # Locality name - - - - - - - - - - - - - - - v = geoloc_dict.get("locality_name", []) v.append(w) geoloc_dict.update({"locality_name": v}) elif s == "locql": # Locality qualifier - - - - - - - - - - - - - - - - v = geoloc_dict.get("locality_qualifier", []) v.append(w) geoloc_dict.update({"locality_qualifier": v}) elif s == "pc": # Postcode - - - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("postcode", []) v.append(w) geoloc_dict.update({"postcode": v}) elif s in ["ter1", "ter2"]: # Territory - - - - - - - - - - - - - - - - - v = geoloc_dict.get("territory", []) v.append(w) geoloc_dict.update({"territory": v}) elif s in ["cntr1", "cntr2"]: # Country - - - - - - - - - - - - - - - - - v = geoloc_dict.get("country", []) v.append(w) geoloc_dict.update({"country": v}) else: # Should never happen msg = [ "This should never happen!", " Tag: " + str(s), " Word: " + w, " Word list: " + str(word_list), " tag list: " + str(tag_list), ] inout.log_message(msg, "warn") # Check if concatenated locality and territory words are in lookup-table - - # if geoloc_dict.has_key("locality_name"): loc = geoloc_dict["locality_name"] if len(loc) > 1: # Locality contains more than one word loc_tuple = tuple(loc) # Make it a tuple if config.geoloc_lookup_dict.has_key(loc_tuple): new_loc = config.geoloc_lookup_dict[loc_tuple][0] geoloc_dict.update({"locality_name": [new_loc]}) if geoloc_dict.has_key("territory"): terr = geoloc_dict["territory"] if len(terr) > 1: # Territory contains more than one word terr_tuple = tuple(terr) # Make it a tuple if config.geoloc_lookup_dict.has_key(terr_tuple): new_terr = config.geoloc_lookup_dict[terr_tuple][0] geoloc_dict.update({"territory": [new_terr]}) if geoloc_dict.has_key("country"): cntr = geoloc_dict["country"] if len(cntr) > 1: # Country contains more than one word cntr_tuple = tuple(cntr) # Make it a tuple if config.geoloc_lookup_dict.has_key(cntr_tuple): new_cntr = config.geoloc_lookup_dict[cntr_tuple][0] geoloc_dict.update({"country": [new_cntr]}) # Finally do some tests on the output fields - - - - - - - - - - - - - - - - # geoloc_items = geoloc_dict.items() # Check if a value list has more than three elements, if so print out # for i in geoloc_items: if len(i[1]) > 3: inout.log_message( "Geocode/locality output field " + str(i[0]) + " contains more than three elements: " + str(i[1]), "warn", ) # Check if 'number' elements only contain (alpha-) numerical values - - - - - # and also check how many numbers in an element # if geoloc_dict.has_key("wayfare_number"): # Check how many wayfare numbers v = geoloc_dict["wayfare_number"] if len(v) > 2: inout.log_message("More than two wayfare numbers: " + str(v), "warn") for i in v: if i.isalpha(): # Element contains only letters inout.log_message("Wayfare number element contains no digits: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("unit_number"): # Check how many unit numbers v = geoloc_dict["unit_number"] if len(v) > 1: inout.log_message("More than one unit numbers: " + str(v), "warn") for i in v: if i.isalpha(): # Element contains only letters inout.log_message("Unit number element contains no digits: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("postaddress_number"): # Check postaddress numbers v = geoloc_dict["postaddress_number"] if len(v) > 1: inout.log_message("More than one postaddress numbers: " + str(v), "warn") for i in v: if i.isalpha(): # Element contains only letters inout.log_message("Postaddress number element contains no digits: " + str(v), "warn") break # Exit for loop # Check if 'type' elements contain one word only - - - - - - - - - - - - - - # if it's a known type word # if geoloc_dict.has_key("wayfare_type"): # Check wayfare type v = geoloc_dict["wayfare_type"] if len(v) > 1: inout.log_message("More than one wayfare type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("WT") < 0) ): inout.log_message("Wayfare type word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("unit_type"): # Check unit type v = geoloc_dict["unit_type"] if len(v) > 1: inout.log_message("More than one unit type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("UT") < 0) ): inout.log_message("Unit type word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("institution_type"): # Check institution type v = geoloc_dict["institution_type"] if len(v) > 1: inout.log_message("More than one institution type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("IT") < 0) ): inout.log_message("Institution type word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("postaddress_type"): # Check postaddress type v = geoloc_dict["postaddress_type"] if len(v) > 2: inout.log_message("More than two postaddress type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("PA") < 0) ): inout.log_message("Postaddress type word is not known: " + str(v), "warn") break # Exit for loop # Check if 'qualifier' elements only contain known qualifier words - - - - - # if geoloc_dict.has_key("wayfare_qualifier"): # Check wayfare qualifier v = geoloc_dict["wayfare_qualifier"] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or ( config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0) ): inout.log_message("Wayfare qualifier word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("locality_qualifier"): # Check locality qualifier v = geoloc_dict["locality_qualifier"] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or ( config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0) ): inout.log_message("Locality qualifier word is not known: " + str(v), "warn") break # Exit for loop return geoloc_dict
def get_address_hmm(word_list, tag_list, address_hmm, tag_lookup_table, record_id, fields_str): """Process the input using a HMM to extract address output fields. USAGE: address_dict = get_address_hmm(word_list, tag_list, address_hmm, tag_lookup_table) ARGUMENTS: word_list List of words as produces with tag_address_component() tag_list Corresponding list of tags as produces with tag_address_component() address_hmm A reference to the address hidden Markov model tag_lookup_table A tagging look-up table as defined in 'lookup.py' record_id A string identifying the current record fields_str A string representation of the input fields DESCRIPTION: The routine returns a dictionary with the parsed and extracted output fields for the address component. A Hidden Markov Model (HMM) is used for this task. The dictionary returned can contain the following key words: wayfare_number wayfare_name wayfare_qualifier wayfare_type unit_number unit_type property_name institution_name institution_type postaddress_number postaddress_type locality_name locality_qualifier postcode territory country address_hmm_prob (the probability returned by the Viterbi algorithm for the most likely HMM state seqence) """ # First, create all permutations of the input tag sequence # tag_list_seq = mymath.perm_tag_sequence(tag_list) # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - - # and keep the one with highest probability # max_prob = -1.0 best_obs_seq = [] best_tag_list = [] for t in tag_list_seq: [obs_seq, prob] = address_hmm.viterbi(t) if (prob > max_prob): best_obs_seq = obs_seq best_tag_list = t max_prob = prob print '3:%s Sequence: %s has Viterbi probability: %f' % \ (record_id, str(t), prob) print '2:%s Best observation sequence: %s with tag sequence: %s' % \ (record_id, str(best_obs_seq), str(best_tag_list)) # Now process the observation sequence and add elements into dictionary - - - # if (len(tag_list) != len(word_list)): print 'error:%s Length of word list and tag list differs: %s, %s%s' % \ (record_id, str(word_list), str(tag_list), fields_str) raise Exception list_len = len(tag_list) if (list_len == 0): print 'warning:%s Empty tag list returned from HMM %s' % \ (record_id, fields_str) return {} # Return an empty dictionary if not output fields given # norm_max_prob = max_prob / float(list_len) # Normalise max. probability address_dict = {'address_hmm_prob':[str(max_prob)]} for i in range(list_len): # Loop over words and states w = word_list[i] s = best_obs_seq[i] # Do not output commas, vertical bars and hyphens - - - - - - - - - - - - # if (w in ['|', ',', '-','/']): pass elif (s == 'wfnu'): # Wayfare number - - - - - - - - - - - - - - - - - - - v = address_dict.get('wayfare_number',[]) v.append(w) address_dict.update({'wayfare_number':v}) elif (s in ['wfna1','wfna2','wfna3']): # Wayfare name - - - - - - - - - - v = address_dict.get('wayfare_name',[]) v.append(w) address_dict.update({'wayfare_name':v}) elif (s == 'wfql'): # Wayfare qualifier - - - - - - - - - - - - - - - - - v = address_dict.get('wayfare_qualifier',[]) v.append(w) address_dict.update({'wayfare_qualifier':v}) elif (s == 'wfty'): # Wayfare type - - - - - - - - - - - - - - - - - - - - v = address_dict.get('wayfare_type',[]) v.append(w) address_dict.update({'wayfare_type':v}) elif (s == 'unnu'): # Unit number - - - - - - - - - - - - - - - - - - - - v = address_dict.get('unit_number',[]) v.append(w) address_dict.update({'unit_number':v}) elif (s == 'unty'): # Unit type - - - - - - - - - - - - - - - - - - - - - v = address_dict.get('unit_type',[]) v.append(w) address_dict.update({'unit_type':v}) elif (s in ['prna1','prna2']): # Property name - - - - - - - - - - - - - - v = address_dict.get('property_name',[]) v.append(w) address_dict.update({'property_name':v}) elif (s in ['inna1','inna2']): # Institution name - - - - - - - - - - - - v = address_dict.get('institution_name',[]) v.append(w) address_dict.update({'institution_name':v}) elif (s == 'inty'): # Institution type - - - - - - - - - - - - - - - - - - v = address_dict.get('institution_type',[]) v.append(w) address_dict.update({'institution_type':v}) elif (s == 'panu'): # Postal address number - - - - - - - - - - - - - - - v = address_dict.get('postaddress_number',[]) v.append(w) address_dict.update({'postaddress_number':v}) elif (s == 'paty'): # Postal address type - - - - - - - - - - - - - - - - v = address_dict.get('postaddress_type',[]) v.append(w) address_dict.update({'postaddress_type':v}) elif (s in ['loc1','loc2']): # Locality name - - - - - - - - - - - - - - - v = address_dict.get('locality_name',[]) v.append(w) address_dict.update({'locality_name':v}) elif (s == 'locql'): # Locality qualifier - - - - - - - - - - - - - - - - v = address_dict.get('locality_qualifier',[]) v.append(w) address_dict.update({'locality_qualifier':v}) elif (s == 'pc'): # Postcode - - - - - - - - - - - - - - - - - - - - - - - v = address_dict.get('postcode',[]) v.append(w) address_dict.update({'postcode':v}) elif (s in ['ter1','ter2']): # Territory - - - - - - - - - - - - - - - - - v = address_dict.get('territory',[]) v.append(w) address_dict.update({'territory':v}) elif (s in ['cntr1','cntr2']): # Country - - - - - - - - - - - - - - - - - v = address_dict.get('country',[]) v.append(w) address_dict.update({'country':v}) else: # Should never happen print 'warning:%s This should never happen! ' % (record_id) + \ ' Tag: %s, word: %s, word list: %s, tag list: %s%s' % \ (str(s), w, str(word_list), str(tag_list),fields_str) # Check if concatenated locality and territory words are in lookup-table - - # if (address_dict.has_key('locality_name')): loc = address_dict['locality_name'] if (len(loc) > 1): # Locality contains more than one word loc_tuple = tuple(loc) # Make it a tuple if (tag_lookup_table.has_key(loc_tuple)): new_loc = tag_lookup_table[loc_tuple][0] address_dict.update({'locality_name':[new_loc]}) if (address_dict.has_key('territory')): terr = address_dict['territory'] if (len(terr) > 1): # Territory contains more than one word terr_tuple = tuple(terr) # Make it a tuple if (tag_lookup_table.has_key(terr_tuple)): new_terr = tag_lookup_table[terr_tuple][0] address_dict.update({'territory':[new_terr]}) if (address_dict.has_key('country')): cntr = address_dict['country'] if (len(cntr) > 1): # Country contains more than one word cntr_tuple = tuple(cntr) # Make it a tuple if (tag_lookup_table.has_key(cntr_tuple)): new_cntr = tag_lookup_table[cntr_tuple][0] address_dict.update({'country':[new_cntr]}) # Finally do some tests on the output fields - - - - - - - - - - - - - - - - # address_items = address_dict.items() # Check if a value list has more than three elements, if so print out # for i in address_items: if (len(i[1]) > 3): print 'warning:%s Output field "%s" contains' % (record_id, str(i[0]))+ \ ' more than three elements: %s%s' % (str(i[1]), fields_str) # Check if 'number' elements only contain (alpha-) numerical values - - - - - # and also check how many numbers in an element # if (address_dict.has_key('wayfare_number')): # Check how many wayfare numbers v = address_dict['wayfare_number'] if (len(v) > 2): print 'warning:%s More than two wayfare numbers: %s%s' % \ (record_id, str(v), fields_str) for i in v: if (i.isalpha()): # Element contains only letters print 'warning:%s Wayfare number contains no ' % (record_id) + \ 'digits: %s%s' % (str(v), fields_str) break # Exit for loop if (address_dict.has_key('unit_number')): # Check how many unit numbers v = address_dict['unit_number'] if (len(v) > 1): print 'warning:%s More than one unit number: %s%s' % \ (record_id, str(v), fields_str) for i in v: if (i.isalpha()): # Element contains only letters print 'warning:%s Unit number contains no ' % (record_id) + \ 'digits: %s%s' % (str(v), fields_str) break # Exit for loop if (address_dict.has_key('postaddress_number')): # Check postaddress numbers v = address_dict['postaddress_number'] if (len(v) > 1): print 'warning:%s More than one post-address number: %s%s' % \ (record_id, str(v), fields_str) for i in v: if (i.isalpha()): # Element contains only letters print 'warning:%s Post-address number contains no ' % (record_id) + \ 'digits: %s%s' % (str(v), fields_str) break # Exit for loop # Check if 'type' elements contain one word only - - - - - - - - - - - - - - # if it's a known type word # if (address_dict.has_key('wayfare_type')): # Check wayfare type v = address_dict['wayfare_type'] if (len(v) > 1): print 'warning:%s More than one wayfare type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and (tag_lookup_table[(i)][1].find('WT') < 0)): print 'warning:%s Wayfare type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('unit_type')): # Check unit type v = address_dict['unit_type'] if (len(v) > 1): print 'warning:%s More than one unit type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and \ (tag_lookup_table[(i)][1].find('UT') < 0)): print 'warning:%s Unit type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('institution_type')): # Check institution type v = address_dict['institution_type'] if (len(v) > 1): print 'warning:%s More than one institution type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and \ (tag_lookup_table[(i)][1].find('IT') < 0)): print 'warning:%s Institution type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('postaddress_type')): # Check postaddress type v = address_dict['postaddress_type'] if (len(v) > 2): print 'warning:%s More than two post-address type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and \ (tag_lookup_table[(i)][1].find('PA') < 0)): print 'warning:%s Post-address type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop # Check if 'qualifier' elements only contain known qualifier words - - - - - # if (address_dict.has_key('wayfare_qualifier')): # Check wayfare qualifier v = address_dict['wayfare_qualifier'] for i in v: if (not tag_lookup_table.has_key((i,))) or \ (tag_lookup_table.has_key((i,)) and \ (tag_lookup_table[(i,)][1].find('LQ') < 0)): print 'warning:%s Wayfare qualifier word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('locality_qualifier')): # Check locality qualifier v = address_dict['locality_qualifier'] for i in v: if (not tag_lookup_table.has_key((i,))) or \ (tag_lookup_table.has_key((i,)) and \ (tag_lookup_table[(i,)][1].find('LQ') < 0)): print 'warning:%s Locality qualifier word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop return address_dict
def tagdata(): """Main routine, open file, read lines, tag data records, write to out-file. USAGE: tagdata() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 5): print '***** Error: %s needs at least six arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Output training file name' print '***** - Start of block with training records' print '***** - End of block with training records' print '***** - Number of training records' print '***** plus options' raise Exception() if (config.in_file_name == config.options[2]): print '***** Error: Input and output files must differ' print '***** Input file name: ', config.in_file_name print '***** Output training file name:', config.options[2] raise Exception() first_rec = int(config.options[2]) last_rec = int(config.options[3]) num_rec = int(config.options[4]) in_file_name = config.in_file_name out_file_name = config.options[1] # Check record number values - - - - - - - - - - - - - - - - - - - - - - - - # if (int(first_rec) >= int(last_rec)) or \ ((int(num_rec)-1) > (int(last_rec)-int(first_rec))): print '***** Error: Illegal values for training records block:' print '***** - Start of block with training records:', first_rec print '***** - End of block with training records: ', last_rec print '***** - Number of training records: ', num_rec raise Exception() rec_range = last_rec-first_rec-1 # Range of records in input file # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if (last_rec > line_count): # Illegal value for last record print '***** Error: Illegal values for last training records:', last_rec print '***** File only contains',line_count, 'lines/records' raise Exception() # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name','na','n']): tag_mode = 'name' elif (tag_mode in ['locality','localty','loc','l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file hmm_file_name = None # Default: Do not use HMM to standardise training # records retag_file_name = None # Default: Do not retag an existing training file config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) freqs_file_name = None # Default: Do not write frequencies, no -freqs option if (len(config.options) > 5): options = config.options[5:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: '+config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write('############'+os.linesep) f_log.write('#'+os.linesep) f_log.write("# 'pyTagData.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write('#'+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# Output file name: "+out_file_name+os.linesep) f_log.write("# Tagging mode: "+tag_mode+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-hmm'): hmm_file_name = options[1] # Get file name of the HMM to use if (hmm_file_name == out_file_name): print '***** Error: HMM file name is the same as output file name!' raise Exception() try: f_in = open(hmm_file_name,'r') # Test if file is available except: print '***** Error: Cannot open HMM file specified in "-hmm"', print 'option:', hmm_file_name raise IOError() f_in.close() options = options[2:] # Remove processed '-hmm' option and file name elif (options[0] == '-retag'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-retag" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() retag_file_name = options[1] # Get file name of the already-tagged # file to re-process if (retag_file_name == out_file_name): print '***** Error: Retag file name is the same as output file name!' raise Exception() elif (retag_file_name == in_file_name): print '***** Error: Retag file name is the same as input file name!' raise Exception() elif (retag_file_name == hmm_file_name): print '***** Error: Retag file name is the same as HMM file name!' raise Exception() try: f_in = open(retag_file_name,'r') # Test if file is available # Now gather record numbers and previous tags/states, as well as the # original header information. Use a simple state machine to do this. # tagged_recs = {} cleaned_recs = {} original_header_lines = [] state = -1 # Header lines state prevline = '' for line in f_in.xreadlines(): # Read training file and process it line = line.strip() if (state == -1) and (len(line) == 0): # End of header lines state = 0 prevline = line continue if (state == -1) and (len(line) > 0) and (line[0] == "#"): original_header_lines.append("# " + line) prevline = line continue sline = line.split(' ') if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \ and (sline[2][0] == '(') and (sline[2][-2:] == '):'): try: rec = int(sline[1]) # Original record number tagged_recs[rec] = None cleaned_recs[rec] = None state = 1 except: pass prevline = line continue if (state == 1) and (len(line) > 0) and (line[0] != '#'): tagged_recs[rec] = line cleaned_recs[rec] = prevline state = 0 prevline = line continue if (state == 1) and (len(line) > 0): prevline = line continue f_in.close() tagged_recs_keys = tagged_recs.keys() num_rec = len(tagged_recs_keys) # Override specified numbers first_rec = 0 last_rec = line_count except: print '***** Error: Cannot open tagged training file specified', print 'in "-retag" option:', retag_file_name raise IOError() options = options[2:] # Remove processed '-retag' option and file name elif (options[0][:5] == '-freq'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-feqs" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() freqs_file_name = options[1] # File name to write the frequencies to if (freqs_file_name == out_file_name): print '***** Error: Frequency file name is the same as output', print 'file name!' raise Exception() elif (freqs_file_name == in_file_name): print '***** Error: Frequency file name is the same as input', print 'file name!' raise Exception() elif (freqs_file_name == hmm_file_name): print '***** Error: Frequency file name is the same as HMM', print 'file name!' raise Exception() options = options[2:] # Remove processed '-freqs' option and file name try: # Check if file writing is possible freqs_out = open(freqs_file_name,'w') freqs_out.close() except: print '***** Error: Cannot write to frequency output file specified', print 'in "-freqs" option:', freqs_file_name raise IOError() else: print '***** Error: Illegal option:', options[0] raise Exception() # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - - # if (hmm_file_name != None): myhmm = simplehmm.hmm([],[]) # Create new empty HMM object myhmm.load_hmm(hmm_file_name) myhmm.print_hmm() # Print HMM (according to verbose and logging level) # Open output file and write header - - - - - - - - - - - - - - - - - - - - - # try: f_out = open(out_file_name,'w') except: inout.log_message('Cannot open output file: '+out_file_name,'err') raise IOError() f_out.write("# Tagged training data written by 'pyTagData.py -"+ \ " Version 0.1'"+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Created '+time.ctime(time.time())+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Input file name: '+in_file_name+os.linesep) f_out.write('# Output file name: '+out_file_name+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Parameters:'+os.linesep) f_out.write('# - Start of block with training records: '+str(first_rec)+ \ os.linesep) f_out.write('# - End of block with training records: '+str(last_rec)+ \ os.linesep) f_out.write('# - Number of training records: '+str(num_rec)+ \ os.linesep) if (hmm_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \ os.linesep) if (retag_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \ os.linesep) f_out.write("# Header lines from original training file follow:" + \ os.linesep) for header_line in original_header_lines: f_out.write(header_line + os.linesep) if (freqs_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Tag/state pattern frequencies written to file '" + \ freqs_file_name + os.linesep) f_out.write('#'+'-'*70+os.linesep) f_out.write(os.linesep) rec_count = 0 # Number of selected records num_rec_left = num_rec # Number of records to be selected left rec_selected = {} # Dictionary of all record numbers that were selected seq_freqs = {} # Dict to hold examples of tag/state patterns unchanged_loop_cnt = 0 # Counter of how many loops have been done # without new records being selected prev_num_rec_left = num_rec # Number of records left in the previous # interation # Due to the random nature of selecting records, and because sometimes - - - # a selected component can be empty (and is thus not used for training) # more than one iteration over the input data set is carried out. In each # iteration, records are selected randomly. # while (rec_count < num_rec): # Loop until 'num_rec' records selected # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_read = 0 # Number of read lines # Skip to start of training block - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() while (rec_count < num_rec) and (line_read <= (last_rec-first_rec)): line = f_in.readline() if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \ ((retag_file_name == None) and \ (num_rec_left >= random.randrange(0,rec_range,1))): line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record number: '+str(line_read+first_rec)],'v1') config.curr_line_no = line_read+first_rec # Store current line number # Process line and extract content into components (name, geocode, etc) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Select component and process it - - - - - - - - - - - - - - - - - - - # if (tag_mode == 'name'): if (type(name_comp) == types.ListType): component = name_comp[0].strip()+' '+name_comp[1].strip() else: component = name_comp.strip() else: # Locality component component = geocode_comp.strip()+' '+locality_comp.strip() if (component != '') and \ (not rec_selected.has_key((line_read+first_rec))): if (tag_mode == 'name'): inout.log_message(' Name component: |'+component+'|','v1') component = name.clean_name_component(component) [word_list, tag_list] = name.tag_name_component(component) else: # Locality component inout.log_message(' Locality component: |'+component+'|','v1') component = locality.clean_geoloc_component(component) [word_list, tag_list] = locality.tag_geoloc_component(component) if (tag_list != []): # Only process non-empty tag lists # Append record number into dictionary of processed records # rec_selected.update({(line_read+first_rec):(line_read+first_rec)}) # Create all permutation sequences of this tag list - - - - - - - - # tag_seq = mymath.perm_tag_sequence(tag_list) inout.log_message([' Word list: '+str(word_list), \ ' Tag list: '+str(tag_list), \ ' Tag sequences:'],'v2') # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - - # if (hmm_file_name != None): state_seq = [] # List containing computed HMM state sequences max_prob = -1.0 # maximal probability for a sequence max_seq_no = -1 # Number of the seq. with the max. probablity # Now give tag sequences to the HMMs to compute state sequences # i = 0 for t in tag_seq: [obs_seq, prob] = myhmm.viterbi(t) state_seq.append(obs_seq) if (prob > max_prob): max_prob = prob max_seq_no = i i += 1 # Write original component and resulting tag sequences to output # f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \ '): |'+component+'|'+os.linesep) # Commented original num_len = len(str(line_read+first_rec))+len(str(rec_count))+6 f_out.write('#'+num_len*' '+'|'+' '.join(word_list)+'|'+os.linesep) for i in range(len(tag_seq)): # Convert each tag sequence into a string for file output # seq_string = ' ' if (hmm_file_name != None) and (i != max_seq_no): seq_string = '# ' # Comment sequences with not max. probability for j in range(len(tag_seq[i])): if (hmm_file_name != None): seq_string = seq_string+' '+tag_seq[i][j]+':'+ \ state_seq[i][j]+',' else: seq_string = seq_string+' '+tag_seq[i][j]+':,' f_out.write(seq_string[:-1]+os.linesep) # Write without , at end inout.log_message(' '+seq_string[:-1],'v2') if (hmm_file_name != None): f_out.write('# Maximum Viterbi probability: %0.5f'% \ (max_prob) + os.linesep) inout.log_message('Maximum Viterbi probability: %0.5f'% \ (max_prob), 'v2') if (retag_file_name != None) and (tagged_recs[line_read] != None): if (tagged_recs[line_read].strip() != seq_string[:-1].strip()): f_out.write("# Note: ***** Changed *****" + os.linesep) inout.log_message(' Note:' + \ ' ***** Changed *****','v2') f_out.write('# Was: ' + tagged_recs[line_read]+os.linesep) # Write commented original tag sequence inout.log_message('Original tag sequence: '+ \ tagged_recs[line_read],'v2') f_out.write(os.linesep) # Write an empty line inout.log_message('','v1') # Print empty lines between records if (hmm_file_name != None): seq_key = seq_string[:-1] # Add sequence to dictionary if (seq_freqs.has_key(seq_key)): seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \ max_prob]) else: seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \ max_prob]] rec_count += 1 # Print process indicator message # if (config.proc_ind >= 0) and (rec_count > 0): if (rec_count % config.proc_ind == 0): print 'Processed line', rec_count, 'of', num_rec line_read += 1 f_in.close() num_rec_left = num_rec - rec_count if (prev_num_rec_left == num_rec_left): # No new records selected unchanged_loop_cnt += 1 prev_num_rec_left = num_rec_left # Set to current value if (unchanged_loop_cnt > 5): # Do five loops maximal without selecting # new records config.curr_line_no = -1 # Set to illegal/empty values, as warning is config.curr_line = '' # not related to the current input line inout.log_message(['Can not select more than '+str(rec_count)+ \ ' records for training.', \ 'This is probably due to empty input components.', \ 'Please reduce value of "num_rec" or increase ' + \ 'range','between "first_rec" and "last_rec".'],'warn') break if (num_rec_left < 10): # Only 10 records left to select num_rec_left = num_rec+1 # Set to more than 100% probablity elif (num_rec_left < (num_rec / 100.0)): # Less than 1% records left num_rec_left = int(num_rec / 100.0) # Set to 1% f_out.close() # If specified, save Viterbi frequencies to a file - - - - - - - - - - - - - # if (freqs_file_name != None): freqs_out = open(freqs_file_name,'w') # Open frequency file for writing freqs_out.write('# Frequency listing of tag/state patterns written by') freqs_out.write('"pyTagData.py - Version 0.1"'+os.linesep) freqs_out.write('#'+os.linesep) freqs_out.write('# Created '+time.ctime(time.time())+os.linesep) freqs_out.write('#'+os.linesep) freqs_out.write("# Input file name: "+in_file_name+os.linesep) freqs_out.write("# Output file name: "+out_file_name+os.linesep) freqs_out.write(os.linesep) freqs_out.write('# Parameters:'+os.linesep) freqs_out.write('# - Start of block with training records: '+ \ str(first_rec)+os.linesep) freqs_out.write('# - End of block with training records: '+ \ str(last_rec)+os.linesep) freqs_out.write('# - Number of training records: '+ \ str(num_rec)+os.linesep) if (hmm_file_name != None): freqs_out.write('#'+os.linesep) freqs_out.write("# - Using HMM file '"+hmm_file_name+ \ "' for standardisation"+os.linesep) if (retag_file_name != None): freqs_out.write('#'+os.linesep) freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \ "'"+os.linesep) freqs_out.write('#'+'-'*70+os.linesep) freqs_out.write(os.linesep) sorted_seq_freqs = [] # Now sort sequences according to their fruequencies for key in seq_freqs.keys(): sorted_seq_freqs.append((len(seq_freqs[key]),key)) sorted_seq_freqs.sort() for skey in sorted_seq_freqs: key = skey[1] freqs_out.write('# Pattern: '+str(key)+os.linesep) freqs_out.write('# Frequency: '+str(skey[0])+os.linesep) examples = seq_freqs[key] freqs_out.write('# Maximum Viterbi probability: '+ \ str(examples[0][1])+os.linesep) freqs_out.write('# Examples: '+os.linesep) for example in examples: freqs_out.write('# '+str(example[0])+os.linesep) freqs_out.write(str(key)+os.linesep) freqs_out.write(os.linesep) freqs_out.close() inout.log_message(['Read '+str(line_read)+' lines, processed '+ \ str(rec_count)+' lines', 'End.'],'v1')
def get_address_hmm(word_list, tag_list, address_hmm, tag_lookup_table, record_id, fields_str): """Process the input using a HMM to extract address output fields. USAGE: address_dict = get_address_hmm(word_list, tag_list, address_hmm, tag_lookup_table) ARGUMENTS: word_list List of words as produces with tag_address_component() tag_list Corresponding list of tags as produces with tag_address_component() address_hmm A reference to the address hidden Markov model tag_lookup_table A tagging look-up table as defined in 'lookup.py' record_id A string identifying the current record fields_str A string representation of the input fields DESCRIPTION: The routine returns a dictionary with the parsed and extracted output fields for the address component. A Hidden Markov Model (HMM) is used for this task. The dictionary returned can contain the following key words: wayfare_number wayfare_name wayfare_qualifier wayfare_type unit_number unit_type property_name institution_name institution_type postaddress_number postaddress_type locality_name locality_qualifier postcode territory country address_hmm_prob (the probability returned by the Viterbi algorithm for the most likely HMM state seqence) """ # First, create all permutations of the input tag sequence # tag_list_seq = mymath.perm_tag_sequence(tag_list) # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - - # and keep the one with highest probability # max_prob = -1.0 best_obs_seq = [] best_tag_list = [] for t in tag_list_seq: [obs_seq, prob] = address_hmm.viterbi(t) if (prob > max_prob): best_obs_seq = obs_seq best_tag_list = t max_prob = prob print '3:%s Sequence: %s has Viterbi probability: %f' % \ (record_id, str(t), prob) print '2:%s Best observation sequence: %s with tag sequence: %s' % \ (record_id, str(best_obs_seq), str(best_tag_list)) # Now process the observation sequence and add elements into dictionary - - - # if (len(tag_list) != len(word_list)): print 'error:%s Length of word list and tag list differs: %s, %s%s' % \ (record_id, str(word_list), str(tag_list), fields_str) raise Exception list_len = len(tag_list) if (list_len == 0): print 'warning:%s Empty tag list returned from HMM %s' % \ (record_id, fields_str) return {} # Return an empty dictionary if not output fields given # norm_max_prob = max_prob / float(list_len) # Normalise max. probability address_dict = {'address_hmm_prob': [str(max_prob)]} for i in range(list_len): # Loop over words and states w = word_list[i] s = best_obs_seq[i] # Do not output commas, vertical bars and hyphens - - - - - - - - - - - - # if (w in ['|', ',', '-', '/']): pass elif (s == 'wfnu' ): # Wayfare number - - - - - - - - - - - - - - - - - - - v = address_dict.get('wayfare_number', []) v.append(w) address_dict.update({'wayfare_number': v}) elif (s in ['wfna1', 'wfna2', 'wfna3']): # Wayfare name - - - - - - - - - - v = address_dict.get('wayfare_name', []) v.append(w) address_dict.update({'wayfare_name': v}) elif (s == 'wfql' ): # Wayfare qualifier - - - - - - - - - - - - - - - - - v = address_dict.get('wayfare_qualifier', []) v.append(w) address_dict.update({'wayfare_qualifier': v}) elif (s == 'wfty' ): # Wayfare type - - - - - - - - - - - - - - - - - - - - v = address_dict.get('wayfare_type', []) v.append(w) address_dict.update({'wayfare_type': v}) elif (s == 'unnu' ): # Unit number - - - - - - - - - - - - - - - - - - - - v = address_dict.get('unit_number', []) v.append(w) address_dict.update({'unit_number': v}) elif (s == 'unty' ): # Unit type - - - - - - - - - - - - - - - - - - - - - v = address_dict.get('unit_type', []) v.append(w) address_dict.update({'unit_type': v}) elif (s in ['prna1', 'prna2']): # Property name - - - - - - - - - - - - - - v = address_dict.get('property_name', []) v.append(w) address_dict.update({'property_name': v}) elif (s in ['inna1', 'inna2']): # Institution name - - - - - - - - - - - - v = address_dict.get('institution_name', []) v.append(w) address_dict.update({'institution_name': v}) elif (s == 'inty' ): # Institution type - - - - - - - - - - - - - - - - - - v = address_dict.get('institution_type', []) v.append(w) address_dict.update({'institution_type': v}) elif (s == 'panu' ): # Postal address number - - - - - - - - - - - - - - - v = address_dict.get('postaddress_number', []) v.append(w) address_dict.update({'postaddress_number': v}) elif (s == 'paty' ): # Postal address type - - - - - - - - - - - - - - - - v = address_dict.get('postaddress_type', []) v.append(w) address_dict.update({'postaddress_type': v}) elif (s in ['loc1', 'loc2']): # Locality name - - - - - - - - - - - - - - - v = address_dict.get('locality_name', []) v.append(w) address_dict.update({'locality_name': v}) elif (s == 'locql' ): # Locality qualifier - - - - - - - - - - - - - - - - v = address_dict.get('locality_qualifier', []) v.append(w) address_dict.update({'locality_qualifier': v}) elif (s == 'pc' ): # Postcode - - - - - - - - - - - - - - - - - - - - - - - v = address_dict.get('postcode', []) v.append(w) address_dict.update({'postcode': v}) elif (s in ['ter1', 'ter2']): # Territory - - - - - - - - - - - - - - - - - v = address_dict.get('territory', []) v.append(w) address_dict.update({'territory': v}) elif (s in ['cntr1', 'cntr2']): # Country - - - - - - - - - - - - - - - - - v = address_dict.get('country', []) v.append(w) address_dict.update({'country': v}) else: # Should never happen print 'warning:%s This should never happen! ' % (record_id) + \ ' Tag: %s, word: %s, word list: %s, tag list: %s%s' % \ (str(s), w, str(word_list), str(tag_list),fields_str) # Check if concatenated locality and territory words are in lookup-table - - # if (address_dict.has_key('locality_name')): loc = address_dict['locality_name'] if (len(loc) > 1): # Locality contains more than one word loc_tuple = tuple(loc) # Make it a tuple if (tag_lookup_table.has_key(loc_tuple)): new_loc = tag_lookup_table[loc_tuple][0] address_dict.update({'locality_name': [new_loc]}) if (address_dict.has_key('territory')): terr = address_dict['territory'] if (len(terr) > 1): # Territory contains more than one word terr_tuple = tuple(terr) # Make it a tuple if (tag_lookup_table.has_key(terr_tuple)): new_terr = tag_lookup_table[terr_tuple][0] address_dict.update({'territory': [new_terr]}) if (address_dict.has_key('country')): cntr = address_dict['country'] if (len(cntr) > 1): # Country contains more than one word cntr_tuple = tuple(cntr) # Make it a tuple if (tag_lookup_table.has_key(cntr_tuple)): new_cntr = tag_lookup_table[cntr_tuple][0] address_dict.update({'country': [new_cntr]}) # Finally do some tests on the output fields - - - - - - - - - - - - - - - - # address_items = address_dict.items() # Check if a value list has more than three elements, if so print out # for i in address_items: if (len(i[1]) > 3): print 'warning:%s Output field "%s" contains' % (record_id, str(i[0]))+ \ ' more than three elements: %s%s' % (str(i[1]), fields_str) # Check if 'number' elements only contain (alpha-) numerical values - - - - - # and also check how many numbers in an element # if (address_dict.has_key('wayfare_number') ): # Check how many wayfare numbers v = address_dict['wayfare_number'] if (len(v) > 2): print 'warning:%s More than two wayfare numbers: %s%s' % \ (record_id, str(v), fields_str) for i in v: if (i.isalpha()): # Element contains only letters print 'warning:%s Wayfare number contains no ' % (record_id) + \ 'digits: %s%s' % (str(v), fields_str) break # Exit for loop if (address_dict.has_key('unit_number')): # Check how many unit numbers v = address_dict['unit_number'] if (len(v) > 1): print 'warning:%s More than one unit number: %s%s' % \ (record_id, str(v), fields_str) for i in v: if (i.isalpha()): # Element contains only letters print 'warning:%s Unit number contains no ' % (record_id) + \ 'digits: %s%s' % (str(v), fields_str) break # Exit for loop if (address_dict.has_key('postaddress_number') ): # Check postaddress numbers v = address_dict['postaddress_number'] if (len(v) > 1): print 'warning:%s More than one post-address number: %s%s' % \ (record_id, str(v), fields_str) for i in v: if (i.isalpha()): # Element contains only letters print 'warning:%s Post-address number contains no ' % (record_id) + \ 'digits: %s%s' % (str(v), fields_str) break # Exit for loop # Check if 'type' elements contain one word only - - - - - - - - - - - - - - # if it's a known type word # if (address_dict.has_key('wayfare_type')): # Check wayfare type v = address_dict['wayfare_type'] if (len(v) > 1): print 'warning:%s More than one wayfare type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and (tag_lookup_table[(i)][1].find('WT') < 0)): print 'warning:%s Wayfare type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('unit_type')): # Check unit type v = address_dict['unit_type'] if (len(v) > 1): print 'warning:%s More than one unit type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and \ (tag_lookup_table[(i)][1].find('UT') < 0)): print 'warning:%s Unit type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('institution_type')): # Check institution type v = address_dict['institution_type'] if (len(v) > 1): print 'warning:%s More than one institution type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and \ (tag_lookup_table[(i)][1].find('IT') < 0)): print 'warning:%s Institution type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('postaddress_type')): # Check postaddress type v = address_dict['postaddress_type'] if (len(v) > 2): print 'warning:%s More than two post-address type: %s%s' % \ (record_id, str(v), fields_str) for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not tag_lookup_table.has_key((i))) or \ (tag_lookup_table.has_key((i)) and \ (tag_lookup_table[(i)][1].find('PA') < 0)): print 'warning:%s Post-address type word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop # Check if 'qualifier' elements only contain known qualifier words - - - - - # if (address_dict.has_key('wayfare_qualifier')): # Check wayfare qualifier v = address_dict['wayfare_qualifier'] for i in v: if (not tag_lookup_table.has_key((i,))) or \ (tag_lookup_table.has_key((i,)) and \ (tag_lookup_table[(i,)][1].find('LQ') < 0)): print 'warning:%s Wayfare qualifier word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop if (address_dict.has_key('locality_qualifier') ): # Check locality qualifier v = address_dict['locality_qualifier'] for i in v: if (not tag_lookup_table.has_key((i,))) or \ (tag_lookup_table.has_key((i,)) and \ (tag_lookup_table[(i,)][1].find('LQ') < 0)): print 'warning:%s Locality qualifier word is not known: %s%s' % \ (record_id, str(v), fields_str) break # Exit for loop return address_dict
def get_geoloc_hmm(word_list, tag_list): """Process input using a HMM to extract geocode and locality output fields. USAGE: geoloc_dict = get_geoloc_hmm(word_list, tag_list) ARGUMENTS: word_list List of words as produces with clean_tag_locality() tag_list Corresponding list of tags as produces with clean_tag_locality() DESCRIPTION: The routine returns a dictionary with the parsed and extracted output fields for both the locality and geocode components. A Hidden Markov Model (HMM) is used for this task. The dictionary returned can contain the following key words: - wayfare_number - wayfare_name - wayfare_qualifier - wayfare_type - unit_number - unit_type - property_name - institution_name - institution_type - postaddress_number - postaddress_type - locality_name - locality_qualifier - postcode - territory - country - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for the most likely HMM state seqence) """ # First, create all permutations of the input tag sequence # tag_list_seq = mymath.perm_tag_sequence(tag_list) msg = [' Input tag sequence: ' + str(tag_list), ' Output tag sequences:'] for t in tag_list_seq: msg.append(' ' + str(t)) inout.log_message(msg, 'v2') # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - - # and keep the one with highest probability # max_prob = -1.0 best_obs_seq = [] best_tag_list = [] for t in tag_list_seq: [obs_seq, prob] = config.geoloc_hmm.viterbi(t) if (prob > max_prob): best_obs_seq = obs_seq best_tag_list = t max_prob = prob inout.log_message( ' Probability ' + str(prob) + ' for sequence ' + str(t), 'v2') inout.log_message([ ' Best observation sequence: ' + str(best_obs_seq), ' with tag sequence: ' + str(best_tag_list) ], 'v2') # Now process the observation sequence and add elements into dictionary - - - # tag_list_len = len(tag_list) norm_max_prob = max_prob / float( tag_list_len) # Normalise max. probability geoloc_dict = {'geoloc_hmm_proba': [str(norm_max_prob)]} list_len = len(word_list) for i in range(list_len): # Loop over words and states w = word_list[i] s = best_obs_seq[i] # Do not output commas, vertical bars and hyphens - - - - - - - - - - - - # if (w in ['|', ',', '-', '/']): pass elif (s == 'wfnu' ): # Wayfare number - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('wayfare_number', []) v.append(w) geoloc_dict.update({'wayfare_number': v}) elif (s in ['wfna1', 'wfna2', 'wfna3']): # Wayfare name - - - - - - - - - - v = geoloc_dict.get('wayfare_name', []) v.append(w) geoloc_dict.update({'wayfare_name': v}) elif (s == 'wfql' ): # Wayfare qualifier - - - - - - - - - - - - - - - - - v = geoloc_dict.get('wayfare_qualifier', []) v.append(w) geoloc_dict.update({'wayfare_qualifier': v}) elif (s == 'wfty' ): # Wayfare type - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('wayfare_type', []) v.append(w) geoloc_dict.update({'wayfare_type': v}) elif (s == 'unnu' ): # Unit number - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('unit_number', []) v.append(w) geoloc_dict.update({'unit_number': v}) elif (s == 'unty' ): # Unit type - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('unit_type', []) v.append(w) geoloc_dict.update({'unit_type': v}) elif (s in ['prna1', 'prna2']): # Property name - - - - - - - - - - - - - - v = geoloc_dict.get('property_name', []) v.append(w) geoloc_dict.update({'property_name': v}) elif (s in ['inna1', 'inna2']): # Institution name - - - - - - - - - - - - v = geoloc_dict.get('institution_name', []) v.append(w) geoloc_dict.update({'institution_name': v}) elif (s == 'inty' ): # Institution type - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('institution_type', []) v.append(w) geoloc_dict.update({'institution_type': v}) elif (s == 'panu' ): # Postal address number - - - - - - - - - - - - - - - v = geoloc_dict.get('postaddress_number', []) v.append(w) geoloc_dict.update({'postaddress_number': v}) elif (s == 'paty' ): # Postal address type - - - - - - - - - - - - - - - - v = geoloc_dict.get('postaddress_type', []) v.append(w) geoloc_dict.update({'postaddress_type': v}) elif (s in ['loc1', 'loc2']): # Locality name - - - - - - - - - - - - - - - v = geoloc_dict.get('locality_name', []) v.append(w) geoloc_dict.update({'locality_name': v}) elif (s == 'locql' ): # Locality qualifier - - - - - - - - - - - - - - - - v = geoloc_dict.get('locality_qualifier', []) v.append(w) geoloc_dict.update({'locality_qualifier': v}) elif (s == 'pc' ): # Postcode - - - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('postcode', []) v.append(w) geoloc_dict.update({'postcode': v}) elif (s in ['ter1', 'ter2']): # Territory - - - - - - - - - - - - - - - - - v = geoloc_dict.get('territory', []) v.append(w) geoloc_dict.update({'territory': v}) elif (s in ['cntr1', 'cntr2']): # Country - - - - - - - - - - - - - - - - - v = geoloc_dict.get('country', []) v.append(w) geoloc_dict.update({'country': v}) else: # Should never happen msg = ['This should never happen!', ' Tag: '+str(s), ' Word: '+w, \ ' Word list: '+str(word_list), \ ' tag list: '+str(tag_list)] inout.log_message(msg, 'warn') # Check if concatenated locality and territory words are in lookup-table - - # if (geoloc_dict.has_key('locality_name')): loc = geoloc_dict['locality_name'] if (len(loc) > 1): # Locality contains more than one word loc_tuple = tuple(loc) # Make it a tuple if (config.geoloc_lookup_dict.has_key(loc_tuple)): new_loc = config.geoloc_lookup_dict[loc_tuple][0] geoloc_dict.update({'locality_name': [new_loc]}) if (geoloc_dict.has_key('territory')): terr = geoloc_dict['territory'] if (len(terr) > 1): # Territory contains more than one word terr_tuple = tuple(terr) # Make it a tuple if (config.geoloc_lookup_dict.has_key(terr_tuple)): new_terr = config.geoloc_lookup_dict[terr_tuple][0] geoloc_dict.update({'territory': [new_terr]}) if (geoloc_dict.has_key('country')): cntr = geoloc_dict['country'] if (len(cntr) > 1): # Country contains more than one word cntr_tuple = tuple(cntr) # Make it a tuple if (config.geoloc_lookup_dict.has_key(cntr_tuple)): new_cntr = config.geoloc_lookup_dict[cntr_tuple][0] geoloc_dict.update({'country': [new_cntr]}) # Finally do some tests on the output fields - - - - - - - - - - - - - - - - # geoloc_items = geoloc_dict.items() # Check if a value list has more than three elements, if so print out # for i in geoloc_items: if (len(i[1]) > 3): inout.log_message('Geocode/locality output field '+ str(i[0])+ \ ' contains more than three elements: '+str(i[1]),'warn') # Check if 'number' elements only contain (alpha-) numerical values - - - - - # and also check how many numbers in an element # if (geoloc_dict.has_key('wayfare_number') ): # Check how many wayfare numbers v = geoloc_dict['wayfare_number'] if (len(v) > 2): inout.log_message('More than two wayfare numbers: ' + str(v), 'warn') for i in v: if (i.isalpha()): # Element contains only letters inout.log_message('Wayfare number element contains no digits: '+ \ str(v),'warn') break # Exit for loop if (geoloc_dict.has_key('unit_number')): # Check how many unit numbers v = geoloc_dict['unit_number'] if (len(v) > 1): inout.log_message('More than one unit numbers: ' + str(v), 'warn') for i in v: if (i.isalpha()): # Element contains only letters inout.log_message('Unit number element contains no digits: '+str(v),\ 'warn') break # Exit for loop if (geoloc_dict.has_key('postaddress_number') ): # Check postaddress numbers v = geoloc_dict['postaddress_number'] if (len(v) > 1): inout.log_message('More than one postaddress numbers: ' + str(v), 'warn') for i in v: if (i.isalpha()): # Element contains only letters inout.log_message('Postaddress number element contains no digits: '+ \ str(v),'warn') break # Exit for loop # Check if 'type' elements contain one word only - - - - - - - - - - - - - - # if it's a known type word # if (geoloc_dict.has_key('wayfare_type')): # Check wayfare type v = geoloc_dict['wayfare_type'] if (len(v) > 1): inout.log_message('More than one wayfare type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('WT') < 0)): inout.log_message('Wayfare type word is not known: ' + str(v), 'warn') break # Exit for loop if (geoloc_dict.has_key('unit_type')): # Check unit type v = geoloc_dict['unit_type'] if (len(v) > 1): inout.log_message('More than one unit type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('UT') < 0)): inout.log_message('Unit type word is not known: ' + str(v), 'warn') break # Exit for loop if (geoloc_dict.has_key('institution_type')): # Check institution type v = geoloc_dict['institution_type'] if (len(v) > 1): inout.log_message('More than one institution type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('IT') < 0)): inout.log_message( 'Institution type word is not known: ' + str(v), 'warn') break # Exit for loop if (geoloc_dict.has_key('postaddress_type')): # Check postaddress type v = geoloc_dict['postaddress_type'] if (len(v) > 2): inout.log_message('More than two postaddress type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('PA') < 0)): inout.log_message( 'Postaddress type word is not known: ' + str(v), 'warn') break # Exit for loop # Check if 'qualifier' elements only contain known qualifier words - - - - - # if (geoloc_dict.has_key('wayfare_qualifier')): # Check wayfare qualifier v = geoloc_dict['wayfare_qualifier'] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or \ (config.geoloc_lookup_dict.has_key((i,)) and \ (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)): inout.log_message('Wayfare qualifier word is not known: '+str(v), \ 'warn') break # Exit for loop if (geoloc_dict.has_key('locality_qualifier')): # Check locality qualifier v = geoloc_dict['locality_qualifier'] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or \ (config.geoloc_lookup_dict.has_key((i,)) and \ (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)): inout.log_message('Locality qualifier word is not known: '+str(v), \ 'warn') break # Exit for loop return geoloc_dict
def tagdata(): """Main routine, open file, read lines, tag data records, write to out-file. USAGE: tagdata() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 5): print '***** Error: %s needs at least six arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Output training file name' print '***** - Start of block with training records' print '***** - End of block with training records' print '***** - Number of training records' print '***** plus options' raise Exception() if (config.in_file_name == config.options[2]): print '***** Error: Input and output files must differ' print '***** Input file name: ', config.in_file_name print '***** Output training file name:', config.options[2] raise Exception() first_rec = int(config.options[2]) last_rec = int(config.options[3]) num_rec = int(config.options[4]) in_file_name = config.in_file_name out_file_name = config.options[1] # Check record number values - - - - - - - - - - - - - - - - - - - - - - - - # if (int(first_rec) >= int(last_rec)) or \ ((int(num_rec)-1) > (int(last_rec)-int(first_rec))): print '***** Error: Illegal values for training records block:' print '***** - Start of block with training records:', first_rec print '***** - End of block with training records: ', last_rec print '***** - Number of training records: ', num_rec raise Exception() rec_range = last_rec - first_rec - 1 # Range of records in input file # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if (last_rec > line_count): # Illegal value for last record print '***** Error: Illegal values for last training records:', last_rec print '***** File only contains', line_count, 'lines/records' raise Exception() # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name', 'na', 'n']): tag_mode = 'name' elif (tag_mode in ['locality', 'localty', 'loc', 'l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file hmm_file_name = None # Default: Do not use HMM to standardise training # records retag_file_name = None # Default: Do not retag an existing training file config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) freqs_file_name = None # Default: Do not write frequencies, no -freqs option if (len(config.options) > 5): options = config.options[5:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: ' + config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write('############' + os.linesep) f_log.write('#' + os.linesep) f_log.write( "# 'pyTagData.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write('#' + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# Output file name: " + out_file_name + os.linesep) f_log.write("# Tagging mode: " + tag_mode + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-hmm'): hmm_file_name = options[1] # Get file name of the HMM to use if (hmm_file_name == out_file_name): print '***** Error: HMM file name is the same as output file name!' raise Exception() try: f_in = open(hmm_file_name, 'r') # Test if file is available except: print '***** Error: Cannot open HMM file specified in "-hmm"', print 'option:', hmm_file_name raise IOError() f_in.close() options = options[ 2:] # Remove processed '-hmm' option and file name elif (options[0] == '-retag'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-retag" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() retag_file_name = options[ 1] # Get file name of the already-tagged # file to re-process if (retag_file_name == out_file_name): print '***** Error: Retag file name is the same as output file name!' raise Exception() elif (retag_file_name == in_file_name): print '***** Error: Retag file name is the same as input file name!' raise Exception() elif (retag_file_name == hmm_file_name): print '***** Error: Retag file name is the same as HMM file name!' raise Exception() try: f_in = open(retag_file_name, 'r') # Test if file is available # Now gather record numbers and previous tags/states, as well as the # original header information. Use a simple state machine to do this. # tagged_recs = {} cleaned_recs = {} original_header_lines = [] state = -1 # Header lines state prevline = '' for line in f_in.xreadlines( ): # Read training file and process it line = line.strip() if (state == -1) and (len(line) == 0): # End of header lines state = 0 prevline = line continue if (state == -1) and (len(line) > 0) and (line[0] == "#"): original_header_lines.append("# " + line) prevline = line continue sline = line.split(' ') if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \ and (sline[2][0] == '(') and (sline[2][-2:] == '):'): try: rec = int(sline[1]) # Original record number tagged_recs[rec] = None cleaned_recs[rec] = None state = 1 except: pass prevline = line continue if (state == 1) and (len(line) > 0) and (line[0] != '#'): tagged_recs[rec] = line cleaned_recs[rec] = prevline state = 0 prevline = line continue if (state == 1) and (len(line) > 0): prevline = line continue f_in.close() tagged_recs_keys = tagged_recs.keys() num_rec = len( tagged_recs_keys) # Override specified numbers first_rec = 0 last_rec = line_count except: print '***** Error: Cannot open tagged training file specified', print 'in "-retag" option:', retag_file_name raise IOError() options = options[ 2:] # Remove processed '-retag' option and file name elif (options[0][:5] == '-freq'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-feqs" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() freqs_file_name = options[ 1] # File name to write the frequencies to if (freqs_file_name == out_file_name): print '***** Error: Frequency file name is the same as output', print 'file name!' raise Exception() elif (freqs_file_name == in_file_name): print '***** Error: Frequency file name is the same as input', print 'file name!' raise Exception() elif (freqs_file_name == hmm_file_name): print '***** Error: Frequency file name is the same as HMM', print 'file name!' raise Exception() options = options[ 2:] # Remove processed '-freqs' option and file name try: # Check if file writing is possible freqs_out = open(freqs_file_name, 'w') freqs_out.close() except: print '***** Error: Cannot write to frequency output file specified', print 'in "-freqs" option:', freqs_file_name raise IOError() else: print '***** Error: Illegal option:', options[0] raise Exception() # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - - # if (hmm_file_name != None): myhmm = simplehmm.hmm([], []) # Create new empty HMM object myhmm.load_hmm(hmm_file_name) myhmm.print_hmm() # Print HMM (according to verbose and logging level) # Open output file and write header - - - - - - - - - - - - - - - - - - - - - # try: f_out = open(out_file_name, 'w') except: inout.log_message('Cannot open output file: ' + out_file_name, 'err') raise IOError() f_out.write("# Tagged training data written by 'pyTagData.py -"+ \ " Version 0.1'"+os.linesep) f_out.write('#' + os.linesep) f_out.write('# Created ' + time.ctime(time.time()) + os.linesep) f_out.write('#' + os.linesep) f_out.write('# Input file name: ' + in_file_name + os.linesep) f_out.write('# Output file name: ' + out_file_name + os.linesep) f_out.write('#' + os.linesep) f_out.write('# Parameters:' + os.linesep) f_out.write('# - Start of block with training records: '+str(first_rec)+ \ os.linesep) f_out.write('# - End of block with training records: '+str(last_rec)+ \ os.linesep) f_out.write('# - Number of training records: '+str(num_rec)+ \ os.linesep) if (hmm_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \ os.linesep) if (retag_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \ os.linesep) f_out.write("# Header lines from original training file follow:" + \ os.linesep) for header_line in original_header_lines: f_out.write(header_line + os.linesep) if (freqs_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Tag/state pattern frequencies written to file '" + \ freqs_file_name + os.linesep) f_out.write('#' + '-' * 70 + os.linesep) f_out.write(os.linesep) rec_count = 0 # Number of selected records num_rec_left = num_rec # Number of records to be selected left rec_selected = {} # Dictionary of all record numbers that were selected seq_freqs = {} # Dict to hold examples of tag/state patterns unchanged_loop_cnt = 0 # Counter of how many loops have been done # without new records being selected prev_num_rec_left = num_rec # Number of records left in the previous # interation # Due to the random nature of selecting records, and because sometimes - - - # a selected component can be empty (and is thus not used for training) # more than one iteration over the input data set is carried out. In each # iteration, records are selected randomly. # while (rec_count < num_rec): # Loop until 'num_rec' records selected # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_read = 0 # Number of read lines # Skip to start of training block - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() while (rec_count < num_rec) and (line_read <= (last_rec - first_rec)): line = f_in.readline() if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \ ((retag_file_name == None) and \ (num_rec_left >= random.randrange(0,rec_range,1))): line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message( ['Record number: ' + str(line_read + first_rec)], 'v1') config.curr_line_no = line_read + first_rec # Store current line number # Process line and extract content into components (name, geocode, etc) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Select component and process it - - - - - - - - - - - - - - - - - - - # if (tag_mode == 'name'): if (type(name_comp) == types.ListType): component = name_comp[0].strip( ) + ' ' + name_comp[1].strip() else: component = name_comp.strip() else: # Locality component component = geocode_comp.strip( ) + ' ' + locality_comp.strip() if (component != '') and \ (not rec_selected.has_key((line_read+first_rec))): if (tag_mode == 'name'): inout.log_message( ' Name component: |' + component + '|', 'v1') component = name.clean_name_component(component) [word_list, tag_list] = name.tag_name_component(component) else: # Locality component inout.log_message( ' Locality component: |' + component + '|', 'v1') component = locality.clean_geoloc_component(component) [word_list, tag_list] = locality.tag_geoloc_component(component) if (tag_list != []): # Only process non-empty tag lists # Append record number into dictionary of processed records # rec_selected.update({ (line_read + first_rec): (line_read + first_rec) }) # Create all permutation sequences of this tag list - - - - - - - - # tag_seq = mymath.perm_tag_sequence(tag_list) inout.log_message([' Word list: '+str(word_list), \ ' Tag list: '+str(tag_list), \ ' Tag sequences:'],'v2') # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - - # if (hmm_file_name != None): state_seq = [ ] # List containing computed HMM state sequences max_prob = -1.0 # maximal probability for a sequence max_seq_no = -1 # Number of the seq. with the max. probablity # Now give tag sequences to the HMMs to compute state sequences # i = 0 for t in tag_seq: [obs_seq, prob] = myhmm.viterbi(t) state_seq.append(obs_seq) if (prob > max_prob): max_prob = prob max_seq_no = i i += 1 # Write original component and resulting tag sequences to output # f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \ '): |'+component+'|'+os.linesep) # Commented original num_len = len(str(line_read + first_rec)) + len( str(rec_count)) + 6 f_out.write('#' + num_len * ' ' + '|' + ' '.join(word_list) + '|' + os.linesep) for i in range(len(tag_seq)): # Convert each tag sequence into a string for file output # seq_string = ' ' if (hmm_file_name != None) and (i != max_seq_no): seq_string = '# ' # Comment sequences with not max. probability for j in range(len(tag_seq[i])): if (hmm_file_name != None): seq_string = seq_string+' '+tag_seq[i][j]+':'+ \ state_seq[i][j]+',' else: seq_string = seq_string + ' ' + tag_seq[i][ j] + ':,' f_out.write(seq_string[:-1] + os.linesep) # Write without , at end inout.log_message(' ' + seq_string[:-1], 'v2') if (hmm_file_name != None): f_out.write('# Maximum Viterbi probability: %0.5f'% \ (max_prob) + os.linesep) inout.log_message('Maximum Viterbi probability: %0.5f'% \ (max_prob), 'v2') if (retag_file_name != None) and (tagged_recs[line_read] != None): if (tagged_recs[line_read].strip() != seq_string[:-1].strip()): f_out.write("# Note: ***** Changed *****" + os.linesep) inout.log_message(' Note:' + \ ' ***** Changed *****','v2') f_out.write('# Was: ' + tagged_recs[line_read] + os.linesep) # Write commented original tag sequence inout.log_message('Original tag sequence: '+ \ tagged_recs[line_read],'v2') f_out.write(os.linesep) # Write an empty line inout.log_message( '', 'v1') # Print empty lines between records if (hmm_file_name != None): seq_key = seq_string[: -1] # Add sequence to dictionary if (seq_freqs.has_key(seq_key)): seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \ max_prob]) else: seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \ max_prob]] rec_count += 1 # Print process indicator message # if (config.proc_ind >= 0) and (rec_count > 0): if (rec_count % config.proc_ind == 0): print 'Processed line', rec_count, 'of', num_rec line_read += 1 f_in.close() num_rec_left = num_rec - rec_count if (prev_num_rec_left == num_rec_left): # No new records selected unchanged_loop_cnt += 1 prev_num_rec_left = num_rec_left # Set to current value if (unchanged_loop_cnt > 5): # Do five loops maximal without selecting # new records config.curr_line_no = -1 # Set to illegal/empty values, as warning is config.curr_line = '' # not related to the current input line inout.log_message(['Can not select more than '+str(rec_count)+ \ ' records for training.', \ 'This is probably due to empty input components.', \ 'Please reduce value of "num_rec" or increase ' + \ 'range','between "first_rec" and "last_rec".'],'warn') break if (num_rec_left < 10): # Only 10 records left to select num_rec_left = num_rec + 1 # Set to more than 100% probablity elif (num_rec_left < (num_rec / 100.0)): # Less than 1% records left num_rec_left = int(num_rec / 100.0) # Set to 1% f_out.close() # If specified, save Viterbi frequencies to a file - - - - - - - - - - - - - # if (freqs_file_name != None): freqs_out = open(freqs_file_name, 'w') # Open frequency file for writing freqs_out.write('# Frequency listing of tag/state patterns written by') freqs_out.write('"pyTagData.py - Version 0.1"' + os.linesep) freqs_out.write('#' + os.linesep) freqs_out.write('# Created ' + time.ctime(time.time()) + os.linesep) freqs_out.write('#' + os.linesep) freqs_out.write("# Input file name: " + in_file_name + os.linesep) freqs_out.write("# Output file name: " + out_file_name + os.linesep) freqs_out.write(os.linesep) freqs_out.write('# Parameters:' + os.linesep) freqs_out.write('# - Start of block with training records: '+ \ str(first_rec)+os.linesep) freqs_out.write('# - End of block with training records: '+ \ str(last_rec)+os.linesep) freqs_out.write('# - Number of training records: '+ \ str(num_rec)+os.linesep) if (hmm_file_name != None): freqs_out.write('#' + os.linesep) freqs_out.write("# - Using HMM file '"+hmm_file_name+ \ "' for standardisation"+os.linesep) if (retag_file_name != None): freqs_out.write('#' + os.linesep) freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \ "'"+os.linesep) freqs_out.write('#' + '-' * 70 + os.linesep) freqs_out.write(os.linesep) sorted_seq_freqs = [ ] # Now sort sequences according to their fruequencies for key in seq_freqs.keys(): sorted_seq_freqs.append((len(seq_freqs[key]), key)) sorted_seq_freqs.sort() for skey in sorted_seq_freqs: key = skey[1] freqs_out.write('# Pattern: ' + str(key) + os.linesep) freqs_out.write('# Frequency: ' + str(skey[0]) + os.linesep) examples = seq_freqs[key] freqs_out.write('# Maximum Viterbi probability: '+ \ str(examples[0][1])+os.linesep) freqs_out.write('# Examples: ' + os.linesep) for example in examples: freqs_out.write('# ' + str(example[0]) + os.linesep) freqs_out.write(str(key) + os.linesep) freqs_out.write(os.linesep) freqs_out.close() inout.log_message(['Read '+str(line_read)+' lines, processed '+ \ str(rec_count)+' lines', 'End.'],'v1')