def tag_geoloc_component(geoloc_str): """Tag a geocode locality input component string and make a list. USAGE: [word_list, tag_list] = tag_geoloc_component(loc_str) ARGUMENTS: geoloc_str A string containing the geocode and/or locality components DESCRIPTION: This routines cleans the input string and extracts words, numbers and separators into a list. Each element of this list is assigned one or more tags. A 'greedy tagger' is applied, which cheques sequences of list elements in the name lookup table (longer sequences first) and replaces them with the string and tag from the lookup-table if found. The routine returns two lists: words and their tags """ # First, split input string into elements at spaces - - - - - - - - - - - - - # org_list = geoloc_str.split() # The original list from the input string inout.log_message(" Initial word list: " + str(org_list), "v2") tag_list = [] # The initially empty list of tags word_list = [] # The initially empty list of words while org_list != []: # As long as not all elements have been processed tmp_list = org_list[: config.geoloc_dict_seq_len] # Extract longest sub-list tmp_val = [] # Start with empty value tmp_key = tuple(tmp_list) while tmp_key != (): # As long as key not empty and not found in lookup if config.geoloc_lookup_dict.has_key(tmp_key): tmp_val = config.geoloc_lookup_dict[tmp_key] break tmp_key = tmp_key[:-1] # Remove last element in key if tmp_val != []: # A value has been found in the dictionary tmp_len = len(tmp_key) # Length of found sequence if tmp_val[0] != "": # it's not an empty value word_list.append(tmp_val[0]) # Append corrected word (or sequence) tag_list.append(tmp_val[1]) # Append tag or tags else: # No value has been found in the lookup dictionary, try other tags tmp_val = org_list[0] # Value is first element in the original list tmp_len = 1 if tmp_val.isdigit(): # Element is a number word_list.append(tmp_val) if len(tmp_val) == 4: tag_list.append("N4") else: tag_list.append("NU") elif (not tmp_val.isalpha()) and tmp_val.isalnum(): # Alpha-numeric word_list.append(tmp_val) tag_list.append("AN") elif tmp_val == "-": # Element is a hyphen word_list.append(tmp_val) tag_list.append("HY") elif tmp_val == ",": # Element is a comma word_list.append(tmp_val) tag_list.append("CO") elif tmp_val == "|": # Element is a vertical bar word_list.append(tmp_val) tag_list.append("VB") else: # An unknown element word_list.append(tmp_val) tag_list.append("UN") # Finally remove the processed elements from the original element list # org_list = org_list[tmp_len:] # Remove processed elements return [word_list, tag_list]
# Check if definition of input components is correct with file types # input_values = input_component.values() input_len = -1 # Length of the input (either in number of fields (CSV and TAB # files) or in characters (COL files) output_keys = output_field.keys() # Check if 'original_input' is in output # fields, and if so check for correctness for k in output_keys: if (k[:14] == 'original_input'): v = k[14:].strip() if (v != ''): # There is a field or column range given if (v[0] == '[') and (v[-1] == ']'): v = v[1:-1] # Remove brackets else: inout.log_message('Wrong input component definition: '+str(k) + \ ' for "original_input" output field','err') raise Exception() if (v[0] == '(') and (v[-1] == ')'): # It's a tuple v = v[1:-1] # Remove tuple brackets v = v.split(',') # Make a list for i in range(len(v)): v[i] = int(v[i]) # Make integers if (len(v) == 1): # One integer only, must be a field number input_values.append(v) # Append 'original_input' field number elif (len(v) == 2): # Two integers, must be a column range input_values.append([(v[0],v[1])]) # Append as a tuple else: inout.log_message('Wrong input component value: '+str(k) + \ ' for "original_input" output field','err') raise Exception()
def get_geoloc_hmm(word_list, tag_list): """Process input using a HMM to extract geocode and locality output fields. USAGE: geoloc_dict = get_geoloc_hmm(word_list, tag_list) ARGUMENTS: word_list List of words as produces with clean_tag_locality() tag_list Corresponding list of tags as produces with clean_tag_locality() DESCRIPTION: The routine returns a dictionary with the parsed and extracted output fields for both the locality and geocode components. A Hidden Markov Model (HMM) is used for this task. The dictionary returned can contain the following key words: - wayfare_number - wayfare_name - wayfare_qualifier - wayfare_type - unit_number - unit_type - property_name - institution_name - institution_type - postaddress_number - postaddress_type - locality_name - locality_qualifier - postcode - territory - country - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for the most likely HMM state seqence) """ # First, create all permutations of the input tag sequence # tag_list_seq = mymath.perm_tag_sequence(tag_list) msg = [" Input tag sequence: " + str(tag_list), " Output tag sequences:"] for t in tag_list_seq: msg.append(" " + str(t)) inout.log_message(msg, "v2") # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - - # and keep the one with highest probability # max_prob = -1.0 best_obs_seq = [] best_tag_list = [] for t in tag_list_seq: [obs_seq, prob] = config.geoloc_hmm.viterbi(t) if prob > max_prob: best_obs_seq = obs_seq best_tag_list = t max_prob = prob inout.log_message(" Probability " + str(prob) + " for sequence " + str(t), "v2") inout.log_message( [" Best observation sequence: " + str(best_obs_seq), " with tag sequence: " + str(best_tag_list)], "v2", ) # Now process the observation sequence and add elements into dictionary - - - # tag_list_len = len(tag_list) norm_max_prob = max_prob / float(tag_list_len) # Normalise max. probability geoloc_dict = {"geoloc_hmm_proba": [str(norm_max_prob)]} list_len = len(word_list) for i in range(list_len): # Loop over words and states w = word_list[i] s = best_obs_seq[i] # Do not output commas, vertical bars and hyphens - - - - - - - - - - - - # if w in ["|", ",", "-", "/"]: pass elif s == "wfnu": # Wayfare number - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("wayfare_number", []) v.append(w) geoloc_dict.update({"wayfare_number": v}) elif s in ["wfna1", "wfna2", "wfna3"]: # Wayfare name - - - - - - - - - - v = geoloc_dict.get("wayfare_name", []) v.append(w) geoloc_dict.update({"wayfare_name": v}) elif s == "wfql": # Wayfare qualifier - - - - - - - - - - - - - - - - - v = geoloc_dict.get("wayfare_qualifier", []) v.append(w) geoloc_dict.update({"wayfare_qualifier": v}) elif s == "wfty": # Wayfare type - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("wayfare_type", []) v.append(w) geoloc_dict.update({"wayfare_type": v}) elif s == "unnu": # Unit number - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("unit_number", []) v.append(w) geoloc_dict.update({"unit_number": v}) elif s == "unty": # Unit type - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("unit_type", []) v.append(w) geoloc_dict.update({"unit_type": v}) elif s in ["prna1", "prna2"]: # Property name - - - - - - - - - - - - - - v = geoloc_dict.get("property_name", []) v.append(w) geoloc_dict.update({"property_name": v}) elif s in ["inna1", "inna2"]: # Institution name - - - - - - - - - - - - v = geoloc_dict.get("institution_name", []) v.append(w) geoloc_dict.update({"institution_name": v}) elif s == "inty": # Institution type - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("institution_type", []) v.append(w) geoloc_dict.update({"institution_type": v}) elif s == "panu": # Postal address number - - - - - - - - - - - - - - - v = geoloc_dict.get("postaddress_number", []) v.append(w) geoloc_dict.update({"postaddress_number": v}) elif s == "paty": # Postal address type - - - - - - - - - - - - - - - - v = geoloc_dict.get("postaddress_type", []) v.append(w) geoloc_dict.update({"postaddress_type": v}) elif s in ["loc1", "loc2"]: # Locality name - - - - - - - - - - - - - - - v = geoloc_dict.get("locality_name", []) v.append(w) geoloc_dict.update({"locality_name": v}) elif s == "locql": # Locality qualifier - - - - - - - - - - - - - - - - v = geoloc_dict.get("locality_qualifier", []) v.append(w) geoloc_dict.update({"locality_qualifier": v}) elif s == "pc": # Postcode - - - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get("postcode", []) v.append(w) geoloc_dict.update({"postcode": v}) elif s in ["ter1", "ter2"]: # Territory - - - - - - - - - - - - - - - - - v = geoloc_dict.get("territory", []) v.append(w) geoloc_dict.update({"territory": v}) elif s in ["cntr1", "cntr2"]: # Country - - - - - - - - - - - - - - - - - v = geoloc_dict.get("country", []) v.append(w) geoloc_dict.update({"country": v}) else: # Should never happen msg = [ "This should never happen!", " Tag: " + str(s), " Word: " + w, " Word list: " + str(word_list), " tag list: " + str(tag_list), ] inout.log_message(msg, "warn") # Check if concatenated locality and territory words are in lookup-table - - # if geoloc_dict.has_key("locality_name"): loc = geoloc_dict["locality_name"] if len(loc) > 1: # Locality contains more than one word loc_tuple = tuple(loc) # Make it a tuple if config.geoloc_lookup_dict.has_key(loc_tuple): new_loc = config.geoloc_lookup_dict[loc_tuple][0] geoloc_dict.update({"locality_name": [new_loc]}) if geoloc_dict.has_key("territory"): terr = geoloc_dict["territory"] if len(terr) > 1: # Territory contains more than one word terr_tuple = tuple(terr) # Make it a tuple if config.geoloc_lookup_dict.has_key(terr_tuple): new_terr = config.geoloc_lookup_dict[terr_tuple][0] geoloc_dict.update({"territory": [new_terr]}) if geoloc_dict.has_key("country"): cntr = geoloc_dict["country"] if len(cntr) > 1: # Country contains more than one word cntr_tuple = tuple(cntr) # Make it a tuple if config.geoloc_lookup_dict.has_key(cntr_tuple): new_cntr = config.geoloc_lookup_dict[cntr_tuple][0] geoloc_dict.update({"country": [new_cntr]}) # Finally do some tests on the output fields - - - - - - - - - - - - - - - - # geoloc_items = geoloc_dict.items() # Check if a value list has more than three elements, if so print out # for i in geoloc_items: if len(i[1]) > 3: inout.log_message( "Geocode/locality output field " + str(i[0]) + " contains more than three elements: " + str(i[1]), "warn", ) # Check if 'number' elements only contain (alpha-) numerical values - - - - - # and also check how many numbers in an element # if geoloc_dict.has_key("wayfare_number"): # Check how many wayfare numbers v = geoloc_dict["wayfare_number"] if len(v) > 2: inout.log_message("More than two wayfare numbers: " + str(v), "warn") for i in v: if i.isalpha(): # Element contains only letters inout.log_message("Wayfare number element contains no digits: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("unit_number"): # Check how many unit numbers v = geoloc_dict["unit_number"] if len(v) > 1: inout.log_message("More than one unit numbers: " + str(v), "warn") for i in v: if i.isalpha(): # Element contains only letters inout.log_message("Unit number element contains no digits: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("postaddress_number"): # Check postaddress numbers v = geoloc_dict["postaddress_number"] if len(v) > 1: inout.log_message("More than one postaddress numbers: " + str(v), "warn") for i in v: if i.isalpha(): # Element contains only letters inout.log_message("Postaddress number element contains no digits: " + str(v), "warn") break # Exit for loop # Check if 'type' elements contain one word only - - - - - - - - - - - - - - # if it's a known type word # if geoloc_dict.has_key("wayfare_type"): # Check wayfare type v = geoloc_dict["wayfare_type"] if len(v) > 1: inout.log_message("More than one wayfare type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("WT") < 0) ): inout.log_message("Wayfare type word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("unit_type"): # Check unit type v = geoloc_dict["unit_type"] if len(v) > 1: inout.log_message("More than one unit type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("UT") < 0) ): inout.log_message("Unit type word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("institution_type"): # Check institution type v = geoloc_dict["institution_type"] if len(v) > 1: inout.log_message("More than one institution type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("IT") < 0) ): inout.log_message("Institution type word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("postaddress_type"): # Check postaddress type v = geoloc_dict["postaddress_type"] if len(v) > 2: inout.log_message("More than two postaddress type: " + str(v), "warn") for i in v: i = i.split("_") i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or ( config.geoloc_lookup_dict.has_key((i)) and (config.geoloc_lookup_dict[(i)][1].find("PA") < 0) ): inout.log_message("Postaddress type word is not known: " + str(v), "warn") break # Exit for loop # Check if 'qualifier' elements only contain known qualifier words - - - - - # if geoloc_dict.has_key("wayfare_qualifier"): # Check wayfare qualifier v = geoloc_dict["wayfare_qualifier"] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or ( config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0) ): inout.log_message("Wayfare qualifier word is not known: " + str(v), "warn") break # Exit for loop if geoloc_dict.has_key("locality_qualifier"): # Check locality qualifier v = geoloc_dict["locality_qualifier"] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or ( config.geoloc_lookup_dict.has_key((i,)) and (config.geoloc_lookup_dict[(i,)][1].find("LQ") < 0) ): inout.log_message("Locality qualifier word is not known: " + str(v), "warn") break # Exit for loop return geoloc_dict
def trainhmm(): """Main routine, open file, read lines, train HMM and save it to file. USAGE: trainhmm() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 3): print '***** Error: %s needs at least four arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Input training file name' print '***** - HMM output file name' print '***** plus options' raise Exception() if (config.options[1] == config.options[2]): print '*** Error: Input and output files must differ' print '*** Input training file name:', config.options[1] print '*** HMM output file name: ', config.options[1] raise Exception() in_file_name = config.options[1] hmm_file_name = config.options[2] # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name','na','n']): tag_mode = 'name' elif (tag_mode in ['locality','lolty','loc','l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file smoothing = None # Default: No smoothing config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) if (len(config.options) > 3): options = config.options[3:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: '+config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write("############"+os.linesep) f_log.write("#"+os.linesep) f_log.write("# 'pyTrainHMM.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write("#"+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# HMM file name: "+hmm_file_name+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-s'): smoothing = 1 # Set to do a HMM smoothing smoothing = options[1] if (smoothing in ['l','la','lap','laplac','laplace']): smoothing = 'laplace' elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\ 'absdiscount']): smoothing = 'absdiscount' else: # Illegal value print "*** Error: Illegal value for 'smoothing' argument:", smoothing print "*** Possible are: 'laplace' or 'absdiscount'" raise Exception() options = options[2:] # Remove processed option else: print '*** Error: Illegal option:', options[0] raise Exception() # Get HMM states and observations from configuration module - - - - - - - - - # if (tag_mode == 'name'): state_list = config.name_hmm_states obser_list = config.name_hmm_obser else: state_list = config.geoloc_hmm_states obser_list = config.geoloc_hmm_obser # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 # Counter for lines read rec_count = 0 # Counter for training records read # Read lines, discard comment lines and process training data lines - - - - - # training_data = [] # List of training records train_list = [] # List of training sequences (dictionaries), extracted from # training data for line in xreadlines.xreadlines(f_in): if (line[0] != '#') and (line.strip() != ''): # Line must contain a training record line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line_list = line.split(',') # Split into a list of elements line_data = [] # Training data list for one training record inout.log_message(['Record number: '+str(rec_count)],'v1') config.curr_line_no = line_count # Store current line number for elem in line_list: [k,v] = elem.split(':') # Split into key and value tag = k.strip() state = v.strip() line_data.append((state,tag)) if (state not in state_list): msg = ['Illegal state name in training record: '+state, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(state_list)] inout.log_message(msg,'err') raise Exception() if (tag not in obser_list): msg = ['Illegal observation (tag) name in training record: '+tag, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(obser_list)] inout.log_message(msg,'err') raise Exception() inout.log_message(' Training record '+str(rec_count)+':'+ \ str(line_data),'v1') train_list.append(line_data) rec_count += 1 inout.log_message('','v1') # Print empty lines between records line_count += 1 # Close input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() inout.log_message('','v1') # Print empty lines between records # Initalise HMM and train it with training data - - - - - - - - - - - - - - - # myhmm = simplehmm.hmm(state_list, obser_list) myhmm.train(train_list,smoothing) myhmm.print_hmm() # Save trained HMM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # myhmm.save_hmm(hmm_file_name) inout.log_message(['Read '+str(line_count)+' lines, processed '+ \ str(rec_count)+' training records', 'End.'],'v1')
def tagdata(): """Main routine, open file, read lines, tag data records, write to out-file. USAGE: tagdata() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 5): print '***** Error: %s needs at least six arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Output training file name' print '***** - Start of block with training records' print '***** - End of block with training records' print '***** - Number of training records' print '***** plus options' raise Exception() if (config.in_file_name == config.options[2]): print '***** Error: Input and output files must differ' print '***** Input file name: ', config.in_file_name print '***** Output training file name:', config.options[2] raise Exception() first_rec = int(config.options[2]) last_rec = int(config.options[3]) num_rec = int(config.options[4]) in_file_name = config.in_file_name out_file_name = config.options[1] # Check record number values - - - - - - - - - - - - - - - - - - - - - - - - # if (int(first_rec) >= int(last_rec)) or \ ((int(num_rec)-1) > (int(last_rec)-int(first_rec))): print '***** Error: Illegal values for training records block:' print '***** - Start of block with training records:', first_rec print '***** - End of block with training records: ', last_rec print '***** - Number of training records: ', num_rec raise Exception() rec_range = last_rec-first_rec-1 # Range of records in input file # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if (last_rec > line_count): # Illegal value for last record print '***** Error: Illegal values for last training records:', last_rec print '***** File only contains',line_count, 'lines/records' raise Exception() # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name','na','n']): tag_mode = 'name' elif (tag_mode in ['locality','localty','loc','l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file hmm_file_name = None # Default: Do not use HMM to standardise training # records retag_file_name = None # Default: Do not retag an existing training file config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) freqs_file_name = None # Default: Do not write frequencies, no -freqs option if (len(config.options) > 5): options = config.options[5:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: '+config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write('############'+os.linesep) f_log.write('#'+os.linesep) f_log.write("# 'pyTagData.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write('#'+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# Output file name: "+out_file_name+os.linesep) f_log.write("# Tagging mode: "+tag_mode+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-hmm'): hmm_file_name = options[1] # Get file name of the HMM to use if (hmm_file_name == out_file_name): print '***** Error: HMM file name is the same as output file name!' raise Exception() try: f_in = open(hmm_file_name,'r') # Test if file is available except: print '***** Error: Cannot open HMM file specified in "-hmm"', print 'option:', hmm_file_name raise IOError() f_in.close() options = options[2:] # Remove processed '-hmm' option and file name elif (options[0] == '-retag'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-retag" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() retag_file_name = options[1] # Get file name of the already-tagged # file to re-process if (retag_file_name == out_file_name): print '***** Error: Retag file name is the same as output file name!' raise Exception() elif (retag_file_name == in_file_name): print '***** Error: Retag file name is the same as input file name!' raise Exception() elif (retag_file_name == hmm_file_name): print '***** Error: Retag file name is the same as HMM file name!' raise Exception() try: f_in = open(retag_file_name,'r') # Test if file is available # Now gather record numbers and previous tags/states, as well as the # original header information. Use a simple state machine to do this. # tagged_recs = {} cleaned_recs = {} original_header_lines = [] state = -1 # Header lines state prevline = '' for line in f_in.xreadlines(): # Read training file and process it line = line.strip() if (state == -1) and (len(line) == 0): # End of header lines state = 0 prevline = line continue if (state == -1) and (len(line) > 0) and (line[0] == "#"): original_header_lines.append("# " + line) prevline = line continue sline = line.split(' ') if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \ and (sline[2][0] == '(') and (sline[2][-2:] == '):'): try: rec = int(sline[1]) # Original record number tagged_recs[rec] = None cleaned_recs[rec] = None state = 1 except: pass prevline = line continue if (state == 1) and (len(line) > 0) and (line[0] != '#'): tagged_recs[rec] = line cleaned_recs[rec] = prevline state = 0 prevline = line continue if (state == 1) and (len(line) > 0): prevline = line continue f_in.close() tagged_recs_keys = tagged_recs.keys() num_rec = len(tagged_recs_keys) # Override specified numbers first_rec = 0 last_rec = line_count except: print '***** Error: Cannot open tagged training file specified', print 'in "-retag" option:', retag_file_name raise IOError() options = options[2:] # Remove processed '-retag' option and file name elif (options[0][:5] == '-freq'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-feqs" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() freqs_file_name = options[1] # File name to write the frequencies to if (freqs_file_name == out_file_name): print '***** Error: Frequency file name is the same as output', print 'file name!' raise Exception() elif (freqs_file_name == in_file_name): print '***** Error: Frequency file name is the same as input', print 'file name!' raise Exception() elif (freqs_file_name == hmm_file_name): print '***** Error: Frequency file name is the same as HMM', print 'file name!' raise Exception() options = options[2:] # Remove processed '-freqs' option and file name try: # Check if file writing is possible freqs_out = open(freqs_file_name,'w') freqs_out.close() except: print '***** Error: Cannot write to frequency output file specified', print 'in "-freqs" option:', freqs_file_name raise IOError() else: print '***** Error: Illegal option:', options[0] raise Exception() # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - - # if (hmm_file_name != None): myhmm = simplehmm.hmm([],[]) # Create new empty HMM object myhmm.load_hmm(hmm_file_name) myhmm.print_hmm() # Print HMM (according to verbose and logging level) # Open output file and write header - - - - - - - - - - - - - - - - - - - - - # try: f_out = open(out_file_name,'w') except: inout.log_message('Cannot open output file: '+out_file_name,'err') raise IOError() f_out.write("# Tagged training data written by 'pyTagData.py -"+ \ " Version 0.1'"+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Created '+time.ctime(time.time())+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Input file name: '+in_file_name+os.linesep) f_out.write('# Output file name: '+out_file_name+os.linesep) f_out.write('#'+os.linesep) f_out.write('# Parameters:'+os.linesep) f_out.write('# - Start of block with training records: '+str(first_rec)+ \ os.linesep) f_out.write('# - End of block with training records: '+str(last_rec)+ \ os.linesep) f_out.write('# - Number of training records: '+str(num_rec)+ \ os.linesep) if (hmm_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \ os.linesep) if (retag_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \ os.linesep) f_out.write("# Header lines from original training file follow:" + \ os.linesep) for header_line in original_header_lines: f_out.write(header_line + os.linesep) if (freqs_file_name != None): f_out.write('#'+os.linesep) f_out.write("# - Tag/state pattern frequencies written to file '" + \ freqs_file_name + os.linesep) f_out.write('#'+'-'*70+os.linesep) f_out.write(os.linesep) rec_count = 0 # Number of selected records num_rec_left = num_rec # Number of records to be selected left rec_selected = {} # Dictionary of all record numbers that were selected seq_freqs = {} # Dict to hold examples of tag/state patterns unchanged_loop_cnt = 0 # Counter of how many loops have been done # without new records being selected prev_num_rec_left = num_rec # Number of records left in the previous # interation # Due to the random nature of selecting records, and because sometimes - - - # a selected component can be empty (and is thus not used for training) # more than one iteration over the input data set is carried out. In each # iteration, records are selected randomly. # while (rec_count < num_rec): # Loop until 'num_rec' records selected # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_read = 0 # Number of read lines # Skip to start of training block - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() while (rec_count < num_rec) and (line_read <= (last_rec-first_rec)): line = f_in.readline() if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \ ((retag_file_name == None) and \ (num_rec_left >= random.randrange(0,rec_range,1))): line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record number: '+str(line_read+first_rec)],'v1') config.curr_line_no = line_read+first_rec # Store current line number # Process line and extract content into components (name, geocode, etc) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Select component and process it - - - - - - - - - - - - - - - - - - - # if (tag_mode == 'name'): if (type(name_comp) == types.ListType): component = name_comp[0].strip()+' '+name_comp[1].strip() else: component = name_comp.strip() else: # Locality component component = geocode_comp.strip()+' '+locality_comp.strip() if (component != '') and \ (not rec_selected.has_key((line_read+first_rec))): if (tag_mode == 'name'): inout.log_message(' Name component: |'+component+'|','v1') component = name.clean_name_component(component) [word_list, tag_list] = name.tag_name_component(component) else: # Locality component inout.log_message(' Locality component: |'+component+'|','v1') component = locality.clean_geoloc_component(component) [word_list, tag_list] = locality.tag_geoloc_component(component) if (tag_list != []): # Only process non-empty tag lists # Append record number into dictionary of processed records # rec_selected.update({(line_read+first_rec):(line_read+first_rec)}) # Create all permutation sequences of this tag list - - - - - - - - # tag_seq = mymath.perm_tag_sequence(tag_list) inout.log_message([' Word list: '+str(word_list), \ ' Tag list: '+str(tag_list), \ ' Tag sequences:'],'v2') # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - - # if (hmm_file_name != None): state_seq = [] # List containing computed HMM state sequences max_prob = -1.0 # maximal probability for a sequence max_seq_no = -1 # Number of the seq. with the max. probablity # Now give tag sequences to the HMMs to compute state sequences # i = 0 for t in tag_seq: [obs_seq, prob] = myhmm.viterbi(t) state_seq.append(obs_seq) if (prob > max_prob): max_prob = prob max_seq_no = i i += 1 # Write original component and resulting tag sequences to output # f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \ '): |'+component+'|'+os.linesep) # Commented original num_len = len(str(line_read+first_rec))+len(str(rec_count))+6 f_out.write('#'+num_len*' '+'|'+' '.join(word_list)+'|'+os.linesep) for i in range(len(tag_seq)): # Convert each tag sequence into a string for file output # seq_string = ' ' if (hmm_file_name != None) and (i != max_seq_no): seq_string = '# ' # Comment sequences with not max. probability for j in range(len(tag_seq[i])): if (hmm_file_name != None): seq_string = seq_string+' '+tag_seq[i][j]+':'+ \ state_seq[i][j]+',' else: seq_string = seq_string+' '+tag_seq[i][j]+':,' f_out.write(seq_string[:-1]+os.linesep) # Write without , at end inout.log_message(' '+seq_string[:-1],'v2') if (hmm_file_name != None): f_out.write('# Maximum Viterbi probability: %0.5f'% \ (max_prob) + os.linesep) inout.log_message('Maximum Viterbi probability: %0.5f'% \ (max_prob), 'v2') if (retag_file_name != None) and (tagged_recs[line_read] != None): if (tagged_recs[line_read].strip() != seq_string[:-1].strip()): f_out.write("# Note: ***** Changed *****" + os.linesep) inout.log_message(' Note:' + \ ' ***** Changed *****','v2') f_out.write('# Was: ' + tagged_recs[line_read]+os.linesep) # Write commented original tag sequence inout.log_message('Original tag sequence: '+ \ tagged_recs[line_read],'v2') f_out.write(os.linesep) # Write an empty line inout.log_message('','v1') # Print empty lines between records if (hmm_file_name != None): seq_key = seq_string[:-1] # Add sequence to dictionary if (seq_freqs.has_key(seq_key)): seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \ max_prob]) else: seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \ max_prob]] rec_count += 1 # Print process indicator message # if (config.proc_ind >= 0) and (rec_count > 0): if (rec_count % config.proc_ind == 0): print 'Processed line', rec_count, 'of', num_rec line_read += 1 f_in.close() num_rec_left = num_rec - rec_count if (prev_num_rec_left == num_rec_left): # No new records selected unchanged_loop_cnt += 1 prev_num_rec_left = num_rec_left # Set to current value if (unchanged_loop_cnt > 5): # Do five loops maximal without selecting # new records config.curr_line_no = -1 # Set to illegal/empty values, as warning is config.curr_line = '' # not related to the current input line inout.log_message(['Can not select more than '+str(rec_count)+ \ ' records for training.', \ 'This is probably due to empty input components.', \ 'Please reduce value of "num_rec" or increase ' + \ 'range','between "first_rec" and "last_rec".'],'warn') break if (num_rec_left < 10): # Only 10 records left to select num_rec_left = num_rec+1 # Set to more than 100% probablity elif (num_rec_left < (num_rec / 100.0)): # Less than 1% records left num_rec_left = int(num_rec / 100.0) # Set to 1% f_out.close() # If specified, save Viterbi frequencies to a file - - - - - - - - - - - - - # if (freqs_file_name != None): freqs_out = open(freqs_file_name,'w') # Open frequency file for writing freqs_out.write('# Frequency listing of tag/state patterns written by') freqs_out.write('"pyTagData.py - Version 0.1"'+os.linesep) freqs_out.write('#'+os.linesep) freqs_out.write('# Created '+time.ctime(time.time())+os.linesep) freqs_out.write('#'+os.linesep) freqs_out.write("# Input file name: "+in_file_name+os.linesep) freqs_out.write("# Output file name: "+out_file_name+os.linesep) freqs_out.write(os.linesep) freqs_out.write('# Parameters:'+os.linesep) freqs_out.write('# - Start of block with training records: '+ \ str(first_rec)+os.linesep) freqs_out.write('# - End of block with training records: '+ \ str(last_rec)+os.linesep) freqs_out.write('# - Number of training records: '+ \ str(num_rec)+os.linesep) if (hmm_file_name != None): freqs_out.write('#'+os.linesep) freqs_out.write("# - Using HMM file '"+hmm_file_name+ \ "' for standardisation"+os.linesep) if (retag_file_name != None): freqs_out.write('#'+os.linesep) freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \ "'"+os.linesep) freqs_out.write('#'+'-'*70+os.linesep) freqs_out.write(os.linesep) sorted_seq_freqs = [] # Now sort sequences according to their fruequencies for key in seq_freqs.keys(): sorted_seq_freqs.append((len(seq_freqs[key]),key)) sorted_seq_freqs.sort() for skey in sorted_seq_freqs: key = skey[1] freqs_out.write('# Pattern: '+str(key)+os.linesep) freqs_out.write('# Frequency: '+str(skey[0])+os.linesep) examples = seq_freqs[key] freqs_out.write('# Maximum Viterbi probability: '+ \ str(examples[0][1])+os.linesep) freqs_out.write('# Examples: '+os.linesep) for example in examples: freqs_out.write('# '+str(example[0])+os.linesep) freqs_out.write(str(key)+os.linesep) freqs_out.write(os.linesep) freqs_out.close() inout.log_message(['Read '+str(line_read)+' lines, processed '+ \ str(rec_count)+' lines', 'End.'],'v1')
def get_geoloc_hmm(word_list, tag_list): """Process input using a HMM to extract geocode and locality output fields. USAGE: geoloc_dict = get_geoloc_hmm(word_list, tag_list) ARGUMENTS: word_list List of words as produces with clean_tag_locality() tag_list Corresponding list of tags as produces with clean_tag_locality() DESCRIPTION: The routine returns a dictionary with the parsed and extracted output fields for both the locality and geocode components. A Hidden Markov Model (HMM) is used for this task. The dictionary returned can contain the following key words: - wayfare_number - wayfare_name - wayfare_qualifier - wayfare_type - unit_number - unit_type - property_name - institution_name - institution_type - postaddress_number - postaddress_type - locality_name - locality_qualifier - postcode - territory - country - geoloc_hmm_proba (the probability returned by the Viterbi algorithm for the most likely HMM state seqence) """ # First, create all permutations of the input tag sequence # tag_list_seq = mymath.perm_tag_sequence(tag_list) msg = [' Input tag sequence: ' + str(tag_list), ' Output tag sequences:'] for t in tag_list_seq: msg.append(' ' + str(t)) inout.log_message(msg, 'v2') # Now give all tag sequences to the HMM - - - - - - - - - - - - - - - - - - - # and keep the one with highest probability # max_prob = -1.0 best_obs_seq = [] best_tag_list = [] for t in tag_list_seq: [obs_seq, prob] = config.geoloc_hmm.viterbi(t) if (prob > max_prob): best_obs_seq = obs_seq best_tag_list = t max_prob = prob inout.log_message( ' Probability ' + str(prob) + ' for sequence ' + str(t), 'v2') inout.log_message([ ' Best observation sequence: ' + str(best_obs_seq), ' with tag sequence: ' + str(best_tag_list) ], 'v2') # Now process the observation sequence and add elements into dictionary - - - # tag_list_len = len(tag_list) norm_max_prob = max_prob / float( tag_list_len) # Normalise max. probability geoloc_dict = {'geoloc_hmm_proba': [str(norm_max_prob)]} list_len = len(word_list) for i in range(list_len): # Loop over words and states w = word_list[i] s = best_obs_seq[i] # Do not output commas, vertical bars and hyphens - - - - - - - - - - - - # if (w in ['|', ',', '-', '/']): pass elif (s == 'wfnu' ): # Wayfare number - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('wayfare_number', []) v.append(w) geoloc_dict.update({'wayfare_number': v}) elif (s in ['wfna1', 'wfna2', 'wfna3']): # Wayfare name - - - - - - - - - - v = geoloc_dict.get('wayfare_name', []) v.append(w) geoloc_dict.update({'wayfare_name': v}) elif (s == 'wfql' ): # Wayfare qualifier - - - - - - - - - - - - - - - - - v = geoloc_dict.get('wayfare_qualifier', []) v.append(w) geoloc_dict.update({'wayfare_qualifier': v}) elif (s == 'wfty' ): # Wayfare type - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('wayfare_type', []) v.append(w) geoloc_dict.update({'wayfare_type': v}) elif (s == 'unnu' ): # Unit number - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('unit_number', []) v.append(w) geoloc_dict.update({'unit_number': v}) elif (s == 'unty' ): # Unit type - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('unit_type', []) v.append(w) geoloc_dict.update({'unit_type': v}) elif (s in ['prna1', 'prna2']): # Property name - - - - - - - - - - - - - - v = geoloc_dict.get('property_name', []) v.append(w) geoloc_dict.update({'property_name': v}) elif (s in ['inna1', 'inna2']): # Institution name - - - - - - - - - - - - v = geoloc_dict.get('institution_name', []) v.append(w) geoloc_dict.update({'institution_name': v}) elif (s == 'inty' ): # Institution type - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('institution_type', []) v.append(w) geoloc_dict.update({'institution_type': v}) elif (s == 'panu' ): # Postal address number - - - - - - - - - - - - - - - v = geoloc_dict.get('postaddress_number', []) v.append(w) geoloc_dict.update({'postaddress_number': v}) elif (s == 'paty' ): # Postal address type - - - - - - - - - - - - - - - - v = geoloc_dict.get('postaddress_type', []) v.append(w) geoloc_dict.update({'postaddress_type': v}) elif (s in ['loc1', 'loc2']): # Locality name - - - - - - - - - - - - - - - v = geoloc_dict.get('locality_name', []) v.append(w) geoloc_dict.update({'locality_name': v}) elif (s == 'locql' ): # Locality qualifier - - - - - - - - - - - - - - - - v = geoloc_dict.get('locality_qualifier', []) v.append(w) geoloc_dict.update({'locality_qualifier': v}) elif (s == 'pc' ): # Postcode - - - - - - - - - - - - - - - - - - - - - - - v = geoloc_dict.get('postcode', []) v.append(w) geoloc_dict.update({'postcode': v}) elif (s in ['ter1', 'ter2']): # Territory - - - - - - - - - - - - - - - - - v = geoloc_dict.get('territory', []) v.append(w) geoloc_dict.update({'territory': v}) elif (s in ['cntr1', 'cntr2']): # Country - - - - - - - - - - - - - - - - - v = geoloc_dict.get('country', []) v.append(w) geoloc_dict.update({'country': v}) else: # Should never happen msg = ['This should never happen!', ' Tag: '+str(s), ' Word: '+w, \ ' Word list: '+str(word_list), \ ' tag list: '+str(tag_list)] inout.log_message(msg, 'warn') # Check if concatenated locality and territory words are in lookup-table - - # if (geoloc_dict.has_key('locality_name')): loc = geoloc_dict['locality_name'] if (len(loc) > 1): # Locality contains more than one word loc_tuple = tuple(loc) # Make it a tuple if (config.geoloc_lookup_dict.has_key(loc_tuple)): new_loc = config.geoloc_lookup_dict[loc_tuple][0] geoloc_dict.update({'locality_name': [new_loc]}) if (geoloc_dict.has_key('territory')): terr = geoloc_dict['territory'] if (len(terr) > 1): # Territory contains more than one word terr_tuple = tuple(terr) # Make it a tuple if (config.geoloc_lookup_dict.has_key(terr_tuple)): new_terr = config.geoloc_lookup_dict[terr_tuple][0] geoloc_dict.update({'territory': [new_terr]}) if (geoloc_dict.has_key('country')): cntr = geoloc_dict['country'] if (len(cntr) > 1): # Country contains more than one word cntr_tuple = tuple(cntr) # Make it a tuple if (config.geoloc_lookup_dict.has_key(cntr_tuple)): new_cntr = config.geoloc_lookup_dict[cntr_tuple][0] geoloc_dict.update({'country': [new_cntr]}) # Finally do some tests on the output fields - - - - - - - - - - - - - - - - # geoloc_items = geoloc_dict.items() # Check if a value list has more than three elements, if so print out # for i in geoloc_items: if (len(i[1]) > 3): inout.log_message('Geocode/locality output field '+ str(i[0])+ \ ' contains more than three elements: '+str(i[1]),'warn') # Check if 'number' elements only contain (alpha-) numerical values - - - - - # and also check how many numbers in an element # if (geoloc_dict.has_key('wayfare_number') ): # Check how many wayfare numbers v = geoloc_dict['wayfare_number'] if (len(v) > 2): inout.log_message('More than two wayfare numbers: ' + str(v), 'warn') for i in v: if (i.isalpha()): # Element contains only letters inout.log_message('Wayfare number element contains no digits: '+ \ str(v),'warn') break # Exit for loop if (geoloc_dict.has_key('unit_number')): # Check how many unit numbers v = geoloc_dict['unit_number'] if (len(v) > 1): inout.log_message('More than one unit numbers: ' + str(v), 'warn') for i in v: if (i.isalpha()): # Element contains only letters inout.log_message('Unit number element contains no digits: '+str(v),\ 'warn') break # Exit for loop if (geoloc_dict.has_key('postaddress_number') ): # Check postaddress numbers v = geoloc_dict['postaddress_number'] if (len(v) > 1): inout.log_message('More than one postaddress numbers: ' + str(v), 'warn') for i in v: if (i.isalpha()): # Element contains only letters inout.log_message('Postaddress number element contains no digits: '+ \ str(v),'warn') break # Exit for loop # Check if 'type' elements contain one word only - - - - - - - - - - - - - - # if it's a known type word # if (geoloc_dict.has_key('wayfare_type')): # Check wayfare type v = geoloc_dict['wayfare_type'] if (len(v) > 1): inout.log_message('More than one wayfare type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('WT') < 0)): inout.log_message('Wayfare type word is not known: ' + str(v), 'warn') break # Exit for loop if (geoloc_dict.has_key('unit_type')): # Check unit type v = geoloc_dict['unit_type'] if (len(v) > 1): inout.log_message('More than one unit type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('UT') < 0)): inout.log_message('Unit type word is not known: ' + str(v), 'warn') break # Exit for loop if (geoloc_dict.has_key('institution_type')): # Check institution type v = geoloc_dict['institution_type'] if (len(v) > 1): inout.log_message('More than one institution type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('IT') < 0)): inout.log_message( 'Institution type word is not known: ' + str(v), 'warn') break # Exit for loop if (geoloc_dict.has_key('postaddress_type')): # Check postaddress type v = geoloc_dict['postaddress_type'] if (len(v) > 2): inout.log_message('More than two postaddress type: ' + str(v), 'warn') for i in v: i = i.split('_') i = tuple(i) # Make it a tuple if (not config.geoloc_lookup_dict.has_key((i))) or \ (config.geoloc_lookup_dict.has_key((i)) and \ (config.geoloc_lookup_dict[(i)][1].find('PA') < 0)): inout.log_message( 'Postaddress type word is not known: ' + str(v), 'warn') break # Exit for loop # Check if 'qualifier' elements only contain known qualifier words - - - - - # if (geoloc_dict.has_key('wayfare_qualifier')): # Check wayfare qualifier v = geoloc_dict['wayfare_qualifier'] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or \ (config.geoloc_lookup_dict.has_key((i,)) and \ (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)): inout.log_message('Wayfare qualifier word is not known: '+str(v), \ 'warn') break # Exit for loop if (geoloc_dict.has_key('locality_qualifier')): # Check locality qualifier v = geoloc_dict['locality_qualifier'] for i in v: if (not config.geoloc_lookup_dict.has_key((i,))) or \ (config.geoloc_lookup_dict.has_key((i,)) and \ (config.geoloc_lookup_dict[(i,)][1].find('LQ') < 0)): inout.log_message('Locality qualifier word is not known: '+str(v), \ 'warn') break # Exit for loop return geoloc_dict
def tag_geoloc_component(geoloc_str): """Tag a geocode locality input component string and make a list. USAGE: [word_list, tag_list] = tag_geoloc_component(loc_str) ARGUMENTS: geoloc_str A string containing the geocode and/or locality components DESCRIPTION: This routines cleans the input string and extracts words, numbers and separators into a list. Each element of this list is assigned one or more tags. A 'greedy tagger' is applied, which cheques sequences of list elements in the name lookup table (longer sequences first) and replaces them with the string and tag from the lookup-table if found. The routine returns two lists: words and their tags """ # First, split input string into elements at spaces - - - - - - - - - - - - - # org_list = geoloc_str.split() # The original list from the input string inout.log_message(' Initial word list: ' + str(org_list), 'v2') tag_list = [] # The initially empty list of tags word_list = [] # The initially empty list of words while (org_list != []): # As long as not all elements have been processed tmp_list = org_list[:config. geoloc_dict_seq_len] # Extract longest sub-list tmp_val = [] # Start with empty value tmp_key = tuple(tmp_list) while (tmp_key != ()): # As long as key not empty and not found in lookup if (config.geoloc_lookup_dict.has_key(tmp_key)): tmp_val = config.geoloc_lookup_dict[tmp_key] break tmp_key = tmp_key[:-1] # Remove last element in key if (tmp_val != []): # A value has been found in the dictionary tmp_len = len(tmp_key) # Length of found sequence if (tmp_val[0] != ''): # it's not an empty value word_list.append( tmp_val[0]) # Append corrected word (or sequence) tag_list.append(tmp_val[1]) # Append tag or tags else: # No value has been found in the lookup dictionary, try other tags tmp_val = org_list[ 0] # Value is first element in the original list tmp_len = 1 if (tmp_val.isdigit()): # Element is a number word_list.append(tmp_val) if (len(tmp_val) == 4): tag_list.append('N4') else: tag_list.append('NU') elif (not tmp_val.isalpha() ) and tmp_val.isalnum(): # Alpha-numeric word_list.append(tmp_val) tag_list.append('AN') elif (tmp_val == '-'): # Element is a hyphen word_list.append(tmp_val) tag_list.append('HY') elif (tmp_val == ','): # Element is a comma word_list.append(tmp_val) tag_list.append('CO') elif (tmp_val == '|'): # Element is a vertical bar word_list.append(tmp_val) tag_list.append('VB') else: # An unknown element word_list.append(tmp_val) tag_list.append('UN') # Finally remove the processed elements from the original element list # org_list = org_list[tmp_len:] # Remove processed elements return [word_list, tag_list]
def tagdata(): """Main routine, open file, read lines, tag data records, write to out-file. USAGE: tagdata() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 5): print '***** Error: %s needs at least six arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Output training file name' print '***** - Start of block with training records' print '***** - End of block with training records' print '***** - Number of training records' print '***** plus options' raise Exception() if (config.in_file_name == config.options[2]): print '***** Error: Input and output files must differ' print '***** Input file name: ', config.in_file_name print '***** Output training file name:', config.options[2] raise Exception() first_rec = int(config.options[2]) last_rec = int(config.options[3]) num_rec = int(config.options[4]) in_file_name = config.in_file_name out_file_name = config.options[1] # Check record number values - - - - - - - - - - - - - - - - - - - - - - - - # if (int(first_rec) >= int(last_rec)) or \ ((int(num_rec)-1) > (int(last_rec)-int(first_rec))): print '***** Error: Illegal values for training records block:' print '***** - Start of block with training records:', first_rec print '***** - End of block with training records: ', last_rec print '***** - Number of training records: ', num_rec raise Exception() rec_range = last_rec - first_rec - 1 # Range of records in input file # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if (last_rec > line_count): # Illegal value for last record print '***** Error: Illegal values for last training records:', last_rec print '***** File only contains', line_count, 'lines/records' raise Exception() # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name', 'na', 'n']): tag_mode = 'name' elif (tag_mode in ['locality', 'localty', 'loc', 'l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file hmm_file_name = None # Default: Do not use HMM to standardise training # records retag_file_name = None # Default: Do not retag an existing training file config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) freqs_file_name = None # Default: Do not write frequencies, no -freqs option if (len(config.options) > 5): options = config.options[5:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: ' + config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write('############' + os.linesep) f_log.write('#' + os.linesep) f_log.write( "# 'pyTagData.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write('#' + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# Output file name: " + out_file_name + os.linesep) f_log.write("# Tagging mode: " + tag_mode + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-hmm'): hmm_file_name = options[1] # Get file name of the HMM to use if (hmm_file_name == out_file_name): print '***** Error: HMM file name is the same as output file name!' raise Exception() try: f_in = open(hmm_file_name, 'r') # Test if file is available except: print '***** Error: Cannot open HMM file specified in "-hmm"', print 'option:', hmm_file_name raise IOError() f_in.close() options = options[ 2:] # Remove processed '-hmm' option and file name elif (options[0] == '-retag'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-retag" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() retag_file_name = options[ 1] # Get file name of the already-tagged # file to re-process if (retag_file_name == out_file_name): print '***** Error: Retag file name is the same as output file name!' raise Exception() elif (retag_file_name == in_file_name): print '***** Error: Retag file name is the same as input file name!' raise Exception() elif (retag_file_name == hmm_file_name): print '***** Error: Retag file name is the same as HMM file name!' raise Exception() try: f_in = open(retag_file_name, 'r') # Test if file is available # Now gather record numbers and previous tags/states, as well as the # original header information. Use a simple state machine to do this. # tagged_recs = {} cleaned_recs = {} original_header_lines = [] state = -1 # Header lines state prevline = '' for line in f_in.xreadlines( ): # Read training file and process it line = line.strip() if (state == -1) and (len(line) == 0): # End of header lines state = 0 prevline = line continue if (state == -1) and (len(line) > 0) and (line[0] == "#"): original_header_lines.append("# " + line) prevline = line continue sline = line.split(' ') if (len(sline) > 2) and (len(sline[2]) > 3) and (sline[0] == '#') \ and (sline[2][0] == '(') and (sline[2][-2:] == '):'): try: rec = int(sline[1]) # Original record number tagged_recs[rec] = None cleaned_recs[rec] = None state = 1 except: pass prevline = line continue if (state == 1) and (len(line) > 0) and (line[0] != '#'): tagged_recs[rec] = line cleaned_recs[rec] = prevline state = 0 prevline = line continue if (state == 1) and (len(line) > 0): prevline = line continue f_in.close() tagged_recs_keys = tagged_recs.keys() num_rec = len( tagged_recs_keys) # Override specified numbers first_rec = 0 last_rec = line_count except: print '***** Error: Cannot open tagged training file specified', print 'in "-retag" option:', retag_file_name raise IOError() options = options[ 2:] # Remove processed '-retag' option and file name elif (options[0][:5] == '-freq'): if (hmm_file_name == None) and ('-hmm' not in options): print '***** Error: "-feqs" option can only be used together with', print '"-hmm" option (which is not given).' raise Exception() freqs_file_name = options[ 1] # File name to write the frequencies to if (freqs_file_name == out_file_name): print '***** Error: Frequency file name is the same as output', print 'file name!' raise Exception() elif (freqs_file_name == in_file_name): print '***** Error: Frequency file name is the same as input', print 'file name!' raise Exception() elif (freqs_file_name == hmm_file_name): print '***** Error: Frequency file name is the same as HMM', print 'file name!' raise Exception() options = options[ 2:] # Remove processed '-freqs' option and file name try: # Check if file writing is possible freqs_out = open(freqs_file_name, 'w') freqs_out.close() except: print '***** Error: Cannot write to frequency output file specified', print 'in "-freqs" option:', freqs_file_name raise IOError() else: print '***** Error: Illegal option:', options[0] raise Exception() # If specified initalise and load Hidden Markov Model (HMM) - - - - - - - - - # if (hmm_file_name != None): myhmm = simplehmm.hmm([], []) # Create new empty HMM object myhmm.load_hmm(hmm_file_name) myhmm.print_hmm() # Print HMM (according to verbose and logging level) # Open output file and write header - - - - - - - - - - - - - - - - - - - - - # try: f_out = open(out_file_name, 'w') except: inout.log_message('Cannot open output file: ' + out_file_name, 'err') raise IOError() f_out.write("# Tagged training data written by 'pyTagData.py -"+ \ " Version 0.1'"+os.linesep) f_out.write('#' + os.linesep) f_out.write('# Created ' + time.ctime(time.time()) + os.linesep) f_out.write('#' + os.linesep) f_out.write('# Input file name: ' + in_file_name + os.linesep) f_out.write('# Output file name: ' + out_file_name + os.linesep) f_out.write('#' + os.linesep) f_out.write('# Parameters:' + os.linesep) f_out.write('# - Start of block with training records: '+str(first_rec)+ \ os.linesep) f_out.write('# - End of block with training records: '+str(last_rec)+ \ os.linesep) f_out.write('# - Number of training records: '+str(num_rec)+ \ os.linesep) if (hmm_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Using HMM file '"+hmm_file_name+"' for standardisation"+ \ os.linesep) if (retag_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Reprocessing training file '"+retag_file_name+"'"+ \ os.linesep) f_out.write("# Header lines from original training file follow:" + \ os.linesep) for header_line in original_header_lines: f_out.write(header_line + os.linesep) if (freqs_file_name != None): f_out.write('#' + os.linesep) f_out.write("# - Tag/state pattern frequencies written to file '" + \ freqs_file_name + os.linesep) f_out.write('#' + '-' * 70 + os.linesep) f_out.write(os.linesep) rec_count = 0 # Number of selected records num_rec_left = num_rec # Number of records to be selected left rec_selected = {} # Dictionary of all record numbers that were selected seq_freqs = {} # Dict to hold examples of tag/state patterns unchanged_loop_cnt = 0 # Counter of how many loops have been done # without new records being selected prev_num_rec_left = num_rec # Number of records left in the previous # interation # Due to the random nature of selecting records, and because sometimes - - - # a selected component can be empty (and is thus not used for training) # more than one iteration over the input data set is carried out. In each # iteration, records are selected randomly. # while (rec_count < num_rec): # Loop until 'num_rec' records selected # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_read = 0 # Number of read lines # Skip to start of training block - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() while (rec_count < num_rec) and (line_read <= (last_rec - first_rec)): line = f_in.readline() if ((retag_file_name != None) and (line_read in tagged_recs_keys)) or \ ((retag_file_name == None) and \ (num_rec_left >= random.randrange(0,rec_range,1))): line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message( ['Record number: ' + str(line_read + first_rec)], 'v1') config.curr_line_no = line_read + first_rec # Store current line number # Process line and extract content into components (name, geocode, etc) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Select component and process it - - - - - - - - - - - - - - - - - - - # if (tag_mode == 'name'): if (type(name_comp) == types.ListType): component = name_comp[0].strip( ) + ' ' + name_comp[1].strip() else: component = name_comp.strip() else: # Locality component component = geocode_comp.strip( ) + ' ' + locality_comp.strip() if (component != '') and \ (not rec_selected.has_key((line_read+first_rec))): if (tag_mode == 'name'): inout.log_message( ' Name component: |' + component + '|', 'v1') component = name.clean_name_component(component) [word_list, tag_list] = name.tag_name_component(component) else: # Locality component inout.log_message( ' Locality component: |' + component + '|', 'v1') component = locality.clean_geoloc_component(component) [word_list, tag_list] = locality.tag_geoloc_component(component) if (tag_list != []): # Only process non-empty tag lists # Append record number into dictionary of processed records # rec_selected.update({ (line_read + first_rec): (line_read + first_rec) }) # Create all permutation sequences of this tag list - - - - - - - - # tag_seq = mymath.perm_tag_sequence(tag_list) inout.log_message([' Word list: '+str(word_list), \ ' Tag list: '+str(tag_list), \ ' Tag sequences:'],'v2') # Do HMM processing - - - - - - - - - - - - - - - - - - - - - - - - # if (hmm_file_name != None): state_seq = [ ] # List containing computed HMM state sequences max_prob = -1.0 # maximal probability for a sequence max_seq_no = -1 # Number of the seq. with the max. probablity # Now give tag sequences to the HMMs to compute state sequences # i = 0 for t in tag_seq: [obs_seq, prob] = myhmm.viterbi(t) state_seq.append(obs_seq) if (prob > max_prob): max_prob = prob max_seq_no = i i += 1 # Write original component and resulting tag sequences to output # f_out.write('# '+str(line_read+first_rec)+' ('+str(rec_count)+ \ '): |'+component+'|'+os.linesep) # Commented original num_len = len(str(line_read + first_rec)) + len( str(rec_count)) + 6 f_out.write('#' + num_len * ' ' + '|' + ' '.join(word_list) + '|' + os.linesep) for i in range(len(tag_seq)): # Convert each tag sequence into a string for file output # seq_string = ' ' if (hmm_file_name != None) and (i != max_seq_no): seq_string = '# ' # Comment sequences with not max. probability for j in range(len(tag_seq[i])): if (hmm_file_name != None): seq_string = seq_string+' '+tag_seq[i][j]+':'+ \ state_seq[i][j]+',' else: seq_string = seq_string + ' ' + tag_seq[i][ j] + ':,' f_out.write(seq_string[:-1] + os.linesep) # Write without , at end inout.log_message(' ' + seq_string[:-1], 'v2') if (hmm_file_name != None): f_out.write('# Maximum Viterbi probability: %0.5f'% \ (max_prob) + os.linesep) inout.log_message('Maximum Viterbi probability: %0.5f'% \ (max_prob), 'v2') if (retag_file_name != None) and (tagged_recs[line_read] != None): if (tagged_recs[line_read].strip() != seq_string[:-1].strip()): f_out.write("# Note: ***** Changed *****" + os.linesep) inout.log_message(' Note:' + \ ' ***** Changed *****','v2') f_out.write('# Was: ' + tagged_recs[line_read] + os.linesep) # Write commented original tag sequence inout.log_message('Original tag sequence: '+ \ tagged_recs[line_read],'v2') f_out.write(os.linesep) # Write an empty line inout.log_message( '', 'v1') # Print empty lines between records if (hmm_file_name != None): seq_key = seq_string[: -1] # Add sequence to dictionary if (seq_freqs.has_key(seq_key)): seq_freqs[seq_key].append(['|'+' '.join(word_list)+'|', \ max_prob]) else: seq_freqs[seq_key] = [['|'+' '.join(word_list)+'|', \ max_prob]] rec_count += 1 # Print process indicator message # if (config.proc_ind >= 0) and (rec_count > 0): if (rec_count % config.proc_ind == 0): print 'Processed line', rec_count, 'of', num_rec line_read += 1 f_in.close() num_rec_left = num_rec - rec_count if (prev_num_rec_left == num_rec_left): # No new records selected unchanged_loop_cnt += 1 prev_num_rec_left = num_rec_left # Set to current value if (unchanged_loop_cnt > 5): # Do five loops maximal without selecting # new records config.curr_line_no = -1 # Set to illegal/empty values, as warning is config.curr_line = '' # not related to the current input line inout.log_message(['Can not select more than '+str(rec_count)+ \ ' records for training.', \ 'This is probably due to empty input components.', \ 'Please reduce value of "num_rec" or increase ' + \ 'range','between "first_rec" and "last_rec".'],'warn') break if (num_rec_left < 10): # Only 10 records left to select num_rec_left = num_rec + 1 # Set to more than 100% probablity elif (num_rec_left < (num_rec / 100.0)): # Less than 1% records left num_rec_left = int(num_rec / 100.0) # Set to 1% f_out.close() # If specified, save Viterbi frequencies to a file - - - - - - - - - - - - - # if (freqs_file_name != None): freqs_out = open(freqs_file_name, 'w') # Open frequency file for writing freqs_out.write('# Frequency listing of tag/state patterns written by') freqs_out.write('"pyTagData.py - Version 0.1"' + os.linesep) freqs_out.write('#' + os.linesep) freqs_out.write('# Created ' + time.ctime(time.time()) + os.linesep) freqs_out.write('#' + os.linesep) freqs_out.write("# Input file name: " + in_file_name + os.linesep) freqs_out.write("# Output file name: " + out_file_name + os.linesep) freqs_out.write(os.linesep) freqs_out.write('# Parameters:' + os.linesep) freqs_out.write('# - Start of block with training records: '+ \ str(first_rec)+os.linesep) freqs_out.write('# - End of block with training records: '+ \ str(last_rec)+os.linesep) freqs_out.write('# - Number of training records: '+ \ str(num_rec)+os.linesep) if (hmm_file_name != None): freqs_out.write('#' + os.linesep) freqs_out.write("# - Using HMM file '"+hmm_file_name+ \ "' for standardisation"+os.linesep) if (retag_file_name != None): freqs_out.write('#' + os.linesep) freqs_out.write("# - Reprocessing training file '"+retag_file_name+ \ "'"+os.linesep) freqs_out.write('#' + '-' * 70 + os.linesep) freqs_out.write(os.linesep) sorted_seq_freqs = [ ] # Now sort sequences according to their fruequencies for key in seq_freqs.keys(): sorted_seq_freqs.append((len(seq_freqs[key]), key)) sorted_seq_freqs.sort() for skey in sorted_seq_freqs: key = skey[1] freqs_out.write('# Pattern: ' + str(key) + os.linesep) freqs_out.write('# Frequency: ' + str(skey[0]) + os.linesep) examples = seq_freqs[key] freqs_out.write('# Maximum Viterbi probability: '+ \ str(examples[0][1])+os.linesep) freqs_out.write('# Examples: ' + os.linesep) for example in examples: freqs_out.write('# ' + str(example[0]) + os.linesep) freqs_out.write(str(key) + os.linesep) freqs_out.write(os.linesep) freqs_out.close() inout.log_message(['Read '+str(line_read)+' lines, processed '+ \ str(rec_count)+' lines', 'End.'],'v1')
def trainhmm(): """Main routine, open file, read lines, train HMM and save it to file. USAGE: trainhmm() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 3): print '***** Error: %s needs at least four arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Tagging mode: "name" or "locality"' print '***** - Input training file name' print '***** - HMM output file name' print '***** plus options' raise Exception() if (config.options[1] == config.options[2]): print '*** Error: Input and output files must differ' print '*** Input training file name:', config.options[1] print '*** HMM output file name: ', config.options[1] raise Exception() in_file_name = config.options[1] hmm_file_name = config.options[2] # Get tagging mode/lookup-tables used - - - - - - - - - - - - - - - - - - - - # tag_mode = config.options[0] if (tag_mode in ['name', 'na', 'n']): tag_mode = 'name' elif (tag_mode in ['locality', 'lolty', 'loc', 'l']): tag_mode = 'loc' else: print '***** Error: Illegal tagging mode:', tag_mode print '***** Must be either "name" or "locality"' raise Exception() # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file smoothing = None # Default: No smoothing config.nowarn = 0 # Deactivate no warning flag (print/log warning # messages) if (len(config.options) > 3): options = config.options[3:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file: ' + config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write("############" + os.linesep) f_log.write("#" + os.linesep) f_log.write( "# 'pyTrainHMM.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write("#" + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# HMM file name: " + hmm_file_name + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-s'): smoothing = 1 # Set to do a HMM smoothing smoothing = options[1] if (smoothing in ['l', 'la', 'lap', 'laplac', 'laplace']): smoothing = 'laplace' elif (smoothing in ['a','ad','abs','absd','absdis','absdisc',\ 'absdiscount']): smoothing = 'absdiscount' else: # Illegal value print "*** Error: Illegal value for 'smoothing' argument:", smoothing print "*** Possible are: 'laplace' or 'absdiscount'" raise Exception() options = options[2:] # Remove processed option else: print '*** Error: Illegal option:', options[0] raise Exception() # Get HMM states and observations from configuration module - - - - - - - - - # if (tag_mode == 'name'): state_list = config.name_hmm_states obser_list = config.name_hmm_obser else: state_list = config.geoloc_hmm_states obser_list = config.geoloc_hmm_obser # Open input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 # Counter for lines read rec_count = 0 # Counter for training records read # Read lines, discard comment lines and process training data lines - - - - - # training_data = [] # List of training records train_list = [ ] # List of training sequences (dictionaries), extracted from # training data for line in xreadlines.xreadlines(f_in): if (line[0] != '#') and (line.strip() != ''): # Line must contain a training record line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line_list = line.split(',') # Split into a list of elements line_data = [] # Training data list for one training record inout.log_message(['Record number: ' + str(rec_count)], 'v1') config.curr_line_no = line_count # Store current line number for elem in line_list: [k, v] = elem.split(':') # Split into key and value tag = k.strip() state = v.strip() line_data.append((state, tag)) if (state not in state_list): msg = ['Illegal state name in training record: '+state, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(state_list)] inout.log_message(msg, 'err') raise Exception() if (tag not in obser_list): msg = ['Illegal observation (tag) name in training record: '+tag, \ 'Line: '+str(line_count)+', record: '+str(rec_count), \ 'Possible values: '+str(obser_list)] inout.log_message(msg, 'err') raise Exception() inout.log_message(' Training record '+str(rec_count)+':'+ \ str(line_data),'v1') train_list.append(line_data) rec_count += 1 inout.log_message('', 'v1') # Print empty lines between records line_count += 1 # Close input file - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() inout.log_message('', 'v1') # Print empty lines between records # Initalise HMM and train it with training data - - - - - - - - - - - - - - - # myhmm = simplehmm.hmm(state_list, obser_list) myhmm.train(train_list, smoothing) myhmm.print_hmm() # Save trained HMM - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # myhmm.save_hmm(hmm_file_name) inout.log_message(['Read '+str(line_count)+' lines, processed '+ \ str(rec_count)+' training records', 'End.'],'v1')
# Check if definition of input components is correct with file types # input_values = input_component.values() input_len = -1 # Length of the input (either in number of fields (CSV and TAB # files) or in characters (COL files) output_keys = output_field.keys() # Check if 'original_input' is in output # fields, and if so check for correctness for k in output_keys: if (k[:14] == 'original_input'): v = k[14:].strip() if (v != ''): # There is a field or column range given if (v[0] == '[') and (v[-1] == ']'): v = v[1:-1] # Remove brackets else: inout.log_message('Wrong input component definition: '+str(k) + \ ' for "original_input" output field','err') raise Exception() if (v[0] == '(') and (v[-1] == ')'): # It's a tuple v = v[1:-1] # Remove tuple brackets v = v.split(',') # Make a list for i in range(len(v)): v[i] = int(v[i]) # Make integers if (len(v) == 1): # One integer only, must be a field number input_values.append(v) # Append 'original_input' field number elif (len(v) == 2): # Two integers, must be a column range input_values.append([(v[0], v[1])]) # Append as a tuple else: inout.log_message('Wrong input component value: '+str(k) + \ ' for "original_input" output field','err') raise Exception()
def standard(): """Main routine, open file, read lines, standardise them and write into file. USAGE: standard() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 2): print '***** Error: %s needs at least three arguments:'% (sys.argv[0]) print '***** - Name of the project module' print '***** - Number of the first record to be processed' print '***** - Number of records to be processed' print '***** plus options' raise Exception() first_rec = int(config.options[0]) num_rec = int(config.options[1]) in_file_name = config.in_file_name out_file_name = config.out_file_name # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file write_header = 0 # Write header (output field names) to output file # (default: Don't) config.nowarn = 0 # Deactivate no warning flag (print/log warning messages) if (len(config.options) > 2): options = config.options[2:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file,'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file:', config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write('##################################################') f_log.write("############"+os.linesep) f_log.write("#"+os.linesep) f_log.write("# 'pyStandard.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time())+os.linesep) f_log.write("#"+os.linesep) f_log.write("# Input file name: "+in_file_name+os.linesep) f_log.write("# Output file name: "+out_file_name+os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-h'): write_header = 1 options = options[1:] # Remove processed -'h' option elif (options[0] == '-hmm-name'): hmm_name_file = options[1] # Get file name of the name HMM to use try: f_in = open(hmm_name_file,'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-name" option:', print hmm_name_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.name_standard_method = 'hmm' config.name_hmm_file_name = hmm_name_file config.name_hmm = simplehmm.hmm([],[]) # Create new empty HMM object config.name_hmm.load_hmm(config.name_hmm_file_name) elif (options[0] == '-hmm-loc'): hmm_loc_file = options[1] # Get file name of the locality HMM to use try: f_in = open(hmm_loc_file,'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-loc" option:', print hmm_loc_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.geoloc_standard_method == 'hmm' config.geoloc_hmm_file_name = hmm_loc_file config.geoloc_hmm = simplehmm.hmm([],[]) # Create new HMM object config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name) else: print '***** Error: Illegal option:', options[0] raise Exception() # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if ((first_rec+num_rec) > line_count): # Illegal value for last record print '***** Error: Illegal values for number of records to process:', print num__rec, ', with start record:', start_rec print '***** File only contains',line_count, 'lines/records' raise Exception() # Open files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name,'r') except: inout.log_message('Cannot open input file: '+in_file_name,'err') raise IOError() try: f_out = open(out_file_name,'w') except: inout.log_message('Cannot open output file: '+out_file_name,'err') raise IOError() # Write header (name of output fields) into output file - - - - - - - - - - - # if (write_header == 1): header_dict = {} for n in config.output_field_names: header_dict.update({n:n}) # Dictionary where values are field names header_line = inout.compose_line(header_dict,header=1) f_out.write(header_line+os.linesep) # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() # Read lines, process them and write into output files - - - - - - - - - - - # line_read = 0 # Number of read lines while (line_read < num_rec): # Loop until 'num_rec' records processed line = f_in.readline() # Print process indicator message # if (config.proc_ind >= 0) and (line_read > 0): # Only print if activated if (line_read % config.proc_ind == 0): print 'Processed line', line_read, 'of', num_rec line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record '+str(line_read+first_rec)],'v1') config.curr_line_no = line_read+first_rec # Store current line number # Process line and extract content into components (name, geocode, etc.) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Make a local empty working copy of the output field dictionary - - - - - # output_fields = config.output_field.copy() output_fields_keys = output_fields.keys() for k in output_fields_keys: output_fields[k] = '' # Set all fields to an empty string # Standardise name component - - - - - - - - - - - - - - - - - - - - - - - # if (type(name_comp) == types.ListType): # Givenname and surname separate givenname_comp = name_comp[0].strip() surname_comp = name_comp[1].strip() if (givenname_comp != ''): # There is a givenname - - - - - - - - - - - inout.log_message(' Givenname component: |'+givenname_comp+'|','v1') givenname_comp = name.clean_name_component(givenname_comp) [name_list, tag_list] = name.tag_name_component(givenname_comp) output_fields['gender_guess'] = name.get_gender_guess(name_list, \ tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) [output_fields['givenname'], output_fields['alt_givenname']] = \ name.get_name_component(name_list, tag_list, 'gname') if (surname_comp != ''): # There is a surname - - - - - - - - - - - - - inout.log_message(' Surname component: |'+surname_comp+'|','v1') surname_comp = name.clean_name_component(surname_comp) [name_list, tag_list] = name.tag_name_component(surname_comp) [output_fields['surname'], output_fields['alt_surname']] = \ name.get_name_component(name_list, tag_list, 'sname') elif (name_comp.strip() != ''): # Given- and surname both in one field - - inout.log_message(' Name component: |'+name_comp+'|','v1') name_comp = name.clean_name_component(name_comp) [name_list, tag_list] = name.tag_name_component(name_comp) output_fields['gender_guess'] = name.get_gender_guess(name_list,tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) if (config.name_standard_method == 'rules'): name_dict = name.get_names_rules(name_list, tag_list, 'gname') elif (config.name_standard_method == 'hmm'): name_dict = name.get_names_hmm(name_list, tag_list) else: inout.log_message('Illegal name standardisation method:'+ \ config.name_standard_method,'err') raise Exception() for (field,value) in name_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode and locality components using HMM - - - - - - - - - - # if (config.geoloc_standard_method == 'hmm') and \ ((geocode_comp.strip() != '') or (locality_comp.strip() != '')): geoloc_comp = geocode_comp.strip()+' '+locality_comp.strip() inout.log_message(' Geocode and locality component: |'+geoloc_comp+'|',\ 'v1') geoloc_comp = locality.clean_geoloc_component(geoloc_comp) [geoloc_words, geoloc_tags] = locality.tag_geoloc_component(geoloc_comp) if (geoloc_words != []): # Component not empty, do HMM standardisation geoloc_dict = locality.get_geoloc_hmm(geoloc_words,geoloc_tags) for (field,value) in geoloc_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode component using rules - - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (geocode_comp.strip() != ''): inout.log_message(' Geocode component: |'+geocode_comp+'|','v1') ### TO BE DONE inout.log_message('Rules based standardisation for geocode is' + \ 'not implemented yet','err') raise Exception() # Standardise locality component using rules - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (locality_comp.strip() != ''): inout.log_message(' Locality component: |'+locality_comp+'|','v1') ### TO BE FINALISED inout.log_message('Rules based standardisation for locality is' + \ 'not implemented yet','err') raise Exception() # locality_comp = locality.clean_geoloc_component(locality_comp) # [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp) # # [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags) # if (terr != ''): # output_fields['territory'] = terr # # [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2) # if (pc != ''): # output_fields['postcode'] = pc # # [loc_name, loc_quali, loc_words4, loc_tags4] = \ # locality.get_localityname_qualifier(loc_words3, loc_tags3) # if (loc_name != ''): # output_fields['locality_name'] = loc_name # if (loc_quali != ''): # output_fields['locality_quali'] = loc_quali # # if (loc_words4 != []): # Not all words are standardised yet # print ' # Remaining word list:', loc_words4 ###### TEST # print ' # Remaining tag list: ', loc_tags4 ###### TEST # Standardise date strings - - - - - - - - - - - - - - - - - - - - - - - - # if (date1_comp != ''): inout.log_message(' Date1 component: |'+date1_comp+'|','v1') [day1,month1,year1,status1] = date.parse_datestr(date1_comp) if (day1 != -1): output_fields['day1'] = str(day1) if (month1 != -1): output_fields['month1'] = str(month1) if (year1 != -1): output_fields['year1'] = str(year1) if (date2_comp != ''): inout.log_message(' Date2 component: |'+date2_comp+'|','v1') [day2,month2,year2,status2] = date.parse_datestr(date2_comp) if (day2 != -1): output_fields['day2'] = str(day2) if (month2 != -1): output_fields['month2'] = str(month2) if (year2 != -1): output_fields['year2'] = str(year2) # Create log message of output fields - - - - - - - - - - - - - - - - - - - # msg = [' Standardised record output fields:'] for (field,value) in output_fields.items(): if (value != '') and (value != []): msg.append(' '+field+':'+str(value)) inout.log_message(msg,'v1') # Save standardised record into output field # out_line = inout.compose_line(output_fields) f_out.write(out_line+os.linesep) # Increment line counter and go to beginning of loop - - - - - - - - - - - # line_read += 1 inout.log_message('','v1') # Print empty lines between records # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() f_out.close() msg = ['','Number of warnings: '+str(config.num_warning), \ 'Number of corrected word spillings: '+str(config.num_word_spills)] inout.log_message(msg,'v1') print msg[1] print msg[2] inout.log_message('End.','v1')
def standard(): """Main routine, open file, read lines, standardise them and write into file. USAGE: standard() ARGUMENTS: None DESCRIPTION: Main routine, see description of module above. """ # Process command line arguments and check for correctness - - - - - - - - - # if (len(config.options) < 2): print '***** Error: %s needs at least three arguments:' % (sys.argv[0]) print '***** - Name of the project module' print '***** - Number of the first record to be processed' print '***** - Number of records to be processed' print '***** plus options' raise Exception() first_rec = int(config.options[0]) num_rec = int(config.options[1]) in_file_name = config.in_file_name out_file_name = config.out_file_name # Check for optional arguments and process if any - - - - - - - - - - - - - - # config.verbose = 0 # Default: No verbose output config.logging = 0 # Default: No logging into a file write_header = 0 # Write header (output field names) to output file # (default: Don't) config.nowarn = 0 # Deactivate no warning flag (print/log warning messages) if (len(config.options) > 2): options = config.options[2:] while (options != []): # Do a loop processing all options if (options[0] == '-nowarn'): config.nowarn = 1 # Activate no warning flag options = options[1:] # Remove processed '-nowarn' option elif (options[0] == '-v1'): config.verbose = 1 # Set to verbose output level 1 options = options[1:] # Remove processed '-v1' option elif (options[0] == '-v2'): config.verbose = 2 # Set to verbose output level 2 options = options[1:] # Remove processed '-v2' option elif (options[0] == '-l'): config.logging = 1 if (len(options) > 1): if (options[1][0] != '-'): # Not another option, must be a file name config.log_file = options[1] # Get name of log file options = options[1:] # Remove file_name options = options[1:] # Remove processed -'l' option only try: f_log = open(config.log_file, 'a') # Test if file is appendable except: print '***** Error ********************', print '***** Cannot write to log file:', config.log_file raise IOError() # Write (append) header to log file # f_log.write(os.linesep) f_log.write( '##################################################') f_log.write("############" + os.linesep) f_log.write("#" + os.linesep) f_log.write( "# 'pyStandard.py - Version 0.1' process started at: ") f_log.write(time.ctime(time.time()) + os.linesep) f_log.write("#" + os.linesep) f_log.write("# Input file name: " + in_file_name + os.linesep) f_log.write("# Output file name: " + out_file_name + os.linesep) f_log.write(os.linesep) f_log.close() elif (options[0] == '-h'): write_header = 1 options = options[1:] # Remove processed -'h' option elif (options[0] == '-hmm-name'): hmm_name_file = options[ 1] # Get file name of the name HMM to use try: f_in = open(hmm_name_file, 'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-name" option:', print hmm_name_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.name_standard_method = 'hmm' config.name_hmm_file_name = hmm_name_file config.name_hmm = simplehmm.hmm( [], []) # Create new empty HMM object config.name_hmm.load_hmm(config.name_hmm_file_name) elif (options[0] == '-hmm-loc'): hmm_loc_file = options[ 1] # Get file name of the locality HMM to use try: f_in = open(hmm_loc_file, 'r') # Test if file is available except: print '***** Error ********************', print '***** Cannot open HMM file in "-hmm-loc" option:', print hmm_loc_file raise IOError() f_in.close() options = options[2:] # Remove processed option and file name config.geoloc_standard_method == 'hmm' config.geoloc_hmm_file_name = hmm_loc_file config.geoloc_hmm = simplehmm.hmm([], []) # Create new HMM object config.geoloc_hmm.load_hmm(config.geoloc_hmm_file_name) else: print '***** Error: Illegal option:', options[0] raise Exception() # Open input file and check number of available records - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() line_count = 0 for line in f_in.xreadlines(): line_count += 1 f_in.close() if ((first_rec + num_rec) > line_count): # Illegal value for last record print '***** Error: Illegal values for number of records to process:', print num__rec, ', with start record:', start_rec print '***** File only contains', line_count, 'lines/records' raise Exception() # Open files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # try: f_in = open(in_file_name, 'r') except: inout.log_message('Cannot open input file: ' + in_file_name, 'err') raise IOError() try: f_out = open(out_file_name, 'w') except: inout.log_message('Cannot open output file: ' + out_file_name, 'err') raise IOError() # Write header (name of output fields) into output file - - - - - - - - - - - # if (write_header == 1): header_dict = {} for n in config.output_field_names: header_dict.update({n: n}) # Dictionary where values are field names header_line = inout.compose_line(header_dict, header=1) f_out.write(header_line + os.linesep) # Skip over records - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # if (first_rec > 0): for i in range(first_rec): f_in.readline() # Read lines, process them and write into output files - - - - - - - - - - - # line_read = 0 # Number of read lines while (line_read < num_rec): # Loop until 'num_rec' records processed line = f_in.readline() # Print process indicator message # if (config.proc_ind >= 0) and (line_read > 0): # Only print if activated if (line_read % config.proc_ind == 0): print 'Processed line', line_read, 'of', num_rec line = line.strip() # Remove line separators config.curr_line = line # Make a copy of the unprocessed current line line = line.lower() # Make all characters lower case inout.log_message(['Record ' + str(line_read + first_rec)], 'v1') config.curr_line_no = line_read + first_rec # Store current line number # Process line and extract content into components (name, geocode, etc.) # [name_comp, geocode_comp, locality_comp, date1_comp, date2_comp] = \ inout.process_line(line) # Make a local empty working copy of the output field dictionary - - - - - # output_fields = config.output_field.copy() output_fields_keys = output_fields.keys() for k in output_fields_keys: output_fields[k] = '' # Set all fields to an empty string # Standardise name component - - - - - - - - - - - - - - - - - - - - - - - # if (type(name_comp) == types.ListType ): # Givenname and surname separate givenname_comp = name_comp[0].strip() surname_comp = name_comp[1].strip() if (givenname_comp != ''): # There is a givenname - - - - - - - - - - - inout.log_message( ' Givenname component: |' + givenname_comp + '|', 'v1') givenname_comp = name.clean_name_component(givenname_comp) [name_list, tag_list] = name.tag_name_component(givenname_comp) output_fields['gender_guess'] = name.get_gender_guess(name_list, \ tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) [output_fields['givenname'], output_fields['alt_givenname']] = \ name.get_name_component(name_list, tag_list, 'gname') if (surname_comp != ''): # There is a surname - - - - - - - - - - - - - inout.log_message( ' Surname component: |' + surname_comp + '|', 'v1') surname_comp = name.clean_name_component(surname_comp) [name_list, tag_list] = name.tag_name_component(surname_comp) [output_fields['surname'], output_fields['alt_surname']] = \ name.get_name_component(name_list, tag_list, 'sname') elif (name_comp.strip() != ''): # Given- and surname both in one field - - inout.log_message(' Name component: |' + name_comp + '|', 'v1') name_comp = name.clean_name_component(name_comp) [name_list, tag_list] = name.tag_name_component(name_comp) output_fields['gender_guess'] = name.get_gender_guess( name_list, tag_list) [name_list, tag_list, output_fields['title']] = \ name.get_title(name_list, tag_list) if (config.name_standard_method == 'rules'): name_dict = name.get_names_rules(name_list, tag_list, 'gname') elif (config.name_standard_method == 'hmm'): name_dict = name.get_names_hmm(name_list, tag_list) else: inout.log_message('Illegal name standardisation method:'+ \ config.name_standard_method,'err') raise Exception() for (field, value) in name_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode and locality components using HMM - - - - - - - - - - # if (config.geoloc_standard_method == 'hmm') and \ ((geocode_comp.strip() != '') or (locality_comp.strip() != '')): geoloc_comp = geocode_comp.strip() + ' ' + locality_comp.strip() inout.log_message(' Geocode and locality component: |'+geoloc_comp+'|',\ 'v1') geoloc_comp = locality.clean_geoloc_component(geoloc_comp) [geoloc_words, geoloc_tags] = locality.tag_geoloc_component(geoloc_comp) if (geoloc_words != []): # Component not empty, do HMM standardisation geoloc_dict = locality.get_geoloc_hmm(geoloc_words, geoloc_tags) for (field, value ) in geoloc_dict.items(): # Assign to output dictionary output_fields[field] = value # Standardise geocode component using rules - - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (geocode_comp.strip() != ''): inout.log_message(' Geocode component: |' + geocode_comp + '|', 'v1') ### TO BE DONE inout.log_message('Rules based standardisation for geocode is' + \ 'not implemented yet','err') raise Exception() # Standardise locality component using rules - - - - - - - - - - - - - - - # elif (config.geoloc_standard_method == 'rules') and \ (locality_comp.strip() != ''): inout.log_message(' Locality component: |' + locality_comp + '|', 'v1') ### TO BE FINALISED inout.log_message('Rules based standardisation for locality is' + \ 'not implemented yet','err') raise Exception() # locality_comp = locality.clean_geoloc_component(locality_comp) # [loc_words, loc_tags] = locality.tag_geoloc_component(locality_comp) # # [terr,loc_words2,loc_tags2] = locality.get_territory(loc_words,loc_tags) # if (terr != ''): # output_fields['territory'] = terr # # [pc,loc_words3,loc_tags3] = locality.get_postcode(loc_words2,loc_tags2) # if (pc != ''): # output_fields['postcode'] = pc # # [loc_name, loc_quali, loc_words4, loc_tags4] = \ # locality.get_localityname_qualifier(loc_words3, loc_tags3) # if (loc_name != ''): # output_fields['locality_name'] = loc_name # if (loc_quali != ''): # output_fields['locality_quali'] = loc_quali # # if (loc_words4 != []): # Not all words are standardised yet # print ' # Remaining word list:', loc_words4 ###### TEST # print ' # Remaining tag list: ', loc_tags4 ###### TEST # Standardise date strings - - - - - - - - - - - - - - - - - - - - - - - - # if (date1_comp != ''): inout.log_message(' Date1 component: |' + date1_comp + '|', 'v1') [day1, month1, year1, status1] = date.parse_datestr(date1_comp) if (day1 != -1): output_fields['day1'] = str(day1) if (month1 != -1): output_fields['month1'] = str(month1) if (year1 != -1): output_fields['year1'] = str(year1) if (date2_comp != ''): inout.log_message(' Date2 component: |' + date2_comp + '|', 'v1') [day2, month2, year2, status2] = date.parse_datestr(date2_comp) if (day2 != -1): output_fields['day2'] = str(day2) if (month2 != -1): output_fields['month2'] = str(month2) if (year2 != -1): output_fields['year2'] = str(year2) # Create log message of output fields - - - - - - - - - - - - - - - - - - - # msg = [' Standardised record output fields:'] for (field, value) in output_fields.items(): if (value != '') and (value != []): msg.append(' ' + field + ':' + str(value)) inout.log_message(msg, 'v1') # Save standardised record into output field # out_line = inout.compose_line(output_fields) f_out.write(out_line + os.linesep) # Increment line counter and go to beginning of loop - - - - - - - - - - - # line_read += 1 inout.log_message('', 'v1') # Print empty lines between records # Close files - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # f_in.close() f_out.close() msg = ['','Number of warnings: '+str(config.num_warning), \ 'Number of corrected word spillings: '+str(config.num_word_spills)] inout.log_message(msg, 'v1') print msg[1] print msg[2] inout.log_message('End.', 'v1')