def __init__(self, **kwargs): """Constructor, set general attributes. """ dict.__init__(self) # Initialise dictionary base type self.description = '' self.created = '' self.modified = '' self.file_names = [] self.default = None # Default return value for non existing keys self.length = None # Number of entries in the look-up table for (keyword, value) in kwargs.items(): if (keyword.startswith('desc')): auxiliary.check_is_string('description', value) self.description = value elif (keyword.startswith('defau')): self.default = value elif (keyword.startswith('creat')): self.created = value elif (keyword.startswith('modif')): self.modified = value else: logging.exception('Illegal constructor argument keyword: "%s"' % \ (str(keyword))) raise Exception
def __init__(self, **kwargs): """Constructor, set general attributes. """ list.__init__(self) # Initialise list base type self.description = '' self.created = '' self.modified = '' self.file_name = '' self.length = None # Number of entries in the correction list for (keyword, value) in kwargs.items(): if (keyword.startswith('desc')): auxiliary.check_is_string('description', value) self.description = value elif (keyword.startswith('creat')): self.created = value elif (keyword.startswith('modif')): self.modified = value else: logging.exception('Illegal constructor argument keyword: "%s"' % \ (str(keyword))) raise Exception # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # logging.info('Initialised correction list "%s"' % (self.description))
def SaveMatchStatusFile(w_vec_dict, match_set, file_name): """Save the matched record identifiers into a CVS file. This function saves the record identifiers of all record pairs that are in the given match set into a CSV file with four columns: - First record identifier - Second record identifier - Summed matching weight from the corresponding weight vector - A unique match identifier (generated in the same way as the ones in the function SaveMatchDataSet below). """ auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict) auxiliary.check_is_set('match_set', match_set) auxiliary.check_is_string('file_name', file_name) match_rec_id_list = list(match_set) # Make a list so it can be sorted match_rec_id_list.sort() if (len(match_set) > 0): num_digit = max(1, int(math.ceil(math.log(len(match_set), 10)))) else: num_digit = 1 mid_count = 1 # Counter for match identifiers # Try to open the file for writing # try: f = open(file_name, 'w') except: logging.exception('Cannot open file "%s" for writing' % (str(file_name))) raise IOError for rec_id_tuple in match_rec_id_list: w_vec = w_vec_dict[rec_id_tuple] w_sum = sum(w_vec) mid_count_str = '%s' % (mid_count) this_mid = 'mid%s' % (mid_count_str.zfill(num_digit)) rec_id1 = rec_id_tuple[0] rec_id2 = rec_id_tuple[1] f.write('%s,%s,%f,%s' % (rec_id1, rec_id2, w_sum, this_mid) + os.linesep) mid_count += 1 f.close()
def SaveMatchStatusFile(w_vec_dict, match_set, file_name): """Save the matched record identifiers into a CVS file. This function saves the record identifiers of all record pairs that are in the given match set into a CSV file with four columns: - First record identifier - Second record identifier - Summed matching weight from the corresponding weight vector - A unique match identifier (generated in the same way as the ones in the function SaveMatchDataSet below). """ auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict) auxiliary.check_is_set('match_set', match_set) auxiliary.check_is_string('file_name', file_name) match_rec_id_list = list(match_set) # Make a list so it can be sorted match_rec_id_list.sort() if (len(match_set) > 0): num_digit = max(1,int(math.ceil(math.log(len(match_set), 10)))) else: num_digit = 1 mid_count = 1 # Counter for match identifiers # Try to open the file for writing # try: f = open(file_name, 'w') except: logging.exception('Cannot open file "%s" for writing' % (str(file_name))) raise IOError for rec_id_tuple in match_rec_id_list: w_vec = w_vec_dict[rec_id_tuple] w_sum = sum(w_vec) mid_count_str = '%s' % (mid_count) this_mid = 'mid%s' % (mid_count_str.zfill(num_digit)) rec_id1 = rec_id_tuple[0] rec_id2 = rec_id_tuple[1] f.write('%s,%s,%f,%s' % (rec_id1, rec_id2, w_sum, this_mid) + os.linesep) mid_count += 1 f.close()
def testIsString(self): # - - - - - - - - - - - - - - - - - - - - - - - - - """Test 'check_is_string' function.""" assert (auxiliary.check_is_string('TestArgument','hello') == None) assert (auxiliary.check_is_string('TestArgument','') == None) assert (auxiliary.check_is_string('TestArgument',"123") == None) assert (auxiliary.check_is_string('TestArgument','-1.23') == None) assert (auxiliary.check_is_string('TestArgument',"HELlo") == None) assert (auxiliary.check_is_string('TestArgument',"'!?!'") == None) assert (auxiliary.check_is_string('TestArgument',"[..]") == None)
def testIsString(self): # - - - - - - - - - - - - - - - - - - - - - - - - - """Test 'check_is_string' function.""" assert auxiliary.check_is_string("TestArgument", "hello") == None assert auxiliary.check_is_string("TestArgument", "") == None assert auxiliary.check_is_string("TestArgument", "123") == None assert auxiliary.check_is_string("TestArgument", "-1.23") == None assert auxiliary.check_is_string("TestArgument", "HELlo") == None assert auxiliary.check_is_string("TestArgument", "'!?!'") == None assert auxiliary.check_is_string("TestArgument", "[..]") == None
def testIsString( self): # - - - - - - - - - - - - - - - - - - - - - - - - - """Test 'check_is_string' function.""" assert auxiliary.check_is_string("TestArgument", "hello") assert auxiliary.check_is_string("TestArgument", "") assert auxiliary.check_is_string("TestArgument", "123") assert auxiliary.check_is_string("TestArgument", "-1.23") assert auxiliary.check_is_string("TestArgument", "HELlo") assert auxiliary.check_is_string("TestArgument", "'!?!'") assert auxiliary.check_is_string("TestArgument", "[..]")
def SaveMatchDataSet(match_set, dataset1, id_field1, new_dataset_name1, dataset2=None, id_field2=None, new_dataset_name2=None): """Save the original data set(s) with an additional field (attribute) that contains match identifiers. This functions creates unique match identifiers (one for each matched pair of record identifiers in the given match set), and inserts them into a new attribute (field) of a data set(s) which will be written. If the record identifier field is not one of the fields in the input data set, then additionally such a field will be added to the output data set (with the name of the record identifier from the input data set). Currently the output data set(s) to be written will be CSV type data sets. Match identifiers as or the form 'mid00001', 'mid0002', etc. with the number of digits depending upon the total number of matches in the match set. If a record is involved in several matches, then the match identifiers will be separated by a semi-colon (;). Only one new data set will be created for deduplication, and two new data sets for linkage. For a deduplication, it is assumed that the second data set is set to None. """ auxiliary.check_is_set('match_set', match_set) auxiliary.check_is_not_none('dataset1', dataset1) auxiliary.check_is_string('id_field1', id_field1) auxiliary.check_is_string('new_dataset_name1', new_dataset_name1) if (dataset2 != None): # A linkage, check second set of parameters auxiliary.check_is_not_none('dataset2', dataset2) auxiliary.check_is_string('id_field2', id_field2) auxiliary.check_is_string('new_dataset_name2', new_dataset_name2) do_link = True else: do_link = False match_rec_id_list = list(match_set) # Make a list so it can be sorted match_rec_id_list.sort() if (len(match_set) > 0): num_digit = max(1,int(math.ceil(math.log(len(match_set), 10)))) else: num_digit = 1 mid_count = 1 # Counter for match identifiers # Generate a dictionary with record identifiers as keys and lists of match # identifiers as values # match_id_dict1 = {} # For first data set match_id_dict2 = {} # For second data set, not required for deduplication for rec_id_tuple in match_rec_id_list: rec_id1, rec_id2 = rec_id_tuple mid_count_str = '%s' % (mid_count) this_mid = 'mid%s' % (mid_count_str.zfill(num_digit)) rec_id1_mid_list = match_id_dict1.get(rec_id1, []) rec_id1_mid_list.append(this_mid) match_id_dict1[rec_id1] = rec_id1_mid_list if (do_link == True): # Do the same for second data set rec_id2_mid_list = match_id_dict2.get(rec_id2, []) rec_id2_mid_list.append(this_mid) match_id_dict2[rec_id2] = rec_id2_mid_list else: # Same dicionary for deduplication rec_id2_mid_list = match_id_dict1.get(rec_id2, []) rec_id2_mid_list.append(this_mid) match_id_dict1[rec_id2] = rec_id2_mid_list mid_count += 1 # Now initialise new data set(s) for output based on input data set(s) - - - # First need to generate field list from input data set # if (dataset1.dataset_type == 'CSV'): new_dataset1_field_list = dataset1.field_list[:] # Make a copy of list last_col_index = new_dataset1_field_list[-1][1]+1 elif (dataset1.dataset_type == 'COL'): new_dataset1_field_list = [] col_index = 0 for (field, col_width) in dataset1.field_list: new_dataset1_field_list.append((field, col_index)) col_index += 1 last_col_index = col_index # Check if the record identifier is not a normal input field (in which case # it has to be written into the output data set as well) # rec_ident_name = dataset1.rec_ident add_rec_ident = True for (field_name, field_data) in dataset1.field_list: if (field_name == rec_ident_name): add_rec_ident = False break if (add_rec_ident == True): # Put record identifier into first column new_dataset1_field_list.append((rec_ident_name, last_col_index)) last_col_index += 1 # Append match id field # new_dataset1_field_list.append((id_field1, last_col_index)) new_dataset1_description = dataset1.description+' with match identifiers' new_dataset1 = dataset.DataSetCSV(description=new_dataset1_description, access_mode='write', rec_ident=dataset1.rec_ident, header_line=True, write_header=True, strip_fields = dataset1.strip_fields, miss_val = dataset1.miss_val, field_list = new_dataset1_field_list, delimiter = dataset1.delimiter, file_name = new_dataset_name1) # Read all records, add match identifiers and write into new data set # for (rec_id, rec_list) in dataset1.readall(): if (add_rec_ident == True): # Add record identifier rec_list.append(rec_id) mid_list = match_id_dict1.get(rec_id, []) mid_str = ';'.join(mid_list) rec_list.append(mid_str) new_dataset1.write({rec_id:rec_list}) new_dataset1.finalise() if (do_link == True): # Second data set for linkage only - - - - - - - - - - if (dataset2.dataset_type == 'CSV'): new_dataset2_field_list = dataset2.field_list[:] # Make a copy of list last_col_index = new_dataset2_field_list[-1][1]+1 elif (dataset2.dataset_type == 'COL'): new_dataset2_field_list = [] col_index = 0 for (field, col_width) in dataset2.field_list: new_dataset2_field_list.append((field, col_index)) col_index += 1 last_col_index = col_index # Check if the record identifier is not an normal input field (in which # case it has to be written into the output data set as well) # rec_ident_name = dataset2.rec_ident add_rec_ident = True for (field_name, field_data) in dataset2.field_list: if (field_name == rec_ident_name): add_rec_ident = False break if (add_rec_ident == True): # Put record identifier into first column new_dataset2_field_list.append((rec_ident_name, last_col_index)) last_col_index += 1 # Append match id field # new_dataset2_field_list.append((id_field2, last_col_index)) new_dataset2_description = dataset2.description+' with match identifiers' new_dataset2 = dataset.DataSetCSV(description=new_dataset2_description, access_mode='write', rec_ident=dataset2.rec_ident, header_line=True, write_header=True, strip_fields = dataset2.strip_fields, miss_val = dataset2.miss_val, field_list = new_dataset2_field_list, file_name = new_dataset_name2) # Read all records, add match identifiers and write into new data set # for (rec_id, rec_list) in dataset2.readall(): if (add_rec_ident == True): # Add record identifier rec_list.append(rec_id) mid_list = match_id_dict2.get(rec_id, []) mid_str = ';'.join(mid_list) rec_list.append(mid_str) new_dataset2.write({rec_id:rec_list}) new_dataset2.finalise()
def GenerateHistogram(w_vec_dict, bin_width, file_name=None, match_sets=None): """Print and/or save a histogram of the weight vectors stored in the given dictionary, and according to the match sets (if given). The histogram is rotated 90 degrees clockwise, i.e. up to down instead of left to right. This function sums up the number of weight vectors with a matching weight in a given bin (according to the given bin width). If given, the match sets must be a tuple containing three sets, the first being a set with matches, the second with non-matches, and the third with possible matches, as generated by classifiers in the classification.py Febrl module. For each bin, the number of weight vectors in this bin is printed as well, and if the match sets are given the number of matches, non-matches and possible matches in this bin. If a file name is given, the output will be written into this text file. This function returns a list of containing the histogram as text strings. """ MAX_HISTO_WIDTH = 80 # maximum width in characters auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict) auxiliary.check_is_number('bin_width', bin_width) auxiliary.check_is_positive('bin_width', bin_width) if (file_name != None): auxiliary.check_is_string('file_name', file_name) if (match_sets != None): auxiliary.check_is_tuple('match_sets', match_sets) if (len(match_sets) != 3): logging.exception( 'Match sets must be a tuple containing three sets.') raise Exception auxiliary.check_is_set('match_sets[0]', match_sets[0]) auxiliary.check_is_set('match_sets[1]', match_sets[1]) auxiliary.check_is_set('match_sets[2]', match_sets[2]) if (len(w_vec_dict) != (len(match_sets[0]) + len(match_sets[1]) + \ len(match_sets[2]))): logging.exception('Lengths of weight vector dictionary differs from' + \ 'summed lengths of match sets.') raise Exception # Check if weight vector dictionary is empty, if so return empty list # if (w_vec_dict == {}): logging.warn('Empty weight vector dictionary given for histogram ' + \ 'generation') return [] # Get a random vector dictionary element to get dimensionality of vectors # (rec_id_tuple, w_vec) = w_vec_dict.popitem() v_dim = len(w_vec) w_vec_dict[rec_id_tuple] = w_vec # Put back in histo_dict = {} # A combined histogram dictionary if (match_sets != None): # Also matches, non-matches and possible matches match_histo_dict = {} non_match_histo_dict = {} poss_match_histo_dict = {} max_bin_w_count = -1 # Maximal count for one binned weight entry # Loop over weight vectors - - - - - - - - - - - - - - - - - - - - - - - - - # for (rec_id_tuple, w_vec) in w_vec_dict.iteritems(): w_sum = sum(w_vec) # Sum all weight vector elements binned_w = w_sum - (w_sum % bin_width) binned_w_count = histo_dict.get(binned_w, 0) + 1 # Increase count by one histo_dict[binned_w] = binned_w_count if (binned_w_count > max_bin_w_count): # Check if this is new maximum count max_bin_w_count = binned_w_count if (match_sets != None): if (rec_id_tuple in match_sets[0]): binned_w_count = match_histo_dict.get(binned_w, 0) + 1 match_histo_dict[binned_w] = binned_w_count elif (rec_id_tuple in match_sets[1]): binned_w_count = non_match_histo_dict.get(binned_w, 0) + 1 non_match_histo_dict[binned_w] = binned_w_count else: # A possible match binned_w_count = poss_match_histo_dict.get(binned_w, 0) + 1 poss_match_histo_dict[binned_w] = binned_w_count # Sort histogram according to X axis values - - - - - - - - - - - - - - - - - # x_vals = histo_dict.keys() x_vals.sort() assert sum(histo_dict.values()) == len(w_vec_dict) if (match_sets == None): # Can use 68 characters for histogram scale_factor_y = float(MAX_HISTO_WIDTH - 19) / max_bin_w_count elif (len(poss_match_histo_dict) == 0): # No possible matches scale_factor_y = float(MAX_HISTO_WIDTH - 30) / max_bin_w_count else: # All three set non-empty scale_factor_y = float(MAX_HISTO_WIDTH - 41) / max_bin_w_count # Generate the histogram as a list of strings - - - - - - - - - - - - - - - - # histo_list = [] histo_list.append('Weight histogram:') histo_list.append('-----------------') if (match_sets == None): histo_list.append(' Counts | w_sum |') histo_list.append('-------------------') elif (len(poss_match_histo_dict) == 0): # No possible matches histo_list.append(' Counts |') histo_list.append(' Match | Non-Match| w_sum |') histo_list.append('------------------------------') else: histo_list.append(' Counts |') histo_list.append(' Match | Non-Match|Poss-Match| w_sum |') histo_list.append('-----------------------------------------') for x_val in x_vals: this_count = histo_dict[x_val] if (match_sets == None): line_str = '%9d | %5.2f |' % (this_count, x_val) elif (len(poss_match_histo_dict) == 0): # No possible matches this_match_count = match_histo_dict.get(x_val, 0) this_non_match_count = non_match_histo_dict.get(x_val, 0) line_str = '%9d |%9d | %5.2f |' % (this_match_count, this_non_match_count, x_val) else: this_match_count = match_histo_dict.get(x_val, 0) this_non_match_count = non_match_histo_dict.get(x_val, 0) this_poss_match_count = poss_match_histo_dict.get(x_val, 0) line_str = '%9d |%9d |%9d | %5.2f |' % ( this_match_count, this_non_match_count, this_poss_match_count, x_val) line_str += '*' * int(this_count * scale_factor_y) histo_list.append(line_str) histo_list.append('') # If a file name is given open it for writing - - - - - - - - - - - - - - - - # if (file_name != None): try: f = open(file_name, 'w') except: logging.exception('Cannot open file "%s" for writing' % (str(file_name))) raise IOError for line in histo_list: f.write(line + os.linesep) f.close() logging.info('Histogram written to file: %s' % (file_name)) if (match_sets != None): print match_histo_dict.items() print non_match_histo_dict.items() return histo_list
def LoadWeightVectorFile(file_name): """Function to load a weight vector dictionary from a file, assumed to be of type CSV (comma separated values), with the first line being a header line containing the field comparison names. Such files were normally written withhin the run() method of index implementations, see the Febrl module indexing.py. The first two columns in each line are assumed to be the two record identifiers which (together as a tuple) will become the keys in the weight vector dictionary that is returned. The function first checks if a gzipped version of the file is available (with file ending '.gz' or '.GZ'). This function returns a list with the field comparison names and a weight vector dictionary. """ auxiliary.check_is_string('file_name', file_name) if (file_name[-3:] not in ['.gz', '.GZ']): # Check for gzipped versions if (os.access(file_name + '.gz', os.F_OK) == True): file_name = file_name + '.gz' elif (os.access(file_name + '.GZ', os.F_OK) == True): file_name = file_name + '.GZ' if (file_name.endswith('.gz')) or (file_name.endswith('.GZ')): try: in_file = gzip.open(file_name) # Open gzipped file except: logging.exception('Cannot open gzipped CSV file "%s" for reading' % \ (file_name)) raise IOError else: # Open normal file for reading try: # Try to open the file in read mode in_file = open(file_name) except: logging.exception('Cannot open CSV file "%s" for reading' % \ (file_name)) raise IOError # Initialise the CSV parser - - - - - - - - - - - - - - - - - - - - - - - # csv_parser = csv.reader(in_file) header_line = csv_parser.next() # Read header line # Generate field names list # field_names_list = header_line[2:] # Remove record identifier names weight_vec_dict = {} # Fill weight vector dictionary with data from file for line in csv_parser: rec_id_tuple = (line[0], line[1]) if (rec_id_tuple in weight_vec_dict): # Check for unique record ids logging.warn('Record identifier tuple %s already in weight vector ' % \ (str(rec_id_tuple))+'dictionary') w_vec = [] for w in line[2:]: w_vec.append(float(w)) weight_vec_dict[rec_id_tuple] = w_vec in_file.close() return [field_names_list, weight_vec_dict]
def load(self, file_names): """Load one or more files with words and their frequency counts into the look-up table. See Febrl manual for details on the file format. """ # Check input argument type - - - - - - - - - - - - - - - - - - - - - - - - # if (isinstance(file_names, str)): file_names = [file_names] # Make a list out of a single file name auxiliary.check_is_list('file_names', file_names) i = 0 for file_name in file_names: auxiliary.check_is_string('file_name[%d]' % (i), file_name[i]) i += 1 self.file_names = file_names self.clear() # Remove all items from the look-up table self.sum = 0 # Loop over file names - - - - - - - - - - - - - - - - - - - - - - - - - - # for fn in self.file_names: try: # Open file and read all lines into a list f = open(fn,'r') except: logging.exception('Cannot read from file "%s"' % (fn)) raise IOError file_data = f.readlines() # Read complete file f.close() # Now process all lines - - - - - - - - - - - - - - - - - - - - - - - - - # for line in file_data: l = line.strip() if (len(l) > 0) and (l[0] != '#'): # Not empty line and not comment ll = l.split(',') # Get fields from a line # Check for two columns # if (len(ll) != 2): logging.exception('Illegal file format (not 2 columns) in file' + \ ': "%s" in line: %s"' % (fn, l)) raise Exception key = ll[0].strip().lower() # Make sure it's lower case val = ll[1].strip().lower() try: val = int(val) # Convert the value into an integer except: logging.exception('Illegal value for frequency count: "%s"' % \ (str(val)) + ' in line: "%s"' % (l)) raise Exception if (self.__contains__(key)): val += self.__getitem__(key) # Sum up counts self.__setitem__(key, val) self.sum += val self.length = self.__len__() # Get number of elements in the look-up table # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # logging.info('Loaded frequency look-up table "%s"' % \ (self.description)) logging.info(' From files: %s' % (str(self.file_names))) logging.info(' Number of entries: %i' % (self.length)) logging.info(' Sum of all value: %i' % (self.sum))
def load(self, file_name): """Load one correction list file into a sorted (decreasing length) list. See Febrl manual for details on the file format. """ # Check input argument type and open file - - - - - - - - - - - - - - - - - # auxiliary.check_is_string('file_name', file_name) self.file_name = file_name # Make sure the list is empty, remove all items from the correction list # while (self.__len__() > 0): self.pop() try: # Open file and read all lines into a list f = open(self.file_name, 'r') except: logging.exception('Cannot read from file "%s"' % (str(self.file_name))) raise IOError file_data = f.readlines() # Read complete file f.close() org_list = [] # List of original strings (the ones to be replaced) repl_list = [] # List of replacement strings len_list = [] # List of original string lengths repl = '' # Set inital replacement to nothing # Now process all lines - - - - - - - - - - - - - - - - - - - - - - - - - - # for line in file_data: l = line.strip() # Remove line separators at the end if (len(l) > 0) and (l[0] != '#'): # Not an empty line and not comment ll = l.split(':=') # Separate replacement from values if (len(ll) == 2): # Line contains a replacement - - - - - - - - - - - repl = ll[0].strip().lower() # Make replacement lower and strip if (not ((repl[0] == '"') and (repl[-1] == '"') or \ (repl[0] == "'") and (repl[-1] == "'"))): logging.exception('Replacement string is not properly quoted: '+ \ '"%s" in file: "%s"' % (repl, str(self.file_name))) raise Exception repl = repl[1:-1] # Remove quotes from replacement string v = ll[1].lower() # Get values in a string and make lowercase elif (len(ll) == 1): # Line contains only values - - - - - - - - - - - v = ll[0].lower() # Get values in a string and make lowercase else: # More than one ':=' separator in the line - - - - - - - - - - - logging.exception('Too many ":=" separators in line: "%s"' % (l)) raise Exception # Now process the values and append them to the list - - - - - - - - - vv = v.split(',') # Split values into a list for v in vv: # Loop over all values - - - - - - - - - - - - - - - - org = v.strip() # Get the original string if (org != ''): # Only process non-empty values if (not ((org[0] == '"') and (org[-1] == '"') or \ (org[0] == "'") and (org[-1] == "'"))): logging.exception('Original string is not properly quoted: '+ \ '"%s" in file: "%s"' % (org, str(self.file_name))) raise Exception org = org[1:-1] # Remove quotes from original string if (org != ''): # Only append non-empty values org_list.append(org) repl_list.append(repl) len_list.append(len(org)) tmp_list = map(None,len_list,org_list,repl_list) tmp_list.sort() tmp_list.reverse() for (i,org,repl) in tmp_list: self.append((org,repl)) self.length = self.__len__() # Get number of elements in the look-up table # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # logging.info('Loaded correction list "%s"' % (self.description)) logging.info(' From file: %s' % (str(self.file_name))) logging.info(' Number of entries: %i' % (self.length))
def load(self, file_names): """Load one or more files with entries and their localities into the table. See Febrl manual for details on the file format. """ # Check input argument type - - - - - - - - - - - - - - - - - - - - - - - - # if (isinstance(file_names, str)): file_names = [file_names] # Make a list out of a single file name auxiliary.check_is_list('file_names', file_names) i = 0 for file_name in file_names: auxiliary.check_is_string('file_name[%d]' % (i), file_name[i]) i += 1 self.file_names = file_names self.clear() # Remove all items from the look-up table # Loop over file names - - - - - - - - - - - - - - - - - - - - - - - - - - # for fn in self.file_names: try: # Open file and read all lines into a list f = open(fn,'r') except: logging.exception('Cannot read from file "%s"' % (fn)) raise IOError file_data = f.readlines() # Read complete file f.close() # Now process all lines - - - - - - - - - - - - - - - - - - - - - - - - - # for line in file_data: l = line.strip() if (len(l) > 0) and (l[0] != '#'): # Not empty line and not comment ll = l.split(',') # Get fields from a line # Check for three columns # if (len(ll) != 3): logging.exception('Illegal file format (not 3 columns) in file' + \ ': "%s" in line: %s' % (fn, l)) raise Exception key = ll[0].strip().lower() # Make sure it's lower case long = ll[1].strip() lati = ll[2].strip() # Try to convert into numerical (float) values # try: long = float(long) except: logging.exception('Longitude: "%s" is not a number in line: "%s"' \ % (str(long), l)) raise Exception try: lati = float(lati) except: logging.exception('Lattitude: "%s" is not a number in line: "%s"' \ % (str(lati), l)) raise Exception # And check their values # if (long < -180.0) or (long > 180.0): logging.exception('Illegal value for longitude: '+str(long)) raise Exception if (lati < -90.0) or (lati > 90.0): logging.exception('Illegal value for latitude: '+str(lati)) raise Exception val = [long,lati] # Value for dictionary if (self.__contains__(key)) and (self.__getitem__(key) != val): logging.exception('Key "%s" already in look-up table with ' % \ (str(key)) + 'different value') raise Exception self.__setitem__(key, val) self.length = self.__len__() # Get number of elements in the look-up table # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # logging.info('Loaded geocode look-up table "%s"' % (self.description)) logging.info(' From files: %s' % (str(self.file_names))) logging.info(' Number of entries: %i' % (self.length))
def LoadWeightVectorFile(file_name): """Function to load a weight vector dictionary from a file, assumed to be of type CSV (comma separated values), with the first line being a header line containing the field comparison names. Such files were normally written withhin the run() method of index implementations, see the Febrl module indexing.py. The first two columns in each line are assumed to be the two record identifiers which (together as a tuple) will become the keys in the weight vector dictionary that is returned. The function first checks if a gzipped version of the file is available (with file ending '.gz' or '.GZ'). This function returns a list with the field comparison names and a weight vector dictionary. """ auxiliary.check_is_string('file_name', file_name) if (file_name[-3:] not in ['.gz','.GZ']): # Check for gzipped versions if (os.access(file_name+'.gz', os.F_OK) == True): file_name = file_name+'.gz' elif (os.access(file_name+'.GZ', os.F_OK) == True): file_name = file_name+'.GZ' if (file_name.endswith('.gz')) or (file_name.endswith('.GZ')): try: in_file = gzip.open(file_name) # Open gzipped file except: logging.exception('Cannot open gzipped CSV file "%s" for reading' % \ (file_name)) raise IOError else: # Open normal file for reading try: # Try to open the file in read mode in_file = open(file_name) except: logging.exception('Cannot open CSV file "%s" for reading' % \ (file_name)) raise IOError # Initialise the CSV parser - - - - - - - - - - - - - - - - - - - - - - - # csv_parser = csv.reader(in_file) header_line = csv_parser.next() # Read header line # Generate field names list # field_names_list = header_line[2:] # Remove record identifier names weight_vec_dict = {} # Fill weight vector dictionary with data from file for line in csv_parser: rec_id_tuple = (line[0], line[1]) if (rec_id_tuple in weight_vec_dict): # Check for unique record ids logging.warn('Record identifier tuple %s already in weight vector ' % \ (str(rec_id_tuple))+'dictionary') w_vec = [] for w in line[2:]: w_vec.append(float(w)) weight_vec_dict[rec_id_tuple] = w_vec in_file.close() return [field_names_list, weight_vec_dict]
def SaveMatchDataSet(match_set, dataset1, id_field1, new_dataset_name1, dataset2=None, id_field2=None, new_dataset_name2=None): """Save the original data set(s) with an additional field (attribute) that contains match identifiers. This functions creates unique match identifiers (one for each matched pair of record identifiers in the given match set), and inserts them into a new attribute (field) of a data set(s) which will be written. If the record identifier field is not one of the fields in the input data set, then additionally such a field will be added to the output data set (with the name of the record identifier from the input data set). Currently the output data set(s) to be written will be CSV type data sets. Match identifiers as or the form 'mid00001', 'mid0002', etc. with the number of digits depending upon the total number of matches in the match set. If a record is involved in several matches, then the match identifiers will be separated by a semi-colon (;). Only one new data set will be created for deduplication, and two new data sets for linkage. For a deduplication, it is assumed that the second data set is set to None. """ auxiliary.check_is_set('match_set', match_set) auxiliary.check_is_not_none('dataset1', dataset1) auxiliary.check_is_string('id_field1', id_field1) auxiliary.check_is_string('new_dataset_name1', new_dataset_name1) if (dataset2 != None): # A linkage, check second set of parameters auxiliary.check_is_not_none('dataset2', dataset2) auxiliary.check_is_string('id_field2', id_field2) auxiliary.check_is_string('new_dataset_name2', new_dataset_name2) do_link = True else: do_link = False match_rec_id_list = list(match_set) # Make a list so it can be sorted match_rec_id_list.sort() if (len(match_set) > 0): num_digit = max(1, int(math.ceil(math.log(len(match_set), 10)))) else: num_digit = 1 mid_count = 1 # Counter for match identifiers # Generate a dictionary with record identifiers as keys and lists of match # identifiers as values # match_id_dict1 = {} # For first data set match_id_dict2 = {} # For second data set, not required for deduplication for rec_id_tuple in match_rec_id_list: rec_id1, rec_id2 = rec_id_tuple mid_count_str = '%s' % (mid_count) this_mid = 'mid%s' % (mid_count_str.zfill(num_digit)) rec_id1_mid_list = match_id_dict1.get(rec_id1, []) rec_id1_mid_list.append(this_mid) match_id_dict1[rec_id1] = rec_id1_mid_list if (do_link == True): # Do the same for second data set rec_id2_mid_list = match_id_dict2.get(rec_id2, []) rec_id2_mid_list.append(this_mid) match_id_dict2[rec_id2] = rec_id2_mid_list else: # Same dicionary for deduplication rec_id2_mid_list = match_id_dict1.get(rec_id2, []) rec_id2_mid_list.append(this_mid) match_id_dict1[rec_id2] = rec_id2_mid_list mid_count += 1 # Now initialise new data set(s) for output based on input data set(s) - - - # First need to generate field list from input data set # if (dataset1.dataset_type == 'CSV'): new_dataset1_field_list = dataset1.field_list[:] # Make a copy of list last_col_index = new_dataset1_field_list[-1][1] + 1 elif (dataset1.dataset_type == 'COL'): new_dataset1_field_list = [] col_index = 0 for (field, col_width) in dataset1.field_list: new_dataset1_field_list.append((field, col_index)) col_index += 1 last_col_index = col_index # Check if the record identifier is not a normal input field (in which case # it has to be written into the output data set as well) # rec_ident_name = dataset1.rec_ident add_rec_ident = True for (field_name, field_data) in dataset1.field_list: if (field_name == rec_ident_name): add_rec_ident = False break if (add_rec_ident == True): # Put record identifier into first column new_dataset1_field_list.append((rec_ident_name, last_col_index)) last_col_index += 1 # Append match id field # new_dataset1_field_list.append((id_field1, last_col_index)) new_dataset1_description = dataset1.description + ' with match identifiers' new_dataset1 = dataset.DataSetCSV(description=new_dataset1_description, access_mode='write', rec_ident=dataset1.rec_ident, header_line=True, write_header=True, strip_fields=dataset1.strip_fields, miss_val=dataset1.miss_val, field_list=new_dataset1_field_list, delimiter=dataset1.delimiter, file_name=new_dataset_name1) # Read all records, add match identifiers and write into new data set # for (rec_id, rec_list) in dataset1.readall(): if (add_rec_ident == True): # Add record identifier rec_list.append(rec_id) mid_list = match_id_dict1.get(rec_id, []) mid_str = ';'.join(mid_list) rec_list.append(mid_str) new_dataset1.write({rec_id: rec_list}) new_dataset1.finalise() if (do_link == True ): # Second data set for linkage only - - - - - - - - - - if (dataset2.dataset_type == 'CSV'): new_dataset2_field_list = dataset2.field_list[:] # Make a copy of list last_col_index = new_dataset2_field_list[-1][1] + 1 elif (dataset2.dataset_type == 'COL'): new_dataset2_field_list = [] col_index = 0 for (field, col_width) in dataset2.field_list: new_dataset2_field_list.append((field, col_index)) col_index += 1 last_col_index = col_index # Check if the record identifier is not an normal input field (in which # case it has to be written into the output data set as well) # rec_ident_name = dataset2.rec_ident add_rec_ident = True for (field_name, field_data) in dataset2.field_list: if (field_name == rec_ident_name): add_rec_ident = False break if (add_rec_ident == True): # Put record identifier into first column new_dataset2_field_list.append((rec_ident_name, last_col_index)) last_col_index += 1 # Append match id field # new_dataset2_field_list.append((id_field2, last_col_index)) new_dataset2_description = dataset2.description + ' with match identifiers' new_dataset2 = dataset.DataSetCSV(description=new_dataset2_description, access_mode='write', rec_ident=dataset2.rec_ident, header_line=True, write_header=True, strip_fields=dataset2.strip_fields, miss_val=dataset2.miss_val, field_list=new_dataset2_field_list, file_name=new_dataset_name2) # Read all records, add match identifiers and write into new data set # for (rec_id, rec_list) in dataset2.readall(): if (add_rec_ident == True): # Add record identifier rec_list.append(rec_id) mid_list = match_id_dict2.get(rec_id, []) mid_str = ';'.join(mid_list) rec_list.append(mid_str) new_dataset2.write({rec_id: rec_list}) new_dataset2.finalise()
def GenerateHistogram(w_vec_dict, bin_width, file_name=None, match_sets=None): """Print and/or save a histogram of the weight vectors stored in the given dictionary, and according to the match sets (if given). The histogram is rotated 90 degrees clockwise, i.e. up to down instead of left to right. This function sums up the number of weight vectors with a matching weight in a given bin (according to the given bin width). If given, the match sets must be a tuple containing three sets, the first being a set with matches, the second with non-matches, and the third with possible matches, as generated by classifiers in the classification.py Febrl module. For each bin, the number of weight vectors in this bin is printed as well, and if the match sets are given the number of matches, non-matches and possible matches in this bin. If a file name is given, the output will be written into this text file. This function returns a list of containing the histogram as text strings. """ MAX_HISTO_WIDTH = 80 # maximum width in characters auxiliary.check_is_dictionary('w_vec_dict', w_vec_dict) auxiliary.check_is_number('bin_width', bin_width) auxiliary.check_is_positive('bin_width', bin_width) if (file_name != None): auxiliary.check_is_string('file_name', file_name) if (match_sets != None): auxiliary.check_is_tuple('match_sets', match_sets) if (len(match_sets) != 3): logging.exception('Match sets must be a tuple containing three sets.') raise Exception auxiliary.check_is_set('match_sets[0]', match_sets[0]) auxiliary.check_is_set('match_sets[1]', match_sets[1]) auxiliary.check_is_set('match_sets[2]', match_sets[2]) if (len(w_vec_dict) != (len(match_sets[0]) + len(match_sets[1]) + \ len(match_sets[2]))): logging.exception('Lengths of weight vector dictionary differs from' + \ 'summed lengths of match sets.') raise Exception # Check if weight vector dictionary is empty, if so return empty list # if (w_vec_dict == {}): logging.warn('Empty weight vector dictionary given for histogram ' + \ 'generation') return [] # Get a random vector dictionary element to get dimensionality of vectors # (rec_id_tuple, w_vec) = w_vec_dict.popitem() v_dim = len(w_vec) w_vec_dict[rec_id_tuple] = w_vec # Put back in histo_dict = {} # A combined histogram dictionary if (match_sets != None): # Also matches, non-matches and possible matches match_histo_dict = {} non_match_histo_dict = {} poss_match_histo_dict = {} max_bin_w_count = -1 # Maximal count for one binned weight entry # Loop over weight vectors - - - - - - - - - - - - - - - - - - - - - - - - - # for (rec_id_tuple, w_vec) in w_vec_dict.iteritems(): w_sum = sum(w_vec) # Sum all weight vector elements binned_w = w_sum - (w_sum % bin_width) binned_w_count = histo_dict.get(binned_w,0) + 1 # Increase count by one histo_dict[binned_w] = binned_w_count if (binned_w_count > max_bin_w_count): # Check if this is new maximum count max_bin_w_count = binned_w_count if (match_sets != None): if (rec_id_tuple in match_sets[0]): binned_w_count = match_histo_dict.get(binned_w,0) + 1 match_histo_dict[binned_w] = binned_w_count elif (rec_id_tuple in match_sets[1]): binned_w_count = non_match_histo_dict.get(binned_w,0) + 1 non_match_histo_dict[binned_w] = binned_w_count else: # A possible match binned_w_count = poss_match_histo_dict.get(binned_w,0) + 1 poss_match_histo_dict[binned_w] = binned_w_count # Sort histogram according to X axis values - - - - - - - - - - - - - - - - - # x_vals = histo_dict.keys() x_vals.sort() assert sum(histo_dict.values()) == len(w_vec_dict) if (match_sets == None): # Can use 68 characters for histogram scale_factor_y = float(MAX_HISTO_WIDTH-19) / max_bin_w_count elif (len(poss_match_histo_dict) == 0): # No possible matches scale_factor_y = float(MAX_HISTO_WIDTH-30) / max_bin_w_count else: # All three set non-empty scale_factor_y = float(MAX_HISTO_WIDTH-41) / max_bin_w_count # Generate the histogram as a list of strings - - - - - - - - - - - - - - - - # histo_list = [] histo_list.append('Weight histogram:') histo_list.append('-----------------') if (match_sets == None): histo_list.append(' Counts | w_sum |') histo_list.append('-------------------') elif (len(poss_match_histo_dict) == 0): # No possible matches histo_list.append(' Counts |') histo_list.append(' Match | Non-Match| w_sum |') histo_list.append('------------------------------') else: histo_list.append(' Counts |') histo_list.append(' Match | Non-Match|Poss-Match| w_sum |') histo_list.append('-----------------------------------------') for x_val in x_vals: this_count = histo_dict[x_val] if (match_sets == None): line_str = '%9d | %5.2f |' % (this_count, x_val) elif (len(poss_match_histo_dict) == 0): # No possible matches this_match_count = match_histo_dict.get(x_val, 0) this_non_match_count = non_match_histo_dict.get(x_val, 0) line_str = '%9d |%9d | %5.2f |' % (this_match_count, this_non_match_count, x_val) else: this_match_count = match_histo_dict.get(x_val, 0) this_non_match_count = non_match_histo_dict.get(x_val, 0) this_poss_match_count = poss_match_histo_dict.get(x_val, 0) line_str = '%9d |%9d |%9d | %5.2f |' % (this_match_count, this_non_match_count, this_poss_match_count, x_val) line_str += '*'*int(this_count*scale_factor_y) histo_list.append(line_str) histo_list.append('') # If a file name is given open it for writing - - - - - - - - - - - - - - - - # if (file_name != None): try: f = open(file_name, 'w') except: logging.exception('Cannot open file "%s" for writing' % (str(file_name))) raise IOError for line in histo_list: f.write(line + os.linesep) f.close() logging.info('Histogram written to file: %s' % (file_name)) if (match_sets != None): print match_histo_dict.items() print non_match_histo_dict.items() return histo_list
def load(self, file_names): """Load one or more files with word corrections and tags into the look-up table. See Febrl manual for details on the file format. """ # Check input argument type - - - - - - - - - - - - - - - - - - - - - - - - # if (isinstance(file_names, str)): file_names = [file_names] # Make a list out of a single file name auxiliary.check_is_list('file_names', file_names) i = 0 for file_name in file_names: auxiliary.check_is_string('file_name[%d]' % (i), file_name[i]) i += 1 self.file_names = file_names self.clear() # Remove all items from the look-up table self.max_key_length = 0 # Loop over file names - - - - - - - - - - - - - - - - - - - - - - - - - - # for fn in self.file_names: try: # Open file and read all lines into a list f = open(fn,'r') except: logging.exception('Cannot read from file "%s"' % (fn)) raise IOError file_data = f.readlines() # Read complete file f.close() tag = '' # Start with no tag key = '' # Start with an empty key # Now process all lines - - - - - - - - - - - - - - - - - - - - - - - - - # for line in file_data: l = line.strip() # Remove line separators if (len(l) > 0) and (l[0] != '#'): # Not empty line and not comment if (l[:5] == 'tag=<'): # It's a line with a new tag tag = l[5:7] else: # A line with an entry # Make sure a tag is set # if (tag == ''): logging.exception('Missing tag specification in file "%s"' % \ (fn)) raise Exception line_list = l.split(':') # Separate key from values if (len(line_list) > 2): logging.exception('Illegal format in file "%s" in line: %s' % \ (fn, l)) raise Exception if (len(line_list) == 2): # Line contains a key - - - - - - - - - key = line_list[0].strip().lower() # Get and clean key key_list = key.split(' ') # Make a list of key words if (len(key_list) > self.max_key_length): self.max_key_length = len(key_list) # Update maximal key length # Insert key itself into lookup table # dict_val = '_'.join(key_list) dict_key = tuple(key_list) this_tag = tag if (self.__contains__(dict_key)): # Already in lookup table test_item = self.__getitem__(dict_key) test_val = test_item[0] # Value without tag test_tag = test_item[1] if (dict_val != test_val): logging.warn('Key "%s" already in dictionary with ' % \ (str(dict_val)) + 'different value (old value ' + \ 'will be over written with "%s")' % (str(test_val))) if (test_tag.find(this_tag) < 0): # This tag is new this_tag = test_tag+'/'+this_tag # Tag for this entry else: this_tag = test_tag this_val = (dict_val, this_tag) self.__setitem__(dict_key,this_val) # Insert key itself vals = line_list[1].lower() # Get values in this line in a string elif (len(line_list) == 1): # Line contains only values - - - - - vals = line_list[0].lower() # Get values in this line in a string # Porcess all values right of ':' in this line val_list = vals.split(',') # Split values into a list for val in val_list: # Loop over all values - - - - - - - - - - - val_strip = val.strip() if (val_strip != ''): # Only append non-empty values key_list = val_strip.split(' ') # Make a list of key words if (len(key_list) > self.max_key_length): self.max_key_length = len(key_list) # Update maximal key len dict_key = tuple(key_list) this_tag = tag if (self.__contains__(dict_key)): test_item = self.__getitem__(dict_key) test_val = test_item[0] # Value without tag test_tag = test_item[1] if (dict_val != test_val): logging.warn('Key "%s" already in dictionary with ' % \ (str(dict_val)) + 'different value (old value ' + \ 'will be over written with "%s")' % (str(test_val))) if (test_tag.find(this_tag) < 0): # This tag is new this_tag = test_tag+'/'+this_tag # Tag for this entry else: this_tag = test_tag this_val = (dict_val, this_tag) self.__setitem__(dict_key,this_val) self.length = self.__len__() # Get number of elements in the look-up table # A log message - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # logging.info('Loaded tag look-up table "%s"' % (self.description)) logging.info(' From files: %s' % (str(self.file_names))) logging.info(' Number of entries: %i' % (self.length)) logging.info(' Maximal key length: %i' % (self.max_key_length))