def __call__(self, **kwargs): """ Executes model Args: kwargs (dict, optional): keyword arguments passed to pre(), control() and post() silent (bool): if True then suppress printing Return: (float): residuum from range 0.0 .. 1.0 indicating error of task or -1.0 if parallel and rank > 0 """ # skip model execution if parallelized with MPI and rank > 0 if parallel.rank(): return -1.0 self.silent = kwargs.get('silent', self.silent) self.prolog() self.pre(**kwargs) res = self.control(**kwargs) self.post(**kwargs) self.epilog() return res
def write(self, *message): """ Sends output to console (if not in silent mode) and to log file of root Wraps print() function considering indent of follower objects Args: message (list): list of variable arguments to be written; if last argument is 'None', line wont be completed (no carriage return) Note: No seperator between print elements, comparable to print(x, sep='') """ silent = self.silent or parallel.rank() lf = self._logFile if lf is None: lf = self.root()._logFile if self._lineCompleted: if not silent: print(self.indent(), end='') if lf: print(self.indent(), end='', file=lf) for x in message: if x is not None: if not silent: print(str(x), end='', sep='') if lf: print(str(x), end='', sep='', file=lf) if not message or message[-1] is not None: if not silent: print() if lf: print(file=lf) self._lineCompleted = True
def auction(cost_dict, min_cost, max_cost, row_numbers, col_numbers): """Linear sum assignment procedure based on symetric auction algorithm. Re-implementation of a FORTRAN code, taken from: http://web.mit.edu/afs/athena.mit.edu/user/d/i/dimitrib/www/auction.txt and as described in: "Auction Algorithms for Network Flow Problems: A Tutorial Introduction" Dimitri P. Bertsekas, Computational Optimization and Applications, Vol. 1, pp. 7-66, 1992. and other papers by Dimitri P. Bertsekas, see: http://www.mit.edu:8001//people/dimitrib/publ.html Takes as input a cost dictionary, the minimum and maximum weights in this dictionary, and two sorted lists with the row and column numbers. """ #################### START PARALLEL TEST CODE ############################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = cost_dict.keys() tmp_list.sort() tmp_str = str(min_cost) + ' /' + str(max_cost)+', '+ str(row_numbers) + \ ' / ' + str(col_numbers) + ':: ' for k in tmp_list: tmp_list2 = cost_dict[k].items() tmp_list2.sort() tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / ' f = open( 'lap-auction-' + str(parallel.rank()) + '-' + str(parallel.size()), 'a') f.write(tmp_str + os.linesep) f.close() #################### END PARALLEL TEST CODE ############################### i_large = 100000000 # A value larger than max_cost auction_results = {} # Result dictionary with final record pair matches num_rows = len(row_numbers) num_cols = len(col_numbers) if (num_rows != num_cols): print 'error:Asymmetric problem given to symmetric "auction" algorithm' raise Exception # Set parameters for auction algorithm - - - - - - - - - - - - - - - - - - - # if (max_cost > int(i_large / (num_rows + 1))): print 'error:Cost range too large to work with integer epsilon' raise Exception max_cost *= (num_rows + 1) beg_eps = max_cost / 5 # Maybe smaller, can even be 1, but not smaller end_eps = num_rows / 10 # Must be smaller than beg_eps, can be 1 if (end_eps < 1): end_eps = 1 elif (end_eps > beg_eps): end_eps = beg_eps factor = 5 # Must be greater than 1 start_incr = beg_eps / 10 # Maybe even be 1, but not smaller if (start_incr < 1): start_incr = 1 # Initialisation # eps = beg_eps i_small = -i_large large_incr = int(i_large / 10) thresh = min(int(num_rows / 5), 100) # Maximal value 100 incr_factor = 2 cycles = 1 average = num_cols num_phases = 1 # Initialise dictionaries for prices and row assignments # pcol = {} assigned = {} for col_num in col_numbers: pcol[col_num] = i_small # assigned[col_num] = -1####### # Initialise list of un-assigned rows (all rows at the beginning) # list = row_numbers[:] no_list = num_rows do_phase = True # Set flag so a first phase is performed # Start sub problem (scaling phase with new epsilon) - - - - - - - - - - - - # while (do_phase == True): if (eps == 1): thresh = 0 incr = max(start_incr, eps) # Increment must not be larger than epsilon print '2: Start of a scaling phase with epsilon: %f' % (eps) do_cycle = True # Set flag so a first cycle is performed cycle_count = 0 while (do_cycle == True): # Start forward auction cycle - - - - - - - - - - - - - - - - - - - - - - # no_new_list = 0 # Initialise count of next list of un-assigned rows # Cycle through the current list of un-assigned rows # for i in range(no_list): row_num = list[i] row_dict = cost_dict[row_num] row_list = row_dict.items() row_list.sort() ################ Maybe not needed ??? ####### row_len = len(row_dict) # Get first and second column number and cost # col_num, cost = row_list[0] col_num2, cost2 = row_list[1] max1 = cost - pcol[col_num] max2 = cost2 - pcol[col_num2] if (max1 > max2): best_col_num = col_num elif (max1 < max2): # Swap maximum values max1, max2 = max2, max1 best_col_num = col_num2 else: # Both are the same if (col_num < col_num2 ): # Make sure best column number is smallest best_col_num = col_num else: best_col_num = col_num2 if (row_len > 2): # Row has more than two elements for c in range(2, row_len): col_num3, cost3 = row_list[c] # Loop through cols max_tmp = cost3 - pcol[col_num3] if (max_tmp > max2): if (max_tmp > max1): best_col_num = col_num3 # New best column max2 = max1 max1 = max_tmp elif (max_tmp == max1) and (col_num3 < best_col_num): best_col_num = col_num3 # Best column number is smallest else: max2 = max_tmp # Row bids for best column increasing its price, and gets - - - - - - # assigned to best column, while any row assigned to best # column gets un-asssigned # pcol[best_col_num] = pcol[best_col_num] + max1 - max2 + incr old_row = assigned.get(best_col_num, -1) ######### assigned[best_col_num] = row_num if (old_row >= 0): # Row has been assigned list[ no_new_list] = old_row # Save un-assigned row into list no_new_list += 1 cycle_count += 1 if ((cycle_count % 10000) == 0): print '1: Finished %i cycles, %i out of %i rows un-assigned' % \ (cycle_count, no_new_list, num_rows) if (no_new_list > 0): print '1: Un-assigned rows: %s' % (str( list[:no_new_list])) # Collect statistics # average = (cycles * average + no_list) / (cycles + 1) cycles += 1 # Check if there are still 'many' unassignedrows, i.e. if the - - - - - - # number of unassignd rows is greater than the parameter 'thresh'. # If not, replace current list with the new list and go for another # cycle. Otherwise, if epsilon > 1, reduce epsilon, reset the # asignment to empty and restart auction. # If epsilon == 1 terminate. # Also increase the minimal bidding increment up to a maximun value # of epsilon (this is the adaptive feature) # incr *= incr_factor if (incr > eps): incr = eps if (no_new_list > thresh): no_list = no_new_list else: do_cycle = False # Set flag so cycle is left # End of sub-problem (scaling phase) - - - - - - - - - - - - - - - - - - - # if (eps == 1): do_phase = False # Set flag so phase is left else: num_phases += 1 eps = int(eps / factor) if (eps > incr): eps = int(eps / factor) if (eps < 1) or (eps < end_eps): eps = 1 thresh = int(thresh / factor) print '1: End of a scaling phase, new epsilon: %3f' % (eps) t_min = min(pcol.values() + [i_large]) col_num_list = assigned.keys() ########## col_num_list.sort() ########## for col_num in assigned: ########### # for col_num in col_num_list: row_num = assigned[col_num] if (row_num >= 0): list[no_new_list] = row_num no_new_list += 1 assigned[col_num] = -1 incr = t_min - i_small # Reset minimum price to i_small for col_num in col_numbers: # Update all prices pcol[col_num] = pcol[col_num] - incr # Final parameter updates before starting another scaling phase # no_list = no_new_list if (start_incr < eps): start_incr *= factor return assigned
def do_lap(lap_method, results_dict, process_type, threshold): """Linear sum assignment procedure. This routine calculates a linear assignments for one-to-one matching. The routine do_lap does all kinds of preprocessing, including the extraction of unique record pairs (which can be removed before the lap is applied) and the extraction of sub-set which can be solved independently. These sub-sets are then given to the chosen lap routine. The routine takes as input a results dictionary, as produced by a classifier (see classification.py), and returns a dictionary with the assigned record pair numbers as keys and the corresponding weight as values. Possible methods are 'auction' The process_type attribute must either be set to 'deduplication' or to 'linkage' in order to be able to preprocess the classifier data prior to the lap procedure. """ if (lap_method not in ['auction']): print 'error:Illegal method for lap method: %s' % (str(lap_method)) raise Exception if (process_type not in ['deduplication', 'linkage']): print 'error:Illegal value for attribute "process_type": %s' %\ (str(process_type)) raise Exception if (results_dict == {}): print 'error:Empty results dictionary' raise Exception lap_start_time = time.time() # Start timer lap_results = {} # Result dictionary with final record pair matches # Make one (or two) disctionary of all assigned rercord numbers # if (process_type == 'deduplication'): used_rec_nums = {} else: used_rec_nums_a = {} used_rec_nums_b = {} # Make sure the threshold is a number if it is defined # if (threshold != None): if (not (isinstance(threshold, int) or isinstance(threshold, float))): print 'error:Threshold is not a number: %s' % (str(threshold)) raise Exception print '1: Start linear assignment procedure using method: %s' % ( lap_method) print '1: Original length of results dictionary: %i' % ( len(results_dict)) # Step 1: Filter out record pairs with weight lower than the threshold - - - # if (threshold != None): print '1: Remove record pairs with weight less than: %f' % ( threshold) else: threshold = -999999999999.999 # Make it a very very small number work_dict = {} # Make an empty working dictionary for row_num in results_dict: # Loop over all record numbers (keys) row_dict = results_dict[row_num] # Get corresponding record dictionary new_row_dict = {} # Start a new record dictionary for col_num in row_dict: # Loop over all records in this dictionary weight = row_dict[col_num] if (weight >= threshold): new_row_dict[col_num] = weight # Copy to new dictionary if (new_row_dict != {}): # Only insert non empty dictionaries work_dict[row_num] = new_row_dict results_len = len(work_dict) # Save results length (after filtering) if (threshold > -999999999999.999): print '1: Length of working dictionary after filtering: %i' % \ (results_len) # Step 2: Remove all matches (record pairs) which are unique - - - - - - - - # (i.e. which don't have matches with other records) # row_num_dict = {} # Count occurences of record numbers in rows col_num_dict = {} # Count occurences of record numbers in columns for row_num in work_dict: # First count occurences of rows and columns # Insert a count for the row number # row_num_dict[row_num] = row_num_dict.get(row_num, 0) + 1 row_dict = work_dict[row_num] for col_num in row_dict: # Increase a count for a column number # col_num_dict[col_num] = col_num_dict.get(col_num, 0) + 1 if (process_type == 'deduplication'): # For deduplication, insert symmetric record numbers as well # row_num_dict[col_num] = row_num_dict.get(col_num, 0) + 1 col_num_dict[row_num] = col_num_dict.get(row_num, 0) + 1 for row_num in work_dict.keys(): # Secondly remove unique rows and column row_dict = work_dict[row_num] # Get corresponding record dictionary if (len(row_dict) == 1): # Only one record pair for this record col_num, weight = row_dict.items()[ 0] # Get the only element in row if (row_num_dict[row_num] == 1) and (col_num_dict[col_num] == 1): #################### START TEST CODE ################################## if (DO_TESTS == True): if (process_type == 'deduplication'): if (row_num in used_rec_nums): print 'warning:Record number %i already used for deduplication' \ % (row_num) if (col_num in used_rec_nums): print 'warning:record number %i already used for deduplication' \ % (col_num) else: if (row_num in used_rec_nums_a): print 'warning:Record number A %i already used for linkage' % \ (row_num) if (col_num in used_rec_nums_b): print 'warning:record number B %i already used for linkage' % \ (col_num) #################### END TEST CODE ################################## lap_results[(row_num, col_num)] = True # Insert into final results del work_dict[row_num] # And delete the record in the results if (process_type == 'deduplication'): used_rec_nums[row_num] = True used_rec_nums[col_num] = True else: used_rec_nums_a[row_num] = True used_rec_nums_b[col_num] = True print '1: Found and extracted %i unique record ' % (len(lap_results)) + \ 'pairs in results dictionary' for rec_pair in lap_results: print '3: %s' % (str(rec_pair)) print '3:' lap_pair_extract_time = time.time() - lap_start_time #################### START PARALLEL TEST CODE ########################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = lap_results.items() tmp_list.sort() f = open('one2one-unique-dedup-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'w') for c in tmp_list: f.write(str(c) + os.linesep) f.close() tmp_list = work_dict.keys() tmp_list.sort() f = open( 'work-dict-' + str(parallel.rank()) + '-' + str(parallel.size()), 'w') for c in tmp_list: cc = work_dict[c].items() cc.sort() f.write(str(c) + ':: ' + str(cc) + os.linesep) f.close() #################### END PARALLEL TEST CODE ############################# #################### START TEST CODE ######################################## # Test if a record only appears once in the lap results dictionary # if (DO_TESTS == True): if (process_type == 'deduplication'): test_dict = {} for (rec_a, rec_b) in lap_results: if (test_dict.has_key(rec_a)): print 'warning:Record %s is already in the test dictionary' % \ (str(rec_a)) else: test_dict[rec_a] = 1 if (test_dict.has_key(rec_b)): print 'warning:Record %s is already in the test dictionary' % \ (str(rec_b)) else: test_dict[rec_b] = 1 else: # Linkage process test_dict_a = {} test_dict_b = {} for (rec_a, rec_b) in lap_results: if (test_dict_a.has_key(rec_a)): print 'warning:Record %s is already in test dictionary A' % \ (str(rec_a)) else: test_dict_a[rec_a] = 1 if (test_dict_b.has_key(rec_b)): print 'warning:Record %s is already in test dictionary B' % \ (str(rec_b)) else: test_dict_b[rec_b] = 1 #################### END TEST CODE ########################################## if (len(work_dict) == 0 ): # All record pairs are processed - - - - - - - - - return lap_results print '1: Remaining number of records in working dictionary: %i' % \ (len(work_dict)) + ' (down from: %i)' % (results_len) # Step 3: Find connected sub-sets in the results dictionary - - - - - - - - - # (using depth-first search) # visited = {} # Dictionary which will contain all so far visited rows sub_sets = {} # Dictionary which will contain the sub-sets extracted print '1: Find connected sub-graphs in results dictionary' lap_subset_start_time = time.time() max_sub_set_length = -1 num_visited = 0 # Number of rows visited so far row_num_done = 0 work_dict_len = len(work_dict) work_dict_rows = work_dict.keys() work_dict_rows.sort() # Create a column oriented work dictionary - - - - - - - - - - - - - - - - - # col_work_dict = {} for row_num in work_dict_rows: # Loop over all rows row_dict = work_dict[row_num] for col_num in row_dict: col_dict = col_work_dict.get(col_num, {}) col_dict[ row_num] = True # Only position is needed, but not the weight col_work_dict[col_num] = col_dict for row_num in work_dict_rows: # Loop over all rows if (not visited.has_key(row_num)): # This row has not been visited yet visited[row_num] = row_num # Mark visited as 'seeding' row num_visited += 1 print '2: Create sub-set with seeding record %i' % (row_num) process_queue = [row_num] # Start a new queue of rows to process row_sub_set = {row_num: 1} # Row numbers connected to this row while (process_queue != []): # Process rows until all connected rows done print '3: Process queue: %s' % (str(process_queue)) next_row = process_queue.pop( 0) # Get and remove first row to process row_col_numbers = work_dict[next_row].keys( ) # Get columns in this row # For deduplication, also insert row number into this column numbers # if (process_type == 'deduplication'): row_col_numbers.append(next_row) print '3: Row %i with column numbers: %s' % \ (next_row, str(row_col_numbers)) # Get the row numbers from all column numbers # for col_num in row_col_numbers: # Get list of all row numbers in this column # row_num_dict = col_work_dict.get(col_num, {}) row_num_list = row_num_dict.keys() if (process_type == 'deduplication') and (col_num in work_dict) and \ (col_num not in row_num_list): row_num_list.append(col_num) print '3: Column: %i with row numbers: %s' % \ (col_num, str(row_num_list)) for row_num2 in row_num_list: row_sub_set[row_num2] = 1 if (not visited.has_key(row_num2) ): # Check if it's a new row process_queue.append(row_num2) print '3: Appended row number %i to process queue' % \ (row_num2) visited[ row_num2] = row_num # Mark row as visited by seeding row num_visited += 1 print '3: Row %i connected to row %i' % \ (row_num2, row_num) sub_sets[row_num] = row_sub_set.keys() # Only store keys if (len(row_sub_set) > max_sub_set_length): max_sub_set_length = len(row_sub_set) print '3: Sub-set contains records: %s' % \ (str(row_sub_set.keys())) row_num_done += 1 # Now determine timing and print progress report (every 10%) - - - - - - - # (only if more than 100 records in the work dictionary) # if (work_dict_len >= 100) and (row_num_done % int(work_dict_len / 10) == 0): used_time = time.time() - lap_subset_start_time perc_done = 100.0 * row_num_done / work_dict_len # todo_time = (work_dict_len - row_num_done) * \ ################# # (used_time / row_num_done) todo_time = (work_dict_len - num_visited) * \ (used_time / row_num_done) used_time_string = output.time_string(used_time) todo_time_string = output.time_string(todo_time) print '1: Processed %.1f%% of records in %s (%i/%i records ' % \ (perc_done, used_time_string, num_visited, work_dict_len) + \ 'visited)' print '1: Estimated %s until finished' % (todo_time_string) del col_work_dict # Delete the column oriented work dictionary num_sub_sets = len(sub_sets) # Get the total number of sub-sets lap_subset_total_time = time.time() - lap_subset_start_time lap_subset_total_time_string = output.time_string(lap_subset_total_time) print '1: Extracted %i sub-sets in %s' % \ (num_sub_sets, lap_subset_total_time_string) print '1: Longest sub-set contains %i rows' % (max_sub_set_length) #################### START TEST CODE ######################################## # Test if all the sub-sets are mutually exclusive, and if the seed rows are # in the sub-set record lists # if (DO_TESTS == True): for seed_row in sub_sets: row_list = sub_sets[seed_row] if (seed_row not in row_list): print 'warning:Seed row %s not in sub-set row list: %s' % \ (str(seed_rec), str(row_list)) for rec_num in row_list: for seed_row2 in sub_sets: row_list2 = sub_sets[seed_row2] if (seed_row != seed_row2): # Don't test itself if (rec_num in row_list2): print 'warning:Record %s in more than one sub-set: %s, %s' % \ (str(rec_num), str(row_list), str(row_list2)) #################### END TEST CODE ########################################## #################### START PARALLEL TEST CODE ########################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = sub_sets.keys() tmp_list.sort() f = open('sub-sets-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'w') for s in tmp_list: tmp_sub_set = sub_sets[s] tmp_sub_set.sort() f.write(str(s) + '::' + str(tmp_sub_set) + os.linesep) f.close() #################### END PARALLEL TEST CODE ############################# # Now loop over all sub-sets - - - - - - - - - - - - - - - - - - - - - - - - # (pre-process them first before giving them to the actual linear assignment # method) # lap_lap_start_time = time.time() lap_comm_time = 0.0 sub_set_cnt = 0 # A round robin counter, used for parallelism sub_set_rows = sub_sets.keys() sub_set_rows.sort() # Needed to make the same on all processes for seed_row in sub_set_rows: # Distribute sub-sets equally to all processors # if ((sub_set_cnt % parallel.size()) == parallel.rank()): row_list = sub_sets[seed_row] row_list.sort() print '1:' print '1: Sub-set %i of %i with seed row %i contains %i rows' % \ (sub_set_cnt, num_sub_sets, seed_row, len(row_list)) print '3: Sub-set rows: %s' % (str(row_list)) if (len(row_list) == 1 ): # Special case: One row only - - - - - - - - - max_weight = -99999.9 max_col = -1 row_dict = work_dict[ row_list[0]] # Get the dictionary for this row # Find element with largest weight # for col_num in row_dict: weight = row_dict[col_num] if (weight > max_weight): max_weight = weight max_col = col_num # Assignment dictionary is of form col_num:row_num # tmp_assign_dict = { max_col: row_list[0] } # Make record pair dictionary print '2: Special case sub-set with one row only, ' + \ 'assignment pair: (%i,%i)' % (row_list[0], max_col) else: # General case with more than one row - - - - - - - - - - - - - - # Get minimal and maximal weights, and lists with row and column # numbers # min_weight = 999999.9 max_weight = -999999.9 col_numbers = {} row_col_numbers = {} for row_num in row_list: # Loop over rows in this sub-set row_dict = work_dict[ row_num] # Get the dictionary for this row row_col_numbers[row_num] = 1 for col_num in row_dict: weight = row_dict[col_num] col_numbers[col_num] = 1 row_col_numbers[col_num] = 1 if (weight < min_weight): min_weight = weight if (weight > max_weight): max_weight = weight print '3: Minimal and maximal weight: %.3f / %.3f' % \ (min_weight, max_weight) row_numbers = work_dict.keys() col_numbers = col_numbers.keys() row_numbers.sort() col_numbers.sort() num_rows = len(row_numbers) num_cols = len(col_numbers) row_col_numbers = row_col_numbers.keys() row_col_numbers.sort() #print '1: Row numbers: %s' % (str(row_numbers)) #print '1: Column numbers: %s' % (str(col_numbers)) #print '1: Row/colum numbers: %s' % (str(row_col_numbers)) #print '1: Number of unique weights: %i' % (len(weight_dict)) # Deal with the special case that there is only one column number - - - # if (num_cols == 1): max_weight = -99999.9 max_row = -1 col_num = col_numbers[0] # Get the column number # Find element with largest weight # for row_num in row_list: # Loop over rows # Get only weight in row # row_weight = work_dict[row_num].values()[0] if (row_weight > max_weight): max_weight = row_weight max_row = row_num # Assignment dictionary is of form col_num:row_num # tmp_assign_dict = { col_num: max_row } # Make record pair dictionary print '2: Special case sub-set with one column only, ' + \ 'assignment pair: (%i,%i)' % (max_row, col_num) else: # General case with more than one row and column - - - - - - - - # Construct the cost dictionary - - - - - - - - - - - - - - - - - - - # cost_dict = {} dim = len(row_col_numbers) # Final dimension of the LAP min_cost = -max_weight * (dim + 1) # Use original wieghts for row_num in row_list: # Loop over rows row_dict = work_dict[row_num] # Get the column numbers in this row # col_list = row_dict.keys() col_list.sort() row_cost_dict = cost_dict.get(row_num, {}) for col_num in col_list: weight = row_dict[col_num] cost = weight * (dim + 1) # Use original weights row_cost_dict[ col_num] = cost # And store into row dictionary # Insert symmetric element as well (if not on diagonal) # if (row_num != col_num): row_cost_dict2 = cost_dict.get(col_num, {}) if (row_num not in row_cost_dict2 ): # Only insert if not there if (process_type == 'deduplication'): row_cost_dict2[ row_num] = cost # Insert symmetric cost else: # Linkage process row_cost_dict2[ row_num] = min_cost # Insert minimal cost # And insert diagonal element if there is none # if (not row_cost_dict2.has_key(col_num)): row_cost_dict2[col_num] = min_cost cost_dict[col_num] = row_cost_dict2 # Make sure there is a diagonal element (for feasibility) # if (not row_cost_dict.has_key(row_num)): row_cost_dict[row_num] = min_cost # If more than MAX_ROW_ELEMENTS elements in row only take the # largest (following an idea by William Winkler) # if (len(row_cost_dict) > MAX_ROW_ELEMENTS): row_col_numbers = row_cost_dict.keys() row_weights = row_cost_dict.values() row_elem_list = map(None, row_weights, row_col_numbers) row_elem_list.sort() diag_weight = row_cost_dict[ row_num] # Keep diagonal element row_cost_dict = {row_num: diag_weight} ##print ' ' ##print '****** row_elem_list: %s' % (str(row_elem_list)) ##print ' ' for (weight, col_num) in row_elem_list[-MAX_ROW_ELEMENTS:]: row_cost_dict[col_num] = weight # Insert row into cost dictionary # cost_dict[row_num] = row_cost_dict # Get the final row and column numbers - - - - - - - - - - - - - - - # row_numbers = cost_dict.keys() col_numbers = {} for row_dict in cost_dict.values(): col_numbers.update(row_dict) col_numbers = col_numbers.keys() row_numbers.sort() col_numbers.sort() # Check if number of rows and columns are equal - - - - - - - - - - - # if (len(row_numbers) != len(col_numbers)): print 'error:Different number of rows (%i) and columns (%i)' \ % (len(row_numbers), len(col_numbers)) raise Exception print '1: Cost dictionary with %i rows/columns given to ' % \ (len(row_numbers)) + 'assignment method %s:' % (lap_method) print '2: Row numbers: %s' % (str(row_numbers)) print '2: Column numbers: %s' % (str(row_numbers)) print '2: Minimal weight: %3f' % (min_weight) print '2: Maximal weight: %3f' % (max_weight) print '3: Cost dictionary: %s' % (str(cost_dict)) print '3: Process type: %s' % (process_type) #################### START PARALLEL TEST CODE ####################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = cost_dict.keys() tmp_list.sort() tmp_str = str(sub_set_cnt)+':: '+ str(min_weight) + ' / ' + \ str(max_weight) + ', ' + str(row_numbers) + ', ' + \ process_type + '::' for k in tmp_list: tmp_list2 = cost_dict[k].items() tmp_list2.sort() tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / ' f = open('lap-calling-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'a') f.write(tmp_str + os.linesep) f.close() #################### END PARALLEL TEST CODE ######################### # Call the lap method which returns an assignment dictionary - - - - # if (lap_method == 'auction'): tmp_assign_dict = auction(cost_dict, min_weight, max_weight, \ row_numbers, col_numbers) else: print 'error:LAP method %s not implemented' % ( lap_method) raise Exception # If run in parallel, send temporary assignment dictionary process 0 - - # if (parallel.rank() > 0): tmp_time = time.time() parallel.send(tmp_assign_dict, 0) lap_comm_time += (time.time() - tmp_time) print '1: Sent assignment dictionary with %i entries to process' \ % (len(tmp_assign_dict)) + ' 0' # Only process 0 inserts temporary assignment dictionary into results - - - # if (parallel.rank() == 0): # Receive assignment dictionary from other process if necessary # p = (sub_set_cnt % parallel.size() ) # Process number to receive from if (p != 0): tmp_time = time.time() tmp_assign_dict = parallel.receive(p) lap_comm_time += (time.time() - tmp_time) print '1: Received subset %i of %i assignment dictionary with ' % \ (sub_set_cnt, num_sub_sets) + '%i entries from process %i' % \ (len(tmp_assign_dict), p) # Post-process the assignment dictionary - - - - - - - - - - - - - - - - # assign_pairs = {} for rec_num_b in tmp_assign_dict: rec_num_a = tmp_assign_dict[rec_num_b] # Now check if this record pair is in the original results dictionary # if (rec_num_a in results_dict): row_dict = results_dict[rec_num_a] if (rec_num_b in row_dict): weight = row_dict[rec_num_b] # Insert into dictionary of potential record pairs # assign_pairs[(rec_num_a, rec_num_b)] = weight #################### START PARALLEL TEST CODE ########################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = tmp_assign_dict.items() tmp_list.sort() tmp_list2 = assign_pairs.items() tmp_list2.sort() tmp_list3 = sub_sets[seed_row] tmp_list3.sort() f = open('assignments-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'a') f.write(str(sub_set_cnt) + ', ' + str(seed_row) + os.linesep) f.write(str(tmp_list) + os.linesep) f.write(str(tmp_list2) + os.linesep) f.write(str(tmp_list3) + os.linesep) f.write(os.linesep) f.close() #################### END PARALLEL TEST CODE ############################# # Sort the assigned pairs according to their weight # assign_weights = assign_pairs.values() # Get the weights in a list assign_rec_pairs = assign_pairs.keys() # And the record pairs assign_pair_list = map(None, assign_weights, assign_rec_pairs) assign_pair_list.sort() num_assigned_pairs = 0 # Number of assigned pairs for this sub-set dedup_check_rec_nums = {} # Already assigned record numbers while (assign_pair_list != []): # Now check all record pairs check_pair = assign_pair_list.pop( ) # Get largest weight record pair weight = check_pair[0] rec_num_a = check_pair[1][0] rec_num_b = check_pair[1][1] rec_pair = (rec_num_a, rec_num_b) # Now check if a record pair has already been used in an assignment # and for a deduplication process also check if any of the two # records has been used in an assignment # if ((process_type == 'linkage') and \ (rec_num_a not in used_rec_nums_a) and \ (rec_num_b not in used_rec_nums_b)) or \ ((process_type == 'deduplication') and \ (rec_num_a not in used_rec_nums) and \ (rec_num_b not in used_rec_nums)): # For deduplication insert record numbers into used record numbers # if (process_type == 'deduplication'): used_rec_nums[rec_num_a] = True used_rec_nums[rec_num_b] = True else: used_rec_nums_a[rec_num_a] = True used_rec_nums_b[rec_num_b] = True if (rec_pair not in lap_results): lap_results[rec_pair] = True num_assigned_pairs += 1 else: print 'warning:Record pair (%i,%i) already in LAP results' \ % (rec_num_a, rec_num_b) print '2: Inserted %i (out of %i) record pairs into LAP ' % \ (num_assigned_pairs, len(tmp_assign_dict)) + 'results' sub_set_cnt += 1 # Report progress every 10% (only if more than 100 sub-sets) - - - - - - - # if (num_sub_sets >= 100) and (sub_set_cnt % int(num_sub_sets / 10) == 0): used_time = time.time() - lap_lap_start_time perc_done = 100.0 * sub_set_cnt / num_sub_sets sub_set_time = used_time / sub_set_cnt todo_time = (num_sub_sets - sub_set_cnt) * sub_set_time used_time_string = output.time_string(used_time) todo_time_string = output.time_string(todo_time) sub_set_time_string = output.time_string(sub_set_time) print '1: Processed %.1f%% (%i/%i) of sub-sets in %s' % \ (perc_done, sub_set_cnt, num_sub_sets, used_time_string) + \ ' (%s per sub-set)' % (sub_set_time_string) print '1: Estimated %s until finished' % (todo_time_string) print '1: Total number of assignments: %i' % (len(lap_results)) print '1: Number of rows in original results dictionary: %i' % \ (len(results_dict)) #################### START TEST CODE ######################################## # Test if a record only appears once in the lap results dictionary # if (DO_TESTS == True) and (parallel.rank() == 0): if (process_type == 'deduplication'): test_dict = {} for (rec_a, rec_b) in lap_results: if (test_dict.has_key(rec_a)): print 'warning:Record %i is already in the test dictionary' % \ (rec_a)+' rec_pair: (%i,%i)' % (rec_a, rec_b) else: test_dict[rec_a] = True if (test_dict.has_key(rec_b)): print 'warning:Record %i is already in the test dictionary' % \ (rec_b)+' rec_pair: (%i,%i)' % (rec_a, rec_b) else: test_dict[rec_b] = 1 else: # Linkage process test_dict_a = {} test_dict_b = {} for (rec_a, rec_b) in lap_results: if (test_dict_a.has_key(rec_a)): print 'warning:Record %s is already in test dictionary A' % \ (str(rec_a)) else: test_dict_a[rec_a] = 1 if (test_dict_b.has_key(rec_b)): print 'warning:Record %s is already in test dictionary B' % \ (str(rec_b)) else: test_dict_b[rec_b] = 1 #################### END TEST CODE ########################################## lap_stop_time = time.time() lap_lap_time = lap_stop_time - lap_lap_start_time lap_total_time = lap_stop_time - lap_start_time lap_pair_extract_time_string = output.time_string(lap_pair_extract_time) lap_subset_total_time_string = output.time_string(lap_subset_total_time) lap_lap_time_string = output.time_string(lap_lap_time) if (parallel.size() > 1): lap_comm_time_string = output.time_string(lap_comm_time) lap_total_time_string = output.time_string(lap_total_time) print '1:' print '1: Finished linear record pair assignment procedure' print '1: Time for extracting unique record pairs: %s' % \ (lap_pair_extract_time_string) print '1: Time for creating record sub-sets: %s' % \ (lap_subset_total_time_string) print '1: Time for linear assignment algorithm: %s' % \ (lap_lap_time_string) if (parallel.size() > 1): print '1: Time for communication: %s' % \ (lap_comm_time_string) print '1: Total time for linear assignment: %s' % \ (lap_total_time_string) print '1:' return lap_results
def auction(cost_dict, min_cost, max_cost, row_numbers, col_numbers): """Linear sum assignment procedure based on symetric auction algorithm. Re-implementation of a FORTRAN code, taken from: http://web.mit.edu/afs/athena.mit.edu/user/d/i/dimitrib/www/auction.txt and as described in: "Auction Algorithms for Network Flow Problems: A Tutorial Introduction" Dimitri P. Bertsekas, Computational Optimization and Applications, Vol. 1, pp. 7-66, 1992. and other papers by Dimitri P. Bertsekas, see: http://www.mit.edu:8001//people/dimitrib/publ.html Takes as input a cost dictionary, the minimum and maximum weights in this dictionary, and two sorted lists with the row and column numbers. """ #################### START PARALLEL TEST CODE ############################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = cost_dict.keys() tmp_list.sort() tmp_str = str(min_cost) + ' /' + str(max_cost)+', '+ str(row_numbers) + \ ' / ' + str(col_numbers) + ':: ' for k in tmp_list: tmp_list2 = cost_dict[k].items() tmp_list2.sort() tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / ' f = open('lap-auction-'+str(parallel.rank())+'-'+str(parallel.size()),'a') f.write(tmp_str+os.linesep) f.close() #################### END PARALLEL TEST CODE ############################### i_large = 100000000 # A value larger than max_cost auction_results = {} # Result dictionary with final record pair matches num_rows = len(row_numbers) num_cols = len(col_numbers) if (num_rows != num_cols): print 'error:Asymmetric problem given to symmetric "auction" algorithm' raise Exception # Set parameters for auction algorithm - - - - - - - - - - - - - - - - - - - # if (max_cost > int(i_large / (num_rows + 1))): print 'error:Cost range too large to work with integer epsilon' raise Exception max_cost *= (num_rows + 1) beg_eps = max_cost / 5 # Maybe smaller, can even be 1, but not smaller end_eps = num_rows / 10 # Must be smaller than beg_eps, can be 1 if (end_eps < 1): end_eps = 1 elif (end_eps > beg_eps): end_eps = beg_eps factor = 5 # Must be greater than 1 start_incr = beg_eps / 10 # Maybe even be 1, but not smaller if (start_incr < 1): start_incr = 1 # Initialisation # eps = beg_eps i_small = -i_large large_incr = int(i_large / 10) thresh = min(int(num_rows / 5), 100) # Maximal value 100 incr_factor = 2 cycles = 1 average = num_cols num_phases = 1 # Initialise dictionaries for prices and row assignments # pcol = {} assigned = {} for col_num in col_numbers: pcol[col_num] = i_small # assigned[col_num] = -1####### # Initialise list of un-assigned rows (all rows at the beginning) # list = row_numbers[:] no_list = num_rows do_phase = True # Set flag so a first phase is performed # Start sub problem (scaling phase with new epsilon) - - - - - - - - - - - - # while (do_phase == True): if (eps == 1): thresh = 0 incr = max(start_incr, eps) # Increment must not be larger than epsilon print '2: Start of a scaling phase with epsilon: %f' % (eps) do_cycle = True # Set flag so a first cycle is performed cycle_count = 0 while (do_cycle == True): # Start forward auction cycle - - - - - - - - - - - - - - - - - - - - - - # no_new_list = 0 # Initialise count of next list of un-assigned rows # Cycle through the current list of un-assigned rows # for i in range(no_list): row_num = list[i] row_dict = cost_dict[row_num] row_list = row_dict.items() row_list.sort() ################ Maybe not needed ??? ####### row_len = len(row_dict) # Get first and second column number and cost # col_num, cost = row_list[0] col_num2, cost2 = row_list[1] max1 = cost - pcol[col_num] max2 = cost2 - pcol[col_num2] if (max1 > max2): best_col_num = col_num elif (max1 < max2): # Swap maximum values max1, max2 = max2, max1 best_col_num = col_num2 else: # Both are the same if (col_num < col_num2): # Make sure best column number is smallest best_col_num = col_num else: best_col_num = col_num2 if (row_len > 2): # Row has more than two elements for c in range(2, row_len): col_num3, cost3 = row_list[c] # Loop through cols max_tmp = cost3 - pcol[col_num3] if (max_tmp > max2): if (max_tmp > max1): best_col_num = col_num3 # New best column max2 = max1 max1 = max_tmp elif (max_tmp == max1) and (col_num3 < best_col_num): best_col_num = col_num3 # Best column number is smallest else: max2 = max_tmp # Row bids for best column increasing its price, and gets - - - - - - # assigned to best column, while any row assigned to best # column gets un-asssigned # pcol[best_col_num] = pcol[best_col_num] + max1 - max2 + incr old_row = assigned.get(best_col_num, -1) ######### assigned[best_col_num] = row_num if (old_row >= 0): # Row has been assigned list[no_new_list] = old_row # Save un-assigned row into list no_new_list += 1 cycle_count += 1 if ((cycle_count % 10000) == 0): print '1: Finished %i cycles, %i out of %i rows un-assigned' % \ (cycle_count, no_new_list, num_rows) if (no_new_list > 0): print '1: Un-assigned rows: %s' % (str(list[:no_new_list])) # Collect statistics # average = (cycles * average + no_list) / (cycles+1) cycles += 1 # Check if there are still 'many' unassignedrows, i.e. if the - - - - - - # number of unassignd rows is greater than the parameter 'thresh'. # If not, replace current list with the new list and go for another # cycle. Otherwise, if epsilon > 1, reduce epsilon, reset the # asignment to empty and restart auction. # If epsilon == 1 terminate. # Also increase the minimal bidding increment up to a maximun value # of epsilon (this is the adaptive feature) # incr *= incr_factor if (incr > eps): incr = eps if (no_new_list > thresh): no_list = no_new_list else: do_cycle = False # Set flag so cycle is left # End of sub-problem (scaling phase) - - - - - - - - - - - - - - - - - - - # if (eps == 1): do_phase = False # Set flag so phase is left else: num_phases += 1 eps = int(eps / factor) if (eps > incr): eps = int(eps / factor) if (eps < 1) or (eps < end_eps): eps = 1 thresh = int(thresh / factor) print '1: End of a scaling phase, new epsilon: %3f' % (eps) t_min = min(pcol.values()+[i_large]) col_num_list = assigned.keys() ########## col_num_list.sort() ########## for col_num in assigned: ########### # for col_num in col_num_list: row_num = assigned[col_num] if (row_num >= 0): list[no_new_list] = row_num no_new_list += 1 assigned[col_num] = -1 incr = t_min - i_small # Reset minimum price to i_small for col_num in col_numbers: # Update all prices pcol[col_num] = pcol[col_num] - incr # Final parameter updates before starting another scaling phase # no_list = no_new_list if (start_incr < eps): start_incr *= factor return assigned
def do_lap(lap_method, results_dict, process_type, threshold): """Linear sum assignment procedure. This routine calculates a linear assignments for one-to-one matching. The routine do_lap does all kinds of preprocessing, including the extraction of unique record pairs (which can be removed before the lap is applied) and the extraction of sub-set which can be solved independently. These sub-sets are then given to the chosen lap routine. The routine takes as input a results dictionary, as produced by a classifier (see classification.py), and returns a dictionary with the assigned record pair numbers as keys and the corresponding weight as values. Possible methods are 'auction' The process_type attribute must either be set to 'deduplication' or to 'linkage' in order to be able to preprocess the classifier data prior to the lap procedure. """ if (lap_method not in ['auction']): print 'error:Illegal method for lap method: %s' % (str(lap_method)) raise Exception if (process_type not in ['deduplication', 'linkage']): print 'error:Illegal value for attribute "process_type": %s' %\ (str(process_type)) raise Exception if (results_dict == {}): print 'error:Empty results dictionary' raise Exception lap_start_time = time.time() # Start timer lap_results = {} # Result dictionary with final record pair matches # Make one (or two) disctionary of all assigned rercord numbers # if (process_type == 'deduplication'): used_rec_nums = {} else: used_rec_nums_a = {} used_rec_nums_b = {} # Make sure the threshold is a number if it is defined # if (threshold != None): if (not (isinstance(threshold, int) or isinstance(threshold, float))): print 'error:Threshold is not a number: %s' % (str(threshold)) raise Exception print '1: Start linear assignment procedure using method: %s' % (lap_method) print '1: Original length of results dictionary: %i' % (len(results_dict)) # Step 1: Filter out record pairs with weight lower than the threshold - - - # if (threshold != None): print '1: Remove record pairs with weight less than: %f' % (threshold) else: threshold = -999999999999.999 # Make it a very very small number work_dict = {} # Make an empty working dictionary for row_num in results_dict: # Loop over all record numbers (keys) row_dict = results_dict[row_num] # Get corresponding record dictionary new_row_dict = {} # Start a new record dictionary for col_num in row_dict: # Loop over all records in this dictionary weight = row_dict[col_num] if (weight >= threshold): new_row_dict[col_num] = weight # Copy to new dictionary if (new_row_dict != {}): # Only insert non empty dictionaries work_dict[row_num] = new_row_dict results_len = len(work_dict) # Save results length (after filtering) if (threshold > -999999999999.999): print '1: Length of working dictionary after filtering: %i' % \ (results_len) # Step 2: Remove all matches (record pairs) which are unique - - - - - - - - # (i.e. which don't have matches with other records) # row_num_dict = {} # Count occurences of record numbers in rows col_num_dict = {} # Count occurences of record numbers in columns for row_num in work_dict: # First count occurences of rows and columns # Insert a count for the row number # row_num_dict[row_num] = row_num_dict.get(row_num, 0) + 1 row_dict = work_dict[row_num] for col_num in row_dict: # Increase a count for a column number # col_num_dict[col_num] = col_num_dict.get(col_num, 0) + 1 if (process_type == 'deduplication'): # For deduplication, insert symmetric record numbers as well # row_num_dict[col_num] = row_num_dict.get(col_num, 0) + 1 col_num_dict[row_num] = col_num_dict.get(row_num, 0) + 1 for row_num in work_dict.keys(): # Secondly remove unique rows and column row_dict = work_dict[row_num] # Get corresponding record dictionary if (len(row_dict) == 1): # Only one record pair for this record col_num, weight = row_dict.items()[0] # Get the only element in row if (row_num_dict[row_num] == 1) and (col_num_dict[col_num] == 1): #################### START TEST CODE ################################## if (DO_TESTS == True): if (process_type == 'deduplication'): if (row_num in used_rec_nums): print 'warning:Record number %i already used for deduplication' \ % (row_num) if (col_num in used_rec_nums): print 'warning:record number %i already used for deduplication' \ % (col_num) else: if (row_num in used_rec_nums_a): print 'warning:Record number A %i already used for linkage' % \ (row_num) if (col_num in used_rec_nums_b): print 'warning:record number B %i already used for linkage' % \ (col_num) #################### END TEST CODE ################################## lap_results[(row_num,col_num)] = True # Insert into final results del work_dict[row_num] # And delete the record in the results if (process_type == 'deduplication'): used_rec_nums[row_num] = True used_rec_nums[col_num] = True else: used_rec_nums_a[row_num] = True used_rec_nums_b[col_num] = True print '1: Found and extracted %i unique record ' % (len(lap_results)) + \ 'pairs in results dictionary' for rec_pair in lap_results: print '3: %s' % (str(rec_pair)) print '3:' lap_pair_extract_time = time.time() - lap_start_time #################### START PARALLEL TEST CODE ########################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = lap_results.items() tmp_list.sort() f = open('one2one-unique-dedup-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'w') for c in tmp_list: f.write(str(c)+os.linesep) f.close() tmp_list = work_dict.keys() tmp_list.sort() f = open('work-dict-'+str(parallel.rank())+'-'+str(parallel.size()),'w') for c in tmp_list: cc = work_dict[c].items() cc.sort() f.write(str(c)+':: '+str(cc)+os.linesep) f.close() #################### END PARALLEL TEST CODE ############################# #################### START TEST CODE ######################################## # Test if a record only appears once in the lap results dictionary # if (DO_TESTS == True): if (process_type == 'deduplication'): test_dict = {} for (rec_a, rec_b) in lap_results: if (test_dict.has_key(rec_a)): print 'warning:Record %s is already in the test dictionary' % \ (str(rec_a)) else: test_dict[rec_a] = 1 if (test_dict.has_key(rec_b)): print 'warning:Record %s is already in the test dictionary' % \ (str(rec_b)) else: test_dict[rec_b] = 1 else: # Linkage process test_dict_a = {} test_dict_b = {} for (rec_a, rec_b) in lap_results: if (test_dict_a.has_key(rec_a)): print 'warning:Record %s is already in test dictionary A' % \ (str(rec_a)) else: test_dict_a[rec_a] = 1 if (test_dict_b.has_key(rec_b)): print 'warning:Record %s is already in test dictionary B' % \ (str(rec_b)) else: test_dict_b[rec_b] = 1 #################### END TEST CODE ########################################## if (len(work_dict) == 0): # All record pairs are processed - - - - - - - - - return lap_results print '1: Remaining number of records in working dictionary: %i' % \ (len(work_dict)) + ' (down from: %i)' % (results_len) # Step 3: Find connected sub-sets in the results dictionary - - - - - - - - - # (using depth-first search) # visited = {} # Dictionary which will contain all so far visited rows sub_sets = {} # Dictionary which will contain the sub-sets extracted print '1: Find connected sub-graphs in results dictionary' lap_subset_start_time = time.time() max_sub_set_length = -1 num_visited = 0 # Number of rows visited so far row_num_done = 0 work_dict_len = len(work_dict) work_dict_rows = work_dict.keys() work_dict_rows.sort() # Create a column oriented work dictionary - - - - - - - - - - - - - - - - - # col_work_dict = {} for row_num in work_dict_rows: # Loop over all rows row_dict = work_dict[row_num] for col_num in row_dict: col_dict = col_work_dict.get(col_num,{}) col_dict[row_num] = True # Only position is needed, but not the weight col_work_dict[col_num] = col_dict for row_num in work_dict_rows: # Loop over all rows if (not visited.has_key(row_num)): # This row has not been visited yet visited[row_num] = row_num # Mark visited as 'seeding' row num_visited += 1 print '2: Create sub-set with seeding record %i' % (row_num) process_queue = [row_num] # Start a new queue of rows to process row_sub_set = {row_num:1} # Row numbers connected to this row while (process_queue != []): # Process rows until all connected rows done print '3: Process queue: %s' % (str(process_queue)) next_row = process_queue.pop(0) # Get and remove first row to process row_col_numbers = work_dict[next_row].keys() # Get columns in this row # For deduplication, also insert row number into this column numbers # if (process_type == 'deduplication'): row_col_numbers.append(next_row) print '3: Row %i with column numbers: %s' % \ (next_row, str(row_col_numbers)) # Get the row numbers from all column numbers # for col_num in row_col_numbers: # Get list of all row numbers in this column # row_num_dict = col_work_dict.get(col_num, {}) row_num_list = row_num_dict.keys() if (process_type == 'deduplication') and (col_num in work_dict) and \ (col_num not in row_num_list): row_num_list.append(col_num) print '3: Column: %i with row numbers: %s' % \ (col_num, str(row_num_list)) for row_num2 in row_num_list: row_sub_set[row_num2] = 1 if (not visited.has_key(row_num2)): # Check if it's a new row process_queue.append(row_num2) print '3: Appended row number %i to process queue' % \ (row_num2) visited[row_num2] = row_num # Mark row as visited by seeding row num_visited += 1 print '3: Row %i connected to row %i' % \ (row_num2, row_num) sub_sets[row_num] = row_sub_set.keys() # Only store keys if (len(row_sub_set) > max_sub_set_length): max_sub_set_length = len(row_sub_set) print '3: Sub-set contains records: %s' % \ (str(row_sub_set.keys())) row_num_done += 1 # Now determine timing and print progress report (every 10%) - - - - - - - # (only if more than 100 records in the work dictionary) # if (work_dict_len >= 100) and (row_num_done % int(work_dict_len/10) == 0): used_time = time.time() - lap_subset_start_time perc_done = 100.0 * row_num_done / work_dict_len # todo_time = (work_dict_len - row_num_done) * \ ################# # (used_time / row_num_done) todo_time = (work_dict_len - num_visited) * \ (used_time / row_num_done) used_time_string = output.time_string(used_time) todo_time_string = output.time_string(todo_time) print '1: Processed %.1f%% of records in %s (%i/%i records ' % \ (perc_done, used_time_string, num_visited, work_dict_len) + \ 'visited)' print '1: Estimated %s until finished' % (todo_time_string) del col_work_dict # Delete the column oriented work dictionary num_sub_sets = len(sub_sets) # Get the total number of sub-sets lap_subset_total_time = time.time() - lap_subset_start_time lap_subset_total_time_string = output.time_string(lap_subset_total_time) print '1: Extracted %i sub-sets in %s' % \ (num_sub_sets, lap_subset_total_time_string) print '1: Longest sub-set contains %i rows' % (max_sub_set_length) #################### START TEST CODE ######################################## # Test if all the sub-sets are mutually exclusive, and if the seed rows are # in the sub-set record lists # if (DO_TESTS == True): for seed_row in sub_sets: row_list = sub_sets[seed_row] if (seed_row not in row_list): print 'warning:Seed row %s not in sub-set row list: %s' % \ (str(seed_rec), str(row_list)) for rec_num in row_list: for seed_row2 in sub_sets: row_list2 = sub_sets[seed_row2] if (seed_row != seed_row2): # Don't test itself if (rec_num in row_list2): print 'warning:Record %s in more than one sub-set: %s, %s' % \ (str(rec_num), str(row_list), str(row_list2)) #################### END TEST CODE ########################################## #################### START PARALLEL TEST CODE ########################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = sub_sets.keys() tmp_list.sort() f = open('sub-sets-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'w') for s in tmp_list: tmp_sub_set = sub_sets[s] tmp_sub_set.sort() f.write(str(s)+'::'+str(tmp_sub_set)+os.linesep) f.close() #################### END PARALLEL TEST CODE ############################# # Now loop over all sub-sets - - - - - - - - - - - - - - - - - - - - - - - - # (pre-process them first before giving them to the actual linear assignment # method) # lap_lap_start_time = time.time() lap_comm_time = 0.0 sub_set_cnt = 0 # A round robin counter, used for parallelism sub_set_rows = sub_sets.keys() sub_set_rows.sort() # Needed to make the same on all processes for seed_row in sub_set_rows: # Distribute sub-sets equally to all processors # if ((sub_set_cnt % parallel.size()) == parallel.rank()): row_list = sub_sets[seed_row] row_list.sort() print '1:' print '1: Sub-set %i of %i with seed row %i contains %i rows' % \ (sub_set_cnt, num_sub_sets, seed_row, len(row_list)) print '3: Sub-set rows: %s' % (str(row_list)) if (len(row_list) == 1): # Special case: One row only - - - - - - - - - max_weight = -99999.9 max_col = -1 row_dict = work_dict[row_list[0]] # Get the dictionary for this row # Find element with largest weight # for col_num in row_dict: weight = row_dict[col_num] if (weight > max_weight): max_weight = weight max_col = col_num # Assignment dictionary is of form col_num:row_num # tmp_assign_dict = {max_col:row_list[0]} # Make record pair dictionary print '2: Special case sub-set with one row only, ' + \ 'assignment pair: (%i,%i)' % (row_list[0], max_col) else: # General case with more than one row - - - - - - - - - - - - - - # Get minimal and maximal weights, and lists with row and column # numbers # min_weight = 999999.9 max_weight = -999999.9 col_numbers = {} row_col_numbers = {} for row_num in row_list: # Loop over rows in this sub-set row_dict = work_dict[row_num] # Get the dictionary for this row row_col_numbers[row_num] = 1 for col_num in row_dict: weight = row_dict[col_num] col_numbers[col_num] = 1 row_col_numbers[col_num] = 1 if (weight < min_weight): min_weight = weight if (weight > max_weight): max_weight = weight print '3: Minimal and maximal weight: %.3f / %.3f' % \ (min_weight, max_weight) row_numbers = work_dict.keys() col_numbers = col_numbers.keys() row_numbers.sort() col_numbers.sort() num_rows = len(row_numbers) num_cols = len(col_numbers) row_col_numbers = row_col_numbers.keys() row_col_numbers.sort() #print '1: Row numbers: %s' % (str(row_numbers)) #print '1: Column numbers: %s' % (str(col_numbers)) #print '1: Row/colum numbers: %s' % (str(row_col_numbers)) #print '1: Number of unique weights: %i' % (len(weight_dict)) # Deal with the special case that there is only one column number - - - # if (num_cols == 1): max_weight = -99999.9 max_row = -1 col_num = col_numbers[0] # Get the column number # Find element with largest weight # for row_num in row_list: # Loop over rows # Get only weight in row # row_weight = work_dict[row_num].values()[0] if (row_weight > max_weight): max_weight = row_weight max_row = row_num # Assignment dictionary is of form col_num:row_num # tmp_assign_dict = {col_num:max_row} # Make record pair dictionary print '2: Special case sub-set with one column only, ' + \ 'assignment pair: (%i,%i)' % (max_row, col_num) else: # General case with more than one row and column - - - - - - - - # Construct the cost dictionary - - - - - - - - - - - - - - - - - - - # cost_dict = {} dim = len(row_col_numbers) # Final dimension of the LAP min_cost = -max_weight * (dim + 1) # Use original wieghts for row_num in row_list: # Loop over rows row_dict = work_dict[row_num] # Get the column numbers in this row # col_list = row_dict.keys() col_list.sort() row_cost_dict = cost_dict.get(row_num, {}) for col_num in col_list: weight = row_dict[col_num] cost = weight * (dim + 1) # Use original weights row_cost_dict[col_num] = cost # And store into row dictionary # Insert symmetric element as well (if not on diagonal) # if (row_num != col_num): row_cost_dict2 = cost_dict.get(col_num,{}) if (row_num not in row_cost_dict2): # Only insert if not there if (process_type == 'deduplication'): row_cost_dict2[row_num] = cost # Insert symmetric cost else: # Linkage process row_cost_dict2[row_num] = min_cost # Insert minimal cost # And insert diagonal element if there is none # if (not row_cost_dict2.has_key(col_num)): row_cost_dict2[col_num] = min_cost cost_dict[col_num] = row_cost_dict2 # Make sure there is a diagonal element (for feasibility) # if (not row_cost_dict.has_key(row_num)): row_cost_dict[row_num] = min_cost # If more than MAX_ROW_ELEMENTS elements in row only take the # largest (following an idea by William Winkler) # if (len(row_cost_dict) > MAX_ROW_ELEMENTS): row_col_numbers = row_cost_dict.keys() row_weights = row_cost_dict.values() row_elem_list = map(None, row_weights, row_col_numbers) row_elem_list.sort() diag_weight = row_cost_dict[row_num] # Keep diagonal element row_cost_dict = {row_num:diag_weight} ##print ' ' ##print '****** row_elem_list: %s' % (str(row_elem_list)) ##print ' ' for (weight, col_num) in row_elem_list[-MAX_ROW_ELEMENTS:]: row_cost_dict[col_num] = weight # Insert row into cost dictionary # cost_dict[row_num] = row_cost_dict # Get the final row and column numbers - - - - - - - - - - - - - - - # row_numbers = cost_dict.keys() col_numbers = {} for row_dict in cost_dict.values(): col_numbers.update(row_dict) col_numbers = col_numbers.keys() row_numbers.sort() col_numbers.sort() # Check if number of rows and columns are equal - - - - - - - - - - - # if (len(row_numbers) != len(col_numbers)): print 'error:Different number of rows (%i) and columns (%i)' \ % (len(row_numbers), len(col_numbers)) raise Exception print '1: Cost dictionary with %i rows/columns given to ' % \ (len(row_numbers)) + 'assignment method %s:' % (lap_method) print '2: Row numbers: %s' % (str(row_numbers)) print '2: Column numbers: %s' % (str(row_numbers)) print '2: Minimal weight: %3f' % (min_weight) print '2: Maximal weight: %3f' % (max_weight) print '3: Cost dictionary: %s' % (str(cost_dict)) print '3: Process type: %s' % (process_type) #################### START PARALLEL TEST CODE ####################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = cost_dict.keys() tmp_list.sort() tmp_str = str(sub_set_cnt)+':: '+ str(min_weight) + ' / ' + \ str(max_weight) + ', ' + str(row_numbers) + ', ' + \ process_type + '::' for k in tmp_list: tmp_list2 = cost_dict[k].items() tmp_list2.sort() tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / ' f = open('lap-calling-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'a') f.write(tmp_str+os.linesep) f.close() #################### END PARALLEL TEST CODE ######################### # Call the lap method which returns an assignment dictionary - - - - # if (lap_method == 'auction'): tmp_assign_dict = auction(cost_dict, min_weight, max_weight, \ row_numbers, col_numbers) else: print 'error:LAP method %s not implemented' % (lap_method) raise Exception # If run in parallel, send temporary assignment dictionary process 0 - - # if (parallel.rank() > 0): tmp_time = time.time() parallel.send(tmp_assign_dict, 0) lap_comm_time += (time.time() - tmp_time) print '1: Sent assignment dictionary with %i entries to process' \ % (len(tmp_assign_dict)) + ' 0' # Only process 0 inserts temporary assignment dictionary into results - - - # if (parallel.rank() == 0): # Receive assignment dictionary from other process if necessary # p = (sub_set_cnt % parallel.size()) # Process number to receive from if (p != 0): tmp_time = time.time() tmp_assign_dict = parallel.receive(p) lap_comm_time += (time.time() - tmp_time) print '1: Received subset %i of %i assignment dictionary with ' % \ (sub_set_cnt, num_sub_sets) + '%i entries from process %i' % \ (len(tmp_assign_dict), p) # Post-process the assignment dictionary - - - - - - - - - - - - - - - - # assign_pairs = {} for rec_num_b in tmp_assign_dict: rec_num_a = tmp_assign_dict[rec_num_b] # Now check if this record pair is in the original results dictionary # if (rec_num_a in results_dict): row_dict = results_dict[rec_num_a] if (rec_num_b in row_dict): weight = row_dict[rec_num_b] # Insert into dictionary of potential record pairs # assign_pairs[(rec_num_a, rec_num_b)] = weight #################### START PARALLEL TEST CODE ########################### if (SAVE_PARALLEL_TEST_FILES == True): tmp_list = tmp_assign_dict.items() tmp_list.sort() tmp_list2 = assign_pairs.items() tmp_list2.sort() tmp_list3 = sub_sets[seed_row] tmp_list3.sort() f = open('assignments-'+str(parallel.rank())+'-'+ \ str(parallel.size()),'a') f.write(str(sub_set_cnt)+', '+str(seed_row)+os.linesep) f.write(str(tmp_list)+os.linesep) f.write(str(tmp_list2)+os.linesep) f.write(str(tmp_list3)+os.linesep) f.write(os.linesep) f.close() #################### END PARALLEL TEST CODE ############################# # Sort the assigned pairs according to their weight # assign_weights = assign_pairs.values() # Get the weights in a list assign_rec_pairs = assign_pairs.keys() # And the record pairs assign_pair_list = map(None, assign_weights, assign_rec_pairs) assign_pair_list.sort() num_assigned_pairs = 0 # Number of assigned pairs for this sub-set dedup_check_rec_nums = {} # Already assigned record numbers while (assign_pair_list != []): # Now check all record pairs check_pair = assign_pair_list.pop() # Get largest weight record pair weight = check_pair[0] rec_num_a = check_pair[1][0] rec_num_b = check_pair[1][1] rec_pair = (rec_num_a, rec_num_b) # Now check if a record pair has already been used in an assignment # and for a deduplication process also check if any of the two # records has been used in an assignment # if ((process_type == 'linkage') and \ (rec_num_a not in used_rec_nums_a) and \ (rec_num_b not in used_rec_nums_b)) or \ ((process_type == 'deduplication') and \ (rec_num_a not in used_rec_nums) and \ (rec_num_b not in used_rec_nums)): # For deduplication insert record numbers into used record numbers # if (process_type == 'deduplication'): used_rec_nums[rec_num_a] = True used_rec_nums[rec_num_b] = True else: used_rec_nums_a[rec_num_a] = True used_rec_nums_b[rec_num_b] = True if (rec_pair not in lap_results): lap_results[rec_pair] = True num_assigned_pairs += 1 else: print 'warning:Record pair (%i,%i) already in LAP results' \ % (rec_num_a, rec_num_b) print '2: Inserted %i (out of %i) record pairs into LAP ' % \ (num_assigned_pairs, len(tmp_assign_dict)) + 'results' sub_set_cnt += 1 # Report progress every 10% (only if more than 100 sub-sets) - - - - - - - # if (num_sub_sets >= 100) and (sub_set_cnt % int(num_sub_sets / 10) == 0): used_time = time.time() - lap_lap_start_time perc_done = 100.0 * sub_set_cnt / num_sub_sets sub_set_time = used_time / sub_set_cnt todo_time = (num_sub_sets - sub_set_cnt) * sub_set_time used_time_string = output.time_string(used_time) todo_time_string = output.time_string(todo_time) sub_set_time_string = output.time_string(sub_set_time) print '1: Processed %.1f%% (%i/%i) of sub-sets in %s' % \ (perc_done, sub_set_cnt, num_sub_sets, used_time_string) + \ ' (%s per sub-set)' % (sub_set_time_string) print '1: Estimated %s until finished' % (todo_time_string) print '1: Total number of assignments: %i' % (len(lap_results)) print '1: Number of rows in original results dictionary: %i' % \ (len(results_dict)) #################### START TEST CODE ######################################## # Test if a record only appears once in the lap results dictionary # if (DO_TESTS == True) and (parallel.rank() == 0): if (process_type == 'deduplication'): test_dict = {} for (rec_a, rec_b) in lap_results: if (test_dict.has_key(rec_a)): print 'warning:Record %i is already in the test dictionary' % \ (rec_a)+' rec_pair: (%i,%i)' % (rec_a, rec_b) else: test_dict[rec_a] = True if (test_dict.has_key(rec_b)): print 'warning:Record %i is already in the test dictionary' % \ (rec_b)+' rec_pair: (%i,%i)' % (rec_a, rec_b) else: test_dict[rec_b] = 1 else: # Linkage process test_dict_a = {} test_dict_b = {} for (rec_a, rec_b) in lap_results: if (test_dict_a.has_key(rec_a)): print 'warning:Record %s is already in test dictionary A' % \ (str(rec_a)) else: test_dict_a[rec_a] = 1 if (test_dict_b.has_key(rec_b)): print 'warning:Record %s is already in test dictionary B' % \ (str(rec_b)) else: test_dict_b[rec_b] = 1 #################### END TEST CODE ########################################## lap_stop_time = time.time() lap_lap_time = lap_stop_time - lap_lap_start_time lap_total_time = lap_stop_time - lap_start_time lap_pair_extract_time_string = output.time_string(lap_pair_extract_time) lap_subset_total_time_string = output.time_string(lap_subset_total_time) lap_lap_time_string = output.time_string(lap_lap_time) if (parallel.size() > 1): lap_comm_time_string = output.time_string(lap_comm_time) lap_total_time_string = output.time_string(lap_total_time) print '1:' print '1: Finished linear record pair assignment procedure' print '1: Time for extracting unique record pairs: %s' % \ (lap_pair_extract_time_string) print '1: Time for creating record sub-sets: %s' % \ (lap_subset_total_time_string) print '1: Time for linear assignment algorithm: %s' % \ (lap_lap_time_string) if (parallel.size() > 1): print '1: Time for communication: %s' % \ (lap_comm_time_string) print '1: Total time for linear assignment: %s' % \ (lap_total_time_string) print '1:' return lap_results