Python size Examples

Programming Language: Python

Namespace/Package Name: parallel

Method/Function: size

Examples at hotexamples.com: 4

Python size - 4 examples found. These are the top rated real world Python examples of parallel.size extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: lap.py Project: mkitawat/febrl

def auction(cost_dict, min_cost, max_cost, row_numbers, col_numbers):
    """Linear sum assignment procedure based on symetric auction algorithm.

     Re-implementation of a FORTRAN code, taken from: 

     http://web.mit.edu/afs/athena.mit.edu/user/d/i/dimitrib/www/auction.txt

     and as described in:

     "Auction Algorithms for Network Flow Problems: A Tutorial Introduction"
     Dimitri P. Bertsekas, Computational Optimization and Applications, Vol.
     1, pp. 7-66, 1992.

     and other papers by Dimitri P. Bertsekas, see:

     http://www.mit.edu:8001//people/dimitrib/publ.html

     Takes as input a cost dictionary, the minimum and maximum weights in this
     dictionary, and two sorted lists with the row and column numbers.
  """

    #################### START PARALLEL TEST CODE ###############################

    if (SAVE_PARALLEL_TEST_FILES == True):

        tmp_list = cost_dict.keys()
        tmp_list.sort()
        tmp_str = str(min_cost) + ' /'  + str(max_cost)+', '+ str(row_numbers) + \
                  ' / ' + str(col_numbers) + ':: '
        for k in tmp_list:
            tmp_list2 = cost_dict[k].items()
            tmp_list2.sort()
            tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / '

        f = open(
            'lap-auction-' + str(parallel.rank()) + '-' + str(parallel.size()),
            'a')
        f.write(tmp_str + os.linesep)
        f.close()

        #################### END PARALLEL TEST CODE ###############################

    i_large = 100000000  # A value larger than max_cost

    auction_results = {}  # Result dictionary with final record pair matches

    num_rows = len(row_numbers)
    num_cols = len(col_numbers)

    if (num_rows != num_cols):
        print 'error:Asymmetric problem given to symmetric "auction" algorithm'
        raise Exception

    # Set parameters for auction algorithm  - - - - - - - - - - - - - - - - - - -
    #
    if (max_cost > int(i_large / (num_rows + 1))):
        print 'error:Cost range too large to work with integer epsilon'
        raise Exception

    max_cost *= (num_rows + 1)

    beg_eps = max_cost / 5  # Maybe smaller, can even be 1, but not smaller
    end_eps = num_rows / 10  # Must be smaller than beg_eps, can be 1
    if (end_eps < 1):
        end_eps = 1
    elif (end_eps > beg_eps):
        end_eps = beg_eps
    factor = 5  # Must be greater than 1
    start_incr = beg_eps / 10  # Maybe even be 1, but not smaller
    if (start_incr < 1):
        start_incr = 1

    # Initialisation
    #
    eps = beg_eps
    i_small = -i_large
    large_incr = int(i_large / 10)
    thresh = min(int(num_rows / 5), 100)  # Maximal value 100
    incr_factor = 2
    cycles = 1
    average = num_cols
    num_phases = 1

    # Initialise dictionaries for prices and row assignments
    #
    pcol = {}
    assigned = {}

    for col_num in col_numbers:
        pcol[col_num] = i_small
        # assigned[col_num] = -1#######

    # Initialise list of un-assigned rows (all rows at the beginning)
    #
    list = row_numbers[:]
    no_list = num_rows

    do_phase = True  # Set flag so a first phase is performed

    # Start sub problem (scaling phase with new epsilon)  - - - - - - - - - - - -
    #
    while (do_phase == True):

        if (eps == 1):
            thresh = 0
        incr = max(start_incr,
                   eps)  # Increment must not be larger than epsilon

        print '2:      Start of a scaling phase with epsilon: %f' % (eps)

        do_cycle = True  # Set flag so a first cycle is performed
        cycle_count = 0

        while (do_cycle == True):

            # Start forward auction cycle - - - - - - - - - - - - - - - - - - - - - -
            #
            no_new_list = 0  # Initialise count of next list of un-assigned rows

            # Cycle through the current list of un-assigned rows
            #
            for i in range(no_list):
                row_num = list[i]
                row_dict = cost_dict[row_num]
                row_list = row_dict.items()
                row_list.sort()  ################ Maybe not needed ??? #######
                row_len = len(row_dict)

                # Get first and second column number and cost
                #
                col_num, cost = row_list[0]
                col_num2, cost2 = row_list[1]

                max1 = cost - pcol[col_num]
                max2 = cost2 - pcol[col_num2]
                if (max1 > max2):
                    best_col_num = col_num
                elif (max1 < max2):  # Swap maximum values
                    max1, max2 = max2, max1
                    best_col_num = col_num2
                else:  # Both are the same
                    if (col_num < col_num2
                        ):  # Make sure best column number is smallest
                        best_col_num = col_num
                    else:
                        best_col_num = col_num2

                if (row_len > 2):  # Row has more than two elements
                    for c in range(2, row_len):
                        col_num3, cost3 = row_list[c]  # Loop through cols

                        max_tmp = cost3 - pcol[col_num3]
                        if (max_tmp > max2):
                            if (max_tmp > max1):
                                best_col_num = col_num3  # New best column
                                max2 = max1
                                max1 = max_tmp
                            elif (max_tmp
                                  == max1) and (col_num3 < best_col_num):
                                best_col_num = col_num3  # Best column number is smallest
                            else:
                                max2 = max_tmp

                # Row bids for best column increasing its price, and gets - - - - - -
                # assigned to best column, while any row assigned to best
                # column gets un-asssigned
                #
                pcol[best_col_num] = pcol[best_col_num] + max1 - max2 + incr

                old_row = assigned.get(best_col_num, -1)  #########
                assigned[best_col_num] = row_num
                if (old_row >= 0):  # Row has been assigned
                    list[
                        no_new_list] = old_row  # Save un-assigned row into list
                    no_new_list += 1

            cycle_count += 1
            if ((cycle_count % 10000) == 0):
                print '1:        Finished %i cycles, %i out of %i rows un-assigned' % \
                      (cycle_count, no_new_list, num_rows)
                if (no_new_list > 0):
                    print '1:          Un-assigned rows: %s' % (str(
                        list[:no_new_list]))

            # Collect statistics
            #
            average = (cycles * average + no_list) / (cycles + 1)
            cycles += 1

            # Check if there are still 'many' unassignedrows, i.e. if the - - - - - -
            # number of unassignd rows is greater than the parameter 'thresh'.
            # If not, replace current list with the new list and go for another
            # cycle. Otherwise, if epsilon > 1, reduce epsilon, reset the
            # asignment to empty and restart auction.
            # If epsilon == 1 terminate.
            # Also increase the minimal bidding increment up to a maximun value
            # of epsilon (this is the adaptive feature)
            #
            incr *= incr_factor
            if (incr > eps):
                incr = eps
            if (no_new_list > thresh):
                no_list = no_new_list
            else:
                do_cycle = False  # Set flag so cycle is left

        # End of sub-problem (scaling phase) - - - - - - - - - - - - - - - - - - -
        #
        if (eps == 1):
            do_phase = False  # Set flag so phase is left
        else:
            num_phases += 1
            eps = int(eps / factor)
            if (eps > incr):
                eps = int(eps / factor)
            if (eps < 1) or (eps < end_eps):
                eps = 1
            thresh = int(thresh / factor)

            print '1:        End of a scaling phase, new epsilon: %3f' % (eps)

            t_min = min(pcol.values() + [i_large])

            col_num_list = assigned.keys()  ##########
            col_num_list.sort()  ##########

            for col_num in assigned:  ###########
                #      for col_num in col_num_list:
                row_num = assigned[col_num]
                if (row_num >= 0):
                    list[no_new_list] = row_num
                    no_new_list += 1
                    assigned[col_num] = -1

            incr = t_min - i_small  # Reset minimum price to i_small
            for col_num in col_numbers:  # Update all prices
                pcol[col_num] = pcol[col_num] - incr

            # Final parameter updates before starting another scaling phase
            #
            no_list = no_new_list

            if (start_incr < eps):
                start_incr *= factor

    return assigned

Example #2

Show file

File: lap.py Project: PpKarOn/febrl

def auction(cost_dict, min_cost, max_cost, row_numbers, col_numbers):
  """Linear sum assignment procedure based on symetric auction algorithm.

     Re-implementation of a FORTRAN code, taken from: 

     http://web.mit.edu/afs/athena.mit.edu/user/d/i/dimitrib/www/auction.txt

     and as described in:

     "Auction Algorithms for Network Flow Problems: A Tutorial Introduction"
     Dimitri P. Bertsekas, Computational Optimization and Applications, Vol.
     1, pp. 7-66, 1992.

     and other papers by Dimitri P. Bertsekas, see:

     http://www.mit.edu:8001//people/dimitrib/publ.html

     Takes as input a cost dictionary, the minimum and maximum weights in this
     dictionary, and two sorted lists with the row and column numbers.
  """

  #################### START PARALLEL TEST CODE ###############################

  if (SAVE_PARALLEL_TEST_FILES == True):

    tmp_list = cost_dict.keys()
    tmp_list.sort()
    tmp_str = str(min_cost) + ' /'  + str(max_cost)+', '+ str(row_numbers) + \
              ' / ' + str(col_numbers) + ':: '
    for k in tmp_list:
      tmp_list2 = cost_dict[k].items()
      tmp_list2.sort()
      tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / '

    f = open('lap-auction-'+str(parallel.rank())+'-'+str(parallel.size()),'a')
    f.write(tmp_str+os.linesep)
    f.close()

    #################### END PARALLEL TEST CODE ###############################

  i_large = 100000000  # A value larger than max_cost

  auction_results = {}  # Result dictionary with final record pair matches

  num_rows = len(row_numbers)
  num_cols = len(col_numbers)

  if (num_rows != num_cols):
    print 'error:Asymmetric problem given to symmetric "auction" algorithm'
    raise Exception

  # Set parameters for auction algorithm  - - - - - - - - - - - - - - - - - - -
  #
  if (max_cost > int(i_large / (num_rows + 1))):
    print 'error:Cost range too large to work with integer epsilon'
    raise Exception

  max_cost *= (num_rows + 1)

  beg_eps =    max_cost / 5   # Maybe smaller, can even be 1, but not smaller
  end_eps =    num_rows / 10  # Must be smaller than beg_eps, can be 1
  if (end_eps < 1):
    end_eps = 1
  elif (end_eps > beg_eps):
    end_eps = beg_eps
  factor =     5              # Must be greater than 1
  start_incr = beg_eps / 10   # Maybe even be 1, but not smaller
  if (start_incr < 1):
    start_incr = 1

  # Initialisation
  #
  eps =         beg_eps
  i_small =     -i_large
  large_incr =  int(i_large / 10)
  thresh =      min(int(num_rows / 5), 100)  # Maximal value 100
  incr_factor = 2
  cycles =      1
  average =     num_cols
  num_phases =  1

  # Initialise dictionaries for prices and row assignments
  #
  pcol =     {}
  assigned = {}

  for col_num in col_numbers:
    pcol[col_num] =     i_small
    # assigned[col_num] = -1#######

  # Initialise list of un-assigned rows (all rows at the beginning)
  #
  list =     row_numbers[:]
  no_list =  num_rows

  do_phase = True  # Set flag so a first phase is performed

  # Start sub problem (scaling phase with new epsilon)  - - - - - - - - - - - -
  #
  while (do_phase == True):

    if (eps == 1):
      thresh = 0
    incr = max(start_incr, eps)  # Increment must not be larger than epsilon

    print '2:      Start of a scaling phase with epsilon: %f' % (eps)

    do_cycle = True  # Set flag so a first cycle is performed
    cycle_count = 0

    while (do_cycle == True):

      # Start forward auction cycle - - - - - - - - - - - - - - - - - - - - - -
      #
      no_new_list = 0  # Initialise count of next list of un-assigned rows

      # Cycle through the current list of un-assigned rows
      #
      for i in range(no_list):
        row_num =  list[i]
        row_dict = cost_dict[row_num]
        row_list = row_dict.items()
        row_list.sort()  ################ Maybe not needed ??? #######
        row_len = len(row_dict)

        # Get first and second column number and cost
        #
        col_num, cost =   row_list[0]
        col_num2, cost2 = row_list[1]

        max1 = cost -  pcol[col_num]
        max2 = cost2 - pcol[col_num2]
        if (max1 > max2):
          best_col_num = col_num
        elif (max1 < max2):  # Swap maximum values
          max1, max2 = max2, max1
          best_col_num = col_num2
        else:  # Both are the same
          if (col_num < col_num2):  # Make sure best column number is smallest
            best_col_num = col_num
          else:
            best_col_num = col_num2

        if (row_len > 2):  # Row has more than two elements
          for c in range(2, row_len):
            col_num3, cost3 = row_list[c] # Loop through cols

            max_tmp = cost3 - pcol[col_num3]
            if (max_tmp > max2):
              if (max_tmp > max1):
                best_col_num = col_num3  # New best column
                max2 = max1
                max1 = max_tmp
              elif (max_tmp == max1) and (col_num3 < best_col_num):
                  best_col_num = col_num3  # Best column number is smallest
              else:
                max2 = max_tmp

        # Row bids for best column increasing its price, and gets - - - - - -
        # assigned to best column, while any row assigned to best
        # column gets un-asssigned
        #
        pcol[best_col_num] = pcol[best_col_num] + max1 - max2 + incr

        old_row = assigned.get(best_col_num, -1)  #########
        assigned[best_col_num] = row_num
        if (old_row >= 0):  # Row has been assigned
          list[no_new_list] = old_row  # Save un-assigned row into list
          no_new_list += 1

      cycle_count += 1
      if ((cycle_count % 10000) == 0):
        print '1:        Finished %i cycles, %i out of %i rows un-assigned' % \
              (cycle_count, no_new_list, num_rows)
        if (no_new_list > 0):
          print '1:          Un-assigned rows: %s' % (str(list[:no_new_list]))

      # Collect statistics
      #
      average = (cycles * average + no_list) / (cycles+1)
      cycles += 1

      # Check if there are still 'many' unassignedrows, i.e. if the - - - - - -
      # number of unassignd rows is greater than the parameter 'thresh'.
      # If not, replace current list with the new list and go for another
      # cycle. Otherwise, if epsilon > 1, reduce epsilon, reset the
      # asignment to empty and restart auction.
      # If epsilon == 1 terminate.
      # Also increase the minimal bidding increment up to a maximun value
      # of epsilon (this is the adaptive feature)
      #
      incr *= incr_factor
      if (incr > eps):
        incr = eps
      if (no_new_list > thresh):
        no_list = no_new_list
      else:
        do_cycle = False  # Set flag so cycle is left

    # End of sub-problem (scaling phase) - - - - - - - - - - - - - - - - - - -
    #
    if (eps == 1):
      do_phase = False  # Set flag so phase is left
    else:
      num_phases += 1
      eps = int(eps / factor)
      if (eps > incr):
        eps = int(eps / factor)
      if (eps < 1) or (eps < end_eps):
        eps = 1
      thresh = int(thresh / factor)

      print '1:        End of a scaling phase, new epsilon: %3f' % (eps)

      t_min = min(pcol.values()+[i_large])

      col_num_list = assigned.keys()   ##########
      col_num_list.sort()              ##########

      for col_num in assigned:    ###########
#      for col_num in col_num_list:
        row_num = assigned[col_num]
        if (row_num >= 0):
          list[no_new_list] = row_num
          no_new_list += 1
          assigned[col_num] = -1

      incr = t_min - i_small  # Reset minimum price to i_small
      for col_num in col_numbers:  # Update all prices
        pcol[col_num] = pcol[col_num] - incr

      # Final parameter updates before starting another scaling phase
      #
      no_list = no_new_list

      if (start_incr < eps):
        start_incr *= factor

  return assigned

Example #3

Show file

File: lap.py Project: mkitawat/febrl

def do_lap(lap_method, results_dict, process_type, threshold):
    """Linear sum assignment procedure.

     This routine calculates a linear assignments for one-to-one matching.

     The routine do_lap does all kinds of preprocessing, including the
     extraction of unique record pairs (which can be removed before the lap is
     applied) and the extraction of sub-set which can be solved independently.
     These sub-sets are then given to the chosen lap routine.

     The routine takes as input a results dictionary, as produced by a
     classifier (see classification.py), and returns a dictionary with the
     assigned record pair numbers as keys and the corresponding weight as
     values.

     Possible methods are 'auction'

     The process_type attribute must either be set to 'deduplication' or to
     'linkage' in order to be able to preprocess the classifier data prior to
     the lap procedure.
  """

    if (lap_method not in ['auction']):
        print 'error:Illegal method for lap method: %s' % (str(lap_method))
        raise Exception

    if (process_type not in ['deduplication', 'linkage']):
        print 'error:Illegal value for attribute "process_type": %s' %\
              (str(process_type))
        raise Exception

    if (results_dict == {}):
        print 'error:Empty results dictionary'
        raise Exception

    lap_start_time = time.time()  # Start timer

    lap_results = {}  # Result dictionary with final record pair matches

    # Make one (or two) disctionary of all assigned rercord numbers
    #
    if (process_type == 'deduplication'):
        used_rec_nums = {}
    else:
        used_rec_nums_a = {}
        used_rec_nums_b = {}

    # Make sure the threshold is a number if it is defined
    #
    if (threshold != None):
        if (not (isinstance(threshold, int) or isinstance(threshold, float))):
            print 'error:Threshold is not a number: %s' % (str(threshold))
            raise Exception

    print '1:  Start linear assignment procedure using method: %s' % (
        lap_method)
    print '1:    Original length of results dictionary: %i' % (
        len(results_dict))

    # Step 1: Filter out record pairs with weight lower than the threshold  - - -
    #
    if (threshold != None):
        print '1:    Remove record pairs with weight less than: %f' % (
            threshold)
    else:
        threshold = -999999999999.999  # Make it a very very small number

    work_dict = {}  # Make an empty working dictionary

    for row_num in results_dict:  # Loop over all record numbers (keys)
        row_dict = results_dict[row_num]  # Get corresponding record dictionary

        new_row_dict = {}  # Start a new record dictionary

        for col_num in row_dict:  # Loop over all records in this dictionary
            weight = row_dict[col_num]

            if (weight >= threshold):
                new_row_dict[col_num] = weight  # Copy to new dictionary

        if (new_row_dict != {}):  # Only insert non empty dictionaries
            work_dict[row_num] = new_row_dict

    results_len = len(work_dict)  # Save results length (after filtering)

    if (threshold > -999999999999.999):
        print '1:    Length of working dictionary after filtering: %i' % \
              (results_len)

    # Step 2: Remove all matches (record pairs) which are unique  - - - - - - - -
    #         (i.e. which don't have matches with other records)
    #
    row_num_dict = {}  # Count occurences of record numbers in rows
    col_num_dict = {}  # Count occurences of record numbers in columns

    for row_num in work_dict:  # First count occurences of rows and columns

        # Insert a count for the row number
        #
        row_num_dict[row_num] = row_num_dict.get(row_num, 0) + 1

        row_dict = work_dict[row_num]

        for col_num in row_dict:

            # Increase a count for a column number
            #
            col_num_dict[col_num] = col_num_dict.get(col_num, 0) + 1

            if (process_type == 'deduplication'):

                # For deduplication, insert symmetric record numbers as well
                #
                row_num_dict[col_num] = row_num_dict.get(col_num, 0) + 1
                col_num_dict[row_num] = col_num_dict.get(row_num, 0) + 1

    for row_num in work_dict.keys():  # Secondly remove unique rows and column

        row_dict = work_dict[row_num]  # Get corresponding record dictionary

        if (len(row_dict) == 1):  # Only one record pair for this record

            col_num, weight = row_dict.items()[
                0]  # Get the only element in row

            if (row_num_dict[row_num] == 1) and (col_num_dict[col_num] == 1):

                #################### START TEST CODE ##################################

                if (DO_TESTS == True):
                    if (process_type == 'deduplication'):
                        if (row_num in used_rec_nums):
                            print 'warning:Record number %i already used for deduplication' \
                                  % (row_num)
                        if (col_num in used_rec_nums):
                            print 'warning:record number %i already used for deduplication' \
                                  % (col_num)
                    else:
                        if (row_num in used_rec_nums_a):
                            print 'warning:Record number A %i already used for linkage' % \
                                  (row_num)
                        if (col_num in used_rec_nums_b):
                            print 'warning:record number B %i already used for linkage' % \
                                  (col_num)

                #################### END TEST CODE ##################################

                lap_results[(row_num,
                             col_num)] = True  # Insert into final results
                del work_dict[row_num]  # And delete the record in the results

                if (process_type == 'deduplication'):
                    used_rec_nums[row_num] = True
                    used_rec_nums[col_num] = True
                else:
                    used_rec_nums_a[row_num] = True
                    used_rec_nums_b[col_num] = True

    print '1:    Found and extracted %i unique record ' % (len(lap_results)) + \
          'pairs in results dictionary'

    for rec_pair in lap_results:
        print '3:      %s' % (str(rec_pair))
    print '3:'

    lap_pair_extract_time = time.time() - lap_start_time

    #################### START PARALLEL TEST CODE ###########################

    if (SAVE_PARALLEL_TEST_FILES == True):
        tmp_list = lap_results.items()
        tmp_list.sort()
        f = open('one2one-unique-dedup-'+str(parallel.rank())+'-'+ \
                 str(parallel.size()),'w')
        for c in tmp_list:
            f.write(str(c) + os.linesep)
        f.close()

        tmp_list = work_dict.keys()
        tmp_list.sort()
        f = open(
            'work-dict-' + str(parallel.rank()) + '-' + str(parallel.size()),
            'w')
        for c in tmp_list:
            cc = work_dict[c].items()
            cc.sort()
            f.write(str(c) + ':: ' + str(cc) + os.linesep)
        f.close()

    #################### END PARALLEL TEST CODE #############################

    #################### START TEST CODE ########################################
    # Test if a record only appears once in the lap results dictionary
    #
    if (DO_TESTS == True):
        if (process_type == 'deduplication'):
            test_dict = {}
            for (rec_a, rec_b) in lap_results:
                if (test_dict.has_key(rec_a)):
                    print 'warning:Record %s is already in the test dictionary' % \
                          (str(rec_a))
                else:
                    test_dict[rec_a] = 1
                if (test_dict.has_key(rec_b)):
                    print 'warning:Record %s is already in the test dictionary' % \
                          (str(rec_b))
                else:
                    test_dict[rec_b] = 1

        else:  # Linkage process
            test_dict_a = {}
            test_dict_b = {}
            for (rec_a, rec_b) in lap_results:
                if (test_dict_a.has_key(rec_a)):
                    print 'warning:Record %s is already in test dictionary A' % \
                          (str(rec_a))
                else:
                    test_dict_a[rec_a] = 1
                if (test_dict_b.has_key(rec_b)):
                    print 'warning:Record %s is already in test dictionary B' % \
                          (str(rec_b))
                else:
                    test_dict_b[rec_b] = 1

    #################### END TEST CODE ##########################################

    if (len(work_dict) == 0
        ):  # All record pairs are processed - - - - - - - - -
        return lap_results

    print '1:    Remaining number of records in working dictionary: %i' % \
          (len(work_dict)) + ' (down from: %i)' % (results_len)

    # Step 3: Find connected sub-sets in the results dictionary - - - - - - - - -
    #         (using depth-first search)
    #
    visited = {}  # Dictionary which will contain all so far visited rows
    sub_sets = {}  # Dictionary which will contain the sub-sets extracted

    print '1:    Find connected sub-graphs in results dictionary'

    lap_subset_start_time = time.time()

    max_sub_set_length = -1
    num_visited = 0  # Number of rows visited so far
    row_num_done = 0
    work_dict_len = len(work_dict)

    work_dict_rows = work_dict.keys()
    work_dict_rows.sort()

    # Create a column oriented work dictionary  - - - - - - - - - - - - - - - - -
    #
    col_work_dict = {}

    for row_num in work_dict_rows:  # Loop over all rows
        row_dict = work_dict[row_num]

        for col_num in row_dict:
            col_dict = col_work_dict.get(col_num, {})
            col_dict[
                row_num] = True  # Only position is needed, but not the weight
            col_work_dict[col_num] = col_dict

    for row_num in work_dict_rows:  # Loop over all rows

        if (not visited.has_key(row_num)):  # This row has not been visited yet

            visited[row_num] = row_num  # Mark visited as 'seeding' row
            num_visited += 1
            print '2:      Create sub-set with seeding record %i' % (row_num)

            process_queue = [row_num]  # Start a new queue of rows to process
            row_sub_set = {row_num: 1}  # Row numbers connected to this row

            while (process_queue !=
                   []):  # Process rows until all connected rows done
                print '3:        Process queue: %s' % (str(process_queue))

                next_row = process_queue.pop(
                    0)  # Get and remove first row to process
                row_col_numbers = work_dict[next_row].keys(
                )  # Get columns in this row

                # For deduplication, also insert row number into this column numbers
                #
                if (process_type == 'deduplication'):
                    row_col_numbers.append(next_row)

                print '3:          Row %i with column numbers: %s' % \
                      (next_row, str(row_col_numbers))

                # Get the row numbers from all column numbers
                #
                for col_num in row_col_numbers:

                    # Get list of all row numbers in this column
                    #
                    row_num_dict = col_work_dict.get(col_num, {})
                    row_num_list = row_num_dict.keys()

                    if (process_type == 'deduplication') and (col_num in work_dict) and \
                      (col_num not in row_num_list):
                        row_num_list.append(col_num)

                    print '3:          Column: %i with row numbers: %s' % \
                          (col_num, str(row_num_list))

                    for row_num2 in row_num_list:
                        row_sub_set[row_num2] = 1
                        if (not visited.has_key(row_num2)
                            ):  # Check if it's a new row
                            process_queue.append(row_num2)
                            print '3:          Appended row number %i to process queue' % \
                                  (row_num2)

                            visited[
                                row_num2] = row_num  # Mark row as visited by seeding row
                            num_visited += 1
                            print '3:          Row %i connected to row %i' % \
                                  (row_num2, row_num)

            sub_sets[row_num] = row_sub_set.keys()  # Only store keys

            if (len(row_sub_set) > max_sub_set_length):
                max_sub_set_length = len(row_sub_set)

            print '3:        Sub-set contains records: %s' % \
                  (str(row_sub_set.keys()))

        row_num_done += 1

        # Now determine timing and print progress report (every 10%)  - - - - - - -
        # (only if more than 100 records in the work dictionary)
        #
        if (work_dict_len >= 100) and (row_num_done % int(work_dict_len / 10)
                                       == 0):
            used_time = time.time() - lap_subset_start_time
            perc_done = 100.0 * row_num_done / work_dict_len
            #      todo_time =     (work_dict_len - row_num_done) * \  #################
            #                      (used_time / row_num_done)
            todo_time =     (work_dict_len - num_visited) * \
                            (used_time / row_num_done)

            used_time_string = output.time_string(used_time)
            todo_time_string = output.time_string(todo_time)

            print '1:      Processed %.1f%% of records in %s (%i/%i records ' % \
                  (perc_done, used_time_string, num_visited, work_dict_len) + \
                  'visited)'
            print '1:        Estimated %s until finished' % (todo_time_string)

    del col_work_dict  # Delete the column oriented work dictionary

    num_sub_sets = len(sub_sets)  # Get the total number of sub-sets

    lap_subset_total_time = time.time() - lap_subset_start_time
    lap_subset_total_time_string = output.time_string(lap_subset_total_time)

    print '1:    Extracted %i sub-sets in %s' % \
          (num_sub_sets, lap_subset_total_time_string)
    print '1:      Longest sub-set contains %i rows' % (max_sub_set_length)

    #################### START TEST CODE ########################################
    # Test if all the sub-sets are mutually exclusive, and if the seed rows are
    # in the sub-set record lists
    #
    if (DO_TESTS == True):
        for seed_row in sub_sets:
            row_list = sub_sets[seed_row]
            if (seed_row not in row_list):
                print 'warning:Seed row %s not in sub-set row list: %s' % \
                      (str(seed_rec), str(row_list))
            for rec_num in row_list:
                for seed_row2 in sub_sets:
                    row_list2 = sub_sets[seed_row2]
                    if (seed_row != seed_row2):  # Don't test itself
                        if (rec_num in row_list2):
                            print 'warning:Record %s in more than one sub-set: %s, %s' % \
                                  (str(rec_num), str(row_list), str(row_list2))

    #################### END TEST CODE ##########################################

    #################### START PARALLEL TEST CODE ###########################

    if (SAVE_PARALLEL_TEST_FILES == True):
        tmp_list = sub_sets.keys()
        tmp_list.sort()
        f = open('sub-sets-'+str(parallel.rank())+'-'+ \
                 str(parallel.size()),'w')
        for s in tmp_list:
            tmp_sub_set = sub_sets[s]
            tmp_sub_set.sort()

            f.write(str(s) + '::' + str(tmp_sub_set) + os.linesep)
        f.close()

    #################### END PARALLEL TEST CODE #############################

    # Now loop over all sub-sets  - - - - - - - - - - - - - - - - - - - - - - - -
    # (pre-process them first before giving them to the actual linear assignment
    # method)
    #
    lap_lap_start_time = time.time()
    lap_comm_time = 0.0

    sub_set_cnt = 0  # A round robin counter, used for parallelism

    sub_set_rows = sub_sets.keys()
    sub_set_rows.sort()  # Needed to make the same on all processes

    for seed_row in sub_set_rows:

        # Distribute sub-sets equally to all processors
        #
        if ((sub_set_cnt % parallel.size()) == parallel.rank()):

            row_list = sub_sets[seed_row]
            row_list.sort()

            print '1:'
            print '1:    Sub-set %i of %i with seed row %i contains %i rows' % \
                  (sub_set_cnt, num_sub_sets, seed_row, len(row_list))
            print '3:      Sub-set rows:  %s' % (str(row_list))

            if (len(row_list) == 1
                ):  # Special case: One row only  - - - - - - - - -
                max_weight = -99999.9
                max_col = -1
                row_dict = work_dict[
                    row_list[0]]  # Get the dictionary for this row

                # Find element with largest weight
                #
                for col_num in row_dict:
                    weight = row_dict[col_num]
                    if (weight > max_weight):
                        max_weight = weight
                        max_col = col_num

                # Assignment dictionary is of form col_num:row_num
                #
                tmp_assign_dict = {
                    max_col: row_list[0]
                }  # Make record pair dictionary

                print '2:      Special case sub-set with one row only, ' + \
                      'assignment pair: (%i,%i)' % (row_list[0], max_col)

            else:  # General case with more than one row  - - - - - - - - - - - - - -

                # Get minimal and maximal weights, and lists with row and column
                # numbers
                #
                min_weight = 999999.9
                max_weight = -999999.9
                col_numbers = {}
                row_col_numbers = {}

                for row_num in row_list:  # Loop over rows in this sub-set
                    row_dict = work_dict[
                        row_num]  # Get the dictionary for this row
                    row_col_numbers[row_num] = 1

                    for col_num in row_dict:
                        weight = row_dict[col_num]
                        col_numbers[col_num] = 1
                        row_col_numbers[col_num] = 1

                        if (weight < min_weight):
                            min_weight = weight
                        if (weight > max_weight):
                            max_weight = weight

                print '3:      Minimal and maximal weight: %.3f / %.3f' % \
                      (min_weight, max_weight)

                row_numbers = work_dict.keys()
                col_numbers = col_numbers.keys()
                row_numbers.sort()
                col_numbers.sort()
                num_rows = len(row_numbers)
                num_cols = len(col_numbers)

                row_col_numbers = row_col_numbers.keys()
                row_col_numbers.sort()

                #print '1:      Row numbers:    %s' % (str(row_numbers))
                #print '1:      Column numbers: %s' % (str(col_numbers))
                #print '1:      Row/colum numbers: %s' % (str(row_col_numbers))
                #print '1:      Number of unique weights: %i' % (len(weight_dict))

                # Deal with the special case that there is only one column number - - -
                #
                if (num_cols == 1):
                    max_weight = -99999.9
                    max_row = -1

                    col_num = col_numbers[0]  # Get the column number

                    # Find element with largest weight
                    #
                    for row_num in row_list:  # Loop over rows

                        # Get only weight in row
                        #
                        row_weight = work_dict[row_num].values()[0]

                        if (row_weight > max_weight):
                            max_weight = row_weight
                            max_row = row_num

                    # Assignment dictionary is of form col_num:row_num
                    #
                    tmp_assign_dict = {
                        col_num: max_row
                    }  # Make record pair dictionary

                    print '2:      Special case sub-set with one column only, ' + \
                          'assignment pair: (%i,%i)' % (max_row, col_num)

                else:  # General case with more than one row and column - - - - - - - -

                    # Construct the cost dictionary - - - - - - - - - - - - - - - - - - -
                    #
                    cost_dict = {}
                    dim = len(row_col_numbers)  # Final dimension of the LAP

                    min_cost = -max_weight * (dim + 1)  # Use original wieghts

                    for row_num in row_list:  # Loop over rows

                        row_dict = work_dict[row_num]

                        # Get the column numbers in this row
                        #
                        col_list = row_dict.keys()
                        col_list.sort()

                        row_cost_dict = cost_dict.get(row_num, {})

                        for col_num in col_list:
                            weight = row_dict[col_num]

                            cost = weight * (dim + 1)  # Use original weights
                            row_cost_dict[
                                col_num] = cost  # And store into row dictionary

                            # Insert symmetric element as well (if not on diagonal)
                            #
                            if (row_num != col_num):
                                row_cost_dict2 = cost_dict.get(col_num, {})

                                if (row_num not in row_cost_dict2
                                    ):  # Only insert if not there

                                    if (process_type == 'deduplication'):
                                        row_cost_dict2[
                                            row_num] = cost  # Insert symmetric cost
                                    else:  # Linkage process
                                        row_cost_dict2[
                                            row_num] = min_cost  # Insert minimal cost

                                    # And insert diagonal element if there is none
                                    #
                                    if (not row_cost_dict2.has_key(col_num)):
                                        row_cost_dict2[col_num] = min_cost

                                    cost_dict[col_num] = row_cost_dict2

                        # Make sure there is a diagonal element (for feasibility)
                        #
                        if (not row_cost_dict.has_key(row_num)):
                            row_cost_dict[row_num] = min_cost

                        # If more than MAX_ROW_ELEMENTS elements in row only take the
                        # largest (following an idea by William Winkler)
                        #
                        if (len(row_cost_dict) > MAX_ROW_ELEMENTS):
                            row_col_numbers = row_cost_dict.keys()
                            row_weights = row_cost_dict.values()
                            row_elem_list = map(None, row_weights,
                                                row_col_numbers)
                            row_elem_list.sort()

                            diag_weight = row_cost_dict[
                                row_num]  # Keep diagonal element
                            row_cost_dict = {row_num: diag_weight}

                            ##print '   '
                            ##print '****** row_elem_list: %s' % (str(row_elem_list))
                            ##print '   '

                            for (weight,
                                 col_num) in row_elem_list[-MAX_ROW_ELEMENTS:]:
                                row_cost_dict[col_num] = weight

                        # Insert row into cost dictionary
                        #
                        cost_dict[row_num] = row_cost_dict

                    # Get the final row and column numbers  - - - - - - - - - - - - - - -
                    #
                    row_numbers = cost_dict.keys()
                    col_numbers = {}
                    for row_dict in cost_dict.values():
                        col_numbers.update(row_dict)

                    col_numbers = col_numbers.keys()
                    row_numbers.sort()
                    col_numbers.sort()

                    # Check if number of rows and columns are equal - - - - - - - - - - -
                    #
                    if (len(row_numbers) != len(col_numbers)):
                        print 'error:Different number of rows (%i) and columns (%i)' \
                              % (len(row_numbers), len(col_numbers))
                        raise Exception

                    print '1:      Cost dictionary with %i rows/columns given to ' % \
                          (len(row_numbers)) + 'assignment method %s:' % (lap_method)
                    print '2:        Row numbers:    %s' % (str(row_numbers))
                    print '2:        Column numbers: %s' % (str(row_numbers))
                    print '2:        Minimal weight: %3f' % (min_weight)
                    print '2:        Maximal weight: %3f' % (max_weight)
                    print '3:        Cost dictionary: %s' % (str(cost_dict))
                    print '3:        Process type:    %s' % (process_type)

                    #################### START PARALLEL TEST CODE #######################

                    if (SAVE_PARALLEL_TEST_FILES == True):

                        tmp_list = cost_dict.keys()
                        tmp_list.sort()
                        tmp_str = str(sub_set_cnt)+':: '+ str(min_weight) + ' / ' + \
                                  str(max_weight) + ', ' + str(row_numbers) + ', ' + \
                                  process_type + '::'
                        for k in tmp_list:
                            tmp_list2 = cost_dict[k].items()
                            tmp_list2.sort()
                            tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / '

                        f = open('lap-calling-'+str(parallel.rank())+'-'+ \
                            str(parallel.size()),'a')
                        f.write(tmp_str + os.linesep)
                        f.close()

                    #################### END PARALLEL TEST CODE #########################

                    # Call the lap method which returns an assignment dictionary  - - - -
                    #
                    if (lap_method == 'auction'):
                        tmp_assign_dict = auction(cost_dict, min_weight, max_weight, \
                                              row_numbers, col_numbers)
                    else:
                        print 'error:LAP method %s not implemented' % (
                            lap_method)
                        raise Exception

            # If run in parallel, send temporary assignment dictionary process 0  - -
            #
            if (parallel.rank() > 0):
                tmp_time = time.time()
                parallel.send(tmp_assign_dict, 0)
                lap_comm_time += (time.time() - tmp_time)
                print '1:      Sent assignment dictionary with %i entries to process' \
                      % (len(tmp_assign_dict)) + ' 0'

        # Only process 0 inserts temporary assignment dictionary into results - - -
        #
        if (parallel.rank() == 0):

            # Receive assignment dictionary from other process if necessary
            #
            p = (sub_set_cnt % parallel.size()
                 )  # Process number to receive from

            if (p != 0):
                tmp_time = time.time()
                tmp_assign_dict = parallel.receive(p)
                lap_comm_time += (time.time() - tmp_time)
                print '1:    Received subset %i of %i assignment dictionary with ' % \
                      (sub_set_cnt, num_sub_sets) + '%i entries from process %i' % \
                      (len(tmp_assign_dict), p)

            # Post-process the assignment dictionary  - - - - - - - - - - - - - - - -
            #
            assign_pairs = {}

            for rec_num_b in tmp_assign_dict:
                rec_num_a = tmp_assign_dict[rec_num_b]

                # Now check if this record pair is in the original results dictionary
                #
                if (rec_num_a in results_dict):
                    row_dict = results_dict[rec_num_a]
                    if (rec_num_b in row_dict):
                        weight = row_dict[rec_num_b]

                        # Insert into dictionary of potential record pairs
                        #
                        assign_pairs[(rec_num_a, rec_num_b)] = weight

            #################### START PARALLEL TEST CODE ###########################

            if (SAVE_PARALLEL_TEST_FILES == True):

                tmp_list = tmp_assign_dict.items()
                tmp_list.sort()
                tmp_list2 = assign_pairs.items()
                tmp_list2.sort()
                tmp_list3 = sub_sets[seed_row]
                tmp_list3.sort()

                f = open('assignments-'+str(parallel.rank())+'-'+ \
                     str(parallel.size()),'a')
                f.write(str(sub_set_cnt) + ', ' + str(seed_row) + os.linesep)
                f.write(str(tmp_list) + os.linesep)
                f.write(str(tmp_list2) + os.linesep)
                f.write(str(tmp_list3) + os.linesep)
                f.write(os.linesep)
                f.close()

            #################### END PARALLEL TEST CODE #############################

            # Sort the assigned pairs according to their weight
            #
            assign_weights = assign_pairs.values()  # Get the weights in a list
            assign_rec_pairs = assign_pairs.keys()  # And the record pairs

            assign_pair_list = map(None, assign_weights, assign_rec_pairs)
            assign_pair_list.sort()

            num_assigned_pairs = 0  # Number of assigned pairs for this sub-set
            dedup_check_rec_nums = {}  # Already assigned record numbers

            while (assign_pair_list != []):  # Now check all record pairs
                check_pair = assign_pair_list.pop(
                )  # Get largest weight record pair

                weight = check_pair[0]
                rec_num_a = check_pair[1][0]
                rec_num_b = check_pair[1][1]
                rec_pair = (rec_num_a, rec_num_b)

                # Now check if a record pair has already been used in an assignment
                # and for a deduplication process also check if any of the two
                # records has been used in an assignment
                #
                if ((process_type == 'linkage') and \
                    (rec_num_a not in used_rec_nums_a) and \
                    (rec_num_b not in used_rec_nums_b)) or \
                   ((process_type == 'deduplication') and \
                    (rec_num_a not in used_rec_nums) and \
                    (rec_num_b not in used_rec_nums)):

                    # For deduplication insert record numbers into used record numbers
                    #
                    if (process_type == 'deduplication'):
                        used_rec_nums[rec_num_a] = True
                        used_rec_nums[rec_num_b] = True
                    else:
                        used_rec_nums_a[rec_num_a] = True
                        used_rec_nums_b[rec_num_b] = True

                    if (rec_pair not in lap_results):
                        lap_results[rec_pair] = True
                        num_assigned_pairs += 1
                    else:
                        print 'warning:Record pair (%i,%i) already in LAP results' \
                              % (rec_num_a, rec_num_b)

            print '2:      Inserted %i (out of %i) record pairs into LAP ' % \
                  (num_assigned_pairs, len(tmp_assign_dict)) + 'results'

        sub_set_cnt += 1

        # Report progress every 10% (only if more than 100 sub-sets)  - - - - - - -
        #
        if (num_sub_sets >= 100) and (sub_set_cnt % int(num_sub_sets / 10)
                                      == 0):
            used_time = time.time() - lap_lap_start_time
            perc_done = 100.0 * sub_set_cnt / num_sub_sets
            sub_set_time = used_time / sub_set_cnt
            todo_time = (num_sub_sets - sub_set_cnt) * sub_set_time

            used_time_string = output.time_string(used_time)
            todo_time_string = output.time_string(todo_time)
            sub_set_time_string = output.time_string(sub_set_time)

            print '1:      Processed %.1f%% (%i/%i) of sub-sets in %s' % \
                    (perc_done, sub_set_cnt, num_sub_sets, used_time_string) + \
                    ' (%s per sub-set)' % (sub_set_time_string)
            print '1:        Estimated %s until finished' % (todo_time_string)

    print '1:  Total number of assignments: %i' % (len(lap_results))
    print '1:    Number of rows in original results dictionary: %i' % \
          (len(results_dict))

    #################### START TEST CODE ########################################
    # Test if a record only appears once in the lap results dictionary
    #
    if (DO_TESTS == True) and (parallel.rank() == 0):
        if (process_type == 'deduplication'):
            test_dict = {}
            for (rec_a, rec_b) in lap_results:
                if (test_dict.has_key(rec_a)):
                    print 'warning:Record %i is already in the test dictionary' % \
                          (rec_a)+' rec_pair: (%i,%i)' % (rec_a, rec_b)
                else:
                    test_dict[rec_a] = True
                if (test_dict.has_key(rec_b)):
                    print 'warning:Record %i is already in the test dictionary' % \
                          (rec_b)+' rec_pair: (%i,%i)' % (rec_a, rec_b)
                else:
                    test_dict[rec_b] = 1

        else:  # Linkage process
            test_dict_a = {}
            test_dict_b = {}
            for (rec_a, rec_b) in lap_results:
                if (test_dict_a.has_key(rec_a)):
                    print 'warning:Record %s is already in test dictionary A' % \
                          (str(rec_a))
                else:
                    test_dict_a[rec_a] = 1
                if (test_dict_b.has_key(rec_b)):
                    print 'warning:Record %s is already in test dictionary B' % \
                          (str(rec_b))
                else:
                    test_dict_b[rec_b] = 1

    #################### END TEST CODE ##########################################

    lap_stop_time = time.time()
    lap_lap_time = lap_stop_time - lap_lap_start_time
    lap_total_time = lap_stop_time - lap_start_time

    lap_pair_extract_time_string = output.time_string(lap_pair_extract_time)
    lap_subset_total_time_string = output.time_string(lap_subset_total_time)
    lap_lap_time_string = output.time_string(lap_lap_time)
    if (parallel.size() > 1):
        lap_comm_time_string = output.time_string(lap_comm_time)
    lap_total_time_string = output.time_string(lap_total_time)

    print '1:'
    print '1:  Finished linear record pair assignment procedure'
    print '1:    Time for extracting unique record pairs: %s' % \
          (lap_pair_extract_time_string)
    print '1:    Time for creating record sub-sets:       %s' % \
          (lap_subset_total_time_string)
    print '1:    Time for linear assignment algorithm:    %s' % \
          (lap_lap_time_string)
    if (parallel.size() > 1):
        print '1:    Time for communication:                  %s' % \
              (lap_comm_time_string)
    print '1:    Total time for linear assignment:        %s' % \
          (lap_total_time_string)
    print '1:'

    return lap_results

Example #4

Show file

File: lap.py Project: PpKarOn/febrl

def do_lap(lap_method, results_dict, process_type, threshold):
  """Linear sum assignment procedure.

     This routine calculates a linear assignments for one-to-one matching.

     The routine do_lap does all kinds of preprocessing, including the
     extraction of unique record pairs (which can be removed before the lap is
     applied) and the extraction of sub-set which can be solved independently.
     These sub-sets are then given to the chosen lap routine.

     The routine takes as input a results dictionary, as produced by a
     classifier (see classification.py), and returns a dictionary with the
     assigned record pair numbers as keys and the corresponding weight as
     values.

     Possible methods are 'auction'

     The process_type attribute must either be set to 'deduplication' or to
     'linkage' in order to be able to preprocess the classifier data prior to
     the lap procedure.
  """

  if (lap_method not in ['auction']):
    print 'error:Illegal method for lap method: %s' % (str(lap_method))
    raise Exception

  if (process_type not in ['deduplication', 'linkage']):
    print 'error:Illegal value for attribute "process_type": %s' %\
          (str(process_type))
    raise Exception

  if (results_dict == {}):
    print 'error:Empty results dictionary'
    raise Exception

  lap_start_time = time.time()  # Start timer

  lap_results = {}  # Result dictionary with final record pair matches

  # Make one (or two) disctionary of all assigned rercord numbers
  #
  if (process_type == 'deduplication'):
    used_rec_nums = {}
  else:
    used_rec_nums_a = {}
    used_rec_nums_b = {}

  # Make sure the threshold is a number if it is defined
  #
  if (threshold != None):
    if (not (isinstance(threshold, int) or isinstance(threshold, float))):
      print 'error:Threshold is not a number: %s' % (str(threshold))
      raise Exception

  print '1:  Start linear assignment procedure using method: %s' % (lap_method)
  print '1:    Original length of results dictionary: %i' % (len(results_dict))

  # Step 1: Filter out record pairs with weight lower than the threshold  - - -
  #
  if (threshold != None):
    print '1:    Remove record pairs with weight less than: %f' % (threshold)
  else:
    threshold = -999999999999.999  # Make it a very very small number

  work_dict = {}  # Make an empty working dictionary

  for row_num in results_dict:  # Loop over all record numbers (keys)
    row_dict = results_dict[row_num]  # Get corresponding record dictionary

    new_row_dict = {}  # Start a new record dictionary

    for col_num in row_dict:  # Loop over all records in this dictionary
      weight = row_dict[col_num]

      if (weight >= threshold):
        new_row_dict[col_num] = weight  # Copy to new dictionary

    if (new_row_dict != {}):  # Only insert non empty dictionaries
      work_dict[row_num] = new_row_dict

  results_len = len(work_dict)  # Save results length (after filtering)

  if (threshold > -999999999999.999):
    print '1:    Length of working dictionary after filtering: %i' % \
          (results_len)

  # Step 2: Remove all matches (record pairs) which are unique  - - - - - - - -
  #         (i.e. which don't have matches with other records)
  #
  row_num_dict = {}  # Count occurences of record numbers in rows
  col_num_dict = {}  # Count occurences of record numbers in columns

  for row_num in work_dict:  # First count occurences of rows and columns

    # Insert a count for the row number
    #
    row_num_dict[row_num] = row_num_dict.get(row_num, 0) + 1

    row_dict = work_dict[row_num]

    for col_num in row_dict:

      # Increase a count for a column number
      #
      col_num_dict[col_num] = col_num_dict.get(col_num, 0) + 1

      if (process_type == 'deduplication'):

        # For deduplication, insert symmetric record numbers as well
        #
        row_num_dict[col_num] = row_num_dict.get(col_num, 0) + 1
        col_num_dict[row_num] = col_num_dict.get(row_num, 0) + 1

  for row_num in work_dict.keys():  # Secondly remove unique rows and column

    row_dict = work_dict[row_num]  # Get corresponding record dictionary

    if (len(row_dict) == 1):  # Only one record pair for this record

      col_num, weight = row_dict.items()[0]  # Get the only element in row

      if (row_num_dict[row_num] == 1) and (col_num_dict[col_num] == 1):

        #################### START TEST CODE ##################################

        if (DO_TESTS == True):
          if (process_type == 'deduplication'):
            if (row_num in used_rec_nums):
              print 'warning:Record number %i already used for deduplication' \
                    % (row_num)
            if (col_num in used_rec_nums):
              print 'warning:record number %i already used for deduplication' \
                    % (col_num)
          else:
            if (row_num in used_rec_nums_a):
              print 'warning:Record number A %i already used for linkage' % \
                    (row_num)
            if (col_num in used_rec_nums_b):
              print 'warning:record number B %i already used for linkage' % \
                    (col_num)

        #################### END TEST CODE ##################################

        lap_results[(row_num,col_num)] = True  # Insert into final results
        del work_dict[row_num]  # And delete the record in the results

        if (process_type == 'deduplication'):
          used_rec_nums[row_num] = True
          used_rec_nums[col_num] = True
        else:
          used_rec_nums_a[row_num] = True
          used_rec_nums_b[col_num] = True

  print '1:    Found and extracted %i unique record ' % (len(lap_results)) + \
        'pairs in results dictionary'

  for rec_pair in lap_results:
    print '3:      %s' % (str(rec_pair))
  print '3:'

  lap_pair_extract_time = time.time() - lap_start_time

  #################### START PARALLEL TEST CODE ###########################

  if (SAVE_PARALLEL_TEST_FILES == True):
    tmp_list = lap_results.items()
    tmp_list.sort()
    f = open('one2one-unique-dedup-'+str(parallel.rank())+'-'+ \
             str(parallel.size()),'w')
    for c in tmp_list:
      f.write(str(c)+os.linesep)
    f.close()

    tmp_list = work_dict.keys()
    tmp_list.sort()
    f = open('work-dict-'+str(parallel.rank())+'-'+str(parallel.size()),'w')
    for c in tmp_list:
      cc = work_dict[c].items()
      cc.sort()
      f.write(str(c)+':: '+str(cc)+os.linesep)
    f.close()

  #################### END PARALLEL TEST CODE #############################

  #################### START TEST CODE ########################################
  # Test if a record only appears once in the lap results dictionary
  #
  if (DO_TESTS == True):
    if (process_type == 'deduplication'):
      test_dict = {}
      for (rec_a, rec_b) in lap_results:
        if (test_dict.has_key(rec_a)):
          print 'warning:Record %s is already in the test dictionary' % \
                (str(rec_a))
        else:
          test_dict[rec_a] = 1
        if (test_dict.has_key(rec_b)):
          print 'warning:Record %s is already in the test dictionary' % \
                (str(rec_b))
        else:
          test_dict[rec_b] = 1

    else:  # Linkage process
      test_dict_a = {}
      test_dict_b = {}
      for (rec_a, rec_b) in lap_results:
        if (test_dict_a.has_key(rec_a)):
          print 'warning:Record %s is already in test dictionary A' % \
                (str(rec_a))
        else:
          test_dict_a[rec_a] = 1
        if (test_dict_b.has_key(rec_b)):
          print 'warning:Record %s is already in test dictionary B' % \
                (str(rec_b))
        else:
          test_dict_b[rec_b] = 1

  #################### END TEST CODE ##########################################

  if (len(work_dict) == 0):  # All record pairs are processed - - - - - - - - -
    return lap_results

  print '1:    Remaining number of records in working dictionary: %i' % \
        (len(work_dict)) + ' (down from: %i)' % (results_len)

  # Step 3: Find connected sub-sets in the results dictionary - - - - - - - - -
  #         (using depth-first search)
  #
  visited =  {}  # Dictionary which will contain all so far visited rows
  sub_sets = {}  # Dictionary which will contain the sub-sets extracted 

  print '1:    Find connected sub-graphs in results dictionary'

  lap_subset_start_time = time.time()

  max_sub_set_length = -1
  num_visited = 0  # Number of rows visited so far
  row_num_done = 0
  work_dict_len = len(work_dict)

  work_dict_rows = work_dict.keys()
  work_dict_rows.sort()

  # Create a column oriented work dictionary  - - - - - - - - - - - - - - - - -
  #
  col_work_dict = {}

  for row_num in work_dict_rows:  # Loop over all rows
    row_dict = work_dict[row_num]

    for col_num in row_dict:
      col_dict = col_work_dict.get(col_num,{})
      col_dict[row_num] = True  # Only position is needed, but not the weight
      col_work_dict[col_num] = col_dict

  for row_num in work_dict_rows:  # Loop over all rows

    if (not visited.has_key(row_num)):  # This row has not been visited yet

      visited[row_num] = row_num  # Mark visited as 'seeding' row
      num_visited += 1
      print '2:      Create sub-set with seeding record %i' % (row_num)

      process_queue = [row_num]  # Start a new queue of rows to process
      row_sub_set = {row_num:1}  # Row numbers connected to this row

      while (process_queue != []): # Process rows until all connected rows done
        print '3:        Process queue: %s' % (str(process_queue))

        next_row = process_queue.pop(0)  # Get and remove first row to process
        row_col_numbers = work_dict[next_row].keys()  # Get columns in this row

        # For deduplication, also insert row number into this column numbers
        #
        if (process_type == 'deduplication'):
          row_col_numbers.append(next_row)

        print '3:          Row %i with column numbers: %s' % \
              (next_row, str(row_col_numbers))

        # Get the row numbers from all column numbers
        #
        for col_num in row_col_numbers:

          # Get list of all row numbers in this column
          #
          row_num_dict = col_work_dict.get(col_num, {})
          row_num_list = row_num_dict.keys()

          if (process_type == 'deduplication') and (col_num in work_dict) and \
            (col_num not in row_num_list):
            row_num_list.append(col_num)

          print '3:          Column: %i with row numbers: %s' % \
                (col_num, str(row_num_list))

          for row_num2 in row_num_list:
            row_sub_set[row_num2] = 1
            if (not visited.has_key(row_num2)):  # Check if it's a new row
              process_queue.append(row_num2)
              print '3:          Appended row number %i to process queue' % \
                    (row_num2)

              visited[row_num2] = row_num  # Mark row as visited by seeding row
              num_visited += 1
              print '3:          Row %i connected to row %i' % \
                    (row_num2, row_num)

      sub_sets[row_num] = row_sub_set.keys()  # Only store keys

      if (len(row_sub_set) > max_sub_set_length):
        max_sub_set_length = len(row_sub_set)

      print '3:        Sub-set contains records: %s' % \
            (str(row_sub_set.keys()))

    row_num_done += 1

    # Now determine timing and print progress report (every 10%)  - - - - - - -
    # (only if more than 100 records in the work dictionary)
    #
    if (work_dict_len >= 100) and (row_num_done % int(work_dict_len/10) == 0):
      used_time =     time.time() - lap_subset_start_time
      perc_done =     100.0 * row_num_done / work_dict_len
#      todo_time =     (work_dict_len - row_num_done) * \  #################
#                      (used_time / row_num_done)
      todo_time =     (work_dict_len - num_visited) * \
                      (used_time / row_num_done)

      used_time_string =       output.time_string(used_time)
      todo_time_string =       output.time_string(todo_time)

      print '1:      Processed %.1f%% of records in %s (%i/%i records ' % \
            (perc_done, used_time_string, num_visited, work_dict_len) + \
            'visited)'
      print '1:        Estimated %s until finished' % (todo_time_string)

  del col_work_dict  # Delete the column oriented work dictionary

  num_sub_sets = len(sub_sets)  # Get the total number of sub-sets

  lap_subset_total_time = time.time() - lap_subset_start_time
  lap_subset_total_time_string = output.time_string(lap_subset_total_time)

  print '1:    Extracted %i sub-sets in %s' % \
        (num_sub_sets, lap_subset_total_time_string)
  print '1:      Longest sub-set contains %i rows' % (max_sub_set_length)

  #################### START TEST CODE ########################################
  # Test if all the sub-sets are mutually exclusive, and if the seed rows are
  # in the sub-set record lists
  #
  if (DO_TESTS == True):
    for seed_row in sub_sets:
      row_list = sub_sets[seed_row]
      if (seed_row not in row_list):
        print 'warning:Seed row %s not in sub-set row list: %s' % \
              (str(seed_rec), str(row_list))
      for rec_num in row_list:
        for seed_row2 in sub_sets:
          row_list2 = sub_sets[seed_row2]
          if (seed_row != seed_row2):  # Don't test itself
            if (rec_num in row_list2):
              print 'warning:Record %s in more than one sub-set: %s, %s' % \
                    (str(rec_num), str(row_list), str(row_list2))

  #################### END TEST CODE ##########################################

  #################### START PARALLEL TEST CODE ###########################

  if (SAVE_PARALLEL_TEST_FILES == True):
    tmp_list = sub_sets.keys()
    tmp_list.sort()
    f = open('sub-sets-'+str(parallel.rank())+'-'+ \
             str(parallel.size()),'w')
    for s in tmp_list:
      tmp_sub_set = sub_sets[s]
      tmp_sub_set.sort()

      f.write(str(s)+'::'+str(tmp_sub_set)+os.linesep)
    f.close()

  #################### END PARALLEL TEST CODE #############################

  # Now loop over all sub-sets  - - - - - - - - - - - - - - - - - - - - - - - -
  # (pre-process them first before giving them to the actual linear assignment
  # method)
  #
  lap_lap_start_time = time.time()
  lap_comm_time = 0.0

  sub_set_cnt = 0  # A round robin counter, used for parallelism

  sub_set_rows = sub_sets.keys()
  sub_set_rows.sort()  # Needed to make the same on all processes

  for seed_row in sub_set_rows:

    # Distribute sub-sets equally to all processors
    #
    if ((sub_set_cnt % parallel.size()) == parallel.rank()):

      row_list = sub_sets[seed_row]
      row_list.sort()

      print '1:'
      print '1:    Sub-set %i of %i with seed row %i contains %i rows' % \
            (sub_set_cnt, num_sub_sets, seed_row, len(row_list))
      print '3:      Sub-set rows:  %s' % (str(row_list))

      if (len(row_list) == 1):  # Special case: One row only  - - - - - - - - -
        max_weight = -99999.9
        max_col = -1
        row_dict = work_dict[row_list[0]]  # Get the dictionary for this row

        # Find element with largest weight
        #
        for col_num in row_dict:
          weight = row_dict[col_num]
          if (weight > max_weight):
            max_weight = weight
            max_col = col_num

        # Assignment dictionary is of form col_num:row_num
        #
        tmp_assign_dict = {max_col:row_list[0]}  # Make record pair dictionary

        print '2:      Special case sub-set with one row only, ' + \
              'assignment pair: (%i,%i)' % (row_list[0], max_col)

      else:  # General case with more than one row  - - - - - - - - - - - - - -

        # Get minimal and maximal weights, and lists with row and column
        # numbers
        #
        min_weight =  999999.9
        max_weight = -999999.9
        col_numbers = {}
        row_col_numbers = {}

        for row_num in row_list:  # Loop over rows in this sub-set
          row_dict = work_dict[row_num]  # Get the dictionary for this row
          row_col_numbers[row_num] = 1

          for col_num in row_dict:
            weight = row_dict[col_num]
            col_numbers[col_num] = 1
            row_col_numbers[col_num] = 1

            if (weight < min_weight):
              min_weight = weight
            if (weight > max_weight):
              max_weight = weight

        print '3:      Minimal and maximal weight: %.3f / %.3f' % \
              (min_weight, max_weight)

        row_numbers = work_dict.keys()
        col_numbers = col_numbers.keys()
        row_numbers.sort()
        col_numbers.sort()
        num_rows = len(row_numbers)
        num_cols = len(col_numbers)

        row_col_numbers = row_col_numbers.keys()
        row_col_numbers.sort()

        #print '1:      Row numbers:    %s' % (str(row_numbers))
        #print '1:      Column numbers: %s' % (str(col_numbers))
        #print '1:      Row/colum numbers: %s' % (str(row_col_numbers))
        #print '1:      Number of unique weights: %i' % (len(weight_dict))

        # Deal with the special case that there is only one column number - - -
        #
        if (num_cols == 1):
          max_weight = -99999.9
          max_row = -1

          col_num = col_numbers[0]  # Get the column number

          # Find element with largest weight
          #
          for row_num in row_list:  # Loop over rows

            # Get only weight in row
            #
            row_weight = work_dict[row_num].values()[0]

            if (row_weight > max_weight):
              max_weight = row_weight
              max_row = row_num

          # Assignment dictionary is of form col_num:row_num
          #
          tmp_assign_dict = {col_num:max_row}  # Make record pair dictionary

          print '2:      Special case sub-set with one column only, ' + \
                'assignment pair: (%i,%i)' % (max_row, col_num)

        else:  # General case with more than one row and column - - - - - - - -

          # Construct the cost dictionary - - - - - - - - - - - - - - - - - - -
          #
          cost_dict = {}
          dim = len(row_col_numbers)  # Final dimension of the LAP

          min_cost = -max_weight * (dim + 1)  # Use original wieghts

          for row_num in row_list:  # Loop over rows

            row_dict = work_dict[row_num]

            # Get the column numbers in this row
            #
            col_list = row_dict.keys()
            col_list.sort()

            row_cost_dict = cost_dict.get(row_num, {})

            for col_num in col_list:
              weight = row_dict[col_num]

              cost = weight * (dim + 1)  # Use original weights
              row_cost_dict[col_num] = cost  # And store into row dictionary

              # Insert symmetric element as well (if not on diagonal)
              #
              if (row_num != col_num):
                row_cost_dict2 = cost_dict.get(col_num,{})

                if (row_num not in row_cost_dict2):  # Only insert if not there

                  if (process_type == 'deduplication'):
                    row_cost_dict2[row_num] = cost  # Insert symmetric cost
                  else:  # Linkage process
                    row_cost_dict2[row_num] = min_cost  # Insert minimal cost

                  # And insert diagonal element if there is none
                  #
                  if (not row_cost_dict2.has_key(col_num)):
                    row_cost_dict2[col_num] = min_cost

                  cost_dict[col_num] = row_cost_dict2

            # Make sure there is a diagonal element (for feasibility)
            #
            if (not row_cost_dict.has_key(row_num)):
              row_cost_dict[row_num] = min_cost

            # If more than MAX_ROW_ELEMENTS elements in row only take the
            # largest (following an idea by William Winkler)
            #
            if (len(row_cost_dict) > MAX_ROW_ELEMENTS):
              row_col_numbers = row_cost_dict.keys()
              row_weights =     row_cost_dict.values()
              row_elem_list = map(None, row_weights, row_col_numbers)
              row_elem_list.sort()

              diag_weight = row_cost_dict[row_num]  # Keep diagonal element
              row_cost_dict = {row_num:diag_weight}

              ##print '   '
              ##print '****** row_elem_list: %s' % (str(row_elem_list))
              ##print '   '

              for (weight, col_num) in row_elem_list[-MAX_ROW_ELEMENTS:]:
                row_cost_dict[col_num] = weight

            # Insert row into cost dictionary
            #
            cost_dict[row_num] = row_cost_dict

          # Get the final row and column numbers  - - - - - - - - - - - - - - -
          #
          row_numbers = cost_dict.keys()
          col_numbers = {}
          for row_dict in cost_dict.values():
            col_numbers.update(row_dict)

          col_numbers = col_numbers.keys()
          row_numbers.sort()
          col_numbers.sort()

          # Check if number of rows and columns are equal - - - - - - - - - - -
          #
          if (len(row_numbers) != len(col_numbers)):
            print 'error:Different number of rows (%i) and columns (%i)' \
                  % (len(row_numbers), len(col_numbers))
            raise Exception

          print '1:      Cost dictionary with %i rows/columns given to ' % \
                (len(row_numbers)) + 'assignment method %s:' % (lap_method)
          print '2:        Row numbers:    %s' % (str(row_numbers))
          print '2:        Column numbers: %s' % (str(row_numbers))
          print '2:        Minimal weight: %3f' % (min_weight)
          print '2:        Maximal weight: %3f' % (max_weight)
          print '3:        Cost dictionary: %s' % (str(cost_dict))
          print '3:        Process type:    %s' % (process_type)

          #################### START PARALLEL TEST CODE #######################

          if (SAVE_PARALLEL_TEST_FILES == True):

            tmp_list = cost_dict.keys()
            tmp_list.sort()
            tmp_str = str(sub_set_cnt)+':: '+ str(min_weight) + ' / ' + \
                      str(max_weight) + ', ' + str(row_numbers) + ', ' + \
                      process_type + '::'
            for k in tmp_list:
              tmp_list2 = cost_dict[k].items()
              tmp_list2.sort()
              tmp_str = tmp_str + ' ' + str(tmp_list2) + ' / '

            f = open('lap-calling-'+str(parallel.rank())+'-'+ \
                str(parallel.size()),'a')
            f.write(tmp_str+os.linesep)
            f.close()

          #################### END PARALLEL TEST CODE #########################

          # Call the lap method which returns an assignment dictionary  - - - -
          #
          if (lap_method == 'auction'):
            tmp_assign_dict = auction(cost_dict, min_weight, max_weight, \
                                  row_numbers, col_numbers)
          else:
             print 'error:LAP method %s not implemented' % (lap_method)
             raise Exception

      # If run in parallel, send temporary assignment dictionary process 0  - -
      #
      if (parallel.rank() > 0):
        tmp_time = time.time()
        parallel.send(tmp_assign_dict, 0)
        lap_comm_time += (time.time() - tmp_time)
        print '1:      Sent assignment dictionary with %i entries to process' \
              % (len(tmp_assign_dict)) + ' 0'

    # Only process 0 inserts temporary assignment dictionary into results - - -
    #
    if (parallel.rank() == 0):

      # Receive assignment dictionary from other process if necessary
      #
      p = (sub_set_cnt % parallel.size())  # Process number to receive from

      if (p != 0):
        tmp_time = time.time()
        tmp_assign_dict = parallel.receive(p)
        lap_comm_time += (time.time() - tmp_time)
        print '1:    Received subset %i of %i assignment dictionary with ' % \
              (sub_set_cnt, num_sub_sets) + '%i entries from process %i' % \
              (len(tmp_assign_dict), p)

      # Post-process the assignment dictionary  - - - - - - - - - - - - - - - -
      #
      assign_pairs = {}

      for rec_num_b in tmp_assign_dict:
        rec_num_a = tmp_assign_dict[rec_num_b]

        # Now check if this record pair is in the original results dictionary
        #
        if (rec_num_a in results_dict):
          row_dict = results_dict[rec_num_a]
          if (rec_num_b in row_dict):
            weight = row_dict[rec_num_b]

            # Insert into dictionary of potential record pairs
            #
            assign_pairs[(rec_num_a, rec_num_b)] = weight

      #################### START PARALLEL TEST CODE ###########################

      if (SAVE_PARALLEL_TEST_FILES == True):

        tmp_list = tmp_assign_dict.items()
        tmp_list.sort()
        tmp_list2 = assign_pairs.items()
        tmp_list2.sort()
        tmp_list3 = sub_sets[seed_row]
        tmp_list3.sort()

        f = open('assignments-'+str(parallel.rank())+'-'+ \
             str(parallel.size()),'a')
        f.write(str(sub_set_cnt)+', '+str(seed_row)+os.linesep)
        f.write(str(tmp_list)+os.linesep)
        f.write(str(tmp_list2)+os.linesep)
        f.write(str(tmp_list3)+os.linesep)
        f.write(os.linesep)
        f.close()

      #################### END PARALLEL TEST CODE #############################

      # Sort the assigned pairs according to their weight
      #
      assign_weights = assign_pairs.values()  # Get the weights in a list
      assign_rec_pairs = assign_pairs.keys()  # And the record pairs

      assign_pair_list = map(None, assign_weights, assign_rec_pairs)
      assign_pair_list.sort()

      num_assigned_pairs = 0  # Number of assigned pairs for this sub-set
      dedup_check_rec_nums = {}  # Already assigned record numbers

      while (assign_pair_list != []):  # Now check all record pairs
        check_pair = assign_pair_list.pop()  # Get largest weight record pair

        weight = check_pair[0]
        rec_num_a = check_pair[1][0]
        rec_num_b = check_pair[1][1]
        rec_pair = (rec_num_a, rec_num_b)

        # Now check if a record pair has already been used in an assignment
        # and for a deduplication process also check if any of the two
        # records has been used in an assignment
        #
        if ((process_type == 'linkage') and \
            (rec_num_a not in used_rec_nums_a) and \
            (rec_num_b not in used_rec_nums_b)) or \
           ((process_type == 'deduplication') and \
            (rec_num_a not in used_rec_nums) and \
            (rec_num_b not in used_rec_nums)):

          # For deduplication insert record numbers into used record numbers
          #
          if (process_type == 'deduplication'):
            used_rec_nums[rec_num_a] = True
            used_rec_nums[rec_num_b] = True
          else:
            used_rec_nums_a[rec_num_a] = True
            used_rec_nums_b[rec_num_b] = True

          if (rec_pair not in lap_results):
            lap_results[rec_pair] = True
            num_assigned_pairs += 1
          else:
            print 'warning:Record pair (%i,%i) already in LAP results' \
                  % (rec_num_a, rec_num_b)

      print '2:      Inserted %i (out of %i) record pairs into LAP ' % \
            (num_assigned_pairs, len(tmp_assign_dict)) + 'results'

    sub_set_cnt += 1

    # Report progress every 10% (only if more than 100 sub-sets)  - - - - - - -
    #
    if (num_sub_sets >= 100) and (sub_set_cnt % int(num_sub_sets / 10) == 0):
        used_time =    time.time() - lap_lap_start_time
        perc_done =    100.0 * sub_set_cnt / num_sub_sets
        sub_set_time = used_time / sub_set_cnt
        todo_time =    (num_sub_sets - sub_set_cnt) * sub_set_time

        used_time_string =    output.time_string(used_time)
        todo_time_string =    output.time_string(todo_time)
        sub_set_time_string = output.time_string(sub_set_time)

        print '1:      Processed %.1f%% (%i/%i) of sub-sets in %s' % \
                (perc_done, sub_set_cnt, num_sub_sets, used_time_string) + \
                ' (%s per sub-set)' % (sub_set_time_string)
        print '1:        Estimated %s until finished' % (todo_time_string)

  print '1:  Total number of assignments: %i' % (len(lap_results))
  print '1:    Number of rows in original results dictionary: %i' % \
        (len(results_dict))

  #################### START TEST CODE ########################################
  # Test if a record only appears once in the lap results dictionary
  #
  if (DO_TESTS == True) and (parallel.rank() == 0):
    if (process_type == 'deduplication'):
      test_dict = {}
      for (rec_a, rec_b) in lap_results:
        if (test_dict.has_key(rec_a)):
          print 'warning:Record %i is already in the test dictionary' % \
                (rec_a)+' rec_pair: (%i,%i)' % (rec_a, rec_b)
        else:
          test_dict[rec_a] = True
        if (test_dict.has_key(rec_b)):
          print 'warning:Record %i is already in the test dictionary' % \
                (rec_b)+' rec_pair: (%i,%i)' % (rec_a, rec_b)
        else:
          test_dict[rec_b] = 1

    else:  # Linkage process
      test_dict_a = {}
      test_dict_b = {}
      for (rec_a, rec_b) in lap_results:
        if (test_dict_a.has_key(rec_a)):
          print 'warning:Record %s is already in test dictionary A' % \
                (str(rec_a))
        else:
          test_dict_a[rec_a] = 1
        if (test_dict_b.has_key(rec_b)):
          print 'warning:Record %s is already in test dictionary B' % \
                (str(rec_b))
        else:
          test_dict_b[rec_b] = 1

  #################### END TEST CODE ##########################################

  lap_stop_time = time.time()
  lap_lap_time = lap_stop_time - lap_lap_start_time
  lap_total_time = lap_stop_time - lap_start_time

  lap_pair_extract_time_string = output.time_string(lap_pair_extract_time)
  lap_subset_total_time_string = output.time_string(lap_subset_total_time)
  lap_lap_time_string =          output.time_string(lap_lap_time)
  if (parallel.size() > 1):
    lap_comm_time_string =         output.time_string(lap_comm_time)
  lap_total_time_string =        output.time_string(lap_total_time)

  print '1:'
  print '1:  Finished linear record pair assignment procedure'
  print '1:    Time for extracting unique record pairs: %s' % \
        (lap_pair_extract_time_string)
  print '1:    Time for creating record sub-sets:       %s' % \
        (lap_subset_total_time_string)
  print '1:    Time for linear assignment algorithm:    %s' % \
        (lap_lap_time_string)
  if (parallel.size() > 1):
    print '1:    Time for communication:                  %s' % \
          (lap_comm_time_string)
  print '1:    Total time for linear assignment:        %s' % \
        (lap_total_time_string)
  print '1:'

  return lap_results