Ejemplo n.º 1
0
    def genericClassification(classification_function='exact',
                              sim_vec_dict=sim_vec_dict,
                              sim_threshold=threshold,
                              min_sim_threshold=minthresh,
                              weight_vec=weightvec,
                              true_match_set=true_match_set):
        start_time = time.time()

        if classification_function == 'exact':
            # Exact matching based classification
            class_match_set1, class_nonmatch_set1 = \
                         classification.exactClassify(sim_vec_dict)

        if classification_function == 'simthresh':
            # Similarity threshold based classification
            #
            class_match_set1, class_nonmatch_set1 = \
                        classification.thresholdClassify(sim_vec_dict, sim_threshold)

        if classification_function == 'minsim':
            # Minimum similarity threshold based classification
            #
            class_match_set1, class_nonmatch_set1 = \
                        classification.minThresholdClassify(sim_vec_dict,
                                                            min_sim_threshold)

        if classification_function == 'weightsim':
            # Weighted similarity threshold based classification
            #
            # weight_vec = [1.0] * len(approx_comp_funct_list)

            # Lower weights for middle name and state
            #
            # weight_vec = [2.0, 1.0, 2.0, 2.0, 2.0, 1.0]

            class_match_set1, class_nonmatch_set1 = \
                        classification.weightedSimilarityClassify(sim_vec_dict,
                                                                  weight_vec,
                                                                  sim_threshold)

        if classification_function == 'dt':
            # A supervised decision tree classifier
            #
            class_match_set1, class_nonmatch_set1 = \
                      classification.supervisedMLClassify(sim_vec_dict, true_match_set)

        class_time = time.time() - start_time

        return class_match_set1, class_nonmatch_set1, class_time
Ejemplo n.º 2
0
def classify(sim_vec_dict, classification_mode, threshold,
             approx_comp_funct_list, weights):
    start_time = time.time()

    if classification_mode == 'exact':
        # Exact matching based classification
        class_match_set, class_nonmatch_set = classification.exactClassify(
            sim_vec_dict)

    elif classification_mode == 'similarity':
        # Similarity threshold based classification
        sim_threshold = threshold
        class_match_set, class_nonmatch_set = classification.thresholdClassify(
            sim_vec_dict, sim_threshold)

    elif classification_mode == 'min_sim':
        # Minimum similarity threshold based classification
        min_sim_threshold = threshold
        class_match_set, class_nonmatch_set = classification.minThresholdClassify(
            sim_vec_dict, min_sim_threshold)

    elif classification_mode == 'weighted':
        # Weighted similarity classification
        sim_threshold = threshold
        class_match_set, class_nonmatch_set = classification.weightedSimilarityClassify(
            sim_vec_dict, weights, sim_threshold)
    elif classification_mode == 'tree':
        # A supervised decision tree classifier
        class_match_set, class_nonmatch_set = classification.supervisedMLClassify(
            sim_vec_dict, true_match_set)
    else:
        print("classification error")

    classification_time = time.time() - start_time
    print("classification time:", classification_time)

    # Return matching and non matching sets and time taken
    return class_match_set, class_nonmatch_set, classification_time
Ejemplo n.º 3
0
start_time = time.time()

sim_vec_dict = comparison.compareBlocks(blockA_dict, blockB_dict, recA_dict,
                                        recB_dict, approx_comp_funct_list)

comparison_time = time.time() - start_time

# -----------------------------------------------------------------------------
# Step 4: Classify the candidate pairs

start_time = time.time()

# Exact matching based classification
#
class_match_set, class_nonmatch_set = classification.exactClassify(
    sim_vec_dict)

# *********** In lab 5, explore different similarity threshold values *********

# Similarity threshold based classification
#
# sim_threshold = 0.5
# class_match_set, class_nonmatch_set = \
#             classification.thresholdClassify(sim_vec_dict, sim_threshold)

# Minimum similarity threshold based classification
#
# min_sim_threshold = 0.5
# class_match_set, class_nonmatch_set = \
#             classification.minThresholdClassify(sim_vec_dict,
#                                                 min_sim_threshold)
sim_vec_dict = comparison.compareBlocks(blockA_dict, blockB_dict, \
                                        recA_dict, recB_dict, \
                                        approx_comp_funct_list)

comparison_time = time.time() - start_time

# -----------------------------------------------------------------------------
# Step 4: Classify the candidate pairs

start_time = time.time()

# Exact matching based classification
#
class_match_set, class_nonmatch_set = \
             classification.exactClassify(sim_vec_dict)

# *********** In lab 5, explore different similarity threshold values *********

# Similarity threshold based classification
#
#sim_threshold = 0.5
#class_match_set, class_nonmatch_set = \
#             classification.thresholdClassify(sim_vec_dict, sim_threshold)

# Minimum similarity threshold based classification
#
#min_sim_threshold = 0.5
#class_match_set, class_nonmatch_set = \
#             classification.minThresholdClassify(sim_vec_dict,
#                                                 min_sim_threshold)