def test_edit_distance_join_invalid_tokenizer(self):
     edit_distance_join(self.A,
                        self.B,
                        'A.id',
                        'B.id',
                        'A.attr',
                        'B.attr',
                        self.threshold,
                        tokenizer=[])
 def test_edit_distance_join_invalid_threshold_below(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        -0.1)
 def test_edit_distance_join_numeric_r_join_attr(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                        'B.int_attr', self.threshold)
 def test_edit_distance_join_invalid_rtable(self):
     edit_distance_join(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold)
 def test_edit_distance_join_invalid_r_join_attr(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr',
                        'B.invalid_attr', self.threshold)
 def test_edit_distance_join_invalid_r_join_attr(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id',
                        'A.attr', 'B.invalid_attr', self.threshold)
def test_valid_join(scenario,
                    tok,
                    threshold,
                    comp_op=DEFAULT_COMP_OP,
                    args=(),
                    convert_to_str=False):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__), ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__), rtable_path))

    if convert_to_str:
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)
        dataframe_column_to_str(rtable, r_join_attr, inplace=True)

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 0 and args[0]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr])
                        or pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join(
                        (str(l_row[l_key_attr]), str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(
        ltable_not_missing[[l_key_attr, l_join_attr, 'tmp_join_key']],
        rtable_not_missing[[r_key_attr, r_join_attr, 'tmp_join_key']],
        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_measure_type = 'EDIT_DISTANCE'
    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(
        lambda row: sim_func(str(row[l_join_attr]), str(row[r_join_attr])),
        axis=1)

    comp_fn = COMP_OP_MAP[comp_op]

    expected_pairs = set()
    overlap = get_sim_function('OVERLAP')
    for idx, row in cartprod.iterrows():
        l_tokens = tok.tokenize(str(row[l_join_attr]))
        r_tokens = tok.tokenize(str(row[r_join_attr]))

        if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0:
            continue

        # current edit distance join is approximate. It cannot find matching
        # strings which don't have any common q-grams. Hence, remove pairs
        # that don't have any common q-grams from expected pairs.
        if comp_fn(float(row['sim_score']), threshold):
            if overlap(l_tokens, r_tokens) > 0:
                expected_pairs.add(','.join(
                    (str(row[l_key_attr]), str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = tok.get_return_set()

    # use join function to obtain actual output pairs.
    actual_candset = edit_distance_join(ltable,
                                        rtable,
                                        l_key_attr,
                                        r_key_attr,
                                        l_join_attr,
                                        r_join_attr,
                                        threshold,
                                        comp_op,
                                        *args,
                                        tokenizer=tok)

    assert_equal(tok.get_return_set(), orig_return_set_flag)

    expected_output_attrs = ['_id']
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 3:
        l_out_prefix = args[3]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 4:
        r_out_prefix = args[4]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 1:
        if args[1]:
            l_out_attrs = remove_redundant_attrs(args[1], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            r_out_attrs = remove_redundant_attrs(args[2], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args.
    if len(args) > 5:
        if args[5]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # verify whether the output table has the necessary attributes.
    assert_list_equal(list(actual_candset.columns.values),
                      expected_output_attrs)

    actual_pairs = set()
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                   str(row[r_out_prefix + r_key_attr]))))

    # verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
 def time_edit_distance_qg2_3(self):
     edit_distance_join(self.ltable, self.rtable,
                        self.l_id_attr, self.r_id_attr,
                        self.l_join_attr, self.r_join_attr, 3)
def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(),
                    convert_to_str=False,data_limit=100000,temp_dir = os.getcwd(), 
                    output_file_path = default_output_file_path):
    (ltable_path, l_key_attr, l_join_attr) = scenario[0]
    (rtable_path, r_key_attr, r_join_attr) = scenario[1]

    # load input tables for the tests.
    ltable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      ltable_path))
    rtable = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                      rtable_path))

    if convert_to_str:                                                          
        dataframe_column_to_str(ltable, l_join_attr, inplace=True)              
        dataframe_column_to_str(rtable, r_join_attr, inplace=True) 

    missing_pairs = set()
    # if allow_missing flag is set, compute missing pairs.
    if len(args) > 0 and args[0]:
        for l_idx, l_row in ltable.iterrows():
            for r_idx, r_row in rtable.iterrows():
                if (pd.isnull(l_row[l_join_attr]) or
                    pd.isnull(r_row[r_join_attr])):
                    missing_pairs.add(','.join((str(l_row[l_key_attr]),
                                                str(r_row[r_key_attr]))))

    # remove rows with missing value in join attribute and create new dataframes
    # consisting of rows with non-missing values.
    ltable_not_missing = ltable[pd.notnull(ltable[l_join_attr])].copy()
    rtable_not_missing = rtable[pd.notnull(rtable[r_join_attr])].copy()

    # generate cartesian product to be used as candset
    ltable_not_missing['tmp_join_key'] = 1
    rtable_not_missing['tmp_join_key'] = 1
    cartprod = pd.merge(ltable_not_missing[[l_key_attr,
                                l_join_attr,
                                'tmp_join_key']],
                        rtable_not_missing[[r_key_attr,
                                r_join_attr,
                                'tmp_join_key']],
                        on='tmp_join_key').drop('tmp_join_key', 1)
    ltable_not_missing.drop('tmp_join_key', 1)
    rtable_not_missing.drop('tmp_join_key', 1)

    sim_measure_type = 'EDIT_DISTANCE'
    sim_func = get_sim_function(sim_measure_type)

    # apply sim function to the entire cartesian product to obtain
    # the expected set of pairs satisfying the threshold.
    cartprod['sim_score'] = cartprod.apply(lambda row: sim_func(
                str(row[l_join_attr]), str(row[r_join_attr])),
            axis=1)

    comp_fn = COMP_OP_MAP[comp_op]

    expected_pairs = set()
    overlap = get_sim_function('OVERLAP')
    for idx, row in cartprod.iterrows():
        l_tokens = tok.tokenize(str(row[l_join_attr]))
        r_tokens = tok.tokenize(str(row[r_join_attr]))

        if len(str(row[l_join_attr])) == 0 or len(str(row[r_join_attr])) == 0:
            continue

        # current edit distance join is approximate. It cannot find matching
        # strings which don't have any common q-grams. Hence, remove pairs
        # that don't have any common q-grams from expected pairs.
        if comp_fn(float(row['sim_score']), threshold):
            if overlap(l_tokens, r_tokens) > 0:
                expected_pairs.add(','.join((str(row[l_key_attr]),
                                             str(row[r_key_attr]))))

    expected_pairs = expected_pairs.union(missing_pairs)

    orig_return_set_flag = tok.get_return_set()
    
    # Removing any previously existing output file path.
    if os.path.exists(output_file_path):
      os.remove(output_file_path)

    # Use join function to process the input data. It returns the boolean value.
    is_success = disk_edit_distance_join(ltable, rtable,
                                         l_key_attr, r_key_attr,
                                         l_join_attr, r_join_attr,
                                         threshold, data_limit,
                                         comp_op, *args,
                                         tokenizer=tok, temp_dir = temp_dir,
                                         output_file_path = output_file_path)
    # Use edit distance join without the disk version to get the dataframe to compare.
    no_disk_candset = edit_distance_join(ltable, rtable,
                                        l_key_attr, r_key_attr,
                                        l_join_attr, r_join_attr,
                                        threshold, comp_op,
                                        *args, tokenizer=tok)
    # Deleting Id to make the schema consistent for comparison.
    if '_id' in no_disk_candset :
      del no_disk_candset['_id']

    assert_equal(tok.get_return_set(), orig_return_set_flag)

    expected_output_attrs = []
    l_out_prefix = DEFAULT_L_OUT_PREFIX
    r_out_prefix = DEFAULT_R_OUT_PREFIX

    # Check for l_out_prefix in args.
    if len(args) > 3:
        l_out_prefix = args[3]
    expected_output_attrs.append(l_out_prefix + l_key_attr)

    # Check for r_out_prefix in args.
    if len(args) > 4:
        r_out_prefix = args[4]
    expected_output_attrs.append(r_out_prefix + r_key_attr)

    # Check for l_out_attrs in args.
    if len(args) > 1:
        if args[1]:
            l_out_attrs = remove_redundant_attrs(args[1], l_key_attr)
            for attr in l_out_attrs:
                expected_output_attrs.append(l_out_prefix + attr)

    # Check for r_out_attrs in args.
    if len(args) > 2:
        if args[2]:
            r_out_attrs = remove_redundant_attrs(args[2], r_key_attr)
            for attr in r_out_attrs:
                expected_output_attrs.append(r_out_prefix + attr)

    # Check for out_sim_score in args. 
    if len(args) > 5:
        if args[5]:
            expected_output_attrs.append('_sim_score')
    else:
        expected_output_attrs.append('_sim_score')

    # Verify whether the current output file path exists.
    assert_equal(True,os.path.exists(output_file_path))

    # verify whether the output table has the necessary attributes.
    actual_candset = pd.read_csv(output_file_path)

    # Comparing column header values
    assert_list_equal(list(actual_candset.columns.values),
                        expected_output_attrs)
    assert_list_equal(list(no_disk_candset.columns.values),
                        list(actual_candset.columns.values))

    actual_pairs = set()
    no_disk_pairs = set() 

    # Creating sets for comparing the data tuples
    for idx, row in actual_candset.iterrows():
        actual_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                     str(row[r_out_prefix + r_key_attr]))))

    for idx, row in no_disk_candset.iterrows():
        no_disk_pairs.add(','.join((str(row[l_out_prefix + l_key_attr]),
                                     str(row[r_out_prefix + r_key_attr]))))
   
    # Verify whether the actual pairs and the expected pairs match.
    assert_equal(len(expected_pairs), len(actual_pairs))
    assert_equal(len(expected_pairs), len(no_disk_pairs))
    common_pairs = actual_pairs.intersection(expected_pairs)
    common_pairs_no_disk = no_disk_pairs.intersection(expected_pairs)
    assert_equal(len(common_pairs), len(expected_pairs))
    assert_equal(len(common_pairs_no_disk), len(expected_pairs))
 def test_edit_distance_join_invalid_comp_op_ge(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold, '>=')
 def test_edit_distance_join_invalid_r_out_attr(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold, self.comp_op, False,
                        ['A.attr'], ['B.invalid_attr'])
 def test_edit_distance_join_invalid_threshold_below(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        -0.1)
 def test_edit_distance_join_invalid_tokenizer(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold, tokenizer=[])
 def test_edit_distance_join_numeric_r_join_attr(self):                      
     edit_distance_join(self.A, self.B, 'A.id', 'B.id',                      
                        'A.attr', 'B.int_attr', self.threshold)
 def test_edit_distance_join_invalid_comp_op_ge(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold, '>=')
Esempio n. 16
0
 def time_edit_distance_qg2_3(self):
     edit_distance_join(self.ltable, self.rtable, self.l_id_attr,
                        self.r_id_attr, self.l_join_attr, self.r_join_attr,
                        3)
 def test_edit_distance_join_invalid_r_out_attr(self):
     edit_distance_join(self.A, self.B, 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold, self.comp_op, False, ['A.attr'],
                        ['B.invalid_attr'])
 def test_edit_distance_join_invalid_rtable(self):
     edit_distance_join(self.A, [], 'A.id', 'B.id', 'A.attr', 'B.attr',
                        self.threshold)