def test_parse_feat_str_parse_valid_2(self):
     feature_string = "jaccard(qgm_3(ltuple['zipcode']), qgm_3(ltuple['zipcode']))"
     p_dict = _parse_feat_str(feature_string, get_tokenizers_for_matching(),
                              get_sim_funs_for_matching())
     self.assertEqual(p_dict['left_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['right_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['simfunction'], 'jaccard')
 def test_parse_feat_str_parse_exp(self):
     feature_string = "jaccard~(qgm_3(ltuple[['zipcode']), qgm_3(rtuple['zipcode']))"
     p_dict = _parse_feat_str(feature_string, get_tokenizers_for_matching(),
                              get_sim_funs_for_matching())
     for k, v in six.iteritems(p_dict):
         if k != 'is_auto_generated':
             self.assertEqual(v, 'PARSE_EXP')
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                                get_sim_funs_for_matching())
        add_feature(pd.DataFrame(), 'test', f_dict)
 def test_parse_feat_str_parse_valid_1(self):
     feature_string = "jaccard(qgm_3(ltuple['zipcode']), qgm_3(rtuple['zipcode']))"
     p_dict = _parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     self.assertEqual(p_dict['left_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['right_attr_tokenizer'], 'qgm_3')
     self.assertEqual(p_dict['simfunction'], 'jaccard')
     self.assertEqual(p_dict['left_attribute'], 'zipcode')
     self.assertEqual(p_dict['right_attribute'], 'zipcode')
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
 def test_add_feature_name_already_present(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = create_feature_table()
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     add_feature(feature_table, 'test', f_dict)
Esempio n. 8
0
 def test_get_features_invalid_ltable_rtable_switch(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(B, A)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B, validate_inferred_attr_types=False)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1+1, len2)
     self.assertEqual(feature_table.ix[len(feature_table)-1, 'function'](A.ix[1], B.ix[2]), 1.0)
 def test_add_features_valid_1(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     feature_table = get_features_for_matching(A, B)
     len1 = len(feature_table)
     feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
     f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                             get_sim_funs_for_matching())
     add_feature(feature_table, 'test', f_dict)
     len2 = len(feature_table)
     self.assertEqual(len1 + 1, len2)
     self.assertEqual(
         feature_table.ix[len(feature_table) - 1, 'function'](A.ix[1],
                                                              B.ix[2]), 1.0)
 def test_get_features_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types, attr_corres, tok, sim)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
Esempio n. 12
0
 def test_get_features_valid(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b, key='ID')
     l_attr_types = au.get_attr_types(A)
     r_attr_types = au.get_attr_types(B)
     attr_corres = au.get_attr_corres(A, B)
     tok = get_tokenizers_for_matching()
     sim = get_sim_funs_for_matching()
     feat_table = afg.get_features(A, B, l_attr_types, r_attr_types,
                                   attr_corres, tok, sim)
     self.assertEqual(isinstance(feat_table, pd.DataFrame), True)
     functions = feat_table['function']
     for f in functions:
         x = f(A.ix[1], B.ix[2])
         self.assertEqual(x >= 0, True)
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())

        with self.assertRaises(AssertionError) as ctx:
            add_feature(pd.DataFrame(), 'test', f_dict)

        actual = str(ctx.exception)
        print(actual)
        expected = 'Feature table does not have all required columns\n ' \
                   'The following columns are missing: feature_name, left_attribute, right_attribute, ' \
                   'left_attr_tokenizer,' \
                   ' right_attr_tokenizer, simfunction, function, function_source, is_auto_generated'
        self.assertEqual(actual, expected)
    def test_add_feature_invalid_df_columns(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b, key='ID')

        feature_string = "exact_match(ltuple['zipcode'], rtuple['zipcode'])"
        f_dict = get_feature_fn(feature_string, get_tokenizers_for_matching(),
                                get_sim_funs_for_matching())

        with self.assertRaises(AssertionError) as ctx:
            add_feature(pd.DataFrame(), 'test', f_dict)

        actual = str(ctx.exception)
        print(actual)
        expected = 'Feature table does not have all required columns\n ' \
                   'The following columns are missing: feature_name, left_attribute, right_attribute, ' \
                   'left_attr_tokenizer,' \
                   ' right_attr_tokenizer, simfunction, function, function_source, is_auto_generated'
        self.assertEqual(actual, expected)
 def test_parse_feat_str_parse_exp(self):
     feature_string = "jaccard~(qgm_3(ltuple[['zipcode']), qgm_3(rtuple['zipcode']))"
     p_dict = _parse_feat_str(feature_string, get_tokenizers_for_matching(), get_sim_funs_for_matching())
     for k,v in six.iteritems(p_dict):
         if k != 'is_auto_generated':
             self.assertEqual(v, 'PARSE_EXP')
 def test_get_tokenizers_for_matching_invalid(self):
     x = tok.get_tokenizers_for_matching(None, None)
 def test_get_tokenizers_for_matching(self):
     x = tok.get_tokenizers_for_matching()
     self.assertEqual(isinstance(x, dict), True)
     input = 'data science'
     for name, value in six.iteritems(x):
         self.assertEqual(isinstance(value(input), list), True)
def get_features_for_matching(ltable, rtable):
    """
    This function automatically generates features that can be used for
    matching purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _match_t, _match_s, _atypes1, _atypes2, and _match_c.

        The variable _match_t contains the tokenizers used and  _match_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _match_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_matching`
     :meth:`py_entitymatching.get_tokenizers_for_matching`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A is not of type pandas DataFrame')
        raise AssertionError('Input table A is not of type pandas DataFrame')

    # # We expect the rtable to be of type pandas DataFrame
    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B is not of type pandas DataFrame')
        raise AssertionError('Input table B is not of type pandas DataFrame')

    # Get similarity functions for generating the features for matching
    sim_funcs = sim.get_sim_funs_for_matching()
    # Get tokenizer functions for generating the features for matching
    tok_funcs = tok.get_tokenizers_for_matching()

    # Get the attribute types of the input tables
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)

    # Get the attribute correspondence between the input tables
    attr_corres = au.get_attr_corres(ltable, rtable)

    # Get the features
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres, tok_funcs,
                                 sim_funcs)

    # Export important variables to global name space
    em._match_t = tok_funcs
    em._match_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_ltable
    em._match_c = attr_corres

    # Finally return the feature table
    return feature_table
def get_features_for_matching(ltable, rtable, validate_inferred_attr_types=True):
    """
    This function automatically generates features that can be used for
    matching purposes.

    Args:
        ltable,rtable (DataFrame): The pandas DataFrames for which the
            features are to be generated.
        validate_inferred_attr_types (boolean): A flag to indicate whether to 
            show the user the inferred attribute types and the features
            chosen for those types.

    Returns:
        A pandas DataFrame containing automatically generated features.

        Specifically, the DataFrame contains the following attributes:
        'feature_name', 'left_attribute', 'right_attribute',
        'left_attr_tokenizer', 'right_attr_tokenizer', 'simfunction',
        'function', 'function_source', and 'is_auto_generated'.


        Further, this function also sets the following global variables:
        _match_t, _match_s, _atypes1, _atypes2, and _match_c.

        The variable _match_t contains the tokenizers used and  _match_s
        contains the similarity functions used for creating features.

        The variables _atypes1, and  _atypes2 contain the attribute types for
        ltable and rtable respectively. The variable _match_c contains the
        attribute correspondences between the two input tables.

    Raises:
        AssertionError: If `ltable` is not of type pandas
            DataFrame.
        AssertionError: If `rtable` is not of type pandas
            DataFrame.
        AssertionError: If `validate_inferred_attr_types` is not of type
            pandas DataFrame.

    Examples:
        >>> import py_entitymatching as em
        >>> A = em.read_csv_metadata('path_to_csv_dir/table_A.csv', key='ID')
        >>> B = em.read_csv_metadata('path_to_csv_dir/table_B.csv', key='ID')
        >>> match_f = em.get_features_for_matching(A, B)

    Note:
        In the output DataFrame, two
        attributes demand some explanation: (1) function, and (2)
        is_auto_generated. The function, points to the actual Python function
        that implements the feature. Specifically, the function takes in two
        tuples (one from each input table) and returns a numeric value. The
        attribute is_auto_generated contains either True or False. The flag
        is True only if the feature is automatically generated by py_entitymatching.
        This is important because this flag is used to make some assumptions
        about the semantics of the similarity function used and use that
        information for scaling purposes.

    See Also:
     :meth:`py_entitymatching.get_attr_corres`, :meth:`py_entitymatching.get_attr_types`,
     :meth:`py_entitymatching.get_sim_funs_for_matching`
     :meth:`py_entitymatching.get_tokenizers_for_matching`

    """
    # Validate input parameters
    # # We expect the ltable to be of type pandas DataFrame
    validate_object_type(ltable, pd.DataFrame, 'Input table A')

    # # We expect the rtable to be of type pandas DataFrame
    validate_object_type(rtable, pd.DataFrame, 'Input table B')

    # # We expect the validate_inferred_attr_types to be of type boolean
    validate_object_type(validate_inferred_attr_types, bool, 'Validate inferred attribute type')

    # Get similarity functions for generating the features for matching
    sim_funcs = sim.get_sim_funs_for_matching()
    # Get tokenizer functions for generating the features for matching
    tok_funcs = tok.get_tokenizers_for_matching()

    # Get the attribute types of the input tables
    attr_types_ltable = au.get_attr_types(ltable)
    attr_types_rtable = au.get_attr_types(rtable)

    # Get the attribute correspondence between the input tables
    attr_corres = au.get_attr_corres(ltable, rtable)

    # Show the user inferred attribute types and features and request
    # user permission to proceed
    if validate_inferred_attr_types:
        # if the user does not want to proceed, then exit the function
        if validate_attr_types(attr_types_ltable, attr_types_rtable, attr_corres) is None:
            return

    # Get the features
    feature_table = get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

    # Export important variables to global name space
    em._match_t = tok_funcs
    em._match_s = sim_funcs
    em._atypes1 = attr_types_ltable
    em._atypes2 = attr_types_rtable
    em._match_c = attr_corres

    # Finally return the feature table
    return feature_table