def init_properties(data_frame): """ Initializes properties for a pandas DataFrame in the catalog. Specifically, this function creates an entry in the catalog and sets its properties to empty. Args: data_frame (DataFrame): DataFrame for which the properties must be initialized. Returns: A Boolean value of True is returned if the initialization was successful. """ # Validate input parameters # # Input object is expected to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # Get the catalog instance catalog = Catalog.Instance() # Initialize the property in the catalog. # Relay the return value from the underlying catalog object's function. # The return value is typically True if the initialization was successful return catalog.init_properties(data_frame)
def show_properties_for_id(object_id): """ Shows the properties for an object id present in the catalog. Specifically, given an object id got from typically executing id( <object>), where the object could be a DataFrame, this function will display the properties present for that object id in the catalog. Args: object_id (int): The Python identifier of an object (typically a pandas DataFrame). """ catalog = Catalog.Instance() metadata = catalog.get_all_properties_for_id(object_id) # First print the id for the DataFrame print('id: ' + str(object_id)) # For each property name anf value, print the contents to the user for property_name, property_value in six.iteritems(metadata): # If the property value is string print it out if isinstance(property_value, six.string_types): print(property_name + ": " + property_value) # else, print just the id. else: print(property_name + "(obj.id): " + str(id(property_value)))
def is_dfinfo_present(data_frame): """ Checks whether the DataFrame information is present in the catalog. Args: data_frame (DataFrame): The DataFrame that should be checked for its presence in the catalog. Returns: A Boolean value of True is returned if the DataFrame is present in the catalog, else False is returned. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. """ # Validate inputs # We expect the input object to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to check if the # DataFrame information is present in the catalog. # Relay the return value from the delegated function. return catalog.is_df_info_present_in_catalog(data_frame)
def get_property(data_frame, property_name): """ Gets the value of a property (with the given property name) for a pandas DataFrame from the catalog. Args: data_frame (DataFrame): The DataFrame for which the property should be retrieved. property_name (string): The name of the property that should be retrieved. Returns: A Python object (typically a string or a pandas DataFrame depending on the property name) is returned. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. KeyError: If `data_frame` information is not present in the catalog. KeyError: If requested property for the `data_frame` is not present in the catalog. """ # Validate input parameters # # The input object should be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # # The property name should be of type string if not isinstance(property_name, six.string_types): logger.error('Property name is not of type string') raise AssertionError('Property name is not of type string') # Get the catalog instance, this is imported here because this object # used to validate the presence of a DataFrame in the catalog, and the # presence of requested metadata in the catalog. catalog = Catalog.Instance() # Check for the present of input DataFrame in the catalog. if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Check if the requested property is present in the catalog. if not catalog.is_property_present_for_df(data_frame, property_name): logger.error( 'Requested metadata ( %s ) for the given DataFrame is not ' 'present in the catalog' % property_name) raise KeyError( 'Requested metadata ( %s ) for the given DataFrame is not ' 'present in the catalog' % property_name) # Return the requested property for the input DataFrame return catalog.get_property(data_frame, property_name)
def del_catalog(): """ Deletes the catalog for the current session. Returns: A Boolean value of True is returned if the deletion was successful. """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to delete the catalog (a # dict). Relay the return value from the delegated function. return catalog.del_catalog()
def del_property(data_frame, property_name): """ Deletes a property for a pandas DataFrame from the catalog. Args: data_frame (DataFrame): The input DataFrame for which a property must be deleted from the catalog. property_name (string): The name of the property that should be deleted. Returns: A Boolean value of True is returned if the deletion was successful. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. KeyError: If `data_frame` information is not present in the catalog. KeyError: If requested property for the DataFrame is not present in the catalog. """ # Validate input parameters # # The input object should be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # # The input property name is expected to be of type string if not isinstance(property_name, six.string_types): logger.error('Property name is not of type string') raise AssertionError('Property name is not of type string') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame information is present in the catalog, if not # raise an error. if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Check if the requested property name to be deleted is present for the # DataFrame in the catalog, if not raise an error. if not catalog.is_property_present_for_df(data_frame, property_name): logger.error('Requested metadata ( %s ) for the given DataFrame is ' 'not present in the catalog' % property_name) raise KeyError('Requested metadata ( %s ) for the given DataFrame is ' 'not present in the catalog' % property_name) # Delete the property using the underlying catalog object and relay the # return value. Typically the return value is True if the deletion was # successful return catalog.del_property(data_frame, property_name)
def get_catalog_len(): """ Get the length (i.e the number of entries) in the catalog. Returns: The number of entries in the catalog as an integer. """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to get the catalog length. # Relay the return value from that function. return catalog.get_catalog_len()
def set_property(data_frame, property_name, property_value): """ Sets the value of a property (with the given property name) for a pandas DataFrame in the catalog. Args: data_frame (DataFrame): The DataFrame for which the property must be set. property_name (string): The name of the property to be set. property_value (object): The value of the property to be set. This is typically a string (such as key) or pandas DataFrame (such as ltable, rtable). Returns: A Boolean value of True is returned if the update was successful. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. Note: If the input DataFrame is not present in the catalog, this function will create an entry in the catalog and set the given property. """ # Validate input parameters # # The input object is expected to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # # The property name is expected to be of type string. if not isinstance(property_name, six.string_types): logger.error('Property name is not of type string') raise AssertionError('Property name is not of type string') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame information is present in the catalog. If the # information is not present, then initialize an entry for that DataFrame # in the catalog. if not catalog.is_df_info_present_in_catalog(data_frame): catalog.init_properties(data_frame) # Set the property in the catalog, and relay the return value from the # underlying catalog object's function. The return value is typically # True if the update was successful. return catalog.set_property(data_frame, property_name, property_value)
def is_catalog_empty(): """ Checks if the catalog is empty. Returns: A Boolean value of True is returned if the catalog is empty, else returns False. """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to check if the catalog # is empty. Relay the return value from the delegated function. return catalog.is_catalog_empty()
def get_catalog(): """ Gets the catalog information for the current session. Returns: A Python dictionary containing the catalog information. Specifically, the dictionary contains the Python identifier of a DataFrame (obtained by id(DataFrame object)) as the key and their properties as value. """ # Get the catalog instance catalog = Catalog.Instance() # Call the underlying catalog object's function to get the catalog. Relay # the return value from the delegated function. return catalog.get_catalog()
def is_property_present_for_df(data_frame, property_name): """ Checks if the given property is present for the given DataFrame in the catalog. Args: data_frame (DataFrame): The DataFrame for which the property must be checked for. property_name (string): The name of the property that should be checked for its presence for the DataFrame, in the catalog. Returns: A Boolean value of True is returned if the property is present for the given DataFrame. Raises: AssertionError: If `data_frame` is not of type pandas DataFrame. AssertionError: If `property_name` is not of type string. KeyError: If `data_frame` is not present in the catalog. """ # Input validations # # We expect the input object to be of type pandas DataFrame. if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # # The input property name should be of type string if not isinstance(property_name, six.string_types): logger.error('The property name is not of type string.') raise AssertionError('The property name is not of type string.') # Get the catalog instance catalog = Catalog.Instance() # Check if the given DataFrame information is present in the catalog. If # not, raise an error. if catalog.is_df_info_present_in_catalog(data_frame) is False: logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Call the underlying catalog object's function to check if the property # is present for the given DataFrame. Relay the return value from that # function. return catalog.is_property_present_for_df(data_frame, property_name)
def del_all_properties(data_frame): """ Deletes all properties for a DataFrame from the catalog. Args: data_frame (DataFrame): Input DataFrame for which all the properties must be deleted from the catalog. Returns: A boolean of True is returned if the deletion was successful from the catalog. Raises: AssertionError: If the `data_frame` is not of type pandas DataFrame. KeyError: If the DataFrame information is not present in the catalog. Note: This method's functionality is not as same as init_properties. Here the DataFrame's entry will be removed from the catalog, but init_properties will add (if the DataFrame is not present in the catalog) and initialize its properties to an empty object ( specifically, an empty Python dictionary). """ # Validations of input parameters # # The input object is expected to be of type pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas data frame') raise AssertionError('Input object is not of type pandas data frame') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame is present in the catalog. If not, raise an error if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Call the underlying catalog object's function to delete the properties # and relay its return value return catalog.del_all_properties(data_frame)
def get_all_properties(data_frame): """ Gets all the properties for a pandas DataFrame object from the catalog. Args: data_frame (DataFrame): DataFrame for which the properties must be retrieved. Returns: A dictionary containing properties for the input pandas DataFrame. Raises: AttributeError: If the input object is not of type pandas DataFrame. KeyError: If the information about DataFrame is not present in the catalog. """ # Validate input parameters # # The input object is expected to be of type DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # Get the catalog instance catalog = Catalog.Instance() # Check if the DataFrame information is present in the catalog. If not # raise an error. if not catalog.is_df_info_present_in_catalog(data_frame): logger.error('DataFrame information is not present in the catalog') raise KeyError('DataFrame information is not present in the catalog') # Retrieve the properties for the DataFrame from the catalog and return # it back to the user. return catalog.get_all_properties(data_frame)
from py_entitymatching.catalog.catalog import Catalog __version__ = '0.1.0' _catalog = Catalog.Instance() # downsampling related methods from py_entitymatching.sampler.down_sample import down_sample # # io related methods # from py_entitymatching.io.parsers import read_csv_metadata, to_csv_metadata from py_entitymatching.io.pickles import load_object, load_table, save_object, save_table # # import catalog related methods from py_entitymatching.catalog.catalog_manager import get_property, get_all_properties, \ set_property, del_property, del_all_properties, init_properties, copy_properties from py_entitymatching.catalog.catalog_manager import get_catalog, del_catalog, \ get_catalog_len, show_properties, show_properties_for_id from py_entitymatching.catalog.catalog_manager import is_property_present_for_df, \ is_dfinfo_present, is_catalog_empty from py_entitymatching.catalog.catalog_manager import get_key, set_key, set_fk_ltable,\ set_fk_rtable, get_ltable, get_rtable, validate_and_set_fk_ltable, \ validate_and_set_fk_rtable, set_ltable, set_rtable, get_fk_rtable, \ get_fk_ltable # # # blockers from py_entitymatching.blocker.attr_equiv_blocker import AttrEquivalenceBlocker
def copy_properties(source_data_frame, target_data_frame, replace=True): """ Copies properties from a source DataFrame to target DataFrame in the catalog. Args: source_data_frame (DataFrame): The DataFrame from which the properties to be copied from, in the catalog. target_data_frame (DataFrame): The DataFrame to which the properties to be copied to, in the catalog. replace (boolean): A flag to indicate whether the source DataFrame's properties can replace the target DataFrame's properties in the catalog. The default value for the flag is True. Specifically, if the target DataFrame's information is already present in the catalog then the function will check if the replace flag is True. If the flag is set to True, then the function will first delete the existing properties and then set it with the source DataFrame properties. If the flag is False, the function will just return without modifying the existing properties. Returns: A Boolean value of True is returned if the copying was successful. Raises: AssertionError: If `source_data_frame` is not of type pandas DataFrame. AssertionError: If `target_data_frame` is not of type pandas DataFrame. KeyError: If source DataFrame is not present in the catalog. """ # Validate input parameters # # The source_data_frame is expected to be of type pandas DataFrame if not isinstance(source_data_frame, pd.DataFrame): logger.error('Input object (source_data_frame) is not of type pandas ' 'DataFrame') raise AssertionError( 'Input object (source_data_frame) is not of type pandas DataFrame') # # The target_data_frame is expected to be of type pandas DataFrame if not isinstance(target_data_frame, pd.DataFrame): logger.error('Input object (target_data_frame) is not of type pandas ' 'DataFrame') raise AssertionError('Input object (target_data_frame) is not of ' 'type pandas DataFrame') # Get the catalog instance catalog = Catalog.Instance() # Check if the source DataFrame information is present in the catalog. If # not raise an error. if catalog.is_df_info_present_in_catalog(source_data_frame) is False: logger.error( 'DataFrame information (source_data_frame) is not present in the ' 'catalog') raise KeyError( 'DataFrame information (source_data_frame) is not present in the ' 'catalog') # Get all properties for the source DataFrame metadata = catalog.get_all_properties(source_data_frame) # Set the properties to the target DataFrame. Specifically, call the set # properties function and relay its return value. # Note: There is a redundancy in validating the input parameters. This # might have a slight performance impact, but we don't expect that this # function gets called so often. return set_properties(target_data_frame, metadata, replace) # this initializes tar in the catalog.
def set_properties(data_frame, properties, replace=True): """ Sets the properties for a DataFrame in the catalog. Args: data_frame (DataFrame): DataFrame for which the properties must be set. properties (dict): A Python dictionary with keys as property names and values as Python objects (typically strings or DataFrames) replace (Optional[bool]): Flag to indicate whether the input properties can replace the properties in the catalog. The default value for the flag is True. Specifically, if the DataFrame information is already present in the catalog then the function will check if the replace flag is True. If the flag is set to True, then the function will first delete the existing properties, set it with the given properties. If the flag is False, the function will just return without modifying the existing properties. Returns: A Boolean value of True is returned if the properties were set for the given DataFrame, else returns False. Raises: AssertionError: If the input data_frame object is not of type pandas DataFrame. AssertionError: If the input properties object is not of type Python dictionary. """ # Validate input parameters # # Input object is expected to be a pandas DataFrame if not isinstance(data_frame, pd.DataFrame): logger.error('Input object is not of type pandas DataFrame') raise AssertionError('Input object is not of type pandas DataFrame') # # Input properties is expected to be of type Python dictionary if not isinstance(properties, dict): logger.error('The properties should be of type Python dictionary') raise AssertionError( 'The properties should be of type Python dictionary') # Get the catalog instance catalog = Catalog.Instance() # Check if the the DataFrame information is present in the catalog. If # present, we expect the replace flag to be True. If the flag was set to # False, then warn the user and return False. if catalog.is_df_info_present_in_catalog(data_frame): if not replace: logger.warning( 'Properties already exists for df ( %s ). Not replacing it' % str(id(data_frame))) return False else: # DataFrame information is present and replace flag is True. We # now reset the properties dictionary for this DataFrame. catalog.init_properties(data_frame) else: # The DataFrame information is not present in the catalog. so # initialize the properties catalog.init_properties(data_frame) # Now iterate through the given properties and set for the DataFrame. # Note: Here we don't check the correctness of the input properties (i.e # we do not check if a property 'key' is indeed a key) for property_name, property_value in six.iteritems(properties): catalog.set_property(data_frame, property_name, property_value) # Finally return True, if everything was successful return True