コード例 #1
0
 def test_validate_and_set_fk_rtable_err_case_2(self):
     C = pd.read_csv(path_c)
     p = os.sep.join([catalog_datasets_path, 'A_inv_fk.csv'])
     A = pd.read_csv(p)
     status = cm.validate_and_set_fk_rtable(C, 'ltable_ID', A, 'ID')
     self.assertEqual(status, False)
     self.assertEqual(cm.is_dfinfo_present(C), False)
コード例 #2
0
 def test_copy_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     A1 = pd.read_csv(path_a)
     cm.copy_properties(A, A1)
     self.assertEqual(cm.is_dfinfo_present(A1), True)
     p = cm.get_all_properties(A)
     p1 = cm.get_all_properties(A1)
     self.assertEqual(p, p1)
     self.assertEqual(cm.get_key(A1), cm.get_key(A))
コード例 #3
0
    def test_copy_properties_valid_2(self):
        A = read_csv_metadata(path_a)
        B = read_csv_metadata(path_b)
        C = read_csv_metadata(path_c, ltable=A, rtable=B)

        C1 = pd.read_csv(path_c)
        cm.copy_properties(C, C1)
        self.assertEqual(cm.is_dfinfo_present(C1), True)
        p = cm.get_all_properties(C1)
        p1 = cm.get_all_properties(C1)
        self.assertEqual(p, p1)
        self.assertEqual(cm.get_key(C1), cm.get_key(C))
        self.assertEqual(cm.get_ltable(C1).equals(A), True)
        self.assertEqual(cm.get_rtable(C1).equals(B), True)
        self.assertEqual(cm.get_fk_ltable(C1), cm.get_fk_ltable(C))
        self.assertEqual(cm.get_fk_rtable(C1), cm.get_fk_rtable(C))
コード例 #4
0
def _write_metadata(data_frame, file_path):
    """
    Write metadata contents to disk.
    """
    # Initialize a metadata dictionary to store the metadata.
    metadata_dict = collections.OrderedDict()

    # Get all the properties for the input data frame
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)
    else:
        # If the data_frame is not in the catalog, then return immedidately.
        return False

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            # If the property value is not of type string, then just write it
            #  as 'POINTER'. This will be useful while writing the candidate
            # sets to disk. The candidate set will have properties such as
            # ltable and rtable which are DataFrames. We do not have a simple
            # way to write them to disk and link them back the candidate set
            # while reading back from disk. So to get around this problem we
            # will use 'POINTER' as the special value to indicate objects
            # other than strings.
            if isinstance(property_value, six.string_types) is False:
                metadata_dict[property_name] = 'POINTER'
            else:
                metadata_dict[property_name] = property_value

        # Write the properties to a file in disk. The file will one property
        # per line. We follow a special syntax to write the properties. The
        # syntax is:
        # #property_name=property_value
        with open(file_path, 'w') as file_handler:
            for property_name, property_value in six.iteritems(metadata_dict):
                file_handler.write('#%s=%s\n' %
                                   (property_name, property_value))

    return True
コード例 #5
0
ファイル: parsers.py プロジェクト: paulgc/magellan
def _write_metadata(data_frame, file_path):
    """
    Write metadata contents to disk.
    """
    # Initialize a metadata dictionary to store the metadata.
    metadata_dict = collections.OrderedDict()

    # Get all the properties for the input data frame
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)
    else:
        # If the data_frame is not in the catalog, then return immedidately.
        return False

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            # If the property value is not of type string, then just write it
            #  as 'POINTER'. This will be useful while writing the candidate
            # sets to disk. The candidate set will have properties such as
            # ltable and rtable which are DataFrames. We do not have a simple
            # way to write them to disk and link them back the candidate set
            # while reading back from disk. So to get around this problem we
            # will use 'POINTER' as the special value to indicate objects
            # other than strings.
            if isinstance(property_value, six.string_types) is False:
                metadata_dict[property_name] = 'POINTER'
            else:
                metadata_dict[property_name] = property_value

        # Write the properties to a file in disk. The file will one property
        # per line. We follow a special syntax to write the properties. The
        # syntax is:
        # #property_name=property_value
        with open(file_path, 'w') as file_handler:
            for property_name, property_value in six.iteritems(metadata_dict):
                file_handler.write('#%s=%s\n' % (property_name, property_value))

    return True
コード例 #6
0
 def test_is_dfinfo_present_invalid(self):
     cm.is_dfinfo_present(None)
コード例 #7
0
 def test_valid_path_wi_invalidmetadata_replace_key(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'A_key_zipcode.csv'])
     IM = read_csv_metadata(p, key='ID')
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.has_property(IM, 'key'), True)
コード例 #8
0
 def test_valid_path_wo_metadata(self):
     cm.del_catalog()
     B = read_csv_metadata(path_b)
     pd_B = pd.read_csv(path_b)
     self.assertEqual(B.equals(pd_B), True)
     self.assertEqual(cm.is_dfinfo_present(B), True)
コード例 #9
0
 def test_del_all_properties_valid_1(self):
     A = read_csv_metadata(path_a)
     cm.del_all_properties(A)
     self.assertEqual(cm.is_dfinfo_present(A), False)
コード例 #10
0
 def test_valid_path_wi_invalidmetadata_replace_key(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'A_key_zipcode.csv'])
     IM = read_csv_metadata(p, key='ID')
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.has_property(IM, 'key'), True)
コード例 #11
0
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
コード例 #12
0
 def test_valid_path_wo_metadata(self):
     cm.del_catalog()
     B = read_csv_metadata(path_b)
     pd_B = pd.read_csv(path_b)
     self.assertEqual(B.equals(pd_B), True)
     self.assertEqual(cm.is_dfinfo_present(B), True)
コード例 #13
0
 def test_validpath_metadata_set_to_none_1(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a, key=None)
     self.assertEqual(cm.is_dfinfo_present(A), True)
     cm.get_key(A)
コード例 #14
0
def read_csv_metadata(file_path, **kwargs):
    """
    Read CSV (comma-separated) file into DataFrame, and update the
    catalog with the metadata read from the same file name with an extension
    specified by the user (with the default value set to '.metadata') or the
    metadata given as key-value arguments.

    Reads the CSV file from the given file path into a pandas DataFrame.
    This function uses 'read_csv' method from pandas to read the CSV file into a
    pandas DataFrame. Further it looks for a file with  same file name but
    with a specific extension. This extension can be given by the user,
    with the default value being '.metadata'. If the metadata  file is
    present, the function with read and update the catalog. If
    the metadata file is not present, the function will issue a warning
    that the metadata file is not present and will
    read the CSV file into a pandas DataFrame.

    The metadata information can also be given as parameters to the function
    (see decription of arguments for more details). If given, the function
    will update the catalog with the given information. Further,
    the metadata given in the function takes precendence over the metadata
    given in the file.


    Args:
        file_path (string): CSV file path.

        kwargs (dict): A python dictionary containing key-value arguments. There
            are a few key-value pairs that are specific to read_csv_metadata and
            all the other key-value pairs are passed to pandas read_csv method.
            The keys that are specific to read_csv_metadata are: (1)
            metadata_extn, (2) key, (3) fk_ltable, (4) fk_rtable, (5) ltable,
            and (6) rtable. Here the metadata_extn is the expected metadata
            extension (with the default value set to '.metadata'), and all
            the others are metadata related to the DataFrame read from the
            CSV file.

    Returns:
        A pandas DataFrame read from the given CSV file.

    Raises:
        AssertionError: If the input file path is not of type string.
        AssertionError: If a file does not exist in the given file path.

    """
    # Validate the input parameters.

    # # File path is expected to be of type string.
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # Check if the given path is valid.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s', file_path)
        raise AssertionError('File does not exist at path %s', file_path)

    # Check if the user has specified the metadata file's extension.
    extension = kwargs.pop('metadata_extn', None)

    # If the extension is not specified then set the extension to .metadata'.
    if extension is None:
        extension = '.metadata'

    # Format the extension to include a '.' in front if the user has not
    # given one.
    if not extension.startswith('.'):
        extension = '.' + extension

    # If the file is present, then update metadata from file.
    if _is_metadata_file_present(file_path, extension=extension):
        file_name, _ = os.path.splitext(file_path)
        file_name = ''.join([file_name, extension])
        metadata, _ = _get_metadata_from_file(file_name)

    # Else issue a warning that the metadata file is not present
    else:
        logger.warning('Metadata file is not present in the given path; '
                       'proceeding to read the csv file.')
        metadata = {}

    # Update the metadata with the key-value pairs given in the command. The
    # function _update_metadata_for_read_cmd takes care of updating the
    # metadata with only the key-value pairs specific to read_csv_metadata
    # method
    metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs)

    # Validate the metadata.
    _check_metadata_for_read_cmd(metadata)

    # Read the csv file using pandas read_csv method.
    data_frame = pd.read_csv(file_path, **kwargs)

    # Get the value for 'key' property and update the catalog.
    key = metadata.pop('key', None)
    if key is not None:
        cm.set_key(data_frame, key)

    # Update the catalog with other properties.
    for property_name, property_value in six.iteritems(metadata):
        cm.set_property(data_frame, property_name, property_value)
    if not cm.is_dfinfo_present(data_frame):
        cm.init_properties(data_frame)

    # Return the DataFrame
    return data_frame
コード例 #15
0
 def test_init_properties_valid(self):
     # cm.del_catalog()
     A = pd.read_csv(path_a)
     cm.init_properties(A)
     self.assertEqual(cm.is_dfinfo_present(A), True)
コード例 #16
0
 def test_del_all_properties_valid_2(self):
     A = read_csv_metadata(path_a)
     B = read_csv_metadata(path_b)
     C = read_csv_metadata(path_c, ltable=A, rtable=B)
     cm.del_all_properties(C)
     self.assertEqual(cm.is_dfinfo_present(C), False)
コード例 #17
0
 def test_validpath_metadata_set_to_none_1(self):
     cm.del_catalog()
     del_files_in_dir(sndbx_path)
     A = read_csv_metadata(path_a, key=None)
     self.assertEqual(cm.is_dfinfo_present(A), True)
     cm.get_key(A)
コード例 #18
0
 def test_is_dfinfo_present_valid_1(self):
     A = read_csv_metadata(path_a)
     status = cm.is_dfinfo_present(A)
     self.assertEqual(status, True)
コード例 #19
0
 def test_valid_path_wi_metadata_unknownprop(self):
     cm.del_catalog()
     p = os.sep.join([io_datasets_path, 'InvalidMetadata1.csv'])
     IM = read_csv_metadata(p)
     self.assertEqual(cm.is_dfinfo_present(IM), True)
     self.assertEqual(cm.get_property(IM, 'key1'), 'ID')
コード例 #20
0
 def test_is_dfinfo_present_valid_2(self):
     A = pd.read_csv(path_a)
     status = cm.is_dfinfo_present(A)
     self.assertEqual(status, False)
コード例 #21
0
def save_table(data_frame, file_path, metadata_ext='.pklmetadata'):
    """
    Save the DataFrame to disk along with the metadata.

    This function saves the DataFrame to disk along with the metadata from
    tha catalog. Specifically, this function saves the DataFrame in the given
    file_path, and saves the metadata in the same directory (as the
    file_path) but with a different extension. This extension can be given
    by the user, if not a default extension of 'pklmetadata' is used.

    Args:
        data_frame (DataFrame): DataFrame that should be saved
        file_path (string): File path where the DataFrame must be stored
        metadata_ext (string): Metadata extension that should be used while
            storing the metadata information. The default value is
            '.pklmetadata'.

    Returns:
        A boolean value of True is returned if the DataFrame is successfully
        saved.

    See Also:
        save_object, to_csv_metadata.

    Notes:
        This function is bit different from to_csv_metadata, where the
        DataFrame is stored in a CSV file format. The CSV file format can be
        viewed with a text editor. But save_table is stored in a
        special format, which cannot be viewed with a text editor.
        The reason we have save_table is, for larger DataFrames it is
        efficient to pickle the DataFrame to disk than writing the DataFrame
        in CSV format.
    """
    # Validate the input parameters

    # # data_frame is expected to be of type pandas DataFrame
    if not isinstance(data_frame, pd.DataFrame):
        logging.error('Input object is not of type pandas DataFrame')
        raise AssertionError('Input object is not of type pandas DataFrame')

    # # file_path is expected to be of type pandas DataFrame
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # metadata_ext is expected to be of type string
    if not isinstance(metadata_ext, six.string_types):
        logger.error('Input metadata ext is not of type string')
        raise AssertionError('Input metadata ext is not of type string')

    # Get the file_name (with out extension) and the extension from the given
    #  file path. For example if the file_path was /Users/foo/file.csv then
    # the file_name will be /Users/foo/file and the extension will be '.csv'
    file_name, _ = os.path.splitext(file_path)

    # The metadata file name is the same file name but with the extension
    # given by the user
    metadata_filename = file_name + metadata_ext

    # Check if the file exists in the file_path and whether we have
    # sufficient access privileges to write in that path
    can_write, file_exists = ps._check_file_path(file_path)

    if can_write:
        # If the file already exists then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning('File already exists at %s; Overwriting it',
                           file_path)
            # we open the file_path in binary mode, as we are writing in
            # binary format'
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
        else:
            #
            with open(file_path, 'wb') as file_handler:
                cloudpickle.dump(data_frame, file_handler)
    else:
        # Looks like we cannot write the file in the given path. Raise an
        # error in this case.
        logger.error('Cannot write in the file path %s; Exiting', file_path)
        raise AssertionError('Cannot write in the file path %s', file_path)

    # Once we are done with writing the DataFrame, we will write the metadata
    #  now

    # Initialize a metadata dictionary to hold the metadata of DataFrame from
    #  the catalog
    metadata_dict = collections.OrderedDict()

    # get all the properties for the input data frame
    # # Check if the DataFrame information is present in the catalog
    properties = {}
    if cm.is_dfinfo_present(data_frame) is True:
        properties = cm.get_all_properties(data_frame)

    # If the properties are present in the catalog, then write properties to
    # disk
    if len(properties) > 0:
        for property_name, property_value in six.iteritems(properties):
            if isinstance(property_value, six.string_types) is True:
                metadata_dict[property_name] = property_value

    # try to save metadata
    can_write, file_exists = ps._check_file_path(metadata_filename)
    if can_write:
        # If the file already exists, then issue a warning and overwrite the
        # file
        if file_exists:
            logger.warning(
                'Metadata file already exists at %s. Overwriting it',
                metadata_filename)
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
        else:
            # write metadata contents
            with open(metadata_filename, 'wb') as file_handler:
                cloudpickle.dump(metadata_dict, file_handler)
    else:
        logger.warning(
            'Cannot write metadata at the file path %s. Skip writing metadata '
            'file', metadata_filename)

    return True
コード例 #22
0
ファイル: parsers.py プロジェクト: paulgc/magellan
def read_csv_metadata(file_path, **kwargs):
    """
    Read CSV (comma-separated) file into DataFrame, and update the
    catalog with the metadata read from the same file name with an extension
    specified by the user (with the default value set to '.metadata') or the
    metadata given as key-value arguments.

    Reads the CSV file from the given file path into a pandas DataFrame.
    This function uses 'read_csv' method from pandas to read the CSV file into a
    pandas DataFrame. Further it looks for a file with  same file name but
    with a specific extension. This extension can be given by the user,
    with the default value being '.metadata'. If the metadata  file is
    present, the function with read and update the catalog. If
    the metadata file is not present, the function will issue a warning
    that the metadata file is not present and will
    read the CSV file into a pandas DataFrame.

    The metadata information can also be given as parameters to the function
    (see decription of arguments for more details). If given, the function
    will update the catalog with the given information. Further,
    the metadata given in the function takes precendence over the metadata
    given in the file.


    Args:
        file_path (string): CSV file path.

        kwargs (dict): A python dictionary containing key-value arguments. There
            are a few key-value pairs that are specific to read_csv_metadata and
            all the other key-value pairs are passed to pandas read_csv method.
            The keys that are specific to read_csv_metadata are: (1)
            metadata_extn, (2) key, (3) fk_ltable, (4) fk_rtable, (5) ltable,
            and (6) rtable. Here the metadata_extn is the expected metadata
            extension (with the default value set to '.metadata'), and all
            the others are metadata related to the DataFrame read from the
            CSV file.

    Returns:
        A pandas DataFrame read from the given CSV file.

    Raises:
        AssertionError: If the input file path is not of type string.
        AssertionError: If a file does not exist in the given file path.

    """
    # Validate the input parameters.

    # # File path is expected to be of type string.
    if not isinstance(file_path, six.string_types):
        logger.error('Input file path is not of type string')
        raise AssertionError('Input file path is not of type string')

    # # Check if the given path is valid.
    if not os.path.exists(file_path):
        logger.error('File does not exist at path %s', file_path)
        raise AssertionError('File does not exist at path %s', file_path)

    # Check if the user has specified the metadata file's extension.
    extension = kwargs.pop('metadata_extn', None)

    # If the extension is not specified then set the extension to .metadata'.
    if extension is None:
        extension = '.metadata'

    # Format the extension to include a '.' in front if the user has not
    # given one.
    if not extension.startswith('.'):
        extension = '.' + extension

    # If the file is present, then update metadata from file.
    if _is_metadata_file_present(file_path, extension=extension):
        file_name, _ = os.path.splitext(file_path)
        file_name = ''.join([file_name, extension])
        metadata, _ = _get_metadata_from_file(file_name)

    # Else issue a warning that the metadata file is not present
    else:
        logger.warning('Metadata file is not present in the given path; '
                       'proceeding to read the csv file.')
        metadata = {}

    # Update the metadata with the key-value pairs given in the command. The
    # function _update_metadata_for_read_cmd takes care of updating the
    # metadata with only the key-value pairs specific to read_csv_metadata
    # method
    metadata, kwargs = _update_metadata_for_read_cmd(metadata, **kwargs)

    # Validate the metadata.
    _check_metadata_for_read_cmd(metadata)

    # Read the csv file using pandas read_csv method.
    data_frame = pd.read_csv(file_path, **kwargs)

    # Get the value for 'key' property and update the catalog.
    key = metadata.pop('key', None)
    if key is not None:
        cm.set_key(data_frame, key)

    # Update the catalog with other properties.
    for property_name, property_value in six.iteritems(metadata):
        cm.set_property(data_frame, property_name, property_value)
    if not cm.is_dfinfo_present(data_frame):
        cm.init_properties(data_frame)

    # Return the DataFrame
    return data_frame