Beispiel #1
0
def _update_metadata(function, metadata, output_path=None, **kwargs):
    metadata_dict = _load_metadata_dict(metadata)
    function(metadata_dict, **kwargs)

    validate(metadata_dict)

    if not isinstance(metadata, MetaData):
        metadata = MetaData()
        metadata.data = metadata_dict

    if output_path is not None:
        metadata.to_json(output_path)

    return metadata
Beispiel #2
0
    def __init__(self, path_to_dataset):
        """Read the dataset from disk.

        Args:
            path_to_dataset (str): The path to the dataset.
        """
        self.path_to_dataset = os.path.expanduser(path_to_dataset)

        path_to_metadata = os.path.join(self.path_to_dataset, "metadata.json")
        self.metadata = MetaData.from_json(path_to_metadata)

        self.tables = {}
        for table_name in self.metadata.get_table_names():
            path_to_csv = os.path.join(self.path_to_dataset, "%s.csv" % table_name)

            table_metadata = self.metadata.get_table(table_name)
            dtypes = _read_csv_dtypes(table_metadata)
            self.tables[table_name] = pd.read_csv(path_to_csv, dtype=dtypes)
            _parse_dtypes(self.tables[table_name], table_metadata)

            logger.info("Loaded table %s (%s rows, %s cols)" % (
                table_name,
                len(self.tables[table_name]),
                len(self.tables[table_name].columns)
            ))
Beispiel #3
0
def test_foreign_keys_output_path(metad, foreign_keys, tmpdir):
    output_path = os.path.join(str(tmpdir), 'updated.json')
    metadata.update_metadata_foreign_keys(metad,
                                          foreign_keys,
                                          output_path=output_path)

    updated = MetaData.from_json(output_path)
    assert updated.data['foreign_keys'] == foreign_keys
Beispiel #4
0
def test_primary_output_path(metad, primary_keys, tmpdir):
    output_path = os.path.join(str(tmpdir), 'metadata.json')
    metadata.update_metadata_primary_keys(metad,
                                          primary_keys,
                                          output_path=output_path)

    updated = MetaData.from_json(output_path)
    assert updated.data['tables'][0]['primary_key'] == 'a_field'
    assert updated.data['tables'][1]['primary_key'] == 'another_field'
Beispiel #5
0
def test_column_map_output_path(metad, column_map, constraints, tmpdir):
    output_path = os.path.join(str(tmpdir), 'metadata.json')
    metadata.update_metadata_column_map(metad,
                                        column_map,
                                        target_table='1234',
                                        target_field='a_field',
                                        output_path=output_path)

    updated = MetaData.from_json(output_path)
    assert updated.data['constraints'] == constraints
Beispiel #6
0
 def test_creation(self):
     metadata = MetaData()
     metadata.add_table({
         "id":
         "users",
         "name":
         "users",
         "fields": [{
             "name": "user_id",
             "data_type": "id"
         }, {
             "name": "name",
             "data_type": "text"
         }, {
             "name": "gender",
             "data_type": "categorical"
         }, {
             "name": "age",
             "data_type": "numerical"
         }, {
             "name": "birthday",
             "data_type": "datetime"
         }, {
             "name": "picture",
             "data_type": "other"
         }]
     })
     assert len(metadata.get_table_names()) == 1
     assert "users" in metadata.get_table_names()
Beispiel #7
0
def metad():
    metadata = MetaData()
    metadata.data = {
        'tables': [
            {
                'id':
                '1234',
                'name':
                'a_table',
                'fields': [{
                    'name': 'a_field',
                    'type': 'number',
                    'subtype': 'float'
                }, {
                    'name': 'some_field',
                    'type': 'number',
                    'subtype': 'float'
                }]
            },
            {
                'id':
                '4567',
                'name':
                'another_table',
                'fields': [{
                    'name': 'another_field',
                    'type': 'number',
                    'subtype': 'float'
                }, {
                    'name': 'some_other_field',
                    'type': 'number',
                    'subtype': 'float'
                }]
            },
        ],
    }
    return metadata
Beispiel #8
0
def _load_metadata_dict(metadata):
    """Return the given metadata as a dict."""
    if isinstance(metadata, MetaData):
        return metadata.data

    if isinstance(metadata, str):
        metadata = MetaData.from_json(metadata)
        metadata.validate()
        return metadata.data

    if isinstance(metadata, dict):
        validate(metadata)
        return metadata

    raise TypeError('Metadata can only be MetaData, dict or str')
Beispiel #9
0
def load_dataset(dataset_path):
    """Load a dataset as a MetaData and a dict of tables.

    Args:
        dataset_path (str):
            Path to the root of the dataset.

    Returns:
        tuple:
            Tuple containing:
                * A ``MetaData`` instance.
                * A ``dict`` containing the tables loaded as ``pandas.DataFrames``.
    """
    metadata = MetaData.from_json(os.path.join(dataset_path, 'metadata.json'))
    tables = {}
    for table_name in metadata.get_table_names():
        tables[table_name] = pd.read_csv(
            os.path.join(dataset_path, table_name + '.csv'))

    return metadata, tables
Beispiel #10
0
def validate(metadata):
    metad = MetaData()
    metad.data = metadata
    metad.validate()
Beispiel #11
0
 def test_validation(self, path_to_example):
     metadata = MetaData.from_json(path_to_example)
     metadata.validate()
     assert str(metadata)
Beispiel #12
0
def validate(args):
    for path_to_json in args["<jsons>"]:
        print("Validating %s..." % path_to_json)
        MetaData.from_json(path_to_json).validate()
Beispiel #13
0
def describe(args):
    metadata = MetaData.from_json(args["<json>"])
    print(str(metadata))
Beispiel #14
0
 def _load_metadata(self):
     cursor = self.db.cursor(pymysql.cursors.DictCursor)
     self.metadata = MetaData()
     self.metadata.set_tables(self._tables(cursor))
     self.metadata.set_foreign_keys(self._foreign_keys(cursor))
     cursor.close()
Beispiel #15
0
class MySQLConnector(BaseConnector):
    """Import data from MySQL instances.

    The MySQLConnector allows you to import data from a MySQL instance
    by translating the data types and exporting the primary / foreign key
    constraints from the schema table.
    """
    def __init__(self, host, port, user, password, database):
        """Create a new MySQLConnector.

        Args:
            host (str): The host name / ip address of the MySQL instance.
            port (int): The port number (default: 3306).
            user (str): The username for accessing the instance.
            password (str): The password for accessing the instance.
            database (str): The name of the database to export.
        """
        super().__init__()
        self.database = database
        self.db = pymysql.connect(host=host,
                                  port=port,
                                  user=user,
                                  passwd=password,
                                  db=database,
                                  charset='utf8')
        self._load_metadata()

    def __del__(self):
        self.db.close()

    def _load_metadata(self):
        cursor = self.db.cursor(pymysql.cursors.DictCursor)
        self.metadata = MetaData()
        self.metadata.set_tables(self._tables(cursor))
        self.metadata.set_foreign_keys(self._foreign_keys(cursor))
        cursor.close()

    def export_metadata(self, path_to_json):
        self.metadata.to_json(path_to_json)

    def export_tables(self, path_to_output):
        for table in self.metadata.data["tables"]:
            cursor = self.db.cursor(pymysql.cursors.Cursor)
            cursor.execute("select * from `" + table["name"] + "`;")
            column_names = [column[0] for column in cursor.description]
            rows = [list(row) for row in cursor.fetchall()]
            df = pd.DataFrame(rows, columns=column_names)
            df.to_csv(os.path.join(path_to_output, "%s.csv" % table["name"]),
                      index=False)
            cursor.close()

    def _tables(self, cursor):
        tables = []
        for table_name in self._table_names(cursor):
            table_metadata = self._table_metadata(cursor, table_name)
            table_metadata["id"] = table_name  # table names are unique
            tables.append(table_metadata)
        return tables

    def _foreign_keys(self, cursor):
        # Select Foreign Key Constraints
        cursor.execute("""SELECT
            CONSTRAINT_NAME,
            TABLE_NAME,
            COLUMN_NAME,
            REFERENCED_TABLE_NAME,
            REFERENCED_COLUMN_NAME
        FROM
            INFORMATION_SCHEMA.KEY_COLUMN_USAGE
        WHERE
            REFERENCED_TABLE_SCHEMA = '%s'""" % self.database)
        foreign_keys = cursor.fetchall()

        # Aggregate Composite Keys
        foreign_key_constraints = defaultdict(list)
        for row in foreign_keys:
            foreign_key_constraints[row["CONSTRAINT_NAME"]].append(row)

        for constraint_name, constraint_rows in list(
                foreign_key_constraints.items()):
            constraint = {
                "table": None,
                "field": [],
                "ref_table": None,
                "ref_field": []
            }
            for row in constraint_rows:
                constraint["table"] = row["TABLE_NAME"]
                constraint["field"].append(row["COLUMN_NAME"])
                constraint["ref_table"] = row["REFERENCED_TABLE_NAME"]
                constraint["ref_field"].append(row["REFERENCED_COLUMN_NAME"])

            # Simplify Non-Composite Keys
            if len(constraint["field"]) == 1:
                constraint["field"] = constraint["field"][0]
                constraint["ref_field"] = constraint["ref_field"][0]
            foreign_key_constraints[constraint_name] = constraint

        return list(foreign_key_constraints.values())

    def _table_names(self, cursor):
        cursor.execute("show tables;")
        table_names = []
        for row in cursor.fetchall():
            table_names.append(list(row.values())[0])
        return table_names

    def _table_metadata(self, cursor, table_name):
        cursor.execute("""
            SELECT *
            FROM INFORMATION_SCHEMA.COLUMNS
            WHERE
                table_name = '%s' AND
                table_schema = '%s'
            ORDER BY ORDINAL_POSITION""" % (table_name, self.database))
        fields = []
        primary_key = []
        for row in cursor.fetchall():
            data_type = self._get_dtype(row["DATA_TYPE"]).copy()
            if row["COLUMN_KEY"] == "PRI":
                primary_key.append(row["COLUMN_NAME"])
            data_type["name"] = row["COLUMN_NAME"]
            fields.append(data_type)
        if len(primary_key) == 1:
            primary_key = primary_key[0]
        table = {"name": table_name, "fields": fields}
        if primary_key:
            table["primary_key"] = primary_key[0] if len(
                primary_key) == 1 else primary_key

        return {
            "name": table_name,
            "primary_key": primary_key,
            "fields": fields
        }

    def _get_dtype(self, mysql_dtype):
        if mysql_dtype in set(["int", "tinyint", "smallint"]):
            return {
                'data_type': 'numerical',
                'data_subtype': 'integer',
            }
        if mysql_dtype in set(["bit", "binary"]):
            return {
                'data_type': 'categorical',
                'data_subtype': 'boolean',
            }
        if mysql_dtype in set([
                "text", "tinytext", "longtext", "mediumtext", "varchar",
                "char", "enum"
        ]):
            return {
                'data_type': 'categorical',
            }
        if mysql_dtype in set(
            ["datetime", "year", "timestamp", "time", "date"]):
            return {
                'data_type': 'datetime',
            }
        if mysql_dtype in set(
            ["float", "double", "bigint", "mediumint", "decimal"]):
            return {
                'data_type': 'numerical',
                'data_subtype': 'float',
            }
        return {'data_type': 'other'}
Beispiel #16
0
 def __init__(self):
     self.metadata = MetaData()