Exemple #1
0
def from_manifest(manifest: IngestionManifest) -> Tuple[List[str], List[str]]:
    table_commands = {}
    mapping_commands = {}

    for op in manifest.operations:
        db = manifest.database_dict[op.database]

        expected_table = Table(op.target, [])

        for source in op.sources:
            mapping = manifest.mappings_dict[source.mapping]

            ingestion_mapping = KustoBackend.get_ingestion_mapping(
                source.data_format, mapping)

            mapping_commands[
                source.mapping] = KustoBackend.get_create_ingestion_command(
                    table=op.target,
                    mapping_name=source.mapping,
                    column_mappings=ingestion_mapping)

            expected_table.extend_columns(
                [Column.copy(col_map.target) for col_map in mapping.columns])

        if op.target not in db.tables_dict:
            table_commands[op.target] = KustoBackend.get_create_table_command(
                expected_table)

    return list(table_commands.values()), list(mapping_commands.values())
def table_from_folder(path,
                      name=None,
                      conflict_mode=DataConflictMode.Safe,
                      top=200) -> Table:
    inferred_table = Table(name, columns=[])
    for file in os.listdir(path):
        current_table = table_from_file(file, top)
        if conflict_mode == DataConflictMode.Safe:
            inferred_table.assert_eq(current_table)

    return inferred_table
    def generate_mapping(cls, table: Table, source: DataEntity) -> IngestionMapping:
        data_format = source.files[0].data_format
        name = source.name + "_from_" + data_format

        column_mappings = []
        index_based = not any((c for c in source.columns if c.name is not None))

        if index_based and len(source.columns) != len(table.columns):
            logger.warning(
                f"Mapping for '{source.name}' used index based mapping, and column count doesn't match target table '{table.name}'. "
                f"{len(source.columns)} != {len(table.columns)}"
            )

        for index, source_col in enumerate(source.columns):
            if index_based:
                # TODO: should probably notify if types mismatch (might mean mis-configured)
                if index + 1 > len(table.columns):
                    raise RuntimeError(f"Target table '{table.name}' has fewer columns than source {source.name}. Failed index {index}")

                target_col = table.columns[index]
            else:
                expected_target_col_name = Table.valid_column_name(source_col.name)
                if expected_target_col_name not in table.columns_lookup:
                    raise RuntimeError(
                        f"Target table '{table.name}' is missing a column {expected_target_col_name} ({source_col.name}) from source {source.name}. Failed index {index}"
                    )

                target_col = table.columns_lookup[expected_target_col_name]
            column_mappings.append(ColumnMapping(source_col, target_col))

        return IngestionMapping(name, column_mappings)
Exemple #4
0
    def describe_database(self, database_name: str, **kwargs) -> Database:
        tables = []
        client = self.client_provider.get_engine_client()
        try:
            tables_result = client.execute(
                "NetDefault",
                LIST_COLUMNS_BY_TABLE.format(
                    database_name=database_name)).primary_results[0]

            for t in tables_result:
                columns = []
                for index, col in enumerate(t["Columns"]):
                    name, dotnet_type = col.split(":")
                    columns.append(
                        Column(name,
                               index=index,
                               data_type=dotnet_to_kusto_type(dotnet_type)))

                tables.append(Table(t["TableName"], columns))
        except KustoServiceError as e:
            resp = e.http_response.json()
            if "error" in resp and resp["error"][
                    "@type"] == "Kusto.Data.Exceptions.EntityNotFoundException":
                raise DatabaseDoesNotExist(database_name)
            else:
                raise e

        return Database(database_name, tables)
Exemple #5
0
    def from_entities(cls, entities: List[DataEntity]) -> IngestionManifest:
        blank_db = Database("{database_name}", [])
        operations = []
        mappings: Dict[str, IngestionMapping] = {}

        for entity in entities:
            sources: Dict[str, List[str]] = defaultdict(list)
            target_table = Table.from_entity(entity)
            mapping = IngestionMapping.generate_mapping(target_table, entity)
            mappings[mapping.name] = mapping

            sources[mapping.name] = [f.path for f in entity.files]

            ingestion_sources = [
                IngestionSource(files=s_files,
                                mapping=s_mapping,
                                data_format=s_mapping.split("_")[-1])
                for s_mapping, s_files in sources.items()
            ]

            operations.append(
                IngestionOp("{database_name}", ingestion_sources,
                            target_table.name))

        return IngestionManifest([blank_db], list(mappings.values()),
                                 operations)
def table_from_blob(blob_uri, name=None, **kwargs) -> Table:
    url = urlparse(blob_uri)
    minimum_rows = 200
    minimum_file_size = 1024 * 1024
    account, service_type, _, _, _ = url.hostname.split(".")
    container, blob_path = url.path.strip("/").split("/", 1)
    blob_path_without_extension, extension = os.path.splitext(blob_path)
    blobname = os.path.basename(blob_path_without_extension)
    blob_service = BlockBlobService(account)

    output_stream = io.BytesIO()
    blob = blob_service.get_blob_to_stream(container,
                                           blob_path,
                                           output_stream,
                                           start_range=0,
                                           end_range=minimum_file_size)
    current_file_size = output_stream.tell()
    buffered_stream = io.StringIO()

    output_stream.seek(0)

    partial_data = current_file_size > minimum_file_size
    if partial_data:
        # since we only have partial data, we need to read row by row before trying to parse to csv
        while output_stream.tell() < current_file_size:
            buffered_stream.write(output_stream.readline().decode("utf-8"))
        buffered_stream.seek(0)
    else:
        buffered_stream = output_stream

    columns = infer.columns_from_csv_stream(buffered_stream,
                                            includes_headers=kwargs.get(
                                                "includes_headers", False))

    return Table(name or blob_path, columns)
def table_from_file(filepath, name=None, top=200, **kwargs) -> Table:
    path = Path(filepath)
    if not path.is_file():
        raise ValueError("Given path is not a valid file.")

    df = DataFile.from_file(filepath, **kwargs, limit=top)

    return Table(name or path.stem, df.columns)
Exemple #8
0
def table_from_create_command(sql_statement: sqlparse.sql.Statement) -> Table:
    columns_part = [a for a in sql_statement.get_sublists()][1].normalized
    columns = []
    for index, col in enumerate(columns_part[1:-1].split(",")):
        col_name, col_type, *col_modifires = col.strip().split(" ")
        kusto_type = sql_type_to_kusto_type(col_type)
        columns.append(Column(index=index, name=col_name,
                              data_type=kusto_type))

    return Table(sql_statement.get_name(), columns)
Exemple #9
0
    def from_entities_and_database(
        cls,
        entities: List[DataEntity],
        target_database: Database,
        conflict_mode: SchemaConflictMode = SchemaConflictMode.Append
    ) -> IngestionManifest:
        operations = []
        mappings: Dict[str, IngestionMapping] = {}

        for entity in entities:
            sources: Dict[str, List[str]] = defaultdict(list)
            source_table = Table.from_entity(entity)

            if entity.name not in target_database.tables_dict:
                if conflict_mode == SchemaConflictMode.Safe:
                    raise SchemaConflictError(
                        f"SAFE MODE: Table '{source_table.name}' appears in source but no in target database '{target_database.name}'"
                    )

                logger.info(
                    f"Source has a table '{entity.name}' which target database '{target_database.name}' is missing. will create it."
                )

                target_table = source_table
            else:
                target_table = target_database.tables_dict[entity.name]
            # TODO: currently assuming all files under entity are of the same format and schema
            #  this assumption can change, for example if each file contains only a part of the schema,
            #  but in such a case, it will require advanced logic to match map all those partitions into a single entity
            mapping = IngestionMapping.generate_mapping(target_table, entity)
            mappings[mapping.name] = mapping

            sources[mapping.name] = [f.path for f in entity.files]

            ingestion_sources = [
                IngestionSource(files=s_files,
                                mapping=s_mapping,
                                data_format=s_mapping.split("_")[-1])
                for s_mapping, s_files in sources.items()
            ]

            operations.append(
                IngestionOp(target_database.name, ingestion_sources,
                            target_table.name))

        return IngestionManifest([target_database], list(mappings.values()),
                                 operations)