def from_manifest(manifest: IngestionManifest) -> Tuple[List[str], List[str]]: table_commands = {} mapping_commands = {} for op in manifest.operations: db = manifest.database_dict[op.database] expected_table = Table(op.target, []) for source in op.sources: mapping = manifest.mappings_dict[source.mapping] ingestion_mapping = KustoBackend.get_ingestion_mapping( source.data_format, mapping) mapping_commands[ source.mapping] = KustoBackend.get_create_ingestion_command( table=op.target, mapping_name=source.mapping, column_mappings=ingestion_mapping) expected_table.extend_columns( [Column.copy(col_map.target) for col_map in mapping.columns]) if op.target not in db.tables_dict: table_commands[op.target] = KustoBackend.get_create_table_command( expected_table) return list(table_commands.values()), list(mapping_commands.values())
def table_from_folder(path, name=None, conflict_mode=DataConflictMode.Safe, top=200) -> Table: inferred_table = Table(name, columns=[]) for file in os.listdir(path): current_table = table_from_file(file, top) if conflict_mode == DataConflictMode.Safe: inferred_table.assert_eq(current_table) return inferred_table
def generate_mapping(cls, table: Table, source: DataEntity) -> IngestionMapping: data_format = source.files[0].data_format name = source.name + "_from_" + data_format column_mappings = [] index_based = not any((c for c in source.columns if c.name is not None)) if index_based and len(source.columns) != len(table.columns): logger.warning( f"Mapping for '{source.name}' used index based mapping, and column count doesn't match target table '{table.name}'. " f"{len(source.columns)} != {len(table.columns)}" ) for index, source_col in enumerate(source.columns): if index_based: # TODO: should probably notify if types mismatch (might mean mis-configured) if index + 1 > len(table.columns): raise RuntimeError(f"Target table '{table.name}' has fewer columns than source {source.name}. Failed index {index}") target_col = table.columns[index] else: expected_target_col_name = Table.valid_column_name(source_col.name) if expected_target_col_name not in table.columns_lookup: raise RuntimeError( f"Target table '{table.name}' is missing a column {expected_target_col_name} ({source_col.name}) from source {source.name}. Failed index {index}" ) target_col = table.columns_lookup[expected_target_col_name] column_mappings.append(ColumnMapping(source_col, target_col)) return IngestionMapping(name, column_mappings)
def describe_database(self, database_name: str, **kwargs) -> Database: tables = [] client = self.client_provider.get_engine_client() try: tables_result = client.execute( "NetDefault", LIST_COLUMNS_BY_TABLE.format( database_name=database_name)).primary_results[0] for t in tables_result: columns = [] for index, col in enumerate(t["Columns"]): name, dotnet_type = col.split(":") columns.append( Column(name, index=index, data_type=dotnet_to_kusto_type(dotnet_type))) tables.append(Table(t["TableName"], columns)) except KustoServiceError as e: resp = e.http_response.json() if "error" in resp and resp["error"][ "@type"] == "Kusto.Data.Exceptions.EntityNotFoundException": raise DatabaseDoesNotExist(database_name) else: raise e return Database(database_name, tables)
def from_entities(cls, entities: List[DataEntity]) -> IngestionManifest: blank_db = Database("{database_name}", []) operations = [] mappings: Dict[str, IngestionMapping] = {} for entity in entities: sources: Dict[str, List[str]] = defaultdict(list) target_table = Table.from_entity(entity) mapping = IngestionMapping.generate_mapping(target_table, entity) mappings[mapping.name] = mapping sources[mapping.name] = [f.path for f in entity.files] ingestion_sources = [ IngestionSource(files=s_files, mapping=s_mapping, data_format=s_mapping.split("_")[-1]) for s_mapping, s_files in sources.items() ] operations.append( IngestionOp("{database_name}", ingestion_sources, target_table.name)) return IngestionManifest([blank_db], list(mappings.values()), operations)
def table_from_blob(blob_uri, name=None, **kwargs) -> Table: url = urlparse(blob_uri) minimum_rows = 200 minimum_file_size = 1024 * 1024 account, service_type, _, _, _ = url.hostname.split(".") container, blob_path = url.path.strip("/").split("/", 1) blob_path_without_extension, extension = os.path.splitext(blob_path) blobname = os.path.basename(blob_path_without_extension) blob_service = BlockBlobService(account) output_stream = io.BytesIO() blob = blob_service.get_blob_to_stream(container, blob_path, output_stream, start_range=0, end_range=minimum_file_size) current_file_size = output_stream.tell() buffered_stream = io.StringIO() output_stream.seek(0) partial_data = current_file_size > minimum_file_size if partial_data: # since we only have partial data, we need to read row by row before trying to parse to csv while output_stream.tell() < current_file_size: buffered_stream.write(output_stream.readline().decode("utf-8")) buffered_stream.seek(0) else: buffered_stream = output_stream columns = infer.columns_from_csv_stream(buffered_stream, includes_headers=kwargs.get( "includes_headers", False)) return Table(name or blob_path, columns)
def table_from_file(filepath, name=None, top=200, **kwargs) -> Table: path = Path(filepath) if not path.is_file(): raise ValueError("Given path is not a valid file.") df = DataFile.from_file(filepath, **kwargs, limit=top) return Table(name or path.stem, df.columns)
def table_from_create_command(sql_statement: sqlparse.sql.Statement) -> Table: columns_part = [a for a in sql_statement.get_sublists()][1].normalized columns = [] for index, col in enumerate(columns_part[1:-1].split(",")): col_name, col_type, *col_modifires = col.strip().split(" ") kusto_type = sql_type_to_kusto_type(col_type) columns.append(Column(index=index, name=col_name, data_type=kusto_type)) return Table(sql_statement.get_name(), columns)
def from_entities_and_database( cls, entities: List[DataEntity], target_database: Database, conflict_mode: SchemaConflictMode = SchemaConflictMode.Append ) -> IngestionManifest: operations = [] mappings: Dict[str, IngestionMapping] = {} for entity in entities: sources: Dict[str, List[str]] = defaultdict(list) source_table = Table.from_entity(entity) if entity.name not in target_database.tables_dict: if conflict_mode == SchemaConflictMode.Safe: raise SchemaConflictError( f"SAFE MODE: Table '{source_table.name}' appears in source but no in target database '{target_database.name}'" ) logger.info( f"Source has a table '{entity.name}' which target database '{target_database.name}' is missing. will create it." ) target_table = source_table else: target_table = target_database.tables_dict[entity.name] # TODO: currently assuming all files under entity are of the same format and schema # this assumption can change, for example if each file contains only a part of the schema, # but in such a case, it will require advanced logic to match map all those partitions into a single entity mapping = IngestionMapping.generate_mapping(target_table, entity) mappings[mapping.name] = mapping sources[mapping.name] = [f.path for f in entity.files] ingestion_sources = [ IngestionSource(files=s_files, mapping=s_mapping, data_format=s_mapping.split("_")[-1]) for s_mapping, s_files in sources.items() ] operations.append( IngestionOp(target_database.name, ingestion_sources, target_table.name)) return IngestionManifest([target_database], list(mappings.values()), operations)