def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, file_size, tc,vm_uuid,deploy_uuid,config_uuid): ingest_source_id=str(uuid.uuid4()) KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) ing_map=[JsonColumnMapping("vm_uuid", "$.vm_uuid", "string"), JsonColumnMapping("deploy_uuid", "$.deployment_description[0].deploy_uuid", "string"), JsonColumnMapping("config_uuid", "$.vm_configuration[0].config_uuid", "string"), JsonColumnMapping("rawdata", "$", "dynamic")] INGESTION_PROPERTIES = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.JSON, ingestionMapping=ing_map, reportLevel=ReportLevel.FailuresAndSuccesses,flushImmediately=IS_FLUSH_IMMEDIATELY) print("Database {} Tabele {}".format(DATABASE,DESTINATION_TABLE)) BLOB_PATH = "https://" + blob_account + ".blob.core.windows.net/" + container_name + "/" + filepath + CLEAN_FILE_TOKEN print (BLOB_PATH,' ',str(file_size), ingest_source_id) BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES) tc.context.properties["ingest_source_id"]=ingest_source_id min_datatime=0 max_datatime=0 total_records=1 doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc) tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime }) log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id) print(log_msg) tc.track_trace(log_msg) tc.flush()
def test_blob_info_csv_mapping(self): """Tests serialization of csv ingestion blob info.""" validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) columnMapping = ColumnMapping("ColumnName", "cslDataType", ordinal=1) properties = IngestionProperties( database="database", table="table", dataFormat=DataFormat.CSV, ingestionMapping=[columnMapping], additionalTags=["tag"], ingestIfNotExists=["ingestIfNotExistTags"], ingestByTags=["ingestByTags"], dropByTags=["dropByTags"], flushImmediately=True, reportLevel=ReportLevel.DoNotReport, reportMethod=ReportMethod.Queue, validationPolicy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = _IngestionBlobInfo(blob, properties, auth_context="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
def ingest_to_ADX(filepath, filesize): KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = AAD_TENANT_ID KCSB_ENGINE = KustoConnectionStringBuilder.with_aad_device_authentication( URI) KCSB_ENGINE.authority_id = AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) INGESTION_PROPERTIES = IngestionProperties( database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={'ignoreFirstRecord': 'true'}, reportLevel=ReportLevel.FailuresAndSuccesses) BLOB_PATH = "https://" + SOURCE_CSV_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_CSV_CONTAINER + "/" + filepath + SOURCE_CSV_BLOB_TOKEN BLOB_DESCRIPTOR = BlobDescriptor( BLOB_PATH, filesize) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob( BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) print('Done queuing up ingestion with Azure Data Explorer ' + filepath)
def test_blob_ingestion(self, mock_uuid, mock_put_message_in_queue, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=queued_request_callback, content_type="application/json") ingest_client = ManagedStreamingIngestClient.from_dm_kcsb( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table") blob_path = ( "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?sp=rl&st=2020-05-20T13" "%3A38%3A37Z&se=2020-05-21T13%3A38%3A37Z&sv=2019-10-10&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx " ) result = ingest_client.ingest_from_blob( BlobDescriptor(blob_path, 1), ingestion_properties=ingestion_properties) assert result.status == IngestionStatus.QUEUED assert_queued_upload( mock_put_message_in_queue, mock_upload_blob_from_stream=None, expected_url= "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?", )
def test_blob_info_json_mapping(self): """ Tests serialization of json ingestion blob info. """ validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) properties = IngestionProperties( database="database", table="table", dataFormat=DataFormat.json, mapping=[JsonColumnMapping("ColumnName", "jsonpath", "datatype")], additionalTags=["tag"], ingestIfNotExists=["ingestIfNotExistTags"], ingestByTags=["ingestByTags"], dropByTags=["dropByTags"], flushImmediately=True, reportLevel=ReportLevel.DoNotReport, reportMethod=ReportMethod.QueueAndTable, validationPolicy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = _IngestionBlobInfo(blob, properties, deleteSourcesOnSuccess=True, authContext="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
def ingest_from_blob(cls, ingest_client: QueuedIngestClient, database_name: str, table_name: str, blob_url: str, data_format: DataFormat, mapping_name: str = None) -> None: """ Ingest Data from a Blob. :param ingest_client: Client to ingest data :param database_name: DB name :param table_name: Table name :param blob_url: Blob Uri :param data_format: Given data format :param mapping_name: Desired mapping name """ ingestion_properties = cls.create_ingestion_properties( database_name, table_name, data_format, mapping_name) # Tip 1: For optimal ingestion batching and performance,specify the uncompressed data size in the file descriptor instead of the default below of 0. # Otherwise, the service will determine the file size, requiring an additional s2s call, and may not be accurate for compressed files. # Tip 2: To correlate between ingestion operations in your applications and Kusto, set the source ID and log it somewhere blob_descriptor = BlobDescriptor(blob_url, size=0, source_id=str(uuid.uuid4())) ingest_client.ingest_from_blob( blob_descriptor, ingestion_properties=ingestion_properties)
def test_blob_info_json_mapping(self): """Tests serialization of json ingestion blob info.""" validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) properties = IngestionProperties( database="database", table="table", data_format=DataFormat.JSON, column_mappings=[ ColumnMapping("ColumnName", "datatype", path="jsonpath") ], additional_tags=["tag"], ingest_if_not_exists=["ingestIfNotExistTags"], ingest_by_tags=["ingestByTags"], drop_by_tags=["dropByTags"], flush_immediately=True, report_level=ReportLevel.DoNotReport, report_method=ReportMethod.Queue, validation_policy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = IngestionBlobInfo(blob, properties, auth_context="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, tc): ingest_source_id = str(uuid.uuid4()) #file_size=BlockBlobService.get_blob_properties(telemetry_block_blob_service,container_name,filepath).properties.content_length #print (filepath+" File Size "+str(file_size)) KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID vm_uuid, config_uuid, deploy_uuid, file_size, min_datatime, max_datatime, total_records = get_uuids_from_csv( telemetry_block_blob_service, container_name, filepath) dropByTag = vm_uuid + '_' + config_uuid + '_' + deploy_uuid INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) INGESTION_PROPERTIES = IngestionProperties( database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={ 'ignoreFirstRecord': 'true', 'reportMethod': 'QueueAndTable' }, reportLevel=ReportLevel.FailuresAndSuccesses, dropByTags=[dropByTag], flushImmediately=IS_FLUSH_IMMEDIATELY) BLOB_PATH = "https://" + SOURCE_OSMETRICS_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_OSMETRICS_CONTAINER + "/" + filepath + SOURCE_OSMETRICS_FILE_TOKEN #print (BLOB_PATH,' ',str(file_size)) BLOB_DESCRIPTOR = BlobDescriptor( BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob( BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) tc.context.properties["ingest_source_id"] = str(ingest_source_id) doc_id = save_COSMOS_log(vm_uuid, deploy_uuid, config_uuid, filepath, min_datatime, max_datatime, total_records, ingest_source_id, blob_account, container_name, tc) tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath, 'DOC_ID': doc_id, "SOURCE_ID": ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE': file_size, 'MIN_DATETIME': min_datatime, 'MAX_DATETIME': max_datatime }) log_msg = "{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format( LOG_MESSAGE_HEADER, filepath, ingest_source_id) print(log_msg) tc.track_trace(log_msg) tc.flush()
def ingestBlob(client,db,blob,properties): INGESTION_PROPERTIES = IngestionProperties(database=db, table=blob['table'], dataFormat=DataFormat(blob['format']), mappingReference=blob['ingestionMapping'], additionalProperties=properties, reportLevel=ReportLevel.FailuresAndSuccesses) BLOB_DESCRIPTOR = BlobDescriptor(blob['path'],blob['size']) try: client.ingest_from_blob(BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) logging.info("Blob %s ingested succesfully."%blob['name']) except Exception as e: logging.error("Error ingesting blob %s: %s"%(blob['name'],e))
def test_uuid_blob_descriptor(self): dummy_file = "dummy" descriptor = BlobDescriptor(dummy_file) assert descriptor.source_id assert descriptor.source_id != TestDescriptors.TEST_UUID assert uuid.UUID(str(descriptor.source_id), version=4) descriptor = BlobDescriptor(dummy_file, source_id=TestDescriptors.TEST_UUID_STR) assert descriptor.source_id == TestDescriptors.TEST_UUID descriptor = BlobDescriptor(dummy_file, source_id=TestDescriptors.TEST_UUID) assert descriptor.source_id == TestDescriptors.TEST_UUID with pytest.raises(ValueError): BlobDescriptor(dummy_file, source_id=TestDescriptors.INVALID_UUID)
def ingest(file, size): props = IngestionProperties( database="GitHub", table="GithubEvent", dataFormat=DataFormat.json, mapping=mapping, ingestIfNotExists=[file], ingestByTags=[file], dropByTags=[file[57:67]], ) client.ingest_from_blob(BlobDescriptor(file, size), props) print("ingested {}".format(file))
def ingest_to_adx(file_path, file_size, target_database, target_table, \ msg_time, modification_time): """ Trigger ADX to ingest the specified file in Azure Data Lake Prepare ADX ingestion meta-data :param file_path: The full path of blob file :param file_size: The full size of blob file :param target_database: The target database :param target_table: The target table :param msg_time: The msg_time from eventgrid :param azure_telemetry_client: The telemetry client used for sending telemetry of the ingest function :return: None """ logging.info(f'{LOG_MESSAGE_HEADER} start to ingest to adx') ingest_source_id = str(uuid.uuid4()) if SOURCE_TELEMETRY_FILE_TOKEN.startswith('?'): blob_path = file_path + SOURCE_TELEMETRY_FILE_TOKEN else: blob_path = file_path + '?' + SOURCE_TELEMETRY_FILE_TOKEN logging.info(f"{LOG_MESSAGE_HEADER} blob_path:{blob_path}, ingest_source_id:{ingest_source_id}") logging.info('%s FILEURL : %s, INGESTION URL: %s, Database: %s, \ Table: %s, FILESIZE: %s, msg_time: %s, modification_time: %s', \ LOG_MESSAGE_HEADER, blob_path, INGESTION_SERVER_URI, \ target_database, target_table, file_size, msg_time, modification_time) ingestion_properties = IngestionProperties(database=target_database, table=target_table, \ dataFormat=DataFormat.JSON, \ ingestion_mapping_reference=INGESTION_MAPPING, \ reportLevel=ReportLevel.FailuresAndSuccesses, \ additionalProperties={'reportMethod': 'QueueAndTable', \ "creationTime": msg_time.strftime( \ "%Y-%m-%d %H:%M"), "modificationTime": modification_time.strftime( \ "%Y-%m-%d %H:%M")}, \ flushImmediately=IS_FLUSH_IMMEDIATELY) blob_descriptor = BlobDescriptor(blob_path, file_size, \ ingest_source_id) # 10 is the raw size of the data in bytes logging.info(f"{LOG_MESSAGE_HEADER} start to ingest to queue") start_time = time.time() KUSTO_INGESTION_CLIENT.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_properties) logging.info(f"{LOG_MESSAGE_HEADER} ingest process time {time.time()-start_time}") return ingest_source_id
def test_blob_json_mapping_reference(self): """Tests serialization of ingestion blob info with json mapping reference.""" validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) properties = IngestionProperties( database="database", table="table", dataFormat=DataFormat.JSON, mappingReference="jsonMappingReference", additionalTags=["tag"], ingestIfNotExists=["ingestIfNotExistTags"], ingestByTags=["ingestByTags"], dropByTags=["dropByTags"], flushImmediately=True, reportLevel=ReportLevel.DoNotReport, reportMethod=ReportMethod.Queue, validationPolicy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = _IngestionBlobInfo(blob, properties, auth_context="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
create_mapping_command) ingestion_client = KustoIngestClient(kcsb_ingest) # All ingestion properties: https://docs.microsoft.com/en-us/azure/kusto/management/data-ingestion/#ingestion-properties ingestion_props = IngestionProperties( reportLevel=reportLevel, database=kusto_database, table=destination_table, dataFormat=DataFormat.csv, mappingReference=column_mapping_name, additionalProperties={'ignoreFirstRecord': 'true'}) blobProps = BlockBlobService.get_blob_properties(blob_service, container, file_name).properties file_size = blobProps.content_length blob_descriptor = BlobDescriptor( blob_path, file_size) # Raw size of the data in bytes ingestion_client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props) print( """Queued blob '{FILE_NAME}' ({FILE_SIZE} bytes) for ingestion into ADX table '{DESTINATION_TABLE}'""" .format(FILE_NAME=file_name, FILE_SIZE=file_size, DESTINATION_TABLE=destination_table)) # query = """{} | count""".format(destination_table) # response = kusto_client.execute_query(kusto_database, query) # count_query_df = dataframe_from_result_table(response.primary_results[0])
# in case status update for success are also required # report_level=ReportLevel.FailuresAndSuccesses, # in case a mapping is required # ingestion_mapping_reference="{json_mapping_that_already_exists_on_table}" # ingestion_mapping_type=IngestionMappingType.JSON ) # ingest from file file_descriptor = FileDescriptor( "{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob blob_descriptor = BlobDescriptor( "https://{path_to_blob}.csv.gz?sp=rl&st=2020-05-20T13:38:37Z&se=2020-05-21T13:38:37Z&sv=2019-10-10&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", 10, ) # 10 is the raw size of the data in bytes. client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props) # ingest from dataframe import pandas fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = pandas.DataFrame(data=rows, columns=fields) client.ingest_from_dataframe(df, ingestion_properties=ingestion_props) # ingest a whole folder. import os
dataFormat=DataFormat.CSV, # in case status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, # in case a mapping is required # ingestionMappingReference="{json_mapping_that_already_exists_on_table}" # ingestionMappingType=IngestionMappingType.Json ) # ingest from file file_descriptor = FileDescriptor("{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob blob_descriptor = BlobDescriptor("https://{path_to_blob}.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props) # ingest from dataframe import pandas fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = pandas.DataFrame(data=rows, columns=fields) client.ingest_from_dataframe(df, ingestion_properties=ingestion_props) # ingest a whole folder. import os
INGESTION_PROPERTIES = IngestionProperties(database="database name", table="table name", dataFormat=DataFormat.csv) INGEST_CLIENT = KustoIngestClient( "https://ingest-<clustername>.kusto.windows.net") KCSB = KustoConnectionStringBuilder.with_aad_application_key_authentication( "https://ingest-<clustername>.kusto.windows.net", "aad app id", "secret") INGEST_CLIENT = KustoIngestClient(KCSB) FILE_DESCRIPTOR = FileDescriptor( "E:\\filePath.csv", 3333) # 3333 is the raw size of the data in bytes. INGEST_CLIENT.ingest_from_multiple_files( [FILE_DESCRIPTOR], delete_sources_on_success=True, ingestion_properties=INGESTION_PROPERTIES) INGEST_CLIENT.ingest_from_multiple_files( ["E:\\filePath.csv"], delete_sources_on_success=True, ingestion_properties=INGESTION_PROPERTIES) BLOB_DESCRIPTOR = BlobDescriptor( "https://path-to-blob.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. INGEST_CLIENT.ingest_from_multiple_blobs( [BLOB_DESCRIPTOR], delete_sources_on_success=True, ingestion_properties=INGESTION_PROPERTIES)
ingestion_properties = IngestionProperties(database="database name", table="table name", dataFormat=DataFormat.csv) ingest_client = KustoIngestClient( "https://ingest-<clustername>.kusto.windows.net") ingest_client = KustoIngestClient( "https://ingest-<clustername>.kusto.windows.net", client_id="aad app id", client_secret="secret") file_descriptor = FileDescriptor( "E:\\filePath.csv", 3333) # 3333 is the raw size of the data in bytes. ingest_client.ingest_from_multiple_files( [file_descriptor], delete_sources_on_success=True, ingestion_properties=ingestion_properties) ingest_client.ingest_from_multiple_files( ["E:\\filePath.csv"], delete_sources_on_success=True, ingestion_properties=ingestion_properties) blob_descriptor = BlobDescriptor( "https://path-to-blob.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. ingest_client.ingest_from_multiple_blobs( [blob_descriptor], delete_sources_on_success=True, ingestion_properties=ingestion_properties)