def ingest_to_ADX(filepath, filesize): KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = AAD_TENANT_ID KCSB_ENGINE = KustoConnectionStringBuilder.with_aad_device_authentication( URI) KCSB_ENGINE.authority_id = AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) INGESTION_PROPERTIES = IngestionProperties( database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={'ignoreFirstRecord': 'true'}, reportLevel=ReportLevel.FailuresAndSuccesses) BLOB_PATH = "https://" + SOURCE_CSV_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_CSV_CONTAINER + "/" + filepath + SOURCE_CSV_BLOB_TOKEN BLOB_DESCRIPTOR = BlobDescriptor( BLOB_PATH, filesize) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob( BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) print('Done queuing up ingestion with Azure Data Explorer ' + filepath)
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, file_size, tc,vm_uuid,deploy_uuid,config_uuid): ingest_source_id=str(uuid.uuid4()) KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) ing_map=[JsonColumnMapping("vm_uuid", "$.vm_uuid", "string"), JsonColumnMapping("deploy_uuid", "$.deployment_description[0].deploy_uuid", "string"), JsonColumnMapping("config_uuid", "$.vm_configuration[0].config_uuid", "string"), JsonColumnMapping("rawdata", "$", "dynamic")] INGESTION_PROPERTIES = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.JSON, ingestionMapping=ing_map, reportLevel=ReportLevel.FailuresAndSuccesses,flushImmediately=IS_FLUSH_IMMEDIATELY) print("Database {} Tabele {}".format(DATABASE,DESTINATION_TABLE)) BLOB_PATH = "https://" + blob_account + ".blob.core.windows.net/" + container_name + "/" + filepath + CLEAN_FILE_TOKEN print (BLOB_PATH,' ',str(file_size), ingest_source_id) BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES) tc.context.properties["ingest_source_id"]=ingest_source_id min_datatime=0 max_datatime=0 total_records=1 doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc) tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime }) log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id) print(log_msg) tc.track_trace(log_msg) tc.flush()
def test_simple_ingest_from_dataframe(self, mock_pid, mock_time, mock_uuid, mock_put_message_in_queue, mock_create_blob_from_path): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json", ) ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.csv) from pandas import DataFrame fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = DataFrame(data=rows, columns=fields) ingest_client.ingest_from_dataframe( df, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] assert put_message_in_queue_mock_kwargs[ "queue_name"] == "readyforaggregation-secured" queued_message = base64.b64decode( put_message_in_queue_mock_kwargs["content"].encode( "utf-8")).decode("utf-8") queued_message_json = json.loads(queued_message) # mock_create_blob_from_stream assert ( queued_message_json["BlobPath"] == "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__df_100_64.csv.gz?sas" ) assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] == False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] == False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] == True create_blob_from_path_mock_kwargs = mock_create_blob_from_path.call_args_list[ 0][1] import tempfile assert create_blob_from_path_mock_kwargs[ "container_name"] == "tempstorage" assert create_blob_from_path_mock_kwargs["file_path"] == os.path.join( tempfile.gettempdir(), "df_100_64.csv.gz") assert (create_blob_from_path_mock_kwargs["blob_name"] == "database__table__1111-111111-111111-1111__df_100_64.csv.gz")
def test_sanity_ingest_from_file(self, mock_uuid, mock_put_message_in_queue, mock_create_blob_from_stream, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.CSV) # ensure test can work when executed from within directories current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] assert put_message_in_queue_mock_kwargs[ "queue_name"] == "readyforaggregation-secured" queued_message = base64.b64decode( put_message_in_queue_mock_kwargs["content"].encode( "utf-8")).decode("utf-8") queued_message_json = json.loads(queued_message) expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/" "database__table__1111-111111-111111-1111__dataset.csv.gz?sas" # mock_create_blob_from_stream assert queued_message_json["BlobPath"] == expected_url assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] == False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] == False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] == True create_blob_from_stream_mock_kwargs = mock_create_blob_from_stream.call_args_list[ 0][1] assert create_blob_from_stream_mock_kwargs[ "container_name"] == "tempstorage" assert type( create_blob_from_stream_mock_kwargs["stream"]) == io.BytesIO assert create_blob_from_stream_mock_kwargs[ "blob_name"] == "database__table__1111-111111-111111-1111__dataset.csv.gz"
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, tc): ingest_source_id = str(uuid.uuid4()) #file_size=BlockBlobService.get_blob_properties(telemetry_block_blob_service,container_name,filepath).properties.content_length #print (filepath+" File Size "+str(file_size)) KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID vm_uuid, config_uuid, deploy_uuid, file_size, min_datatime, max_datatime, total_records = get_uuids_from_csv( telemetry_block_blob_service, container_name, filepath) dropByTag = vm_uuid + '_' + config_uuid + '_' + deploy_uuid INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) INGESTION_PROPERTIES = IngestionProperties( database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={ 'ignoreFirstRecord': 'true', 'reportMethod': 'QueueAndTable' }, reportLevel=ReportLevel.FailuresAndSuccesses, dropByTags=[dropByTag], flushImmediately=IS_FLUSH_IMMEDIATELY) BLOB_PATH = "https://" + SOURCE_OSMETRICS_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_OSMETRICS_CONTAINER + "/" + filepath + SOURCE_OSMETRICS_FILE_TOKEN #print (BLOB_PATH,' ',str(file_size)) BLOB_DESCRIPTOR = BlobDescriptor( BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob( BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) tc.context.properties["ingest_source_id"] = str(ingest_source_id) doc_id = save_COSMOS_log(vm_uuid, deploy_uuid, config_uuid, filepath, min_datatime, max_datatime, total_records, ingest_source_id, blob_account, container_name, tc) tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath, 'DOC_ID': doc_id, "SOURCE_ID": ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE': file_size, 'MIN_DATETIME': min_datatime, 'MAX_DATETIME': max_datatime }) log_msg = "{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format( LOG_MESSAGE_HEADER, filepath, ingest_source_id) print(log_msg) tc.track_trace(log_msg) tc.flush()
def __init__(self): self.ingest_client = KustoIngestClient( credentials.kusto_ppe_ingest_connection, client_id=credentials.kusto_application_id, client_secret=credentials.kusto_application_key) self.properties = IngestionProperties(database="BingAdsUCM", table="PerfIcMAlertEvent", dataFormat=DataFormat.csv) self.log_buffer_file = "kusto_log_buffer.csv"
def Ingest(Tag): # setting AUTHORITY_ID = "6babcaad-604b-40ac-a9d7-9fd97c0b779f" INGESTCLUSTER = "https://ingest-cgadataout.kusto.windows.net" KUSTOCLUSTER = "https://cgadataout.kusto.windows.net" DATABASE = "DevRelWorkArea" # Create table KCSB_DATA = KustoConnectionStringBuilder.with_aad_device_authentication( KUSTOCLUSTER) DESTINATION_TABLE = "RepoContributors" DESTINATION_TABLE_COLUMN_MAPPING = "RepoContributors_CSV_Mapping" KUSTO_CLIENT = KustoClient(KCSB_DATA) DROP_TABLE_IF_EXIST = ".drop table RepoContributors ifexists" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, DROP_TABLE_IF_EXIST) CREATE_TABLE_COMMAND = ".create table RepoContributors (Article: string, Contributors: int64, Data: string)" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_TABLE_COMMAND) print("RepoContributors table is created") # Create mapping CREATE_MAPPING_COMMAND = """.create table RepoContributors ingestion csv mapping 'RepoContributors_CSV_Mapping' '[{"Name": "Article","datatype": "string","Ordinal": 0},{"Name": "Contributors","datatype": "int64","Ordinal": 1},{"Name": "Data","datatype": "string","Ordinal": 2}]'""" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_MAPPING_COMMAND) print("mapping is created") # Ingest # The authentication method will be taken from the chosen KustoConnectionStringBuilder. ingestion_props = IngestionProperties( database="DevRelWorkArea", table="RepoContributors", dataFormat=DataFormat.CSV, ingestByTags=[Tag], dropByTags=[Tag], mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, reportLevel=ReportLevel.FailuresAndSuccesses, additionalProperties={'ignoreFirstRecord': 'true'}) kcsb = KustoConnectionStringBuilder.with_aad_device_authentication( INGESTCLUSTER) client = KustoIngestClient(kcsb) # ingest from file file_descriptor = FileDescriptor( r"D:\test\Results\log_data_merge\merge_microsoftdocs_sql-docs-pr.txt", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) # if status updates are required, something like this can be done return 1
def test_sanity_ingest_from_file(self, mock_uuid, mock_put_message_in_queue, mock_upload_blob_from_stream, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) # ensure test can work when executed from within directories current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] queued_message_json = json.loads( put_message_in_queue_mock_kwargs["content"]) expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__dataset.csv.gz?" # mock_upload_blob_from_stream # not checking the query string because it can change order, just checking it's there assert queued_message_json["BlobPath"].startswith(expected_url) is True assert len(queued_message_json["BlobPath"]) > len(expected_url) assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] is False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] is False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] is True upload_blob_kwargs = mock_upload_blob_from_stream.call_args_list[0][1] assert type(upload_blob_kwargs["data"]) == io.BytesIO
def test_simple_ingest_from_dataframe(self, mock_pid, mock_time, mock_uuid, mock_put_message_in_queue, mock_upload_blob_from_stream): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) from pandas import DataFrame fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = DataFrame(data=rows, columns=fields) ingest_client.ingest_from_dataframe( df, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] queued_message_json = json.loads( put_message_in_queue_mock_kwargs["content"]) expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__df_{0}_100_64.csv.gz?".format( id(df)) # mock_upload_blob_from_stream # not checking the query string because it can change order, just checking it's there assert queued_message_json["BlobPath"].startswith(expected_url) is True assert len(queued_message_json["BlobPath"]) > len(expected_url) assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] is False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] is False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] is True upload_blob_kwargs = mock_upload_blob_from_stream.call_args_list[0][1] assert type(upload_blob_kwargs["data"]) == io.BufferedReader
def authenticate_to_kusto_ingress(cluster): """Authenticate and return kusto connection client""" kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication( cluster, CLIENT_ID, CLIENT_SECRET, AUTHORITY_ID) # The authentication method will be taken from the chosen KustoConnectionStringBuilder. kusto_client = KustoIngestClient(kcsb) return kusto_client
def getKustoClient(kcsb): client = None try: client = KustoIngestClient(kcsb) except Exception as e: logging.error("Could not initialize Kusto Client:%s"%e) return client
def test_pop_unbalanced_queues(self): client = KustoIngestClient("some-cluster") fake_receive = fake_receive_factory( lambda queue_name, messages_per_page=1: [mock_message(success=False) for _ in range(0, messages_per_page)] if "1" in queue_name else []) with mock.patch.object( client._resource_manager, "get_successful_ingestions_queues"), mock.patch.object( client._resource_manager, "get_failed_ingestions_queues" ) as mocked_get_failed_qs, mock.patch.object( QueueClient, "receive_messages", autospec=True, side_effect=fake_receive, ) as q_receive_mock, mock.patch.object(QueueClient, "delete_message", return_value=None): fake_failed_queue1 = _ResourceUri( "mocked_storage_account_f1", OBJECT_TYPE, "queue", "mocked_qf_1_name", ENDPOINT_SUFFIX, ) fake_failed_queue2 = _ResourceUri( "mocked_storage_account_f2", OBJECT_TYPE, "queue", "mocked_qf_2_name", ENDPOINT_SUFFIX, ) mocked_get_failed_qs.return_value = [ fake_failed_queue1, fake_failed_queue2 ] qs = KustoIngestStatusQueues(client) get_failure_actual = qs.failure.pop(6) assert len(get_failure_actual) == 6 for m in get_failure_actual: assert isinstance(m, FailureMessage) assert q_receive_mock.call_count == 3 actual = {} for call_args in q_receive_mock.call_args_list: actual[call_args[0][0].queue_name] = actual.get( call_args[0][0].queue_name, 0) + call_args[1]["messages_per_page"] assert actual[fake_failed_queue2.object_name] + actual[ fake_failed_queue1.object_name] == (4 + 4 + 6)
def get_kusto_client() -> KustoIngestClient: cluster = "https://ingest-" + os.environ['KUSTO_CLUSTER'] + ".kusto.windows.net" username = os.environ['KUSTO_USERNAME'] password = os.environ['KUSTO_PASSWORD'] authority_id = os.environ['KUSTO_TENANT_ID'] kcsb = KustoConnectionStringBuilder.with_aad_user_password_authentication(cluster, username, password, authority_id) return KustoIngestClient(kcsb)
def test_sanity_ingest(self, mock_post, mock_aad, mock_block_blob, mock_queue): """Test simple ingest""" ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.csv) file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv") ingest_client.ingest_from_multiple_files( [file_path], delete_sources_on_success=False, ingestion_properties=ingestion_properties)
def initialize_kusto_client(): """initialize kusto client """ global KUSTO_INGESTION_CLIENT if not KUSTO_INGESTION_CLIENT: kcsb_ingest = KustoConnectionStringBuilder.with_aad_application_key_authentication( \ INGESTION_SERVER_URI, APP_CLIENT_ID, APP_CLIENT_SECRETS, APP_AAD_TENANT_ID) KUSTO_INGESTION_CLIENT = KustoIngestClient(kcsb_ingest) logging.info(f"{LOG_MESSAGE_HEADER} Build KUSTO_INGESTION_CLIENT") else: logging.info(f"{LOG_MESSAGE_HEADER} KUSTO_INGESTION_CLIENT exist")
def __init__(self, db_name: str): """Initialize a Kusto report DB connector. Args: db_name: The Kusto database to connect to. """ self.db_name = db_name ingest_cluster = os.getenv("TEST_REPORT_INGEST_KUSTO_CLUSTER") tenant_id = os.getenv("TEST_REPORT_AAD_TENANT_ID") service_id = os.getenv("TEST_REPORT_AAD_CLIENT_ID") service_key = os.getenv("TEST_REPORT_AAD_CLIENT_KEY") if not ingest_cluster or not tenant_id or not service_id or not service_key: raise RuntimeError("Could not load Kusto Credentials from environment") kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(ingest_cluster, service_id, service_key, tenant_id) self._ingestion_client = KustoIngestClient(kcsb) """ Kusto performance depends on the work load of cluster, to improve the high availability of test result data service by hosting a backup cluster, which is optional. """ ingest_cluster = os.getenv("TEST_REPORT_INGEST_KUSTO_CLUSTER_BACKUP") tenant_id = os.getenv("TEST_REPORT_AAD_TENANT_ID_BACKUP") service_id = os.getenv("TEST_REPORT_AAD_CLIENT_ID_BACKUP") service_key = os.getenv("TEST_REPORT_AAD_CLIENT_KEY_BACKUP") if not ingest_cluster or not tenant_id or not service_id or not service_key: print("Could not load backup Kusto Credentials from environment") self._ingestion_client_backup = None else: kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(ingest_cluster, service_id, service_key, tenant_id) self._ingestion_client_backup = KustoIngestClient(kcsb)
def test_ingest_from_file_wrong_endpoint(self): responses.add_callback( responses.POST, "https://somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_error_callback, content_type="application/json" ) ingest_client = KustoIngestClient("https://somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) with self.assertRaises(KustoInvalidEndpointError) as ex: ingest_client.ingest_from_file(file_path, ingestion_properties=ingestion_properties) self.assertEqual( ex.exception.args[0], "You are using 'DataManagement' client type, but the provided endpoint is of ServiceType 'Engine'. Initialize the client with the appropriate endpoint URI: 'https://ingest-somecluster.kusto.windows.net'", "Expected exception was not raised", )
def setup_class(cls): # DM CS can be composed from engine CS cls.engine_cs = os.environ.get("ENGINE_CONNECTION_STRING") cls.dm_cs = os.environ.get( "DM_CONNECTION_STRING") or cls.engine_cs.replace( "//", "//ingest-") cls.app_id = os.environ.get("APP_ID") cls.app_key = os.environ.get("APP_KEY") cls.auth_id = os.environ.get("AUTH_ID") cls.test_db = os.environ.get("TEST_DATABASE") if not all([ cls.engine_cs, cls.dm_cs, cls.app_id, cls.app_key, cls.auth_id, cls.test_db ]): raise unittest.SkipTest("E2E environment is missing") # Init clients python_version = "_".join([str(v) for v in sys.version_info[:3]]) cls.test_table = "python_test_{0}_{1}_{2}".format( python_version, str(int(time.time())), random.randint(1, 100000)) cls.client = KustoClient(cls.engine_kcsb_from_env()) cls.ingest_client = KustoIngestClient(cls.dm_kcsb_from_env()) cls.streaming_ingest_client = KustoStreamingIngestClient( cls.engine_kcsb_from_env()) cls.input_folder_path = cls.get_file_path() cls.csv_file_path = os.path.join(cls.input_folder_path, "dataset.csv") cls.tsv_file_path = os.path.join(cls.input_folder_path, "dataset.tsv") cls.zipped_csv_file_path = os.path.join(cls.input_folder_path, "dataset.csv.gz") cls.json_file_path = os.path.join(cls.input_folder_path, "dataset.json") cls.zipped_json_file_path = os.path.join(cls.input_folder_path, "dataset.jsonz.gz") cls.current_count = 0 cls.client.execute( cls.test_db, ".create table {0} (rownumber: int, rowguid: string, xdouble: real, xfloat: real, xbool: bool, xint16: int, xint32: int, xint64: long, xuint8: long, xuint16: long, xuint32: long, xuint64: long, xdate: datetime, xsmalltext: string, xtext: string, xnumberAsText: string, xtime: timespan, xtextWithNulls: string, xdynamicWithNulls: dynamic)" .format(cls.test_table), ) cls.client.execute( cls.test_db, ".create table {0} ingestion json mapping 'JsonMapping' {1}". format(cls.test_table, cls.test_table_json_mapping_reference()))
def test_isempty(self): client = KustoIngestClient("some-cluster") fake_peek = fake_peek_factory(lambda queue_name, num_messages=1: [ mock_message(success=True) for _ in range(0, num_messages) ] if "qs" in queue_name else []) with mock.patch.object(client._resource_manager, "get_successful_ingestions_queues" ) as mocked_get_success_qs, mock.patch.object( client._resource_manager, "get_failed_ingestions_queues" ) as mocked_get_failed_qs, mock.patch.object( QueueClient, "peek_messages", autospec=True, side_effect=fake_peek) as q_mock: fake_failed_queue = _ResourceUri( "mocked_storage_account1", OBJECT_TYPE, "queue", "mocked_qf_name", ENDPOINT_SUFFIX, ) fake_success_queue = _ResourceUri( "mocked_storage_account2", OBJECT_TYPE, "queue", "mocked_qs_name", ENDPOINT_SUFFIX, ) mocked_get_success_qs.return_value = [fake_success_queue] mocked_get_failed_qs.return_value = [fake_failed_queue] qs = KustoIngestStatusQueues(client) assert qs.success.is_empty() is False assert qs.failure.is_empty() is True assert q_mock.call_count == 2 assert q_mock.call_args_list[0][1]["max_messages"] == 2 assert q_mock.call_args_list[1][1]["max_messages"] == 2
def __init__(self, db_name: str): """Initialize a Kusto report DB connector. Args: db_name: The Kusto database to connect to. """ self.db_name = db_name ingest_cluster = os.getenv("TEST_REPORT_INGEST_KUSTO_CLUSTER") tenant_id = os.getenv("TEST_REPORT_AAD_TENANT_ID") service_id = os.getenv("TEST_REPORT_AAD_CLIENT_ID") service_key = os.getenv("TEST_REPORT_AAD_CLIENT_KEY") if not ingest_cluster or not tenant_id or not service_id or not service_key: raise RuntimeError( "Could not load Kusto Credentials from environment") kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication( ingest_cluster, service_id, service_key, tenant_id) self._ingestion_client = KustoIngestClient(kcsb)
def test_isempty(self, mocked_q_peek_messages): client = KustoIngestClient("some-cluster") with mock.patch.object( client._resource_manager, "get_successful_ingestions_queues" ) as mocked_get_success_qs, mock.patch.object( client._resource_manager, "get_failed_ingestions_queues") as mocked_get_failed_qs: fake_failed_queue = _ResourceUri("mocked_storage_account1", "queue", "mocked_qf_name", "mocked_sas") fake_success_queue = _ResourceUri("mocked_storage_account2", "queue", "mocked_qs_name", "mocked_sas") mocked_get_success_qs.return_value = [fake_success_queue] mocked_get_failed_qs.return_value = [fake_failed_queue] mocked_q_peek_messages.side_effect = ( lambda queue_name, num_messages=1: [] if queue_name == fake_failed_queue.object_name else [QueueMessage() for _ in range(0, num_messages)]) qs = KustoIngestStatusQueues(client) assert qs.success.is_empty() == False assert qs.failure.is_empty() == True assert mocked_q_peek_messages.call_count == 2 assert mocked_q_peek_messages.call_args_list[0][0][ 0] == fake_success_queue.object_name assert mocked_q_peek_messages.call_args_list[0][1][ "num_messages"] == 2 assert mocked_q_peek_messages.call_args_list[1][0][ 0] == fake_failed_queue.object_name assert mocked_q_peek_messages.call_args_list[1][1][ "num_messages"] == 2
kcsb = KustoConnectionStringBuilder.with_aad_application_certificate_authentication( cluster, client_id, PEM, thumbprint, authority_id ) # In case you want to authenticate with AAD username and password username = "******" password = "******" kcsb = KustoConnectionStringBuilder.with_aad_user_password_authentication(cluster, username, password, authority_id) # In case you want to authenticate with AAD device code. # Please note that if you choose this option, you'll need to autenticate for every new instance that is initialized. # It is highly recommended to create one instance and use it for all of your queries. kcsb = KustoConnectionStringBuilder.with_aad_device_authentication(cluster) # The authentication method will be taken from the chosen KustoConnectionStringBuilder. client = KustoIngestClient(kcsb) # there are more options for authenticating - see azure-kusto-data samples ################################################################## ## INGESTION ## ################################################################## # there are a lot of useful properties, make sure to go over docs and check them out ingestion_props = IngestionProperties( database="{database_name}", table="{table_name}", dataFormat=DataFormat.CSV, # in case status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, # in case a mapping is required
JsonColumnMapping(columnName="xtextWithNulls", jsonPath="$.xtextWithNulls", cslDataType="string")) mappings.append( JsonColumnMapping(columnName="xdynamicWithNulls", jsonPath="$.xdynamicWithNulls", cslDataType="dynamic")) return mappings engine_kcsb = KustoConnectionStringBuilder.with_aad_device_authentication( "https://toshetah.kusto.windows.net") dm_kcsb = KustoConnectionStringBuilder.with_aad_device_authentication( "https://ingest-toshetah.kusto.windows.net") client = KustoClient(engine_kcsb) ingest_client = KustoIngestClient(dm_kcsb) ingest_status_q = KustoIngestStatusQueues(ingest_client) client.execute("PythonTest", ".drop table Deft ifexists") @pytest.mark.run(order=1) def test_csv_ingest_non_existing_table(): csv_ingest_props = IngestionProperties( "PythonTest", "Deft", dataFormat=DataFormat.csv, mapping=Helpers.create_deft_table_csv_mappings(), reportLevel=ReportLevel.FailuresAndSuccesses, ) csv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv")
COLUMN_MAPPING_NAME=column_mapping_name, COLUMN_MAPPING=json.dumps(obj.get('COLUMN_MAPPING'))) # Drop table if exists, then create. kusto_client.execute_mgmt(kusto_database, drop_table_if_exists_command) kusto_client.execute_mgmt(kusto_database, create_table_command) #NOTE: this may be backwards... try: response = kusto_client.execute_mgmt(kusto_database, check_for_mapping_command) except: # Because check_for_mapping_command should throw error if mapping already exists response = kusto_client.execute_mgmt(kusto_database, create_mapping_command) ingestion_client = KustoIngestClient(kcsb_ingest) # All ingestion properties: https://docs.microsoft.com/en-us/azure/kusto/management/data-ingestion/#ingestion-properties ingestion_props = IngestionProperties( reportLevel=reportLevel, database=kusto_database, table=destination_table, dataFormat=DataFormat.csv, mappingReference=column_mapping_name, additionalProperties={'ignoreFirstRecord': 'true'}) blobProps = BlockBlobService.get_blob_properties(blob_service, container, file_name).properties file_size = blobProps.content_length blob_descriptor = BlobDescriptor( blob_path, file_size) # Raw size of the data in bytes
from branch.path_app_branch import * def authenticate_kusto(kusto_cluster): tenant_id = '72f988bf-86f1-41af-91ab-2d7cd011db47' KCSB = KustoConnectionStringBuilder.with_aad_device_authentication( kusto_cluster) KCSB.authority_id = tenant_id return KustoClient(KCSB), KCSB # Query Kusto cga_cluster = 'https://cgadataout.kusto.windows.net' ingest_cluster = "https://ingest-cgadataout.kusto.windows.net" cga_client = authenticate_kusto(cga_cluster)[0] ingest_client = KustoIngestClient(authenticate_kusto(ingest_cluster)[1]) ls = [cga_client, ingest_client] def Ingest(Tag): ingestion_props = IngestionProperties( database="DevRelWorkArea", table="RepoContributors", dataFormat=DataFormat.CSV, ingestByTags=[Tag], dropByTags=[Tag], mappingReference="RepoContributors_CSV_Mapping", reportLevel=ReportLevel.FailuresAndSuccesses, additionalProperties={'ignoreFirstRecord': 'true'}) # 修改这里的本地路径以加载数据源
FileDescriptor, BlobDescriptor, DataFormat, ReportLevel, ) # there are a lot of useful properties, make sure to go over docs and check them out ingestion_props = IngestionProperties( database="{database_name}", table="{table_name}", dataFormat=DataFormat.csv, # incase status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, ) client = KustoIngestClient( KustoConnectionStringBuilder.with_aad_device_authentication( "https://ingest-{cluster_name}.kusto.windows.net")) # there are more options for authenticating - see azure-kusto-data samples ################################################################## ## INGESTION ## ################################################################## # ingest from file file_descriptor = FileDescriptor( "{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob
def update_ADX_ingest_status(tc): KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) qs = KustoIngestStatusQueues(INGESTION_CLIENT) run_id = (str(uuid.uuid4()))[31:].upper() MAX_BACKOFF = 8 backoff = 1 total_queue_success_messages = 0 while True: ################### NOTICE #################### # in order to get success status updates, # make sure ingestion properties set the # reportLevel=ReportLevel.FailuresAndSuccesses. if qs.success.is_empty() and qs.failure.is_empty(): time.sleep(backoff) if backoff == 1 and total_queue_success_messages != 0: print( "{} RUN_ID:{} Processed {} message in this batch ".format( LOG_MESSAGE_HEADER, run_id, total_queue_success_messages)) backoff = min(backoff * 2, MAX_BACKOFF) if (backoff < MAX_BACKOFF): #print("{} No new messages. backing off for {} seconds".format(LOG_MESSAGE_HEADER,backoff)) continue if (backoff == MAX_BACKOFF): #print("{} Reach max waiting time {}, exit.".format(LOG_MESSAGE_HEADER,backoff)) break backoff = 1 success_messages = qs.success.pop(15) failure_messages = qs.failure.pop(15) total_success = 0 total_failure = 0 if success_messages is not None: if (len(success_messages) > 0): tc.track_trace("{} Get {} success ingest messages ".format( LOG_MESSAGE_HEADER, str(len(success_messages)))) total_success = len(success_messages) if failure_messages is not None: if (len(failure_messages) > 0): tc.track_trace("{} Get {} failure ingest messages ".format( LOG_MESSAGE_HEADER, str(len(failure_messages)))) total_failure = len(failure_messages) tc.flush() total_queue_success_messages += len(success_messages) count_success = 0 count_faulure = 0 for smsg in success_messages: file_path = get_file_path(smsg.IngestionSourcePath) container_name = get_container_name(smsg.IngestionSourcePath) count_success += 1 log_msg = "{} SUCCESS TO INGEST TO ADX <{}> -[{}/{}/{}] , Time: {}, vm_uuid: {}, source_id:{}, file path: {}".format( LOG_MESSAGE_HEADER, run_id, str(count_success), str(total_success), str(total_queue_success_messages), smsg.SucceededOn, get_vm_uuid_from_filename(file_path), smsg.IngestionSourceId, file_path) tc.track_trace(log_msg) tc.track_event( APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME, { 'MESSAGE': 'SUCCESS TO Ingest ADX', 'file_path': file_path, 'source_id': smsg.IngestionSourceId }, {}) tc.flush() update_COSMOS_status(COSMOS_CLIENT, file_path, smsg.SucceededOn, SUCCESS_STATUS, str(smsg), get_vm_uuid_from_filename(file_path), smsg.IngestionSourceId, container_name, tc, count_success, run_id) telemetry_block_blob_service = BlockBlobService( account_name=SOURCE_TELEMETRY_BLOB_ACCOUNT, account_key=SOURCE_TELEMETRY_FILE_BLOB_KEY) target_file_path = '' if (PROCESSED_TELEMETRY_FOLDER.endswith('/')): target_file_path = PROCESSED_TELEMETRY_FOLDER + file_path else: target_file_path = PROCESSED_TELEMETRY_FOLDER + '/' + file_path move_processed_file(telemetry_block_blob_service, container_name, file_path, container_name, target_file_path, tc) tc.track_trace( '{} DONE ADX INGESTION PROCESS <{}> -[{}/{}/{}], File Moved to processed folder {} , vm_uuid: {}, file path: {}' .format(LOG_MESSAGE_HEADER, run_id, str(count_success), str(total_success), str(total_queue_success_messages), target_file_path, get_vm_uuid_from_filename(file_path), file_path)) tc.track_event( APP_INSIGHT_INGEST_SUCCESS_EVENT_NAME, { 'MESSAGE': 'DONE ADX INGESTION PROCESS', 'moved_file_path': target_file_path, 'source_file_path': file_path }, {}) tc.flush() #smsgjson=json.loads(smsg) #print (smsgjson['IngestionSourcePath']) #print (smsgjson['SucceededOn']) print("{} IngestionSourcePath: {}".format( LOG_MESSAGE_HEADER, smsg.IngestionSourcePath)) print(smsg.SucceededOn) for fmsg in failure_messages: container_name = get_container_name(fmsg.IngestionSourcePath) file_path = get_file_path(fmsg.IngestionSourcePath) count_faulure += 1 log_msg = "{} FAILED TO INGEST TO ADX <{}> -[{}/{}] , Time: {}, vm_uuid: {}, source_id:{}, container:{}, file path: {}, message: {}".format( LOG_MESSAGE_HEADER, run_id, str(count_faulure), str(total_failure), fmsg.FailedOn, get_vm_uuid_from_filename(file_path), fmsg.IngestionSourceId, container_name, file_path, str(fmsg)) tc.track_trace(log_msg) tc.track_event( APP_INSIGHT_INGEST_FAILURE_EVENT_NAME, { 'MESSAGE': 'FAILED TO Ingest ADX', 'file_path': file_path, 'source_id': fmsg.IngestionSourceId }, {}) tc.flush() update_COSMOS_status(COSMOS_CLIENT, file_path, fmsg.FailedOn, FAILURE_STATUS, str(fmsg), get_vm_uuid_from_filename(file_path), fmsg.IngestionSourceId, container_name, tc, count_faulure, run_id)
def main(): # Kusto cluster inputs data = os.environ["INPUT_DATA"] tenantId = os.environ["INPUT_TENANTID"] databaseName = os.environ["INPUT_DATABASE"] clusterName = os.environ["INPUT_CLUSTERNAME"] region = os.environ["INPUT_CLUSTERREGION"] clientId = os.environ["INPUT_CLIENTID"] clientSecret = os.environ["INPUT_CLIENTSECRET"] destinationTable = os.environ["INPUT_TABLE"] mapping = os.environ['INPUT_MAPPING'] try: print(data) # file creation fileName = "sample.json" filePath = os.path.join(os.environ["GITHUB_WORKSPACE"], fileName) deploymentData = {} deploymentData["Timestamp"] = str(datetime.now()) deploymentData["DeploymentDetails"] = data with open(filePath, "w") as targetFile: json.dump(deploymentData, targetFile) # cluster client connection and auth httpsPrefix = "https://" suffixKustoUri = "kusto.windows.net:443/" clusterIngestUri = "{0}ingest-{1}.{2}.{3}".format(httpsPrefix, clusterName, region, suffixKustoUri) kcsb_ingest = KustoConnectionStringBuilder.with_aad_application_key_authentication( clusterIngestUri, clientId, clientSecret, tenantId) print(mapping) # Cluster ingestion parameters ingestionClient = KustoIngestClient(kcsb_ingest) ingestionProperties = IngestionProperties(database=databaseName, table=destinationTable, dataFormat=DataFormat.JSON, ingestion_mapping_reference=mapping, report_level=ReportLevel.FailuresAndSuccesses) fileDescriptor = FileDescriptor(filePath, 1000) print('Payload to dump') with open(filePath, "r") as targetFile: parsed = json.load(targetFile) print(json.dumps(parsed, indent=2, sort_keys=True)) ingestionClient.ingest_from_file(fileDescriptor, ingestion_properties=ingestionProperties) print('Queued up ingestion with Azure Data Explorer') # Remove the temporary file os.remove(filePath) """ # Repeated pinging to wait for success/failure message qs = KustoIngestStatusQueues(ingestionClient) # Interval to ping MAX_BACKOFF = 5 backoff = 1 while True: if qs.success.is_empty() and qs.failure.is_empty(): time.sleep(backoff) backoff = min(backoff * 2, MAX_BACKOFF) print("No new messages. backing off for {} seconds".format(backoff)) continue backoff = 1 success_messages = qs.success.pop(10) failure_messages = qs.failure.pop(10) pprint.pprint("SUCCESS : {}".format(success_messages)) pprint.pprint("FAILURE : {}".format(failure_messages)) break """ except Exception as e: raise Exception(e)
JsonColumnMapping(columnName="xtime", jsonPath="$.xtime", cslDataType="timespan")) mappings.append( JsonColumnMapping(columnName="xtextWithNulls", jsonPath="$.xtextWithNulls", cslDataType="string")) mappings.append( JsonColumnMapping(columnName="xdynamicWithNulls", jsonPath="$.xdynamicWithNulls", cslDataType="dynamic")) return mappings client = KustoClient("https://toshetah.kusto.windows.net") ingest_client = KustoIngestClient("https://ingest-toshetah.kusto.windows.net") ingest_status_q = KustoIngestStatusQueues(ingest_client) client.execute("PythonTest", ".drop table Deft ifexists") @pytest.mark.run(order=1) def test_csv_ingest_non_existing_table(): csv_ingest_props = IngestionProperties( "PythonTest", "Deft", dataFormat=DataFormat.csv, mapping=Helpers.create_deft_table_csv_mappings(), reportLevel=ReportLevel.FailuresAndSuccesses, ) csv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv")
IngestionProperties, FileDescriptor, BlobDescriptor, DataFormat, ReportLevel, ) # there are a lot of useful properties, make sure to go over docs and check them out ingestion_props = IngestionProperties( database="{database_name}", table="{table_name}", dataFormat=DataFormat.csv, # incase status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, ) client = KustoIngestClient("https://ingest-{cluster_name}.kusto.windows.net") # there are more options for authenticating - see azure-kusto-data samples ################################################################## ## INGESTION ## ################################################################## # ingest from file file_descriptor = FileDescriptor("{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob