def test_sanity_ingest_from_file(self, mock_uuid, mock_put_message_in_queue, mock_create_blob_from_stream, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.CSV) # ensure test can work when executed from within directories current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] assert put_message_in_queue_mock_kwargs[ "queue_name"] == "readyforaggregation-secured" queued_message = base64.b64decode( put_message_in_queue_mock_kwargs["content"].encode( "utf-8")).decode("utf-8") queued_message_json = json.loads(queued_message) expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/" "database__table__1111-111111-111111-1111__dataset.csv.gz?sas" # mock_create_blob_from_stream assert queued_message_json["BlobPath"] == expected_url assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] == False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] == False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] == True create_blob_from_stream_mock_kwargs = mock_create_blob_from_stream.call_args_list[ 0][1] assert create_blob_from_stream_mock_kwargs[ "container_name"] == "tempstorage" assert type( create_blob_from_stream_mock_kwargs["stream"]) == io.BytesIO assert create_blob_from_stream_mock_kwargs[ "blob_name"] == "database__table__1111-111111-111111-1111__dataset.csv.gz"
def Ingest(Tag): # setting AUTHORITY_ID = "6babcaad-604b-40ac-a9d7-9fd97c0b779f" INGESTCLUSTER = "https://ingest-cgadataout.kusto.windows.net" KUSTOCLUSTER = "https://cgadataout.kusto.windows.net" DATABASE = "DevRelWorkArea" # Create table KCSB_DATA = KustoConnectionStringBuilder.with_aad_device_authentication( KUSTOCLUSTER) DESTINATION_TABLE = "RepoContributors" DESTINATION_TABLE_COLUMN_MAPPING = "RepoContributors_CSV_Mapping" KUSTO_CLIENT = KustoClient(KCSB_DATA) DROP_TABLE_IF_EXIST = ".drop table RepoContributors ifexists" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, DROP_TABLE_IF_EXIST) CREATE_TABLE_COMMAND = ".create table RepoContributors (Article: string, Contributors: int64, Data: string)" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_TABLE_COMMAND) print("RepoContributors table is created") # Create mapping CREATE_MAPPING_COMMAND = """.create table RepoContributors ingestion csv mapping 'RepoContributors_CSV_Mapping' '[{"Name": "Article","datatype": "string","Ordinal": 0},{"Name": "Contributors","datatype": "int64","Ordinal": 1},{"Name": "Data","datatype": "string","Ordinal": 2}]'""" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_MAPPING_COMMAND) print("mapping is created") # Ingest # The authentication method will be taken from the chosen KustoConnectionStringBuilder. ingestion_props = IngestionProperties( database="DevRelWorkArea", table="RepoContributors", dataFormat=DataFormat.CSV, ingestByTags=[Tag], dropByTags=[Tag], mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, reportLevel=ReportLevel.FailuresAndSuccesses, additionalProperties={'ignoreFirstRecord': 'true'}) kcsb = KustoConnectionStringBuilder.with_aad_device_authentication( INGESTCLUSTER) client = KustoIngestClient(kcsb) # ingest from file file_descriptor = FileDescriptor( r"D:\test\Results\log_data_merge\merge_microsoftdocs_sql-docs-pr.txt", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) # if status updates are required, something like this can be done return 1
def test_sanity_ingest_from_file(self, mock_uuid, mock_put_message_in_queue, mock_upload_blob_from_stream, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) # ensure test can work when executed from within directories current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] queued_message_json = json.loads( put_message_in_queue_mock_kwargs["content"]) expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__dataset.csv.gz?" # mock_upload_blob_from_stream # not checking the query string because it can change order, just checking it's there assert queued_message_json["BlobPath"].startswith(expected_url) is True assert len(queued_message_json["BlobPath"]) > len(expected_url) assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] is False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] is False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] is True upload_blob_kwargs = mock_upload_blob_from_stream.call_args_list[0][1] assert type(upload_blob_kwargs["data"]) == io.BytesIO
def test_ingest_from_file_wrong_endpoint(self): responses.add_callback( responses.POST, "https://somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_error_callback, content_type="application/json" ) ingest_client = KustoIngestClient("https://somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) with self.assertRaises(KustoInvalidEndpointError) as ex: ingest_client.ingest_from_file(file_path, ingestion_properties=ingestion_properties) self.assertEqual( ex.exception.args[0], "You are using 'DataManagement' client type, but the provided endpoint is of ServiceType 'Engine'. Initialize the client with the appropriate endpoint URI: 'https://ingest-somecluster.kusto.windows.net'", "Expected exception was not raised", )
# there are a lot of useful properties, make sure to go over docs and check them out ingestion_props = IngestionProperties( database="{database_name}", table="{table_name}", dataFormat=DataFormat.CSV, # in case status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, # in case a mapping is required # ingestionMappingReference="{json_mapping_that_already_exists_on_table}" # ingestionMappingType=IngestionMappingType.Json ) # ingest from file file_descriptor = FileDescriptor("{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob blob_descriptor = BlobDescriptor("https://{path_to_blob}.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props) # ingest from dataframe import pandas fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = pandas.DataFrame(data=rows, columns=fields)
# there are a lot of useful properties, make sure to go over docs and check them out ingestion_props = IngestionProperties( database="{database_name}", table="{table_name}", dataFormat=DataFormat.CSV, # in case status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, # in case a mapping is required # ingestionMappingReference="{json_mapping_that_already_exists_on_table}" # ingestionMappingType=IngestionMappingType.Json ) # ingest from file file_descriptor = FileDescriptor( "{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob blob_descriptor = BlobDescriptor( "https://{path_to_blob}.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props) # ingest from dataframe import pandas fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = pandas.DataFrame(data=rows, columns=fields)
def main(): # Kusto cluster inputs data = os.environ["INPUT_DATA"] tenantId = os.environ["INPUT_TENANTID"] databaseName = os.environ["INPUT_DATABASE"] clusterName = os.environ["INPUT_CLUSTERNAME"] region = os.environ["INPUT_CLUSTERREGION"] clientId = os.environ["INPUT_CLIENTID"] clientSecret = os.environ["INPUT_CLIENTSECRET"] destinationTable = os.environ["INPUT_TABLE"] mapping = os.environ['INPUT_MAPPING'] try: print(data) # file creation fileName = "sample.json" filePath = os.path.join(os.environ["GITHUB_WORKSPACE"], fileName) deploymentData = {} deploymentData["Timestamp"] = str(datetime.now()) deploymentData["DeploymentDetails"] = data with open(filePath, "w") as targetFile: json.dump(deploymentData, targetFile) # cluster client connection and auth httpsPrefix = "https://" suffixKustoUri = "kusto.windows.net:443/" clusterIngestUri = "{0}ingest-{1}.{2}.{3}".format(httpsPrefix, clusterName, region, suffixKustoUri) kcsb_ingest = KustoConnectionStringBuilder.with_aad_application_key_authentication( clusterIngestUri, clientId, clientSecret, tenantId) print(mapping) # Cluster ingestion parameters ingestionClient = KustoIngestClient(kcsb_ingest) ingestionProperties = IngestionProperties(database=databaseName, table=destinationTable, dataFormat=DataFormat.JSON, ingestion_mapping_reference=mapping, report_level=ReportLevel.FailuresAndSuccesses) fileDescriptor = FileDescriptor(filePath, 1000) print('Payload to dump') with open(filePath, "r") as targetFile: parsed = json.load(targetFile) print(json.dumps(parsed, indent=2, sort_keys=True)) ingestionClient.ingest_from_file(fileDescriptor, ingestion_properties=ingestionProperties) print('Queued up ingestion with Azure Data Explorer') # Remove the temporary file os.remove(filePath) """ # Repeated pinging to wait for success/failure message qs = KustoIngestStatusQueues(ingestionClient) # Interval to ping MAX_BACKOFF = 5 backoff = 1 while True: if qs.success.is_empty() and qs.failure.is_empty(): time.sleep(backoff) backoff = min(backoff * 2, MAX_BACKOFF) print("No new messages. backing off for {} seconds".format(backoff)) continue backoff = 1 success_messages = qs.success.pop(10) failure_messages = qs.failure.pop(10) pprint.pprint("SUCCESS : {}".format(success_messages)) pprint.pprint("FAILURE : {}".format(failure_messages)) break """ except Exception as e: raise Exception(e)