def test_streaming_ingest_from_io_streams(): ingestion_properties = IngestionProperties(database=db_name, table=table_name, dataFormat=DataFormat.CSV) byte_sequence = b'0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null' bytes_stream = io.BytesIO(byte_sequence) ingest_client.ingest_from_stream(bytes_stream, ingestion_properties=ingestion_properties) str_sequence = '0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null' str_stream = io.StringIO(str_sequence) ingest_client.ingest_from_stream(str_stream, ingestion_properties=ingestion_properties) byte_sequence = b'{"rownumber": 0, "rowguid": "00000000-0000-0000-0001-020304050607", "xdouble": 0.0, "xfloat": 0.0, "xbool": 0, "xint16": 0, "xint32": 0, "xint64": 0, "xunit8": 0, "xuint16": 0, "xunit32": 0, "xunit64": 0, "xdate": "2014-01-01T01:01:01Z", "xsmalltext": "Zero", "xtext": "Zero", "xnumberAsText": "0", "xtime": "00:00:00", "xtextWithNulls": null, "xdynamicWithNulls": ""}' bytes_stream = io.BytesIO(byte_sequence) ingestion_properties.format = DataFormat.JSON ingestion_properties.ingestion_mapping_reference = "JsonMapping" ingest_client.ingest_from_stream(bytes_stream, ingestion_properties=ingestion_properties) str_sequence = u'{"rownumber": 0, "rowguid": "00000000-0000-0000-0001-020304050607", "xdouble": 0.0, "xfloat": 0.0, "xbool": 0, "xint16": 0, "xint32": 0, "xint64": 0, "xunit8": 0, "xuint16": 0, "xunit32": 0, "xunit64": 0, "xdate": "2014-01-01T01:01:01Z", "xsmalltext": "Zero", "xtext": "Zero", "xnumberAsText": "0", "xtime": "00:00:00", "xtextWithNulls": null, "xdynamicWithNulls": ""}' str_stream = io.StringIO(str_sequence) ingest_client.ingest_from_stream(str_stream, ingestion_properties=ingestion_properties) byte_sequence = ( b'0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null' * 600000 ) bytes_stream = io.BytesIO(byte_sequence) try: ingest_client.ingest_from_stream(bytes_stream, ingestion_properties=ingestion_properties) except KustoStreamMaxSizeExceededError: pass
def ingest_from_source(self, source: IngestionSource, mapping: IngestionMapping, target_database: str, target_table: str, **kwargs): files = source.files ingest_client = self.client_provider.get_ingest_client() # TODO: should maybe persist ingestion mappings ingestion_props = IngestionProperties( target_database, target_table, dataFormat=DataFormat(source.data_format), mapping=self.get_ingestion_mapping(source.data_format, mapping), reportLevel=ReportLevel.FailuresOnly, reportMethod=ReportMethod.Queue, flushImmediately=True, ) if "batch_id" in kwargs and not kwargs.get("no_wait", False): # this helps with monitoring ingestion_props.ingest_by_tags = [kwargs["batch_id"]] for file_path in files: if kwargs.get("direct", True): # TODO: allow for direct ingestion (this is currently only relevant to files already in storage) # client.execute(f'.ingest into table {operation.target} ({}) with ({mapping_ref_key}="{mapping_name}")') pass else: logger.info( f'Queueing "{file_path}" to ingest into "{ingestion_props.table}"' ) ingest_client.ingest_from_file(str(file_path), ingestion_props)
def test_streaming_ingest_from_stream(self): responses.add_callback( responses.POST, "https://somecluster.kusto.windows.net/v1/rest/ingest/database/table", callback=request_callback, ) ingest_client = KustoStreamingIngestClient( "https://somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.csv) byte_sequence = b"56,56,56" bytes_stream = io.BytesIO(byte_sequence) ingest_client.ingest_from_stream( bytes_stream, ingestion_properties=ingestion_properties) str_sequence = u"57,57,57" str_stream = io.StringIO(str_sequence) ingest_client.ingest_from_stream( str_stream, ingestion_properties=ingestion_properties) byte_sequence = b'{"Name":"Ben","Age":"56","Weight":"75"}' bytes_stream = io.BytesIO(byte_sequence) ingestion_properties.format = DataFormat.json try: ingest_client.ingest_from_stream( bytes_stream, ingestion_properties=ingestion_properties) except KustoMissingMappingReferenceError: pass ingestion_properties.mapping_reference = "JsonMapping" ingest_client.ingest_from_stream( bytes_stream, ingestion_properties=ingestion_properties) str_sequence = u'{"Name":"Ben","Age":"56","Weight":"75"}' str_stream = io.StringIO(str_sequence) ingest_client.ingest_from_stream( str_stream, ingestion_properties=ingestion_properties) byte_sequence = b"56,56,56" * 600000 bytes_stream = io.BytesIO(byte_sequence) try: ingest_client.ingest_from_stream( bytes_stream, ingestion_properties=ingestion_properties) except KustoStreamMaxSizeExceededError: pass
def test_blob_ingestion(self, mock_uuid, mock_put_message_in_queue, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=queued_request_callback, content_type="application/json") ingest_client = ManagedStreamingIngestClient.from_dm_kcsb( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table") blob_path = ( "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?sp=rl&st=2020-05-20T13" "%3A38%3A37Z&se=2020-05-21T13%3A38%3A37Z&sv=2019-10-10&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx " ) result = ingest_client.ingest_from_blob( BlobDescriptor(blob_path, 1), ingestion_properties=ingestion_properties) assert result.status == IngestionStatus.QUEUED assert_queued_upload( mock_put_message_in_queue, mock_upload_blob_from_stream=None, expected_url= "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?", )
def test_simple_ingest_from_dataframe(self, mock_pid, mock_time, mock_uuid, mock_put_message_in_queue, mock_create_blob_from_path): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json", ) ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.csv) from pandas import DataFrame fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = DataFrame(data=rows, columns=fields) ingest_client.ingest_from_dataframe( df, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] assert put_message_in_queue_mock_kwargs[ "queue_name"] == "readyforaggregation-secured" queued_message = base64.b64decode( put_message_in_queue_mock_kwargs["content"].encode( "utf-8")).decode("utf-8") queued_message_json = json.loads(queued_message) # mock_create_blob_from_stream assert ( queued_message_json["BlobPath"] == "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__df_100_64.csv.gz?sas" ) assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] == False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] == False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] == True create_blob_from_path_mock_kwargs = mock_create_blob_from_path.call_args_list[ 0][1] import tempfile assert create_blob_from_path_mock_kwargs[ "container_name"] == "tempstorage" assert create_blob_from_path_mock_kwargs["file_path"] == os.path.join( tempfile.gettempdir(), "df_100_64.csv.gz") assert (create_blob_from_path_mock_kwargs["blob_name"] == "database__table__1111-111111-111111-1111__df_100_64.csv.gz")
def test_simple_ingest_from_dataframe(self, mock_pid, mock_time, mock_uuid, mock_put_message_in_queue, mock_upload_blob_from_stream, ingest_client_class): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = ingest_client_class( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) from pandas import DataFrame fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = DataFrame(data=rows, columns=fields) result = ingest_client.ingest_from_dataframe( df, ingestion_properties=ingestion_properties) assert result.status == IngestionStatus.QUEUED expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__df_{0}_100_11111111-1111-1111-1111-111111111111.csv.gz?".format( id(df)) assert_queued_upload(mock_put_message_in_queue, mock_upload_blob_from_stream, expected_url)
def test_ingest_from_file_wrong_endpoint(self, ingest_client_class): responses.add_callback( responses.POST, "https://somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_error_callback, content_type="application/json") ingest_client = ingest_client_class( "https://somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV) current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) with pytest.raises(KustoInvalidEndpointError) as ex: ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) assert ( ex.value.args[0] == "You are using 'DataManagement' client type, but the provided endpoint is of ServiceType 'Engine'. Initialize the " "client with the appropriate endpoint URI: 'https://ingest-somecluster.kusto.windows.net'" ), ("Expected exception was " "not raised")
def test_tsv_ingestion_csv_mapping(): tsv_ingestion_props = IngestionProperties( "PythonTest", "Deft", dataFormat=DataFormat.tsv, mapping=Helpers.create_deft_table_csv_mappings(), reportLevel=ReportLevel.FailuresAndSuccesses, ) tsv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.tsv") ingest_client.ingest_from_file(tsv_file_path, tsv_ingestion_props) successes = 0 timeout = 60 while successes != 1 and timeout > 0: while ingest_status_q.success.is_empty() and timeout > 0: time.sleep(1) timeout -= 1 success_message = ingest_status_q.success.pop() assert success_message[0].Table == "Deft" assert success_message[0].Database == "PythonTest" successes += 1 assert successes == 1 # TODO: status queues only mark ingestion was successful, but takes time for data to become available time.sleep(20) response = client.execute("PythonTest", "Deft | count") for row in response.primary_results[0]: assert int(row["Count"]) == 38, print("Deft | count = " + text_type(row["Count"]))
def write_to_db(coverage_data, args): # connect to database cluster = "https://ingest-onnxruntimedashboarddb.southcentralus.kusto.windows.net" kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(cluster) # The authentication method will be taken from the chosen KustoConnectionStringBuilder. client = QueuedIngestClient(kcsb) fields = [ "UploadTime", "CommitId", "Coverage", "LinesCovered", "TotalLines", "OS", "Arch", "BuildConfig", "ReportURL", "Branch" ] now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") rows = [[ now_str, args.commit_hash, coverage_data['coverage'], coverage_data['lines_covered'], coverage_data['lines_valid'], args.os.lower(), args.arch.lower(), args.build_config.lower(), args.report_url.lower(), args.branch.lower() ]] ingestion_props = IngestionProperties( database="powerbi", table="test_coverage", data_format=DataFormat.CSV, report_level=ReportLevel.FailuresAndSuccesses) df = pandas.DataFrame(data=rows, columns=fields) client.ingest_from_dataframe(df, ingestion_properties=ingestion_props)
def test_blob_info_json_mapping(self): """Tests serialization of json ingestion blob info.""" validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) properties = IngestionProperties( database="database", table="table", data_format=DataFormat.JSON, column_mappings=[ ColumnMapping("ColumnName", "datatype", path="jsonpath") ], additional_tags=["tag"], ingest_if_not_exists=["ingestIfNotExistTags"], ingest_by_tags=["ingestByTags"], drop_by_tags=["dropByTags"], flush_immediately=True, report_level=ReportLevel.DoNotReport, report_method=ReportMethod.Queue, validation_policy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = IngestionBlobInfo(blob, properties, auth_context="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
def test_sanity_ingest_from_file(self, mock_uuid, mock_put_message_in_queue, mock_create_blob_from_stream, mock_aad): responses.add_callback( responses.POST, "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_callback, content_type="application/json") ingest_client = KustoIngestClient( "https://ingest-somecluster.kusto.windows.net") ingestion_properties = IngestionProperties(database="database", table="table", dataFormat=DataFormat.CSV) # ensure test can work when executed from within directories current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) # mock_put_message_in_queue assert mock_put_message_in_queue.call_count == 1 put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[ 0][1] assert put_message_in_queue_mock_kwargs[ "queue_name"] == "readyforaggregation-secured" queued_message = base64.b64decode( put_message_in_queue_mock_kwargs["content"].encode( "utf-8")).decode("utf-8") queued_message_json = json.loads(queued_message) expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/" "database__table__1111-111111-111111-1111__dataset.csv.gz?sas" # mock_create_blob_from_stream assert queued_message_json["BlobPath"] == expected_url assert queued_message_json["DatabaseName"] == "database" assert queued_message_json["IgnoreSizeLimit"] == False assert queued_message_json["AdditionalProperties"]["format"] == "csv" assert queued_message_json["FlushImmediately"] == False assert queued_message_json["TableName"] == "table" assert queued_message_json["RawDataSize"] > 0 assert queued_message_json["RetainBlobOnSuccess"] == True create_blob_from_stream_mock_kwargs = mock_create_blob_from_stream.call_args_list[ 0][1] assert create_blob_from_stream_mock_kwargs[ "container_name"] == "tempstorage" assert type( create_blob_from_stream_mock_kwargs["stream"]) == io.BytesIO assert create_blob_from_stream_mock_kwargs[ "blob_name"] == "database__table__1111-111111-111111-1111__dataset.csv.gz"
def ingest_to_ADX(filepath, filesize): KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = AAD_TENANT_ID KCSB_ENGINE = KustoConnectionStringBuilder.with_aad_device_authentication( URI) KCSB_ENGINE.authority_id = AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) INGESTION_PROPERTIES = IngestionProperties( database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={'ignoreFirstRecord': 'true'}, reportLevel=ReportLevel.FailuresAndSuccesses) BLOB_PATH = "https://" + SOURCE_CSV_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_CSV_CONTAINER + "/" + filepath + SOURCE_CSV_BLOB_TOKEN BLOB_DESCRIPTOR = BlobDescriptor( BLOB_PATH, filesize) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob( BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) print('Done queuing up ingestion with Azure Data Explorer ' + filepath)
def test_ingest_complicated_props(): validation_policy = ValidationPolicy( validationOptions=ValidationOptions.ValidateCsvInputConstantColumns, validationImplications=ValidationImplications.Fail) json_ingestion_props = IngestionProperties( db_name, table_name, dataFormat=DataFormat.JSON, ingestionMapping=Helpers.create_test_table_json_mappings(), additionalTags=["a", "b"], ingestIfNotExists=["aaaa", "bbbb"], ingestByTags=["ingestByTag"], dropByTags=["drop", "drop-by"], flushImmediately=False, reportLevel=ReportLevel.FailuresAndSuccesses, reportMethod=ReportMethod.Queue, validationPolicy=validation_policy, ) file_paths = [json_file_path, zipped_json_file_path] fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths] for fd in fds: ingest_client.ingest_from_file(fd, json_ingestion_props) assert_success_mesagges_count(2) assert_row_count(4)
def test_ingest_complicated_props(): validation_policy = ValidationPolicy( validation_options=ValidationOptions.ValidateCsvInputConstantColumns, validation_implications=ValidationImplications.Fail) json_ingestion_props = IngestionProperties( test_db, test_table, data_format=DataFormat.JSON, ingestion_mapping=TestData.test_table_json_mappings(), additional_tags=["a", "b"], ingest_if_not_exists=["aaaa", "bbbb"], ingest_by_tags=["ingestByTag"], drop_by_tags=["drop", "drop-by"], flush_immediately=False, report_level=ReportLevel.FailuresAndSuccesses, report_method=ReportMethod.Queue, validation_policy=validation_policy, ) file_paths = [json_file_path, zipped_json_file_path] fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths] for fd in fds: ingest_client.ingest_from_file(fd, json_ingestion_props) assert_rows_added(4)
def test_streaming_ingest_from_dataframe(): from pandas import DataFrame fields = [ "rownumber", "rowguid", "xdouble", "xfloat", "xbool", "xint16", "xint32", "xint64", "xunit8", "xuint16", "xunit32", "xunit64", "xdate", "xsmalltext", "xtext", "xnumberAsText", "xtime", "xtextWithNulls", "xdynamicWithNulls", ] rows = [[ 0, "00000000-0000-0000-0001-020304050607", 0.0, 0.0, 0, 0, 0, 0, 0, 0, 0, 0, "2014-01-01T01:01:01Z", "Zero", "Zero", "0", "00:00:00", None, "" ]] df = DataFrame(data=rows, columns=fields) ingestion_properties = IngestionProperties(database=db_name, table=table_name, dataFormat=DataFormat.CSV) ingest_client.ingest_from_dataframe(df, ingestion_properties)
def test_blob_info_json_mapping(self): """ Tests serialization of json ingestion blob info. """ validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) properties = IngestionProperties( database="database", table="table", dataFormat=DataFormat.json, mapping=[JsonColumnMapping("ColumnName", "jsonpath", "datatype")], additionalTags=["tag"], ingestIfNotExists=["ingestIfNotExistTags"], ingestByTags=["ingestByTags"], dropByTags=["dropByTags"], flushImmediately=True, reportLevel=ReportLevel.DoNotReport, reportMethod=ReportMethod.QueueAndTable, validationPolicy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = _IngestionBlobInfo(blob, properties, deleteSourcesOnSuccess=True, authContext="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, file_size, tc,vm_uuid,deploy_uuid,config_uuid): ingest_source_id=str(uuid.uuid4()) KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) ing_map=[JsonColumnMapping("vm_uuid", "$.vm_uuid", "string"), JsonColumnMapping("deploy_uuid", "$.deployment_description[0].deploy_uuid", "string"), JsonColumnMapping("config_uuid", "$.vm_configuration[0].config_uuid", "string"), JsonColumnMapping("rawdata", "$", "dynamic")] INGESTION_PROPERTIES = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.JSON, ingestionMapping=ing_map, reportLevel=ReportLevel.FailuresAndSuccesses,flushImmediately=IS_FLUSH_IMMEDIATELY) print("Database {} Tabele {}".format(DATABASE,DESTINATION_TABLE)) BLOB_PATH = "https://" + blob_account + ".blob.core.windows.net/" + container_name + "/" + filepath + CLEAN_FILE_TOKEN print (BLOB_PATH,' ',str(file_size), ingest_source_id) BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES) tc.context.properties["ingest_source_id"]=ingest_source_id min_datatime=0 max_datatime=0 total_records=1 doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc) tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime }) log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id) print(log_msg) tc.track_trace(log_msg) tc.flush()
def write_to_db(binary_size_data, args): # connect to database cluster = "https://ingest-onnxruntimedashboarddb.southcentralus.kusto.windows.net" kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(cluster) # The authentication method will be taken from the chosen KustoConnectionStringBuilder. client = QueuedIngestClient(kcsb) fields = ["build_time", "build_id", "build_project", "commit_id", "os", "arch", "build_config", "size", "Branch"] now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") branch_name = os.environ.get("BUILD_SOURCEBRANCHNAME", "main") rows = [] for row in binary_size_data: rows.append( [ now_str, args.build_id, args.build_project, args.commit_hash, row["os"], row["arch"], row["build_config"], row["size"], branch_name.lower(), ] ) ingestion_props = IngestionProperties( database="powerbi", table="binary_size", data_format=DataFormat.CSV, report_level=ReportLevel.FailuresAndSuccesses, ) df = pandas.DataFrame(data=rows, columns=fields) client.ingest_from_dataframe(df, ingestion_properties=ingestion_props)
def test_blob_info_csv_exceptions(self): """ Tests invalid ingestion properties. """ with self.assertRaises(KustoDuplicateMappingError): IngestionProperties(database="database", table="table", mapping="mapping", mapptingReference="mappingReference")
def test_json_ingest_existing_table(): json_ingestion_props = IngestionProperties( db_name, table_name, dataFormat=DataFormat.json, mapping=Helpers.create_deft_table_json_mappings(), reportLevel=ReportLevel.FailuresAndSuccesses, ) for f in [json_file_path, zipped_json_file_path]: ingest_client.ingest_from_file(f, json_ingestion_props) successes = 0 timeout = 60 while successes != 2 and timeout > 0: while ingest_status_q.success.is_empty() and timeout > 0: time.sleep(1) timeout -= 1 success_message = ingest_status_q.success.pop() assert success_message[0].Database == db_name assert success_message[0].Table == table_name successes += 1 assert successes == 2 # TODO: status queues only mark ingestion was successful, but takes time for data to become available time.sleep(20) response = client.execute(db_name, "{} | count".format(table_name)) for row in response.primary_results[0]: assert int(row["Count"]) == 24, "{0} | count = {1}".format( table_name, text_type(row["Count"]))
def test_blob_info_csv_mapping(self): """Tests serialization of csv ingestion blob info.""" validation_policy = ValidationPolicy( ValidationOptions.ValidateCsvInputConstantColumns, ValidationImplications.BestEffort) columnMapping = ColumnMapping("ColumnName", "cslDataType", ordinal=1) properties = IngestionProperties( database="database", table="table", dataFormat=DataFormat.CSV, ingestionMapping=[columnMapping], additionalTags=["tag"], ingestIfNotExists=["ingestIfNotExistTags"], ingestByTags=["ingestByTags"], dropByTags=["dropByTags"], flushImmediately=True, reportLevel=ReportLevel.DoNotReport, reportMethod=ReportMethod.Queue, validationPolicy=validation_policy, ) blob = BlobDescriptor("somepath", 10) blob_info = _IngestionBlobInfo(blob, properties, auth_context="authorizationContextText") self._verify_ingestion_blob_info_result(blob_info.to_json())
def test_csv_ingest_non_existing_table(): csv_ingest_props = IngestionProperties( db_name, table_name, dataFormat=DataFormat.CSV, mapping=Helpers.create_deft_table_csv_mappings(), reportLevel=ReportLevel.FailuresAndSuccesses, ) csv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv") zipped_csv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv.gz") for f in [csv_file_path, zipped_csv_file_path]: ingest_client.ingest_from_file(f, csv_ingest_props) successes = 0 timeout = 60 while successes != 2 and timeout > 0: while ingest_status_q.success.is_empty() and timeout > 0: time.sleep(1) timeout -= 1 success_message = ingest_status_q.success.pop() assert success_message[0].Database == db_name assert success_message[0].Table == table_name successes += 1 assert successes == 2 # TODO: status queues only mark ingestion was successful, but takes time for data to become available time.sleep(20) response = client.execute(db_name, "{} | count".format(table_name)) for row in response.primary_results[0]: assert int(row["Count"]) == 20, "{0} | count = {1}".format(table_name, text_type(row["Count"]))
def write_table(ingest_client, table, table_name, upload_time, identifier): """ Uploads the provided table to the database. This function also appends the upload time and unique run identifier to the table. :param ingest_client: An instance of QueuedIngestClient used to initiate data ingestion. :param table: The Pandas table to ingest. :param table_name: The name of the table in the database. :param upload_time: A datetime object denoting the data's upload time. :param identifier: An identifier that associates the uploaded data with an ORT commit/date/branch. """ if table.empty: return # Add upload time and identifier columns to data table. table = table.assign(UploadTime=str(upload_time)) table = table.assign(Identifier=identifier) ingestion_props = IngestionProperties( database=DATABASE_NAME, table=table_name, data_format=DataFormat.CSV, report_level=ReportLevel.FailuresAndSuccesses, ) # append rows ingest_client.ingest_from_dataframe(table, ingestion_properties=ingestion_props)
def test_json_ingestion_ingest_by_tag(): json_ingestion_props = IngestionProperties( "PythonTest", "Deft", dataFormat=DataFormat.json, mapping=Helpers.create_deft_table_json_mappings(), ingestIfNotExists=["ingestByTag"], reportLevel=ReportLevel.FailuresAndSuccesses, dropByTags=["drop", "drop-by"], ) ops = [] for f in [json_file_path, zipped_json_file_path]: ingest_client.ingest_from_file(f, json_ingestion_props) successes = 0 timeout = 60 while successes != 2 and timeout > 0: while ingest_status_q.success.is_empty() and timeout > 0: time.sleep(1) timeout -= 1 success_message = ingest_status_q.success.pop() assert success_message[0].Database == "PythonTest" assert success_message[0].Table == "Deft" successes += 1 assert successes == 2 # TODO: status queues only mark ingestion was successful, but takes time for data to become available time.sleep(20) response = client.execute("PythonTest", "Deft | count") for row in response.primary_results[0]: assert int( row["Count"]) == 28, "Deft | count = " + text_type(row["Count"])
def test_streaming_ingest_from_json_file(): current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.json"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingestion_properties = IngestionProperties(database=db_name, table=table_name, dataFormat=DataFormat.json, mappingReference="JsonMapping") ingest_client.ingest_from_file(file_path, ingestion_properties=ingestion_properties) path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.jsonz.gz"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file(file_path, ingestion_properties=ingestion_properties)
def test_streaming_ingest_from_json_no_mapping(): ingestion_properties = IngestionProperties(database=db_name, table=table_name, dataFormat=DataFormat.json) try: current_dir = os.getcwd() path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.json"] missing_path_parts = [] for path_part in path_parts: if path_part not in current_dir: missing_path_parts.append(path_part) file_path = os.path.join(current_dir, *missing_path_parts) ingest_client.ingest_from_file( file_path, ingestion_properties=ingestion_properties) except KustoMissingMappingReferenceError: pass try: byte_sequence = b'{"rownumber": 0, "rowguid": "00000000-0000-0000-0001-020304050607", "xdouble": 0.0, "xfloat": 0.0, "xbool": 0, "xint16": 0, "xint32": 0, "xint64": 0, "xunit8": 0, "xuint16": 0, "xunit32": 0, "xunit64": 0, "xdate": "2014-01-01T01:01:01Z", "xsmalltext": "Zero", "xtext": "Zero", "xnumberAsText": "0", "xtime": "00:00:00", "xtextWithNulls": null, "xdynamicWithNulls": ""}' bytes_stream = io.BytesIO(byte_sequence) ingest_client.ingest_from_stream( bytes_stream, ingestion_properties=ingestion_properties) except KustoMissingMappingReferenceError: pass
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, tc): ingest_source_id = str(uuid.uuid4()) #file_size=BlockBlobService.get_blob_properties(telemetry_block_blob_service,container_name,filepath).properties.content_length #print (filepath+" File Size "+str(file_size)) KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication( DATA_INGESTION_URI) KCSB_INGEST.authority_id = APP_AAD_TENANT_ID vm_uuid, config_uuid, deploy_uuid, file_size, min_datatime, max_datatime, total_records = get_uuids_from_csv( telemetry_block_blob_service, container_name, filepath) dropByTag = vm_uuid + '_' + config_uuid + '_' + deploy_uuid INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST) INGESTION_PROPERTIES = IngestionProperties( database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.CSV, mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, additionalProperties={ 'ignoreFirstRecord': 'true', 'reportMethod': 'QueueAndTable' }, reportLevel=ReportLevel.FailuresAndSuccesses, dropByTags=[dropByTag], flushImmediately=IS_FLUSH_IMMEDIATELY) BLOB_PATH = "https://" + SOURCE_OSMETRICS_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_OSMETRICS_CONTAINER + "/" + filepath + SOURCE_OSMETRICS_FILE_TOKEN #print (BLOB_PATH,' ',str(file_size)) BLOB_DESCRIPTOR = BlobDescriptor( BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes INGESTION_CLIENT.ingest_from_blob( BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) tc.context.properties["ingest_source_id"] = str(ingest_source_id) doc_id = save_COSMOS_log(vm_uuid, deploy_uuid, config_uuid, filepath, min_datatime, max_datatime, total_records, ingest_source_id, blob_account, container_name, tc) tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath, 'DOC_ID': doc_id, "SOURCE_ID": ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE': file_size, 'MIN_DATETIME': min_datatime, 'MAX_DATETIME': max_datatime }) log_msg = "{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format( LOG_MESSAGE_HEADER, filepath, ingest_source_id) print(log_msg) tc.track_trace(log_msg) tc.flush()
def ingestBlob(client,db,blob,properties): INGESTION_PROPERTIES = IngestionProperties(database=db, table=blob['table'], dataFormat=DataFormat(blob['format']), mappingReference=blob['ingestionMapping'], additionalProperties=properties, reportLevel=ReportLevel.FailuresAndSuccesses) BLOB_DESCRIPTOR = BlobDescriptor(blob['path'],blob['size']) try: client.ingest_from_blob(BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES) logging.info("Blob %s ingested succesfully."%blob['name']) except Exception as e: logging.error("Error ingesting blob %s: %s"%(blob['name'],e))
def test_with_constant_value(): IngestionProperties( database="database", table="table", column_mappings=[ColumnMapping("test", "int", const_value="1")], data_format=DataFormat.PARQUET, ingestion_mapping_kind=IngestionMappingKind.PARQUET, )
def test_duplicate_reference_and_column_mappings_raises(): """Tests invalid ingestion properties.""" with pytest.raises(KustoDuplicateMappingError): IngestionProperties( database="database", table="table", column_mappings=[ColumnMapping("test", "int")], ingestion_mapping_reference="ingestionMappingReference")