Example #1
0
def test_streaming_ingest_from_io_streams():
    ingestion_properties = IngestionProperties(database=db_name, table=table_name, dataFormat=DataFormat.CSV)
    byte_sequence = b'0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null'
    bytes_stream = io.BytesIO(byte_sequence)
    ingest_client.ingest_from_stream(bytes_stream, ingestion_properties=ingestion_properties)

    str_sequence = '0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null'
    str_stream = io.StringIO(str_sequence)
    ingest_client.ingest_from_stream(str_stream, ingestion_properties=ingestion_properties)

    byte_sequence = b'{"rownumber": 0, "rowguid": "00000000-0000-0000-0001-020304050607", "xdouble": 0.0, "xfloat": 0.0, "xbool": 0, "xint16": 0, "xint32": 0, "xint64": 0, "xunit8": 0, "xuint16": 0, "xunit32": 0, "xunit64": 0, "xdate": "2014-01-01T01:01:01Z", "xsmalltext": "Zero", "xtext": "Zero", "xnumberAsText": "0", "xtime": "00:00:00", "xtextWithNulls": null, "xdynamicWithNulls": ""}'
    bytes_stream = io.BytesIO(byte_sequence)
    ingestion_properties.format = DataFormat.JSON

    ingestion_properties.ingestion_mapping_reference = "JsonMapping"
    ingest_client.ingest_from_stream(bytes_stream, ingestion_properties=ingestion_properties)

    str_sequence = u'{"rownumber": 0, "rowguid": "00000000-0000-0000-0001-020304050607", "xdouble": 0.0, "xfloat": 0.0, "xbool": 0, "xint16": 0, "xint32": 0, "xint64": 0, "xunit8": 0, "xuint16": 0, "xunit32": 0, "xunit64": 0, "xdate": "2014-01-01T01:01:01Z", "xsmalltext": "Zero", "xtext": "Zero", "xnumberAsText": "0", "xtime": "00:00:00", "xtextWithNulls": null, "xdynamicWithNulls": ""}'
    str_stream = io.StringIO(str_sequence)
    ingest_client.ingest_from_stream(str_stream, ingestion_properties=ingestion_properties)

    byte_sequence = (
        b'0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null'
        * 600000
    )
    bytes_stream = io.BytesIO(byte_sequence)

    try:
        ingest_client.ingest_from_stream(bytes_stream, ingestion_properties=ingestion_properties)
    except KustoStreamMaxSizeExceededError:
        pass
Example #2
0
    def ingest_from_source(self, source: IngestionSource,
                           mapping: IngestionMapping, target_database: str,
                           target_table: str, **kwargs):
        files = source.files
        ingest_client = self.client_provider.get_ingest_client()

        # TODO: should maybe persist ingestion mappings
        ingestion_props = IngestionProperties(
            target_database,
            target_table,
            dataFormat=DataFormat(source.data_format),
            mapping=self.get_ingestion_mapping(source.data_format, mapping),
            reportLevel=ReportLevel.FailuresOnly,
            reportMethod=ReportMethod.Queue,
            flushImmediately=True,
        )

        if "batch_id" in kwargs and not kwargs.get("no_wait", False):
            # this helps with monitoring
            ingestion_props.ingest_by_tags = [kwargs["batch_id"]]
        for file_path in files:
            if kwargs.get("direct", True):
                # TODO: allow for direct ingestion (this is currently only relevant to files already in storage)
                # client.execute(f'.ingest into table {operation.target} ({}) with ({mapping_ref_key}="{mapping_name}")')
                pass
            else:
                logger.info(
                    f'Queueing "{file_path}" to ingest into "{ingestion_props.table}"'
                )
                ingest_client.ingest_from_file(str(file_path), ingestion_props)
    def test_streaming_ingest_from_stream(self):
        responses.add_callback(
            responses.POST,
            "https://somecluster.kusto.windows.net/v1/rest/ingest/database/table",
            callback=request_callback,
        )

        ingest_client = KustoStreamingIngestClient(
            "https://somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   dataFormat=DataFormat.csv)

        byte_sequence = b"56,56,56"
        bytes_stream = io.BytesIO(byte_sequence)
        ingest_client.ingest_from_stream(
            bytes_stream, ingestion_properties=ingestion_properties)

        str_sequence = u"57,57,57"
        str_stream = io.StringIO(str_sequence)
        ingest_client.ingest_from_stream(
            str_stream, ingestion_properties=ingestion_properties)

        byte_sequence = b'{"Name":"Ben","Age":"56","Weight":"75"}'
        bytes_stream = io.BytesIO(byte_sequence)
        ingestion_properties.format = DataFormat.json
        try:
            ingest_client.ingest_from_stream(
                bytes_stream, ingestion_properties=ingestion_properties)
        except KustoMissingMappingReferenceError:
            pass

        ingestion_properties.mapping_reference = "JsonMapping"
        ingest_client.ingest_from_stream(
            bytes_stream, ingestion_properties=ingestion_properties)

        str_sequence = u'{"Name":"Ben","Age":"56","Weight":"75"}'
        str_stream = io.StringIO(str_sequence)
        ingest_client.ingest_from_stream(
            str_stream, ingestion_properties=ingestion_properties)

        byte_sequence = b"56,56,56" * 600000
        bytes_stream = io.BytesIO(byte_sequence)

        try:
            ingest_client.ingest_from_stream(
                bytes_stream, ingestion_properties=ingestion_properties)
        except KustoStreamMaxSizeExceededError:
            pass
    def test_blob_ingestion(self, mock_uuid, mock_put_message_in_queue,
                            mock_aad):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=queued_request_callback,
            content_type="application/json")

        ingest_client = ManagedStreamingIngestClient.from_dm_kcsb(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table")

        blob_path = (
            "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?sp=rl&st=2020-05-20T13"
            "%3A38%3A37Z&se=2020-05-21T13%3A38%3A37Z&sv=2019-10-10&sr=c&sig=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx "
        )
        result = ingest_client.ingest_from_blob(
            BlobDescriptor(blob_path, 1),
            ingestion_properties=ingestion_properties)

        assert result.status == IngestionStatus.QUEUED

        assert_queued_upload(
            mock_put_message_in_queue,
            mock_upload_blob_from_stream=None,
            expected_url=
            "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__tmpbvk40leg?",
        )
Example #5
0
    def test_simple_ingest_from_dataframe(self, mock_pid, mock_time, mock_uuid,
                                          mock_put_message_in_queue,
                                          mock_create_blob_from_path):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=request_callback,
            content_type="application/json",
        )

        ingest_client = KustoIngestClient(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   dataFormat=DataFormat.csv)

        from pandas import DataFrame

        fields = ["id", "name", "value"]
        rows = [[1, "abc", 15.3], [2, "cde", 99.9]]
        df = DataFrame(data=rows, columns=fields)

        ingest_client.ingest_from_dataframe(
            df, ingestion_properties=ingestion_properties)

        # mock_put_message_in_queue
        assert mock_put_message_in_queue.call_count == 1

        put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[
            0][1]

        assert put_message_in_queue_mock_kwargs[
            "queue_name"] == "readyforaggregation-secured"
        queued_message = base64.b64decode(
            put_message_in_queue_mock_kwargs["content"].encode(
                "utf-8")).decode("utf-8")
        queued_message_json = json.loads(queued_message)
        # mock_create_blob_from_stream
        assert (
            queued_message_json["BlobPath"] ==
            "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__df_100_64.csv.gz?sas"
        )
        assert queued_message_json["DatabaseName"] == "database"
        assert queued_message_json["IgnoreSizeLimit"] == False
        assert queued_message_json["AdditionalProperties"]["format"] == "csv"
        assert queued_message_json["FlushImmediately"] == False
        assert queued_message_json["TableName"] == "table"
        assert queued_message_json["RawDataSize"] > 0
        assert queued_message_json["RetainBlobOnSuccess"] == True

        create_blob_from_path_mock_kwargs = mock_create_blob_from_path.call_args_list[
            0][1]
        import tempfile

        assert create_blob_from_path_mock_kwargs[
            "container_name"] == "tempstorage"
        assert create_blob_from_path_mock_kwargs["file_path"] == os.path.join(
            tempfile.gettempdir(), "df_100_64.csv.gz")
        assert (create_blob_from_path_mock_kwargs["blob_name"] ==
                "database__table__1111-111111-111111-1111__df_100_64.csv.gz")
Example #6
0
    def test_simple_ingest_from_dataframe(self, mock_pid, mock_time, mock_uuid,
                                          mock_put_message_in_queue,
                                          mock_upload_blob_from_stream,
                                          ingest_client_class):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=request_callback,
            content_type="application/json")

        ingest_client = ingest_client_class(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   data_format=DataFormat.CSV)

        from pandas import DataFrame

        fields = ["id", "name", "value"]
        rows = [[1, "abc", 15.3], [2, "cde", 99.9]]
        df = DataFrame(data=rows, columns=fields)

        result = ingest_client.ingest_from_dataframe(
            df, ingestion_properties=ingestion_properties)
        assert result.status == IngestionStatus.QUEUED

        expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__11111111-1111-1111-1111-111111111111__df_{0}_100_11111111-1111-1111-1111-111111111111.csv.gz?".format(
            id(df))

        assert_queued_upload(mock_put_message_in_queue,
                             mock_upload_blob_from_stream, expected_url)
Example #7
0
    def test_ingest_from_file_wrong_endpoint(self, ingest_client_class):
        responses.add_callback(
            responses.POST,
            "https://somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=request_error_callback,
            content_type="application/json")

        ingest_client = ingest_client_class(
            "https://somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   data_format=DataFormat.CSV)

        current_dir = os.getcwd()
        path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"]
        missing_path_parts = []
        for path_part in path_parts:
            if path_part not in current_dir:
                missing_path_parts.append(path_part)

        file_path = os.path.join(current_dir, *missing_path_parts)

        with pytest.raises(KustoInvalidEndpointError) as ex:
            ingest_client.ingest_from_file(
                file_path, ingestion_properties=ingestion_properties)

        assert (
            ex.value.args[0] ==
            "You are using 'DataManagement' client type, but the provided endpoint is of ServiceType 'Engine'. Initialize the "
            "client with the appropriate endpoint URI: 'https://ingest-somecluster.kusto.windows.net'"
        ), ("Expected exception was "
            "not raised")
Example #8
0
def test_tsv_ingestion_csv_mapping():
    tsv_ingestion_props = IngestionProperties(
        "PythonTest",
        "Deft",
        dataFormat=DataFormat.tsv,
        mapping=Helpers.create_deft_table_csv_mappings(),
        reportLevel=ReportLevel.FailuresAndSuccesses,
    )
    tsv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests",
                                 "input", "dataset.tsv")

    ingest_client.ingest_from_file(tsv_file_path, tsv_ingestion_props)

    successes = 0
    timeout = 60
    while successes != 1 and timeout > 0:
        while ingest_status_q.success.is_empty() and timeout > 0:
            time.sleep(1)
            timeout -= 1

        success_message = ingest_status_q.success.pop()

        assert success_message[0].Table == "Deft"
        assert success_message[0].Database == "PythonTest"

        successes += 1

    assert successes == 1
    # TODO: status queues only mark ingestion was successful, but takes time for data to become available
    time.sleep(20)
    response = client.execute("PythonTest", "Deft | count")
    for row in response.primary_results[0]:
        assert int(row["Count"]) == 38, print("Deft | count = " +
                                              text_type(row["Count"]))
Example #9
0
def write_to_db(coverage_data, args):
    # connect to database
    cluster = "https://ingest-onnxruntimedashboarddb.southcentralus.kusto.windows.net"
    kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(cluster)
    # The authentication method will be taken from the chosen KustoConnectionStringBuilder.
    client = QueuedIngestClient(kcsb)
    fields = [
        "UploadTime", "CommitId", "Coverage", "LinesCovered", "TotalLines",
        "OS", "Arch", "BuildConfig", "ReportURL", "Branch"
    ]
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    rows = [[
        now_str, args.commit_hash, coverage_data['coverage'],
        coverage_data['lines_covered'], coverage_data['lines_valid'],
        args.os.lower(),
        args.arch.lower(),
        args.build_config.lower(),
        args.report_url.lower(),
        args.branch.lower()
    ]]
    ingestion_props = IngestionProperties(
        database="powerbi",
        table="test_coverage",
        data_format=DataFormat.CSV,
        report_level=ReportLevel.FailuresAndSuccesses)
    df = pandas.DataFrame(data=rows, columns=fields)
    client.ingest_from_dataframe(df, ingestion_properties=ingestion_props)
 def test_blob_info_json_mapping(self):
     """Tests serialization of json ingestion blob info."""
     validation_policy = ValidationPolicy(
         ValidationOptions.ValidateCsvInputConstantColumns,
         ValidationImplications.BestEffort)
     properties = IngestionProperties(
         database="database",
         table="table",
         data_format=DataFormat.JSON,
         column_mappings=[
             ColumnMapping("ColumnName", "datatype", path="jsonpath")
         ],
         additional_tags=["tag"],
         ingest_if_not_exists=["ingestIfNotExistTags"],
         ingest_by_tags=["ingestByTags"],
         drop_by_tags=["dropByTags"],
         flush_immediately=True,
         report_level=ReportLevel.DoNotReport,
         report_method=ReportMethod.Queue,
         validation_policy=validation_policy,
     )
     blob = BlobDescriptor("somepath", 10)
     blob_info = IngestionBlobInfo(blob,
                                   properties,
                                   auth_context="authorizationContextText")
     self._verify_ingestion_blob_info_result(blob_info.to_json())
    def test_sanity_ingest_from_file(self, mock_uuid,
                                     mock_put_message_in_queue,
                                     mock_create_blob_from_stream, mock_aad):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=request_callback,
            content_type="application/json")

        ingest_client = KustoIngestClient(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   dataFormat=DataFormat.CSV)

        # ensure test can work when executed from within directories
        current_dir = os.getcwd()
        path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"]
        missing_path_parts = []
        for path_part in path_parts:
            if path_part not in current_dir:
                missing_path_parts.append(path_part)

        file_path = os.path.join(current_dir, *missing_path_parts)

        ingest_client.ingest_from_file(
            file_path, ingestion_properties=ingestion_properties)

        # mock_put_message_in_queue
        assert mock_put_message_in_queue.call_count == 1

        put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[
            0][1]

        assert put_message_in_queue_mock_kwargs[
            "queue_name"] == "readyforaggregation-secured"
        queued_message = base64.b64decode(
            put_message_in_queue_mock_kwargs["content"].encode(
                "utf-8")).decode("utf-8")
        queued_message_json = json.loads(queued_message)
        expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/" "database__table__1111-111111-111111-1111__dataset.csv.gz?sas"
        # mock_create_blob_from_stream
        assert queued_message_json["BlobPath"] == expected_url
        assert queued_message_json["DatabaseName"] == "database"
        assert queued_message_json["IgnoreSizeLimit"] == False
        assert queued_message_json["AdditionalProperties"]["format"] == "csv"
        assert queued_message_json["FlushImmediately"] == False
        assert queued_message_json["TableName"] == "table"
        assert queued_message_json["RawDataSize"] > 0
        assert queued_message_json["RetainBlobOnSuccess"] == True

        create_blob_from_stream_mock_kwargs = mock_create_blob_from_stream.call_args_list[
            0][1]

        assert create_blob_from_stream_mock_kwargs[
            "container_name"] == "tempstorage"
        assert type(
            create_blob_from_stream_mock_kwargs["stream"]) == io.BytesIO
        assert create_blob_from_stream_mock_kwargs[
            "blob_name"] == "database__table__1111-111111-111111-1111__dataset.csv.gz"
Example #12
0
def ingest_to_ADX(filepath, filesize):
    KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(
        DATA_INGESTION_URI)
    KCSB_INGEST.authority_id = AAD_TENANT_ID

    KCSB_ENGINE = KustoConnectionStringBuilder.with_aad_device_authentication(
        URI)
    KCSB_ENGINE.authority_id = AAD_TENANT_ID

    INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST)
    INGESTION_PROPERTIES = IngestionProperties(
        database=DATABASE,
        table=DESTINATION_TABLE,
        dataFormat=DataFormat.CSV,
        mappingReference=DESTINATION_TABLE_COLUMN_MAPPING,
        additionalProperties={'ignoreFirstRecord': 'true'},
        reportLevel=ReportLevel.FailuresAndSuccesses)
    BLOB_PATH = "https://" + SOURCE_CSV_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_CSV_CONTAINER + "/" + filepath + SOURCE_CSV_BLOB_TOKEN

    BLOB_DESCRIPTOR = BlobDescriptor(
        BLOB_PATH, filesize)  # 10 is the raw size of the data in bytes
    INGESTION_CLIENT.ingest_from_blob(
        BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES)

    print('Done queuing up ingestion with Azure Data Explorer ' + filepath)
Example #13
0
def test_ingest_complicated_props():
    validation_policy = ValidationPolicy(
        validationOptions=ValidationOptions.ValidateCsvInputConstantColumns,
        validationImplications=ValidationImplications.Fail)
    json_ingestion_props = IngestionProperties(
        db_name,
        table_name,
        dataFormat=DataFormat.JSON,
        ingestionMapping=Helpers.create_test_table_json_mappings(),
        additionalTags=["a", "b"],
        ingestIfNotExists=["aaaa", "bbbb"],
        ingestByTags=["ingestByTag"],
        dropByTags=["drop", "drop-by"],
        flushImmediately=False,
        reportLevel=ReportLevel.FailuresAndSuccesses,
        reportMethod=ReportMethod.Queue,
        validationPolicy=validation_policy,
    )

    file_paths = [json_file_path, zipped_json_file_path]
    fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths]

    for fd in fds:
        ingest_client.ingest_from_file(fd, json_ingestion_props)

    assert_success_mesagges_count(2)
    assert_row_count(4)
Example #14
0
def test_ingest_complicated_props():
    validation_policy = ValidationPolicy(
        validation_options=ValidationOptions.ValidateCsvInputConstantColumns,
        validation_implications=ValidationImplications.Fail)
    json_ingestion_props = IngestionProperties(
        test_db,
        test_table,
        data_format=DataFormat.JSON,
        ingestion_mapping=TestData.test_table_json_mappings(),
        additional_tags=["a", "b"],
        ingest_if_not_exists=["aaaa", "bbbb"],
        ingest_by_tags=["ingestByTag"],
        drop_by_tags=["drop", "drop-by"],
        flush_immediately=False,
        report_level=ReportLevel.FailuresAndSuccesses,
        report_method=ReportMethod.Queue,
        validation_policy=validation_policy,
    )

    file_paths = [json_file_path, zipped_json_file_path]
    fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths]

    for fd in fds:
        ingest_client.ingest_from_file(fd, json_ingestion_props)

    assert_rows_added(4)
Example #15
0
def test_streaming_ingest_from_dataframe():
    from pandas import DataFrame

    fields = [
        "rownumber",
        "rowguid",
        "xdouble",
        "xfloat",
        "xbool",
        "xint16",
        "xint32",
        "xint64",
        "xunit8",
        "xuint16",
        "xunit32",
        "xunit64",
        "xdate",
        "xsmalltext",
        "xtext",
        "xnumberAsText",
        "xtime",
        "xtextWithNulls",
        "xdynamicWithNulls",
    ]
    rows = [[
        0, "00000000-0000-0000-0001-020304050607", 0.0, 0.0, 0, 0, 0, 0, 0, 0,
        0, 0, "2014-01-01T01:01:01Z", "Zero", "Zero", "0", "00:00:00", None, ""
    ]]
    df = DataFrame(data=rows, columns=fields)
    ingestion_properties = IngestionProperties(database=db_name,
                                               table=table_name,
                                               dataFormat=DataFormat.CSV)
    ingest_client.ingest_from_dataframe(df, ingestion_properties)
Example #16
0
 def test_blob_info_json_mapping(self):
     """ Tests serialization of json ingestion blob info. """
     validation_policy = ValidationPolicy(
         ValidationOptions.ValidateCsvInputConstantColumns,
         ValidationImplications.BestEffort)
     properties = IngestionProperties(
         database="database",
         table="table",
         dataFormat=DataFormat.json,
         mapping=[JsonColumnMapping("ColumnName", "jsonpath", "datatype")],
         additionalTags=["tag"],
         ingestIfNotExists=["ingestIfNotExistTags"],
         ingestByTags=["ingestByTags"],
         dropByTags=["dropByTags"],
         flushImmediately=True,
         reportLevel=ReportLevel.DoNotReport,
         reportMethod=ReportMethod.QueueAndTable,
         validationPolicy=validation_policy,
     )
     blob = BlobDescriptor("somepath", 10)
     blob_info = _IngestionBlobInfo(blob,
                                    properties,
                                    deleteSourcesOnSuccess=True,
                                    authContext="authorizationContextText")
     self._verify_ingestion_blob_info_result(blob_info.to_json())
Example #17
0
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name, blob_account, file_size, tc,vm_uuid,deploy_uuid,config_uuid):
    ingest_source_id=str(uuid.uuid4())
    KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(DATA_INGESTION_URI)
    KCSB_INGEST.authority_id = APP_AAD_TENANT_ID
    INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST)
    ing_map=[JsonColumnMapping("vm_uuid", "$.vm_uuid", "string"),
             JsonColumnMapping("deploy_uuid", "$.deployment_description[0].deploy_uuid", "string"),
             JsonColumnMapping("config_uuid", "$.vm_configuration[0].config_uuid", "string"),
             JsonColumnMapping("rawdata", "$", "dynamic")]
        
    INGESTION_PROPERTIES  = IngestionProperties(database=DATABASE, table=DESTINATION_TABLE, dataFormat=DataFormat.JSON, ingestionMapping=ing_map, reportLevel=ReportLevel.FailuresAndSuccesses,flushImmediately=IS_FLUSH_IMMEDIATELY)                                                                                                                                                          

    print("Database {} Tabele {}".format(DATABASE,DESTINATION_TABLE))
    
    BLOB_PATH = "https://" + blob_account + ".blob.core.windows.net/" + container_name + "/" + filepath + CLEAN_FILE_TOKEN

    print (BLOB_PATH,' ',str(file_size), ingest_source_id)
    BLOB_DESCRIPTOR = BlobDescriptor(BLOB_PATH, file_size, ingest_source_id) # 10 is the raw size of the data in bytes
    INGESTION_CLIENT.ingest_from_blob(BLOB_DESCRIPTOR,ingestion_properties=INGESTION_PROPERTIES)
    tc.context.properties["ingest_source_id"]=ingest_source_id

    min_datatime=0
    max_datatime=0
    total_records=1

    doc_id=save_COSMOS_log(vm_uuid,deploy_uuid,config_uuid,filepath,min_datatime,max_datatime, total_records,ingest_source_id,blob_account,container_name, tc)

    tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, { 'FILE_PATH': filepath,'DOC_ID':doc_id,"SOURCE_ID":ingest_source_id }, { 'TOTOAL_RECORDS': total_records, 'FILE_SIZE':file_size,'MIN_DATETIME':min_datatime,'MAX_DATETIME': max_datatime })
    log_msg="{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(LOG_MESSAGE_HEADER,filepath,ingest_source_id)
    print(log_msg)
    tc.track_trace(log_msg)
    tc.flush()
Example #18
0
def write_to_db(binary_size_data, args):
    # connect to database
    cluster = "https://ingest-onnxruntimedashboarddb.southcentralus.kusto.windows.net"
    kcsb = KustoConnectionStringBuilder.with_az_cli_authentication(cluster)
    # The authentication method will be taken from the chosen KustoConnectionStringBuilder.
    client = QueuedIngestClient(kcsb)
    fields = ["build_time", "build_id", "build_project", "commit_id", "os", "arch", "build_config", "size", "Branch"]
    now_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    branch_name = os.environ.get("BUILD_SOURCEBRANCHNAME", "main")
    rows = []
    for row in binary_size_data:
        rows.append(
            [
                now_str,
                args.build_id,
                args.build_project,
                args.commit_hash,
                row["os"],
                row["arch"],
                row["build_config"],
                row["size"],
                branch_name.lower(),
            ]
        )
    ingestion_props = IngestionProperties(
        database="powerbi",
        table="binary_size",
        data_format=DataFormat.CSV,
        report_level=ReportLevel.FailuresAndSuccesses,
    )
    df = pandas.DataFrame(data=rows, columns=fields)
    client.ingest_from_dataframe(df, ingestion_properties=ingestion_props)
Example #19
0
 def test_blob_info_csv_exceptions(self):
     """ Tests invalid ingestion properties. """
     with self.assertRaises(KustoDuplicateMappingError):
         IngestionProperties(database="database",
                             table="table",
                             mapping="mapping",
                             mapptingReference="mappingReference")
Example #20
0
def test_json_ingest_existing_table():
    json_ingestion_props = IngestionProperties(
        db_name,
        table_name,
        dataFormat=DataFormat.json,
        mapping=Helpers.create_deft_table_json_mappings(),
        reportLevel=ReportLevel.FailuresAndSuccesses,
    )

    for f in [json_file_path, zipped_json_file_path]:
        ingest_client.ingest_from_file(f, json_ingestion_props)

    successes = 0
    timeout = 60

    while successes != 2 and timeout > 0:
        while ingest_status_q.success.is_empty() and timeout > 0:
            time.sleep(1)
            timeout -= 1

        success_message = ingest_status_q.success.pop()

        assert success_message[0].Database == db_name
        assert success_message[0].Table == table_name

        successes += 1

    assert successes == 2
    # TODO: status queues only mark ingestion was successful, but takes time for data to become available
    time.sleep(20)
    response = client.execute(db_name, "{} | count".format(table_name))
    for row in response.primary_results[0]:
        assert int(row["Count"]) == 24, "{0} | count = {1}".format(
            table_name, text_type(row["Count"]))
    def test_blob_info_csv_mapping(self):
        """Tests serialization of csv ingestion blob info."""
        validation_policy = ValidationPolicy(
            ValidationOptions.ValidateCsvInputConstantColumns,
            ValidationImplications.BestEffort)
        columnMapping = ColumnMapping("ColumnName", "cslDataType", ordinal=1)

        properties = IngestionProperties(
            database="database",
            table="table",
            dataFormat=DataFormat.CSV,
            ingestionMapping=[columnMapping],
            additionalTags=["tag"],
            ingestIfNotExists=["ingestIfNotExistTags"],
            ingestByTags=["ingestByTags"],
            dropByTags=["dropByTags"],
            flushImmediately=True,
            reportLevel=ReportLevel.DoNotReport,
            reportMethod=ReportMethod.Queue,
            validationPolicy=validation_policy,
        )
        blob = BlobDescriptor("somepath", 10)
        blob_info = _IngestionBlobInfo(blob,
                                       properties,
                                       auth_context="authorizationContextText")
        self._verify_ingestion_blob_info_result(blob_info.to_json())
Example #22
0
def test_csv_ingest_non_existing_table():
    csv_ingest_props = IngestionProperties(
        db_name,
        table_name,
        dataFormat=DataFormat.CSV,
        mapping=Helpers.create_deft_table_csv_mappings(),
        reportLevel=ReportLevel.FailuresAndSuccesses,
    )
    csv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv")
    zipped_csv_file_path = os.path.join(os.getcwd(), "azure-kusto-ingest", "tests", "input", "dataset.csv.gz")

    for f in [csv_file_path, zipped_csv_file_path]:
        ingest_client.ingest_from_file(f, csv_ingest_props)

    successes = 0
    timeout = 60
    while successes != 2 and timeout > 0:
        while ingest_status_q.success.is_empty() and timeout > 0:
            time.sleep(1)
            timeout -= 1

        success_message = ingest_status_q.success.pop()

        assert success_message[0].Database == db_name
        assert success_message[0].Table == table_name

        successes += 1

    assert successes == 2
    # TODO: status queues only mark ingestion was successful, but takes time for data to become available
    time.sleep(20)
    response = client.execute(db_name, "{} | count".format(table_name))
    for row in response.primary_results[0]:
        assert int(row["Count"]) == 20, "{0} | count = {1}".format(table_name, text_type(row["Count"]))
Example #23
0
def write_table(ingest_client, table, table_name, upload_time, identifier):
    """
    Uploads the provided table to the database. This function also appends the upload time and unique run identifier
    to the table.

    :param ingest_client: An instance of QueuedIngestClient used to initiate data ingestion.
    :param table: The Pandas table to ingest.
    :param table_name: The name of the table in the database.
    :param upload_time: A datetime object denoting the data's upload time.
    :param identifier: An identifier that associates the uploaded data with an ORT commit/date/branch.
    """

    if table.empty:
        return

    # Add upload time and identifier columns to data table.
    table = table.assign(UploadTime=str(upload_time))
    table = table.assign(Identifier=identifier)
    ingestion_props = IngestionProperties(
        database=DATABASE_NAME,
        table=table_name,
        data_format=DataFormat.CSV,
        report_level=ReportLevel.FailuresAndSuccesses,
    )
    # append rows
    ingest_client.ingest_from_dataframe(table,
                                        ingestion_properties=ingestion_props)
Example #24
0
def test_json_ingestion_ingest_by_tag():
    json_ingestion_props = IngestionProperties(
        "PythonTest",
        "Deft",
        dataFormat=DataFormat.json,
        mapping=Helpers.create_deft_table_json_mappings(),
        ingestIfNotExists=["ingestByTag"],
        reportLevel=ReportLevel.FailuresAndSuccesses,
        dropByTags=["drop", "drop-by"],
    )
    ops = []
    for f in [json_file_path, zipped_json_file_path]:
        ingest_client.ingest_from_file(f, json_ingestion_props)

    successes = 0
    timeout = 60
    while successes != 2 and timeout > 0:
        while ingest_status_q.success.is_empty() and timeout > 0:
            time.sleep(1)
            timeout -= 1

        success_message = ingest_status_q.success.pop()

        assert success_message[0].Database == "PythonTest"
        assert success_message[0].Table == "Deft"

        successes += 1

    assert successes == 2
    # TODO: status queues only mark ingestion was successful, but takes time for data to become available
    time.sleep(20)
    response = client.execute("PythonTest", "Deft | count")
    for row in response.primary_results[0]:
        assert int(
            row["Count"]) == 28, "Deft | count = " + text_type(row["Count"])
Example #25
0
def test_streaming_ingest_from_json_file():
    current_dir = os.getcwd()
    path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.json"]
    missing_path_parts = []
    for path_part in path_parts:
        if path_part not in current_dir:
            missing_path_parts.append(path_part)

    file_path = os.path.join(current_dir, *missing_path_parts)
    ingestion_properties = IngestionProperties(database=db_name,
                                               table=table_name,
                                               dataFormat=DataFormat.json,
                                               mappingReference="JsonMapping")
    ingest_client.ingest_from_file(file_path,
                                   ingestion_properties=ingestion_properties)

    path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.jsonz.gz"]
    missing_path_parts = []
    for path_part in path_parts:
        if path_part not in current_dir:
            missing_path_parts.append(path_part)

    file_path = os.path.join(current_dir, *missing_path_parts)

    ingest_client.ingest_from_file(file_path,
                                   ingestion_properties=ingestion_properties)
Example #26
0
def test_streaming_ingest_from_json_no_mapping():
    ingestion_properties = IngestionProperties(database=db_name,
                                               table=table_name,
                                               dataFormat=DataFormat.json)
    try:
        current_dir = os.getcwd()
        path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.json"]
        missing_path_parts = []
        for path_part in path_parts:
            if path_part not in current_dir:
                missing_path_parts.append(path_part)

        file_path = os.path.join(current_dir, *missing_path_parts)
        ingest_client.ingest_from_file(
            file_path, ingestion_properties=ingestion_properties)
    except KustoMissingMappingReferenceError:
        pass

    try:
        byte_sequence = b'{"rownumber": 0, "rowguid": "00000000-0000-0000-0001-020304050607", "xdouble": 0.0, "xfloat": 0.0, "xbool": 0, "xint16": 0, "xint32": 0, "xint64": 0, "xunit8": 0, "xuint16": 0, "xunit32": 0, "xunit64": 0, "xdate": "2014-01-01T01:01:01Z", "xsmalltext": "Zero", "xtext": "Zero", "xnumberAsText": "0", "xtime": "00:00:00", "xtextWithNulls": null, "xdynamicWithNulls": ""}'
        bytes_stream = io.BytesIO(byte_sequence)
        ingest_client.ingest_from_stream(
            bytes_stream, ingestion_properties=ingestion_properties)
    except KustoMissingMappingReferenceError:
        pass
Example #27
0
def ingest_to_ADX(filepath, telemetry_block_blob_service, container_name,
                  blob_account, tc):
    ingest_source_id = str(uuid.uuid4())
    #file_size=BlockBlobService.get_blob_properties(telemetry_block_blob_service,container_name,filepath).properties.content_length
    #print (filepath+" File Size "+str(file_size))

    KCSB_INGEST = KustoConnectionStringBuilder.with_aad_device_authentication(
        DATA_INGESTION_URI)
    KCSB_INGEST.authority_id = APP_AAD_TENANT_ID

    vm_uuid, config_uuid, deploy_uuid, file_size, min_datatime, max_datatime, total_records = get_uuids_from_csv(
        telemetry_block_blob_service, container_name, filepath)
    dropByTag = vm_uuid + '_' + config_uuid + '_' + deploy_uuid

    INGESTION_CLIENT = KustoIngestClient(KCSB_INGEST)
    INGESTION_PROPERTIES = IngestionProperties(
        database=DATABASE,
        table=DESTINATION_TABLE,
        dataFormat=DataFormat.CSV,
        mappingReference=DESTINATION_TABLE_COLUMN_MAPPING,
        additionalProperties={
            'ignoreFirstRecord': 'true',
            'reportMethod': 'QueueAndTable'
        },
        reportLevel=ReportLevel.FailuresAndSuccesses,
        dropByTags=[dropByTag],
        flushImmediately=IS_FLUSH_IMMEDIATELY)

    BLOB_PATH = "https://" + SOURCE_OSMETRICS_BLOB_ACCOUNT + ".blob.core.windows.net/" + SOURCE_OSMETRICS_CONTAINER + "/" + filepath + SOURCE_OSMETRICS_FILE_TOKEN
    #print (BLOB_PATH,' ',str(file_size))
    BLOB_DESCRIPTOR = BlobDescriptor(
        BLOB_PATH, file_size,
        ingest_source_id)  # 10 is the raw size of the data in bytes

    INGESTION_CLIENT.ingest_from_blob(
        BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES)

    tc.context.properties["ingest_source_id"] = str(ingest_source_id)

    doc_id = save_COSMOS_log(vm_uuid, deploy_uuid, config_uuid, filepath,
                             min_datatime, max_datatime, total_records,
                             ingest_source_id, blob_account, container_name,
                             tc)

    tc.track_event(APP_INSIGHT_INGEST_EVENT_NAME, {
        'FILE_PATH': filepath,
        'DOC_ID': doc_id,
        "SOURCE_ID": ingest_source_id
    }, {
        'TOTOAL_RECORDS': total_records,
        'FILE_SIZE': file_size,
        'MIN_DATETIME': min_datatime,
        'MAX_DATETIME': max_datatime
    })
    log_msg = "{} Done queuing up ingestion with Azure Data Explorer {}, Ingest SourceID {}".format(
        LOG_MESSAGE_HEADER, filepath, ingest_source_id)
    print(log_msg)
    tc.track_trace(log_msg)
    tc.flush()
def ingestBlob(client,db,blob,properties):
    INGESTION_PROPERTIES = IngestionProperties(database=db, table=blob['table'], dataFormat=DataFormat(blob['format']), mappingReference=blob['ingestionMapping'], additionalProperties=properties, reportLevel=ReportLevel.FailuresAndSuccesses)
    BLOB_DESCRIPTOR = BlobDescriptor(blob['path'],blob['size'])
    try:
        client.ingest_from_blob(BLOB_DESCRIPTOR, ingestion_properties=INGESTION_PROPERTIES)
        logging.info("Blob %s ingested succesfully."%blob['name'])
    except Exception as e:
        logging.error("Error ingesting blob %s: %s"%(blob['name'],e))
def test_with_constant_value():
    IngestionProperties(
        database="database",
        table="table",
        column_mappings=[ColumnMapping("test", "int", const_value="1")],
        data_format=DataFormat.PARQUET,
        ingestion_mapping_kind=IngestionMappingKind.PARQUET,
    )
def test_duplicate_reference_and_column_mappings_raises():
    """Tests invalid ingestion properties."""
    with pytest.raises(KustoDuplicateMappingError):
        IngestionProperties(
            database="database",
            table="table",
            column_mappings=[ColumnMapping("test", "int")],
            ingestion_mapping_reference="ingestionMappingReference")