def test_sanity_ingest_from_file(self, mock_uuid,
                                     mock_put_message_in_queue,
                                     mock_create_blob_from_stream, mock_aad):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=request_callback,
            content_type="application/json")

        ingest_client = KustoIngestClient(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   dataFormat=DataFormat.CSV)

        # ensure test can work when executed from within directories
        current_dir = os.getcwd()
        path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"]
        missing_path_parts = []
        for path_part in path_parts:
            if path_part not in current_dir:
                missing_path_parts.append(path_part)

        file_path = os.path.join(current_dir, *missing_path_parts)

        ingest_client.ingest_from_file(
            file_path, ingestion_properties=ingestion_properties)

        # mock_put_message_in_queue
        assert mock_put_message_in_queue.call_count == 1

        put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[
            0][1]

        assert put_message_in_queue_mock_kwargs[
            "queue_name"] == "readyforaggregation-secured"
        queued_message = base64.b64decode(
            put_message_in_queue_mock_kwargs["content"].encode(
                "utf-8")).decode("utf-8")
        queued_message_json = json.loads(queued_message)
        expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/" "database__table__1111-111111-111111-1111__dataset.csv.gz?sas"
        # mock_create_blob_from_stream
        assert queued_message_json["BlobPath"] == expected_url
        assert queued_message_json["DatabaseName"] == "database"
        assert queued_message_json["IgnoreSizeLimit"] == False
        assert queued_message_json["AdditionalProperties"]["format"] == "csv"
        assert queued_message_json["FlushImmediately"] == False
        assert queued_message_json["TableName"] == "table"
        assert queued_message_json["RawDataSize"] > 0
        assert queued_message_json["RetainBlobOnSuccess"] == True

        create_blob_from_stream_mock_kwargs = mock_create_blob_from_stream.call_args_list[
            0][1]

        assert create_blob_from_stream_mock_kwargs[
            "container_name"] == "tempstorage"
        assert type(
            create_blob_from_stream_mock_kwargs["stream"]) == io.BytesIO
        assert create_blob_from_stream_mock_kwargs[
            "blob_name"] == "database__table__1111-111111-111111-1111__dataset.csv.gz"
Exemple #2
0
def Ingest(Tag):
    # setting
    AUTHORITY_ID = "6babcaad-604b-40ac-a9d7-9fd97c0b779f"
    INGESTCLUSTER = "https://ingest-cgadataout.kusto.windows.net"
    KUSTOCLUSTER = "https://cgadataout.kusto.windows.net"
    DATABASE = "DevRelWorkArea"

    # Create table
    KCSB_DATA = KustoConnectionStringBuilder.with_aad_device_authentication(
        KUSTOCLUSTER)
    DESTINATION_TABLE = "RepoContributors"
    DESTINATION_TABLE_COLUMN_MAPPING = "RepoContributors_CSV_Mapping"

    KUSTO_CLIENT = KustoClient(KCSB_DATA)
    DROP_TABLE_IF_EXIST = ".drop table RepoContributors ifexists"
    RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, DROP_TABLE_IF_EXIST)

    CREATE_TABLE_COMMAND = ".create table RepoContributors (Article: string, Contributors: int64, Data: string)"
    RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_TABLE_COMMAND)

    print("RepoContributors table is created")

    # Create mapping

    CREATE_MAPPING_COMMAND = """.create table RepoContributors ingestion csv mapping 'RepoContributors_CSV_Mapping' '[{"Name": "Article","datatype": "string","Ordinal": 0},{"Name": "Contributors","datatype": "int64","Ordinal": 1},{"Name": "Data","datatype": "string","Ordinal": 2}]'"""
    RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_MAPPING_COMMAND)

    print("mapping is created")

    # Ingest

    # The authentication method will be taken from the chosen KustoConnectionStringBuilder.
    ingestion_props = IngestionProperties(
        database="DevRelWorkArea",
        table="RepoContributors",
        dataFormat=DataFormat.CSV,
        ingestByTags=[Tag],
        dropByTags=[Tag],
        mappingReference=DESTINATION_TABLE_COLUMN_MAPPING,
        reportLevel=ReportLevel.FailuresAndSuccesses,
        additionalProperties={'ignoreFirstRecord': 'true'})

    kcsb = KustoConnectionStringBuilder.with_aad_device_authentication(
        INGESTCLUSTER)
    client = KustoIngestClient(kcsb)

    # ingest from file
    file_descriptor = FileDescriptor(
        r"D:\test\Results\log_data_merge\merge_microsoftdocs_sql-docs-pr.txt",
        3333)  # 3333 is the raw size of the data in bytes.
    client.ingest_from_file(file_descriptor,
                            ingestion_properties=ingestion_props)
    # if status updates are required, something like this can be done

    return 1
    def test_sanity_ingest_from_file(self, mock_uuid,
                                     mock_put_message_in_queue,
                                     mock_upload_blob_from_stream, mock_aad):
        responses.add_callback(
            responses.POST,
            "https://ingest-somecluster.kusto.windows.net/v1/rest/mgmt",
            callback=request_callback,
            content_type="application/json")

        ingest_client = KustoIngestClient(
            "https://ingest-somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database",
                                                   table="table",
                                                   data_format=DataFormat.CSV)

        # ensure test can work when executed from within directories
        current_dir = os.getcwd()
        path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"]
        missing_path_parts = []
        for path_part in path_parts:
            if path_part not in current_dir:
                missing_path_parts.append(path_part)

        file_path = os.path.join(current_dir, *missing_path_parts)

        ingest_client.ingest_from_file(
            file_path, ingestion_properties=ingestion_properties)

        # mock_put_message_in_queue
        assert mock_put_message_in_queue.call_count == 1

        put_message_in_queue_mock_kwargs = mock_put_message_in_queue.call_args_list[
            0][1]

        queued_message_json = json.loads(
            put_message_in_queue_mock_kwargs["content"])
        expected_url = "https://storageaccount.blob.core.windows.net/tempstorage/database__table__1111-111111-111111-1111__dataset.csv.gz?"
        # mock_upload_blob_from_stream
        # not checking the query string because it can change order, just checking it's there
        assert queued_message_json["BlobPath"].startswith(expected_url) is True
        assert len(queued_message_json["BlobPath"]) > len(expected_url)
        assert queued_message_json["DatabaseName"] == "database"
        assert queued_message_json["IgnoreSizeLimit"] is False
        assert queued_message_json["AdditionalProperties"]["format"] == "csv"
        assert queued_message_json["FlushImmediately"] is False
        assert queued_message_json["TableName"] == "table"
        assert queued_message_json["RawDataSize"] > 0
        assert queued_message_json["RetainBlobOnSuccess"] is True

        upload_blob_kwargs = mock_upload_blob_from_stream.call_args_list[0][1]

        assert type(upload_blob_kwargs["data"]) == io.BytesIO
    def test_ingest_from_file_wrong_endpoint(self):
        responses.add_callback(
            responses.POST, "https://somecluster.kusto.windows.net/v1/rest/mgmt", callback=request_error_callback, content_type="application/json"
        )

        ingest_client = KustoIngestClient("https://somecluster.kusto.windows.net")
        ingestion_properties = IngestionProperties(database="database", table="table", data_format=DataFormat.CSV)

        current_dir = os.getcwd()
        path_parts = ["azure-kusto-ingest", "tests", "input", "dataset.csv"]
        missing_path_parts = []
        for path_part in path_parts:
            if path_part not in current_dir:
                missing_path_parts.append(path_part)

        file_path = os.path.join(current_dir, *missing_path_parts)

        with self.assertRaises(KustoInvalidEndpointError) as ex:
            ingest_client.ingest_from_file(file_path, ingestion_properties=ingestion_properties)
        self.assertEqual(
            ex.exception.args[0],
            "You are using 'DataManagement' client type, but the provided endpoint is of ServiceType 'Engine'. Initialize the client with the appropriate endpoint URI: 'https://ingest-somecluster.kusto.windows.net'",
            "Expected exception was not raised",
        )
Exemple #5
0
# there are a lot of useful properties, make sure to go over docs and check them out
ingestion_props = IngestionProperties(
    database="{database_name}",
    table="{table_name}",
    dataFormat=DataFormat.CSV,
    # in case status update for success are also required
    # reportLevel=ReportLevel.FailuresAndSuccesses,
    # in case a mapping is required
    # ingestionMappingReference="{json_mapping_that_already_exists_on_table}"
    # ingestionMappingType=IngestionMappingType.Json
)

# ingest from file
file_descriptor = FileDescriptor("{filename}.csv", 3333)  # 3333 is the raw size of the data in bytes.
client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)
client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props)


# ingest from blob
blob_descriptor = BlobDescriptor("https://{path_to_blob}.csv.gz?sas", 10)  # 10 is the raw size of the data in bytes.
client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props)

# ingest from dataframe
import pandas

fields = ["id", "name", "value"]
rows = [[1, "abc", 15.3], [2, "cde", 99.9]]

df = pandas.DataFrame(data=rows, columns=fields)
Exemple #6
0
# there are a lot of useful properties, make sure to go over docs and check them out
ingestion_props = IngestionProperties(
    database="{database_name}",
    table="{table_name}",
    dataFormat=DataFormat.CSV,
    # in case status update for success are also required
    # reportLevel=ReportLevel.FailuresAndSuccesses,
    # in case a mapping is required
    # ingestionMappingReference="{json_mapping_that_already_exists_on_table}"
    # ingestionMappingType=IngestionMappingType.Json
)

# ingest from file
file_descriptor = FileDescriptor(
    "{filename}.csv", 3333)  # 3333 is the raw size of the data in bytes.
client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)
client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props)

# ingest from blob
blob_descriptor = BlobDescriptor(
    "https://{path_to_blob}.csv.gz?sas",
    10)  # 10 is the raw size of the data in bytes.
client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props)

# ingest from dataframe
import pandas

fields = ["id", "name", "value"]
rows = [[1, "abc", 15.3], [2, "cde", 99.9]]

df = pandas.DataFrame(data=rows, columns=fields)
Exemple #7
0
def main():
    
    # Kusto cluster inputs
    data = os.environ["INPUT_DATA"]
    tenantId = os.environ["INPUT_TENANTID"]
    databaseName = os.environ["INPUT_DATABASE"]
    clusterName = os.environ["INPUT_CLUSTERNAME"]
    region = os.environ["INPUT_CLUSTERREGION"]
    clientId = os.environ["INPUT_CLIENTID"]
    clientSecret = os.environ["INPUT_CLIENTSECRET"]
    destinationTable = os.environ["INPUT_TABLE"]
    mapping = os.environ['INPUT_MAPPING']

    try:
        print(data)
        # file creation 

        fileName = "sample.json"
        filePath = os.path.join(os.environ["GITHUB_WORKSPACE"], fileName)

        deploymentData = {}
        deploymentData["Timestamp"] = str(datetime.now())
        deploymentData["DeploymentDetails"] = data

        with open(filePath, "w") as targetFile:
            json.dump(deploymentData, targetFile)

        # cluster client connection and auth

        httpsPrefix = "https://"
        suffixKustoUri = "kusto.windows.net:443/"
        clusterIngestUri = "{0}ingest-{1}.{2}.{3}".format(httpsPrefix, clusterName, region, suffixKustoUri)

        kcsb_ingest = KustoConnectionStringBuilder.with_aad_application_key_authentication(
                       clusterIngestUri, clientId, clientSecret, tenantId)

        print(mapping)

        # Cluster ingestion parameters
        ingestionClient = KustoIngestClient(kcsb_ingest)
        ingestionProperties = IngestionProperties(database=databaseName, table=destinationTable, dataFormat=DataFormat.JSON, ingestion_mapping_reference=mapping, report_level=ReportLevel.FailuresAndSuccesses)
        fileDescriptor = FileDescriptor(filePath, 1000)

        print('Payload to dump')
        with open(filePath, "r") as targetFile:
            parsed = json.load(targetFile)
            print(json.dumps(parsed, indent=2, sort_keys=True))

        ingestionClient.ingest_from_file(fileDescriptor, ingestion_properties=ingestionProperties)

        print('Queued up ingestion with Azure Data Explorer')

        # Remove the temporary file
        os.remove(filePath)
        """
        # Repeated pinging to wait for success/failure message
        qs = KustoIngestStatusQueues(ingestionClient)

        # Interval to ping
        MAX_BACKOFF = 5
        backoff = 1
        while True:
            if qs.success.is_empty() and qs.failure.is_empty():
                time.sleep(backoff)
                backoff = min(backoff * 2, MAX_BACKOFF)
                print("No new messages. backing off for {} seconds".format(backoff))
                continue

            backoff = 1

            success_messages = qs.success.pop(10)
            failure_messages = qs.failure.pop(10)

            pprint.pprint("SUCCESS : {}".format(success_messages))
            pprint.pprint("FAILURE : {}".format(failure_messages))
            break
        """
    except Exception as e:
        raise Exception(e)