Exemple #1
0
    def test_unzipped_file_dont_compress(self):
        """Tests FileDescriptor with size and unzipped file."""
        filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv")
        descriptor = FileDescriptor(filePath, self.mock_size)
        with descriptor.open(False) as stream:
            assert descriptor.size == self.mock_size
            assert descriptor.stream_name.endswith(".csv")
            if sys.version_info[0] >= 3:
                assert stream.readable()
            assert stream.tell() == 0

        assert stream.closed is True
Exemple #2
0
 def test_unzipped_file_with_size(self):
     """Tests FileDescriptor with size and unzipped file."""
     filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv")
     descriptor = FileDescriptor(filePath, 10)
     self.assertGreater(descriptor.size, 10)
     self.assertTrue(descriptor.stream_name.endswith(".csv.gz"))
     if sys.version_info[0] >= 3:
         self.assertTrue(descriptor.zipped_stream.readable(), True)
     self.assertEquals(descriptor.zipped_stream.tell(), 0)
     self.assertEqual(descriptor.zipped_stream.closed, False)
     descriptor.delete_files()
     self.assertEqual(descriptor.zipped_stream.closed, True)
Exemple #3
0
    def test_unzipped_file_without_size(self):
        """Tests FileDescriptor without size and unzipped file."""
        filePath = path.join(path.dirname(path.abspath(__file__)), "input",
                             "dataset.csv")
        descriptor = FileDescriptor(filePath, 0)
        with descriptor.open(True) as stream:
            assert descriptor.size == self.uncompressed_size
            assert descriptor.stream_name.endswith(".csv.gz")
            if sys.version_info[0] >= 3:
                assert stream.readable()
            assert stream.tell() == 0

        assert stream.closed == True
Exemple #4
0
    def test_zip_file_without_size(self):
        """Tests FileDescriptor without size and zipped file."""
        filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv.zip")
        descriptor = FileDescriptor(filePath, 0)
        with descriptor.open(False) as stream:
            # the zip archive contains 2 copies of the source file
            assert descriptor.size == self.uncompressed_size * 2
            assert descriptor.stream_name.endswith(".csv.zip")
            if sys.version_info[0] >= 3:
                assert stream.readable()
            assert stream.tell() == 0

        assert stream.closed is True
Exemple #5
0
    def test_unzipped_file_without_size(self):
        """Tests FileDescriptor without size and unzipped file."""
        filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv")
        descriptor = FileDescriptor(filePath, 0)
        with descriptor.open(True) as stream:

            # TODO: since we don't know if the file is opened on CRLF system or an LF system, allow both sizes
            #   a more robust approach would be to open the file and check
            assert descriptor.size in (self.uncompressed_size, self.uncompressed_size_2)
            assert descriptor.stream_name.endswith(".csv.gz")
            if sys.version_info[0] >= 3:
                assert stream.readable()
            assert stream.tell() == 0

        assert stream.closed is True
Exemple #6
0
def test_ingest_complicated_props():
    validation_policy = ValidationPolicy(
        validation_options=ValidationOptions.ValidateCsvInputConstantColumns,
        validation_implications=ValidationImplications.Fail)
    json_ingestion_props = IngestionProperties(
        test_db,
        test_table,
        data_format=DataFormat.JSON,
        ingestion_mapping=TestData.test_table_json_mappings(),
        additional_tags=["a", "b"],
        ingest_if_not_exists=["aaaa", "bbbb"],
        ingest_by_tags=["ingestByTag"],
        drop_by_tags=["drop", "drop-by"],
        flush_immediately=False,
        report_level=ReportLevel.FailuresAndSuccesses,
        report_method=ReportMethod.Queue,
        validation_policy=validation_policy,
    )

    file_paths = [json_file_path, zipped_json_file_path]
    fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths]

    for fd in fds:
        ingest_client.ingest_from_file(fd, json_ingestion_props)

    assert_rows_added(4)
Exemple #7
0
        def ingest_from_file(cls,
                             ingest_client: BaseIngestClient,
                             database_name: str,
                             table_name: str,
                             file_path: str,
                             data_format: DataFormat,
                             mapping_name: str = None) -> None:
            """
            Ingest Data from a given file path.
            :param ingest_client: Client to ingest data
            :param database_name: DB name
            :param table_name: Table name
            :param file_path: File path
            :param data_format: Given data format
            :param mapping_name: Desired mapping name
            """
            ingestion_properties = cls.create_ingestion_properties(
                database_name, table_name, data_format, mapping_name)

            # Tip 1: For optimal ingestion batching and performance,specify the uncompressed data size in the file descriptor instead of the default below of 0.
            # Otherwise, the service will determine the file size, requiring an additional s2s call, and may not be accurate for compressed files.
            # Tip 2: To correlate between ingestion operations in your applications and Kusto, set the source ID and log it somewhere
            file_descriptor = FileDescriptor(file_path,
                                             size=0,
                                             source_id=uuid.uuid4())
            ingest_client.ingest_from_file(
                file_descriptor, ingestion_properties=ingestion_properties)
Exemple #8
0
def test_ingest_complicated_props():
    validation_policy = ValidationPolicy(
        validationOptions=ValidationOptions.ValidateCsvInputConstantColumns,
        validationImplications=ValidationImplications.Fail)
    json_ingestion_props = IngestionProperties(
        db_name,
        table_name,
        dataFormat=DataFormat.JSON,
        ingestionMapping=Helpers.create_test_table_json_mappings(),
        additionalTags=["a", "b"],
        ingestIfNotExists=["aaaa", "bbbb"],
        ingestByTags=["ingestByTag"],
        dropByTags=["drop", "drop-by"],
        flushImmediately=False,
        reportLevel=ReportLevel.FailuresAndSuccesses,
        reportMethod=ReportMethod.Queue,
        validationPolicy=validation_policy,
    )

    file_paths = [json_file_path, zipped_json_file_path]
    fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths]

    for fd in fds:
        ingest_client.ingest_from_file(fd, json_ingestion_props)

    assert_success_mesagges_count(2)
    assert_row_count(4)
Exemple #9
0
    def test_uuid_file_descriptor(self):
        dummy_file = "dummy"

        descriptor = FileDescriptor(dummy_file)
        assert descriptor.source_id
        assert descriptor.source_id != TestDescriptors.TEST_UUID
        assert uuid.UUID(str(descriptor.source_id), version=4)

        descriptor = FileDescriptor(dummy_file,
                                    source_id=TestDescriptors.TEST_UUID_STR)
        assert descriptor.source_id == TestDescriptors.TEST_UUID

        descriptor = FileDescriptor(dummy_file,
                                    source_id=TestDescriptors.TEST_UUID)
        assert descriptor.source_id == TestDescriptors.TEST_UUID

        with pytest.raises(ValueError):
            FileDescriptor(dummy_file, source_id=TestDescriptors.INVALID_UUID)
Exemple #10
0
def Ingest(Tag):
    # setting
    AUTHORITY_ID = "6babcaad-604b-40ac-a9d7-9fd97c0b779f"
    INGESTCLUSTER = "https://ingest-cgadataout.kusto.windows.net"
    KUSTOCLUSTER = "https://cgadataout.kusto.windows.net"
    DATABASE = "DevRelWorkArea"

    # Create table
    KCSB_DATA = KustoConnectionStringBuilder.with_aad_device_authentication(
        KUSTOCLUSTER)
    DESTINATION_TABLE = "RepoContributors"
    DESTINATION_TABLE_COLUMN_MAPPING = "RepoContributors_CSV_Mapping"

    KUSTO_CLIENT = KustoClient(KCSB_DATA)
    DROP_TABLE_IF_EXIST = ".drop table RepoContributors ifexists"
    RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, DROP_TABLE_IF_EXIST)

    CREATE_TABLE_COMMAND = ".create table RepoContributors (Article: string, Contributors: int64, Data: string)"
    RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_TABLE_COMMAND)

    print("RepoContributors table is created")

    # Create mapping

    CREATE_MAPPING_COMMAND = """.create table RepoContributors ingestion csv mapping 'RepoContributors_CSV_Mapping' '[{"Name": "Article","datatype": "string","Ordinal": 0},{"Name": "Contributors","datatype": "int64","Ordinal": 1},{"Name": "Data","datatype": "string","Ordinal": 2}]'"""
    RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_MAPPING_COMMAND)

    print("mapping is created")

    # Ingest

    # The authentication method will be taken from the chosen KustoConnectionStringBuilder.
    ingestion_props = IngestionProperties(
        database="DevRelWorkArea",
        table="RepoContributors",
        dataFormat=DataFormat.CSV,
        ingestByTags=[Tag],
        dropByTags=[Tag],
        mappingReference=DESTINATION_TABLE_COLUMN_MAPPING,
        reportLevel=ReportLevel.FailuresAndSuccesses,
        additionalProperties={'ignoreFirstRecord': 'true'})

    kcsb = KustoConnectionStringBuilder.with_aad_device_authentication(
        INGESTCLUSTER)
    client = KustoIngestClient(kcsb)

    # ingest from file
    file_descriptor = FileDescriptor(
        r"D:\test\Results\log_data_merge\merge_microsoftdocs_sql-docs-pr.txt",
        3333)  # 3333 is the raw size of the data in bytes.
    client.ingest_from_file(file_descriptor,
                            ingestion_properties=ingestion_props)
    # if status updates are required, something like this can be done

    return 1
Exemple #11
0
def test_ingest_complicated_props():
    # Test ingest with complicated ingestion properties
    validation_policy = ValidationPolicy(
        validationOptions=ValidationOptions.ValidateCsvInputConstantColumns,
        validationImplications=ValidationImplications.Fail,
    )
    json_ingestion_props = IngestionProperties(
        db_name,
        table_name,
        dataFormat=DataFormat.json,
        mapping=Helpers.create_deft_table_json_mappings(),
        additionalTags=["a", "b"],
        ingestIfNotExists=["aaaa", "bbbb"],
        ingestByTags=["ingestByTag"],
        dropByTags=["drop", "drop-by"],
        flushImmediately=False,
        reportLevel=ReportLevel.FailuresAndSuccesses,
        reportMethod=ReportMethod.Queue,
        validationPolicy=validation_policy,
    )

    file_paths = [json_file_path, zipped_json_file_path]
    fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths]
    source_ids = ["{}".format(fd.source_id) for fd in fds]

    for fd in fds:
        ingest_client.ingest_from_file(fd, json_ingestion_props)

    successes = 0
    timeout = 60
    while successes != 2 and timeout > 0:
        while ingest_status_q.success.is_empty() and timeout > 0:
            time.sleep(1)
            timeout -= 1

        success_message = ingest_status_q.success.pop()
        if success_message[0].IngestionSourceId in source_ids:
            assert success_message[0].Database == db_name
            assert success_message[0].Table == table_name

            successes += 1

    assert successes == 2
    # TODO: status queues only mark ingestion was successful, but takes time for data to become available
    time.sleep(20)
    response = client.execute(db_name, "{} | count".format(table_name))
    for row in response.primary_results[0]:
        assert int(row["Count"]) == 28, "{0} | count = {1}".format(
            table_name, text_type(row["Count"]))
Exemple #12
0
def Ingest(Tag):
    ingestion_props = IngestionProperties(
        database="DevRelWorkArea",
        table="BranchCommits",
        dataFormat=DataFormat.CSV,
        ingestByTags=[Tag],
        dropByTags=[Tag],
        mappingReference="BranchCommits_CSV_Mapping",
        reportLevel=ReportLevel.FailuresAndSuccesses,
        additionalProperties={'ignoreFirstRecord': 'true'}
    )

    file_descriptor = FileDescriptor(path_file_3,3333)  # 3333 is the raw size of the data in bytes.
    ls[1].ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)

    return 1
Exemple #13
0
def Ingest(Tag):
    ingestion_props = IngestionProperties(
        database="DevRelWorkArea",
        table="RepoContributors",
        dataFormat=DataFormat.CSV,
        ingestByTags=[Tag],
        dropByTags=[Tag],
        mappingReference="RepoContributors_CSV_Mapping",
        reportLevel=ReportLevel.FailuresAndSuccesses,
        additionalProperties={'ignoreFirstRecord': 'true'})

    # 修改这里的本地路径以加载数据源
    file_descriptor = FileDescriptor(
        r"D:\test\Results\log_data_merge\{}.csv".format(Tag),
        3333)  # 3333 is the raw size of the data in bytes.
    ls[1].ingest_from_file(file_descriptor,
                           ingestion_properties=ingestion_props)

    return 1
Exemple #14
0
    BlobDescriptor,
    DataFormat,
)

INGESTION_PROPERTIES = IngestionProperties(database="database name",
                                           table="table name",
                                           dataFormat=DataFormat.csv)

INGEST_CLIENT = KustoIngestClient(
    "https://ingest-<clustername>.kusto.windows.net")

KCSB = KustoConnectionStringBuilder.with_aad_application_key_authentication(
    "https://ingest-<clustername>.kusto.windows.net", "aad app id", "secret")
INGEST_CLIENT = KustoIngestClient(KCSB)

FILE_DESCRIPTOR = FileDescriptor(
    "E:\\filePath.csv", 3333)  # 3333 is the raw size of the data in bytes.
INGEST_CLIENT.ingest_from_multiple_files(
    [FILE_DESCRIPTOR],
    delete_sources_on_success=True,
    ingestion_properties=INGESTION_PROPERTIES)

INGEST_CLIENT.ingest_from_multiple_files(
    ["E:\\filePath.csv"],
    delete_sources_on_success=True,
    ingestion_properties=INGESTION_PROPERTIES)

BLOB_DESCRIPTOR = BlobDescriptor(
    "https://path-to-blob.csv.gz?sas",
    10)  # 10 is the raw size of the data in bytes.
INGEST_CLIENT.ingest_from_multiple_blobs(
    [BLOB_DESCRIPTOR],
Exemple #15
0
##################################################################

# there are a lot of useful properties, make sure to go over docs and check them out
ingestion_props = IngestionProperties(
    database="{database_name}",
    table="{table_name}",
    dataFormat=DataFormat.CSV,
    # in case status update for success are also required
    # reportLevel=ReportLevel.FailuresAndSuccesses,
    # in case a mapping is required
    # ingestionMappingReference="{json_mapping_that_already_exists_on_table}"
    # ingestionMappingType=IngestionMappingType.Json
)

# ingest from file
file_descriptor = FileDescriptor("{filename}.csv", 3333)  # 3333 is the raw size of the data in bytes.
client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props)
client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props)


# ingest from blob
blob_descriptor = BlobDescriptor("https://{path_to_blob}.csv.gz?sas", 10)  # 10 is the raw size of the data in bytes.
client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props)

# ingest from dataframe
import pandas

fields = ["id", "name", "value"]
rows = [[1, "abc", 15.3], [2, "cde", 99.9]]

df = pandas.DataFrame(data=rows, columns=fields)
Exemple #16
0
def main():
    
    # Kusto cluster inputs
    data = os.environ["INPUT_DATA"]
    tenantId = os.environ["INPUT_TENANTID"]
    databaseName = os.environ["INPUT_DATABASE"]
    clusterName = os.environ["INPUT_CLUSTERNAME"]
    region = os.environ["INPUT_CLUSTERREGION"]
    clientId = os.environ["INPUT_CLIENTID"]
    clientSecret = os.environ["INPUT_CLIENTSECRET"]
    destinationTable = os.environ["INPUT_TABLE"]
    mapping = os.environ['INPUT_MAPPING']

    try:
        print(data)
        # file creation 

        fileName = "sample.json"
        filePath = os.path.join(os.environ["GITHUB_WORKSPACE"], fileName)

        deploymentData = {}
        deploymentData["Timestamp"] = str(datetime.now())
        deploymentData["DeploymentDetails"] = data

        with open(filePath, "w") as targetFile:
            json.dump(deploymentData, targetFile)

        # cluster client connection and auth

        httpsPrefix = "https://"
        suffixKustoUri = "kusto.windows.net:443/"
        clusterIngestUri = "{0}ingest-{1}.{2}.{3}".format(httpsPrefix, clusterName, region, suffixKustoUri)

        kcsb_ingest = KustoConnectionStringBuilder.with_aad_application_key_authentication(
                       clusterIngestUri, clientId, clientSecret, tenantId)

        print(mapping)

        # Cluster ingestion parameters
        ingestionClient = KustoIngestClient(kcsb_ingest)
        ingestionProperties = IngestionProperties(database=databaseName, table=destinationTable, dataFormat=DataFormat.JSON, ingestion_mapping_reference=mapping, report_level=ReportLevel.FailuresAndSuccesses)
        fileDescriptor = FileDescriptor(filePath, 1000)

        print('Payload to dump')
        with open(filePath, "r") as targetFile:
            parsed = json.load(targetFile)
            print(json.dumps(parsed, indent=2, sort_keys=True))

        ingestionClient.ingest_from_file(fileDescriptor, ingestion_properties=ingestionProperties)

        print('Queued up ingestion with Azure Data Explorer')

        # Remove the temporary file
        os.remove(filePath)
        """
        # Repeated pinging to wait for success/failure message
        qs = KustoIngestStatusQueues(ingestionClient)

        # Interval to ping
        MAX_BACKOFF = 5
        backoff = 1
        while True:
            if qs.success.is_empty() and qs.failure.is_empty():
                time.sleep(backoff)
                backoff = min(backoff * 2, MAX_BACKOFF)
                print("No new messages. backing off for {} seconds".format(backoff))
                continue

            backoff = 1

            success_messages = qs.success.pop(10)
            failure_messages = qs.failure.pop(10)

            pprint.pprint("SUCCESS : {}".format(success_messages))
            pprint.pprint("FAILURE : {}".format(failure_messages))
            break
        """
    except Exception as e:
        raise Exception(e)
    BlobDescriptor,
    DataFormat,
)

ingestion_properties = IngestionProperties(database="database name",
                                           table="table name",
                                           dataFormat=DataFormat.csv)

ingest_client = KustoIngestClient(
    "https://ingest-<clustername>.kusto.windows.net")
ingest_client = KustoIngestClient(
    "https://ingest-<clustername>.kusto.windows.net",
    client_id="aad app id",
    client_secret="secret")

file_descriptor = FileDescriptor(
    "E:\\filePath.csv", 3333)  # 3333 is the raw size of the data in bytes.
ingest_client.ingest_from_multiple_files(
    [file_descriptor],
    delete_sources_on_success=True,
    ingestion_properties=ingestion_properties)

ingest_client.ingest_from_multiple_files(
    ["E:\\filePath.csv"],
    delete_sources_on_success=True,
    ingestion_properties=ingestion_properties)

blob_descriptor = BlobDescriptor(
    "https://path-to-blob.csv.gz?sas",
    10)  # 10 is the raw size of the data in bytes.
ingest_client.ingest_from_multiple_blobs(
    [blob_descriptor],