def test_unzipped_file_dont_compress(self): """Tests FileDescriptor with size and unzipped file.""" filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv") descriptor = FileDescriptor(filePath, self.mock_size) with descriptor.open(False) as stream: assert descriptor.size == self.mock_size assert descriptor.stream_name.endswith(".csv") if sys.version_info[0] >= 3: assert stream.readable() assert stream.tell() == 0 assert stream.closed is True
def test_unzipped_file_with_size(self): """Tests FileDescriptor with size and unzipped file.""" filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv") descriptor = FileDescriptor(filePath, 10) self.assertGreater(descriptor.size, 10) self.assertTrue(descriptor.stream_name.endswith(".csv.gz")) if sys.version_info[0] >= 3: self.assertTrue(descriptor.zipped_stream.readable(), True) self.assertEquals(descriptor.zipped_stream.tell(), 0) self.assertEqual(descriptor.zipped_stream.closed, False) descriptor.delete_files() self.assertEqual(descriptor.zipped_stream.closed, True)
def test_unzipped_file_without_size(self): """Tests FileDescriptor without size and unzipped file.""" filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv") descriptor = FileDescriptor(filePath, 0) with descriptor.open(True) as stream: assert descriptor.size == self.uncompressed_size assert descriptor.stream_name.endswith(".csv.gz") if sys.version_info[0] >= 3: assert stream.readable() assert stream.tell() == 0 assert stream.closed == True
def test_zip_file_without_size(self): """Tests FileDescriptor without size and zipped file.""" filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv.zip") descriptor = FileDescriptor(filePath, 0) with descriptor.open(False) as stream: # the zip archive contains 2 copies of the source file assert descriptor.size == self.uncompressed_size * 2 assert descriptor.stream_name.endswith(".csv.zip") if sys.version_info[0] >= 3: assert stream.readable() assert stream.tell() == 0 assert stream.closed is True
def test_unzipped_file_without_size(self): """Tests FileDescriptor without size and unzipped file.""" filePath = path.join(path.dirname(path.abspath(__file__)), "input", "dataset.csv") descriptor = FileDescriptor(filePath, 0) with descriptor.open(True) as stream: # TODO: since we don't know if the file is opened on CRLF system or an LF system, allow both sizes # a more robust approach would be to open the file and check assert descriptor.size in (self.uncompressed_size, self.uncompressed_size_2) assert descriptor.stream_name.endswith(".csv.gz") if sys.version_info[0] >= 3: assert stream.readable() assert stream.tell() == 0 assert stream.closed is True
def test_ingest_complicated_props(): validation_policy = ValidationPolicy( validation_options=ValidationOptions.ValidateCsvInputConstantColumns, validation_implications=ValidationImplications.Fail) json_ingestion_props = IngestionProperties( test_db, test_table, data_format=DataFormat.JSON, ingestion_mapping=TestData.test_table_json_mappings(), additional_tags=["a", "b"], ingest_if_not_exists=["aaaa", "bbbb"], ingest_by_tags=["ingestByTag"], drop_by_tags=["drop", "drop-by"], flush_immediately=False, report_level=ReportLevel.FailuresAndSuccesses, report_method=ReportMethod.Queue, validation_policy=validation_policy, ) file_paths = [json_file_path, zipped_json_file_path] fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths] for fd in fds: ingest_client.ingest_from_file(fd, json_ingestion_props) assert_rows_added(4)
def ingest_from_file(cls, ingest_client: BaseIngestClient, database_name: str, table_name: str, file_path: str, data_format: DataFormat, mapping_name: str = None) -> None: """ Ingest Data from a given file path. :param ingest_client: Client to ingest data :param database_name: DB name :param table_name: Table name :param file_path: File path :param data_format: Given data format :param mapping_name: Desired mapping name """ ingestion_properties = cls.create_ingestion_properties( database_name, table_name, data_format, mapping_name) # Tip 1: For optimal ingestion batching and performance,specify the uncompressed data size in the file descriptor instead of the default below of 0. # Otherwise, the service will determine the file size, requiring an additional s2s call, and may not be accurate for compressed files. # Tip 2: To correlate between ingestion operations in your applications and Kusto, set the source ID and log it somewhere file_descriptor = FileDescriptor(file_path, size=0, source_id=uuid.uuid4()) ingest_client.ingest_from_file( file_descriptor, ingestion_properties=ingestion_properties)
def test_ingest_complicated_props(): validation_policy = ValidationPolicy( validationOptions=ValidationOptions.ValidateCsvInputConstantColumns, validationImplications=ValidationImplications.Fail) json_ingestion_props = IngestionProperties( db_name, table_name, dataFormat=DataFormat.JSON, ingestionMapping=Helpers.create_test_table_json_mappings(), additionalTags=["a", "b"], ingestIfNotExists=["aaaa", "bbbb"], ingestByTags=["ingestByTag"], dropByTags=["drop", "drop-by"], flushImmediately=False, reportLevel=ReportLevel.FailuresAndSuccesses, reportMethod=ReportMethod.Queue, validationPolicy=validation_policy, ) file_paths = [json_file_path, zipped_json_file_path] fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths] for fd in fds: ingest_client.ingest_from_file(fd, json_ingestion_props) assert_success_mesagges_count(2) assert_row_count(4)
def test_uuid_file_descriptor(self): dummy_file = "dummy" descriptor = FileDescriptor(dummy_file) assert descriptor.source_id assert descriptor.source_id != TestDescriptors.TEST_UUID assert uuid.UUID(str(descriptor.source_id), version=4) descriptor = FileDescriptor(dummy_file, source_id=TestDescriptors.TEST_UUID_STR) assert descriptor.source_id == TestDescriptors.TEST_UUID descriptor = FileDescriptor(dummy_file, source_id=TestDescriptors.TEST_UUID) assert descriptor.source_id == TestDescriptors.TEST_UUID with pytest.raises(ValueError): FileDescriptor(dummy_file, source_id=TestDescriptors.INVALID_UUID)
def Ingest(Tag): # setting AUTHORITY_ID = "6babcaad-604b-40ac-a9d7-9fd97c0b779f" INGESTCLUSTER = "https://ingest-cgadataout.kusto.windows.net" KUSTOCLUSTER = "https://cgadataout.kusto.windows.net" DATABASE = "DevRelWorkArea" # Create table KCSB_DATA = KustoConnectionStringBuilder.with_aad_device_authentication( KUSTOCLUSTER) DESTINATION_TABLE = "RepoContributors" DESTINATION_TABLE_COLUMN_MAPPING = "RepoContributors_CSV_Mapping" KUSTO_CLIENT = KustoClient(KCSB_DATA) DROP_TABLE_IF_EXIST = ".drop table RepoContributors ifexists" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, DROP_TABLE_IF_EXIST) CREATE_TABLE_COMMAND = ".create table RepoContributors (Article: string, Contributors: int64, Data: string)" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_TABLE_COMMAND) print("RepoContributors table is created") # Create mapping CREATE_MAPPING_COMMAND = """.create table RepoContributors ingestion csv mapping 'RepoContributors_CSV_Mapping' '[{"Name": "Article","datatype": "string","Ordinal": 0},{"Name": "Contributors","datatype": "int64","Ordinal": 1},{"Name": "Data","datatype": "string","Ordinal": 2}]'""" RESPONSE = KUSTO_CLIENT.execute_mgmt(DATABASE, CREATE_MAPPING_COMMAND) print("mapping is created") # Ingest # The authentication method will be taken from the chosen KustoConnectionStringBuilder. ingestion_props = IngestionProperties( database="DevRelWorkArea", table="RepoContributors", dataFormat=DataFormat.CSV, ingestByTags=[Tag], dropByTags=[Tag], mappingReference=DESTINATION_TABLE_COLUMN_MAPPING, reportLevel=ReportLevel.FailuresAndSuccesses, additionalProperties={'ignoreFirstRecord': 'true'}) kcsb = KustoConnectionStringBuilder.with_aad_device_authentication( INGESTCLUSTER) client = KustoIngestClient(kcsb) # ingest from file file_descriptor = FileDescriptor( r"D:\test\Results\log_data_merge\merge_microsoftdocs_sql-docs-pr.txt", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) # if status updates are required, something like this can be done return 1
def test_ingest_complicated_props(): # Test ingest with complicated ingestion properties validation_policy = ValidationPolicy( validationOptions=ValidationOptions.ValidateCsvInputConstantColumns, validationImplications=ValidationImplications.Fail, ) json_ingestion_props = IngestionProperties( db_name, table_name, dataFormat=DataFormat.json, mapping=Helpers.create_deft_table_json_mappings(), additionalTags=["a", "b"], ingestIfNotExists=["aaaa", "bbbb"], ingestByTags=["ingestByTag"], dropByTags=["drop", "drop-by"], flushImmediately=False, reportLevel=ReportLevel.FailuresAndSuccesses, reportMethod=ReportMethod.Queue, validationPolicy=validation_policy, ) file_paths = [json_file_path, zipped_json_file_path] fds = [FileDescriptor(fp, 0, uuid.uuid4()) for fp in file_paths] source_ids = ["{}".format(fd.source_id) for fd in fds] for fd in fds: ingest_client.ingest_from_file(fd, json_ingestion_props) successes = 0 timeout = 60 while successes != 2 and timeout > 0: while ingest_status_q.success.is_empty() and timeout > 0: time.sleep(1) timeout -= 1 success_message = ingest_status_q.success.pop() if success_message[0].IngestionSourceId in source_ids: assert success_message[0].Database == db_name assert success_message[0].Table == table_name successes += 1 assert successes == 2 # TODO: status queues only mark ingestion was successful, but takes time for data to become available time.sleep(20) response = client.execute(db_name, "{} | count".format(table_name)) for row in response.primary_results[0]: assert int(row["Count"]) == 28, "{0} | count = {1}".format( table_name, text_type(row["Count"]))
def Ingest(Tag): ingestion_props = IngestionProperties( database="DevRelWorkArea", table="BranchCommits", dataFormat=DataFormat.CSV, ingestByTags=[Tag], dropByTags=[Tag], mappingReference="BranchCommits_CSV_Mapping", reportLevel=ReportLevel.FailuresAndSuccesses, additionalProperties={'ignoreFirstRecord': 'true'} ) file_descriptor = FileDescriptor(path_file_3,3333) # 3333 is the raw size of the data in bytes. ls[1].ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) return 1
def Ingest(Tag): ingestion_props = IngestionProperties( database="DevRelWorkArea", table="RepoContributors", dataFormat=DataFormat.CSV, ingestByTags=[Tag], dropByTags=[Tag], mappingReference="RepoContributors_CSV_Mapping", reportLevel=ReportLevel.FailuresAndSuccesses, additionalProperties={'ignoreFirstRecord': 'true'}) # 修改这里的本地路径以加载数据源 file_descriptor = FileDescriptor( r"D:\test\Results\log_data_merge\{}.csv".format(Tag), 3333) # 3333 is the raw size of the data in bytes. ls[1].ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) return 1
BlobDescriptor, DataFormat, ) INGESTION_PROPERTIES = IngestionProperties(database="database name", table="table name", dataFormat=DataFormat.csv) INGEST_CLIENT = KustoIngestClient( "https://ingest-<clustername>.kusto.windows.net") KCSB = KustoConnectionStringBuilder.with_aad_application_key_authentication( "https://ingest-<clustername>.kusto.windows.net", "aad app id", "secret") INGEST_CLIENT = KustoIngestClient(KCSB) FILE_DESCRIPTOR = FileDescriptor( "E:\\filePath.csv", 3333) # 3333 is the raw size of the data in bytes. INGEST_CLIENT.ingest_from_multiple_files( [FILE_DESCRIPTOR], delete_sources_on_success=True, ingestion_properties=INGESTION_PROPERTIES) INGEST_CLIENT.ingest_from_multiple_files( ["E:\\filePath.csv"], delete_sources_on_success=True, ingestion_properties=INGESTION_PROPERTIES) BLOB_DESCRIPTOR = BlobDescriptor( "https://path-to-blob.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. INGEST_CLIENT.ingest_from_multiple_blobs( [BLOB_DESCRIPTOR],
################################################################## # there are a lot of useful properties, make sure to go over docs and check them out ingestion_props = IngestionProperties( database="{database_name}", table="{table_name}", dataFormat=DataFormat.CSV, # in case status update for success are also required # reportLevel=ReportLevel.FailuresAndSuccesses, # in case a mapping is required # ingestionMappingReference="{json_mapping_that_already_exists_on_table}" # ingestionMappingType=IngestionMappingType.Json ) # ingest from file file_descriptor = FileDescriptor("{filename}.csv", 3333) # 3333 is the raw size of the data in bytes. client.ingest_from_file(file_descriptor, ingestion_properties=ingestion_props) client.ingest_from_file("{filename}.csv", ingestion_properties=ingestion_props) # ingest from blob blob_descriptor = BlobDescriptor("https://{path_to_blob}.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. client.ingest_from_blob(blob_descriptor, ingestion_properties=ingestion_props) # ingest from dataframe import pandas fields = ["id", "name", "value"] rows = [[1, "abc", 15.3], [2, "cde", 99.9]] df = pandas.DataFrame(data=rows, columns=fields)
def main(): # Kusto cluster inputs data = os.environ["INPUT_DATA"] tenantId = os.environ["INPUT_TENANTID"] databaseName = os.environ["INPUT_DATABASE"] clusterName = os.environ["INPUT_CLUSTERNAME"] region = os.environ["INPUT_CLUSTERREGION"] clientId = os.environ["INPUT_CLIENTID"] clientSecret = os.environ["INPUT_CLIENTSECRET"] destinationTable = os.environ["INPUT_TABLE"] mapping = os.environ['INPUT_MAPPING'] try: print(data) # file creation fileName = "sample.json" filePath = os.path.join(os.environ["GITHUB_WORKSPACE"], fileName) deploymentData = {} deploymentData["Timestamp"] = str(datetime.now()) deploymentData["DeploymentDetails"] = data with open(filePath, "w") as targetFile: json.dump(deploymentData, targetFile) # cluster client connection and auth httpsPrefix = "https://" suffixKustoUri = "kusto.windows.net:443/" clusterIngestUri = "{0}ingest-{1}.{2}.{3}".format(httpsPrefix, clusterName, region, suffixKustoUri) kcsb_ingest = KustoConnectionStringBuilder.with_aad_application_key_authentication( clusterIngestUri, clientId, clientSecret, tenantId) print(mapping) # Cluster ingestion parameters ingestionClient = KustoIngestClient(kcsb_ingest) ingestionProperties = IngestionProperties(database=databaseName, table=destinationTable, dataFormat=DataFormat.JSON, ingestion_mapping_reference=mapping, report_level=ReportLevel.FailuresAndSuccesses) fileDescriptor = FileDescriptor(filePath, 1000) print('Payload to dump') with open(filePath, "r") as targetFile: parsed = json.load(targetFile) print(json.dumps(parsed, indent=2, sort_keys=True)) ingestionClient.ingest_from_file(fileDescriptor, ingestion_properties=ingestionProperties) print('Queued up ingestion with Azure Data Explorer') # Remove the temporary file os.remove(filePath) """ # Repeated pinging to wait for success/failure message qs = KustoIngestStatusQueues(ingestionClient) # Interval to ping MAX_BACKOFF = 5 backoff = 1 while True: if qs.success.is_empty() and qs.failure.is_empty(): time.sleep(backoff) backoff = min(backoff * 2, MAX_BACKOFF) print("No new messages. backing off for {} seconds".format(backoff)) continue backoff = 1 success_messages = qs.success.pop(10) failure_messages = qs.failure.pop(10) pprint.pprint("SUCCESS : {}".format(success_messages)) pprint.pprint("FAILURE : {}".format(failure_messages)) break """ except Exception as e: raise Exception(e)
BlobDescriptor, DataFormat, ) ingestion_properties = IngestionProperties(database="database name", table="table name", dataFormat=DataFormat.csv) ingest_client = KustoIngestClient( "https://ingest-<clustername>.kusto.windows.net") ingest_client = KustoIngestClient( "https://ingest-<clustername>.kusto.windows.net", client_id="aad app id", client_secret="secret") file_descriptor = FileDescriptor( "E:\\filePath.csv", 3333) # 3333 is the raw size of the data in bytes. ingest_client.ingest_from_multiple_files( [file_descriptor], delete_sources_on_success=True, ingestion_properties=ingestion_properties) ingest_client.ingest_from_multiple_files( ["E:\\filePath.csv"], delete_sources_on_success=True, ingestion_properties=ingestion_properties) blob_descriptor = BlobDescriptor( "https://path-to-blob.csv.gz?sas", 10) # 10 is the raw size of the data in bytes. ingest_client.ingest_from_multiple_blobs( [blob_descriptor],