def test_load_dataverse_dataset_file(dataset_client, dataset_name): dataset_document = dataset_client.read(dataset_name, None) df = load_dataset(dataset_document) assert isinstance(df, pd.pandas.core.frame.DataFrame)
import mlflow import json import sys import pandas as pd from burdock.client import get_dataset_client from burdock.data.adapters import load_reader, load_metadata, load_dataset from burdock.query.sql.private.query import PrivateQuery from pandasql import sqldf if __name__ == "__main__": dataset_name = sys.argv[1] budget = float(sys.argv[2]) query = sys.argv[3] with mlflow.start_run(): dataset_document = get_dataset_client().read(dataset_name, budget) dataset = load_dataset(dataset_document) reader = load_reader(dataset_document) schema = load_metadata(dataset_document) private_reader = PrivateQuery(reader, schema, budget) rowset = private_reader.execute(query) result = {"query_result": rowset} df = pd.DataFrame(rowset[1:], columns=rowset[0]) with open("result.json", "w") as stream: json.dump(df.to_dict(), stream) mlflow.log_artifact("result.json")
import mlflow import json import sys from statistic import Count from burdock.client import get_dataset_client from burdock.data.adapters import load_dataset if __name__ == "__main__": dataset_name = sys.argv[1] if len(sys.argv) > 1 else "example" column_name = sys.argv[2] if len(sys.argv) > 1 else "a" budget = float(sys.argv[3]) if len(sys.argv) > 1 else 1 with mlflow.start_run(): df = load_dataset(get_dataset_client().read(dataset_name, budget)) statistic = Count(column_name, budget).release(df) with open("result.json", "w") as stream: json.dump(statistic.as_dict(), stream) mlflow.log_artifact("result.json")