def test_dataset_to_json(): files = DatasetManager.list_datasets() for file in files: id = file['id'] dataset = DatasetManager.get_dataset(id) assert dataset is not None payload = json.dumps(dataset.get_payload()) ''' with open(f'{id}.json', 'w') as f: f.write(payload) ''' assert is_json(payload)
def test_query2dataset(): payload = { "source_dataset_id": "iris", "query_type": "sql", "query": "SELECT * FROM dataset LIMIT 20;", "dataset_id": "query2dataset", "dataset_name": "query2dataset", "dataset_description": "test query to dataset", } try: DatasetManager.query2dataset(**payload) DatasetManager.delete_dataset(payload['dataset_id']) except Exception: pytest.fail('query to dataset should not raise error')
def test_adddataset(): encoded_data = base64.b64encode(b'A,B,C,D\n1,2,3,4') payload = { "id": "b64test", "name": "b64test", "payload": encoded_data, "description": "b64test dataset", } try: DatasetManager.add_dataset(payload) DatasetManager.delete_dataset(payload['id']) except Exception: pytest.fail('should not raise execption')
def predict(job_id, payload): data = payload['data'] input_type = payload['input_type'] try: model = MLJob.get_model(job_id) if input_type == 'csv': csv_data = BytesIO(base64.b64decode(data)) df = pd.read_csv(csv_data, sep=",") df_prediction = model.predict(df) output_data = df_prediction.to_csv(index=False) result = {} result['data'] = base64.b64encode(output_data.encode('utf-8')) return result elif input_type == 'dataset': dataset = DatasetManager.get_dataset(data) df = dataset.get_df() df_prediction = model.predict(df) payload = {} payload["cols"], payload["rows"] = df_to_cols_rows( df_prediction) return payload else: message = f'input type {input_type} is not supported for prediction' logger.error(message) raise RuntimeError(message) except Exception as e: logger.exception( f'failed to do prediction for data={data} id={job_id} error={e}' ) raise e
def test_job_auto_regression(): dataset_id = 'housing' dataset = DatasetManager.get_dataset(dataset_id) assert dataset is not None df = dataset.get_df() assert df is not None features = [ 'crime_rate', 'business_acres', 'avg_rooms_per_dwelling', 'distance_to_employment_center', ] targets = ['median_house_value'] job_option = {} job_option['time_left_for_this_task'] = 30 job_option['per_run_time_limit'] = 10 job = AutoRegressionJob('testregression', dataset_id, features, targets, job_option, None) job.train() predict_result = job.predict(df[features]) predict_result[targets] = df[targets] assert job.get_status() == MLJobStatus.SUCCESS # predict_result.to_csv('/tmp/regression.csv', encoding='utf-8') job.clean()
def test_sqlquery(): dataset = DatasetManager.get_dataset('iris') query_result = dataset.query('SELECT * FROM dataset LIMIT 10;', 'sql') assert query_result is not None cols, rows = df_to_cols_rows(query_result) assert rows is not None assert len(rows) == 10
def __init__(self, name, dataset): self.id = str(uuid.uuid4()) self.name = name self.dataset_id = dataset self.dataset = DatasetManager.get_dataset(dataset) self.df = self.dataset.get_df() self.job_dir = os.path.join(MLJob.base_dir, self.id) self.metadata = {} self._init()
def test_job_time_serials(): dataset_id = 'air_passengers' dataset = DatasetManager.get_dataset(dataset_id) assert dataset is not None df = dataset.get_df() assert df is not None features = ['Date'] targets = ['Number'] job_option = {} job = TimeSerialsForecastsJob('testtimeserials', dataset_id, features, targets, job_option) job.train() if hasattr(job, 'training_error'): print(f'training error was detected {job.training_error}') assert job.get_status() == MLJobStatus.SUCCESS predict_result = job.predict(df[features]) assert predict_result is not None predict_result.to_csv('/tmp/tt.csv', encoding='utf-8')
def test_job_auto_multi_classification(): dataset_id = 'iris' dataset = DatasetManager.get_dataset(dataset_id) assert dataset is not None df = dataset.get_df() assert df is not None features = ['sepal_length', 'sepal_width'] targets = ['species'] job_option = {} job_option['time_left_for_this_task'] = 30 job_option['per_run_time_limit'] = 10 job = AutoClassificationJob('testclassification', dataset_id, features, targets, job_option, None) job.train() predict_result = job.predict(df[features]) predict_result[targets] = df[targets] assert job.get_status() == MLJobStatus.SUCCESS # predict_result.to_csv('/tmp/classification.csv', encoding='utf-8') job.clean()
def test_job_auto_classification(): dataset_id = 'churn' dataset = DatasetManager.get_dataset(dataset_id) assert dataset is not None df = dataset.get_df() assert df is not None features = ['Account Length', 'Area Code', 'Day Calls', 'State'] targets = ['Churn?'] job_option = {} job_option['time_left_for_this_task'] = 30 job_option['per_run_time_limit'] = 10 job = AutoClassificationJob('testclassification', dataset_id, features, targets, job_option, None) job.train() predict_result = job.predict(df[features]) predict_result[targets] = df[targets] assert job.get_status() == MLJobStatus.SUCCESS # predict_result.to_csv('/tmp/classification.csv', encoding='utf-8') job.clean()
def test_query(): dataset = DatasetManager.get_dataset('iris') query_result = dataset.query('sepal_length > sepal_width') assert query_result is not None
def test_data_list(): files = DatasetManager.list_datasets() assert len(files) == 5
def test_data_list(): DatasetManager.list_datasets()