Exemple #1
0
    def post(self, request):

        scenario = create_test_scenario(request.data)
        inference_time = 0

        try:
            check_test_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Load pipeline model from pipeline file
        pipeline = load_model(scenario.pipeline.file)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Extract pipeline information from loaded model
        pipeline = MLManager.extract_pipeline_components(pipeline)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Model params
        scenario.model = pipeline.get('model')
        scenario.transforms = json.dumps(pipeline.get('transforms', []))

        # Dataset
        if scenario.run_db:
            # Get features from table
            features = get_columns(scenario.db_url, scenario.table)

        else:
            data_extractor_start = datetime.now()
            # Get Dataset
            ds = get_dataset(scenario)
            features = ds.columns.to_list()

            data_extractor_end = datetime.now()
            data_extractor_time = (data_extractor_end -
                                   data_extractor_start).total_seconds()
            inference_time += data_extractor_time

        if scenario.labels_type == 'column':
            # Remove Label column if exists
            features.remove(scenario.labels)

        # ML Manager
        manager = MLManager()

        # Testing Phase
        query = None
        if scenario.run_db:

            inference_start = datetime.now()

            # Generate query using MLManager
            dbms = DBMSUtils.get_dbms_from_str_connection(scenario.db_url)
            queries, query = manager.generate_query(scenario.pipeline.file,
                                                    scenario.table, features,
                                                    dbms, scenario.optimizer)

            # Execute query
            y_pred = execute_multi_queries(scenario.db_url, queries)
            y_pred = pd.Series(y_pred.iloc[:, 0], name='Label')

            inference_end = datetime.now()
            inference_time += (inference_end - inference_start).total_seconds()

        else:

            inference_start = datetime.now()

            # Execute predict using MLManager and ML Library
            y_pred = manager.predict(ds[features], scenario.pipeline.file)
            y_pred = pd.Series(y_pred, name='Label')

            inference_end = datetime.now()
            inference_time += (inference_end - inference_start).total_seconds()

        # Label
        labels = []
        # Compute evaluation
        if scenario.labels_type:
            if scenario.labels_type == 'file':
                # Get labels from file
                labels_document = get_document_object(scenario.labels)
                labels = get_dataframe(labels_document.file)

                if labels is None:
                    return Response(
                        {
                            'detail':
                            'The selected labels file {} isn\'t valid'.format(
                                labels_document.filename)
                        },
                        status=status.HTTP_400_BAD_REQUEST)

                # Get first column from file
                labels = labels.iloc[:, 0].to_list()

            elif scenario.labels_type == 'table':
                # Get labels from table
                labels = get_table(scenario.db_url, scenario.labels)
                if not labels:
                    return Response(
                        {
                            'detail':
                            'The selected table {} isn\'t valid for label'.
                            format(scenario.labels)
                        },
                        status=status.HTTP_400_BAD_REQUEST)

                # Get first column from table
                labels = labels.iloc[:, 0].to_list()

            elif scenario.labels_type == 'column' and not scenario.run_db:
                # Get labels from column
                labels = ds[scenario.labels].to_list()

            elif scenario.labels_type == 'column' and scenario.run_db:
                # Get labels from table
                labels = get_column(scenario.db_url, scenario.table,
                                    scenario.labels)
            else:
                return Response({'detail': 'Select the wrong labels type'},
                                status=status.HTTP_400_BAD_REQUEST)

        # Compute evaluation
        res_evaluation = None
        if labels and scenario.metric:
            res_evaluation = manager.evaluate(scenario.metric, labels, y_pred)

        # Create predictions file
        test_result_name = "test_{}_{}.csv".format(scenario.model,
                                                   datetime.now())
        test_result_name = test_result_name.replace(' ', '_')
        test_result_name = test_result_name.replace(':', '_')
        y_pred.to_csv(test_result_name, index=False, header=True)

        # Save predictions in Document model
        f = open(test_result_name, 'rb')
        document = Document(file=File(f), filename=test_result_name)
        document.save()
        f.close()

        # Remove temporally predictions file
        os.remove(test_result_name)

        # Save Scenario model
        scenario.save()

        # Save ResultScenario
        result_scenario = ResultScenario()
        result_scenario.scenario = scenario
        result_scenario.execution_time = inference_time
        result_scenario.throughput = result_scenario.execution_time / len(
            y_pred)
        result_scenario.score = res_evaluation
        result_scenario.file_result = document.filename
        result_scenario.query = query
        result_scenario.save()

        return Response(
            {
                'detail': 'Successfully predicted result',
                'filename': test_result_name,
                'scenario_id': scenario.id
            },
            status=status.HTTP_201_CREATED)
Exemple #2
0
    def post(self, request):
        # Params
        scenario = create_simulation_scenario(request.data)

        try:
            check_simulation_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Get Dataset
        t_load_start = datetime.now()
        ds = get_table(scenario['db_url'], scenario['table'])
        features = ds.columns.to_list()

        if scenario['labels_type'] == 'column':
            # Remove Label column if exists
            features.remove(scenario['labels'])

        t_load_end = datetime.now()

        # ML Manager
        manager = MLManager()

        t_ml = []
        t_db = []

        # Testing Phase
        for i in range(scenario['batch_number']):
            ds_batch = get_batch(ds, i, scenario['batch_size'])
            if ds_batch.empty:
                break

            # Execute predict using MLManager and ML Library
            t_start = datetime.now()
            _ = manager.predict(ds[features], scenario['pipeline'].file)
            t_end = datetime.now()

            t_ml.append(t_end - t_start)

            # Create Batch for DBMS
            ds_batch.to_sql('batch',
                            con=scenario["db_url"],
                            if_exists="replace",
                            index=False)

            # Generate query using MLManager
            dbms = DBMSUtils.get_dbms_from_str_connection(scenario['db_url'])
            queries, query = manager.generate_query(scenario['pipeline'].file,
                                                    scenario['table'],
                                                    features, dbms,
                                                    scenario['optimizer'])

            # Execute query
            t_start = datetime.now()
            # Execute query
            _ = execute_multi_queries(scenario["db_url"], queries)
            t_end = datetime.now()

            t_db.append(t_end - t_start)

        # Finish Simulation
        return Response(
            {
                'detail': 'Successfully predicted result',
                'ml_results': {
                    'execution_time':
                    (np.mean(t_ml) + (t_load_end - t_load_start))
                },
                'dbms_results': {
                    'execution_time': np.mean(t_db)
                },
            },
            status=status.HTTP_200_OK)
def main(data_conf, pipeline_conf, str_db_conn, task, optimizer, debug=False):
    data_conf['str_db_conn'] = str_db_conn
    data_conf = check_data_config(data_conf)
    train = data_conf['train']
    y_train = data_conf['y_train']
    test = data_conf['test']
    y_test = data_conf['y_test']
    test_table_name = data_conf['test_table_name']
    features = list(data_conf['train'].columns)
    conn = data_conf['db_conn']

    tasks = ['regression', 'binary_classification', 'multi_classification']
    if task not in tasks:
        raise ValueError(f"Wrong task {task}. Available tasks: {tasks}")

    if task == 'regression':
        eval_fn = evaluate_regression_results
    elif task == 'binary_classification':
        eval_fn = evaluate_binary_classification_results
    else:
        eval_fn = evaluate_multi_classification_results

    check_pipeline_config(pipeline_conf, features)
    model_name = pipeline_conf['model']['name']

    mlmanager = MLManager()
    pipeline = create_pipeline(pipeline_conf)

    # fit
    print("\nStarting training...")
    pipeline.fit(train, y_train)
    _check_fitted_pipeline(pipeline, model_name, train)

    print("Training completed.\n")

    fitted_model = pipeline.steps[1][1]

    # ML predict
    print("\nStarting the ML inference...")
    ml_preds = pipeline.predict(test)
    ml_preds = pd.Series(ml_preds)
    print(ml_preds[:10])
    eval_fn(model_name, y_test, ml_preds)
    print("ML inference completed.\n")

    # SQL conversion
    print("\nStarting the SQL conversion...")
    pipeline = extract_pipeline(pipeline)
    dbms = DBMSUtils.get_dbms_from_str_connection(data_conf['str_db_conn'])
    queries, all_query = create_query(pipeline, mlmanager, features,
                                      test_table_name, optimizer, dbms, debug)
    print("SQL Conversion completed.\n")

    # SQL predict
    print("\nStarting the SQL inference...")
    for q in queries[:-1]:
        try:
            for qq in q.split(';'):
                conn.execute(qq)
        except Exception as e:
            pass

    try:
        sql_preds = pd.read_sql(queries[-1], conn)
    except Exception as e:
        logging.error(e.args[0])
        return

    sql_preds = pd.Series(sql_preds.iloc[:, 0])
    print(sql_preds[:10])
    null_val = False
    if sql_preds.isnull().sum() == 0:
        eval_fn(f"{model_name} SQL", y_test, sql_preds)
    else:
        null_val = True
    print("SQL inference completed.\n")

    # Null value test
    if null_val:
        print("\nNull value test")
        null_val_cnt = 0
        for sample_id in sql_preds[sql_preds.isnull()].index:
            print(sample_id)
            for (attr, val) in zip(test.columns,
                                   test.iloc[sample_id, :].values):
                print("\t", attr, '=', val)
            null_val_cnt += 1
        print(f"Found {null_val_cnt} null predictions.")

    # Accuracy test
    print("\nAccuracy test")
    equals = False
    for prec in range(10, 0, -1):
        ml_preds = ml_preds.map(lambda x: round(x, prec))
        sql_preds = sql_preds.map(lambda x: round(x, prec))
        if ml_preds.equals(sql_preds):
            print(
                f"The prediction scores are equal with {prec} decimal precision."
            )
            print(":)")
            equals = True
            break
    if not equals:
        print("The prediction scores are not equal.")
        print(":(\n")

        ne_preds = 0
        for i in range(len(ml_preds)):
            ml_pred = ml_preds.iloc[i]
            sql_pred = sql_preds.iloc[i]

            if ml_pred != sql_pred:
                if debug:
                    print(i, ml_pred, sql_pred)
                    for (attr, val) in zip(test.columns,
                                           test.iloc[i, :].values):
                        print("\t", attr, '=', val)
                ne_preds += 1
        print(f"Found {ne_preds} incorrect predictions.")