Example #1
0
def test_binary_expression_valid_success():
    json_code = {
        "type": "BinaryExpression",
        "operator": "*",
        "left": {
            "type": "Identifier",
            "name": "column1"
        },
        "right": {
            "type": "Identifier",
            "name": "column2"
        }
    }
    params = {}
    expr = Expression(json_code, params)
    result, msg = compare_ast(
        ast.parse(expr.parsed_expression),
        ast.parse("functions.col('column1') * functions.col('column2')"))
    assert result, msg

    json_code['operator'] = '/'
    json_code['left']['type'] = 'Literal'
    json_code['left']['value'] = 100
    json_code['left']['raw'] = '100'
    expr = Expression(json_code, params)

    result, msg = compare_ast(ast.parse(expr.parsed_expression),
                              ast.parse("100 / functions.col('column2')"))
    assert result, msg
Example #2
0
def test_clean_missing_minimal_params_type_value_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['name'],
        CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0,
        CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0,
        CleanMissingOperation.VALUE_PARAMETER: 200,
        CleanMissingOperation.CLEANING_MODE_PARAM: 'VALUE'
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
        min_missing_ratio = 0.0
        max_missing_ratio = 1.0
        output_1 = input_1
        for col in ['name']:
            ratio = input_1[col].isnull().sum()
            if ratio >= min_missing_ratio and ratio <= max_missing_ratio:
                output_1[col].fillna(value=200, inplace=True)
    """)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)

    # Test with value being number
    params[CleanMissingOperation.VALUE_PARAMETER] = 1200
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = expected_code.replace('200', '1200')
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #3
0
def test_binary_expression_with_params_success():
    json_code = {
        "type": "BinaryExpression",
        "operator": "*",
        "left": {
            "type": "Identifier",
            "name": "column1"
        },
        "right": {
            "type": "Identifier",
            "name": "column2"
        }
    }
    params = {'input': 'df00'}
    expr = Expression(json_code, params)
    result, msg = compare_ast(ast.parse(expr.parsed_expression),
                              ast.parse("df00['column1'] * df00['column2']"))
    assert result, msg

    json_code['operator'] = '/'
    json_code['left']['type'] = 'Literal'
    json_code['left']['value'] = 100
    json_code['left']['raw'] = '100'
    expr = Expression(json_code, params)

    result, msg = compare_ast(ast.parse(expr.parsed_expression),
                              ast.parse("100 / df00['column2']"))
    assert result, msg
Example #4
0
def test_onehot_encoder_operation_success():
    params = {
        OneHotEncoderOperation.ALIAS_PARAM: 'result',
        OneHotEncoderOperation.ATTRIBUTE_PARAM: ['col_1']
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    in1 = n_in['input data']
    out = n_out['output data']

    instance = OneHotEncoderOperation(params,
                                      named_inputs=n_in,
                                      named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        output_1 = input_1
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder()
        X_train = input_1['col_1'].values.tolist()
        output_1['result'] = enc.fit_transform(X_train).toarray().tolist()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #5
0
def test_join_remove_right_with_case_columns_success():
    params = {
        'left_attributes': ['id', 'cod'],
        'right_attributes': ['id2', 'cod2'],
        JoinOperation.KEEP_RIGHT_KEYS_PARAM: False,
        JoinOperation.MATCH_CASE_PARAM: True,
        'aliases': '_left,_right'
    }
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        cols_to_remove = [c+'_right' for c in df2.columns if c in df1.columns]

        data1_tmp = df1[['id', 'cod']].applymap(lambda col: str(col).lower())
        data1_tmp.columns = [c+"_lower" for c in data1_tmp.columns]
        data1_tmp = pd.concat([df1, data1_tmp], axis=1, sort=False)

        data2_tmp = df2[['id2', 'cod2']].applymap(lambda col: str(col).lower())
        data2_tmp.columns = [c+"_lower" for c in data2_tmp.columns]
        data2_tmp = pd.concat([df2, data2_tmp], axis=1, sort=False)

        out = pd.merge(data1_tmp, data2_tmp, left_on=col1, right_on=col2,
            copy=False, suffixes=['_left', '_right'], how='inner')
        out.drop(col1+col2, axis=1, inplace=True)

        out.drop(cols_to_remove, axis=1, inplace=True)""")

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #6
0
def test_intersection_minimal_params_success():
    params = {}
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = IntersectionOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        if len({in1}.columns) != len({in2}.columns):
            raise ValueError('{error}')
        {in1} = {in1}.dropna(axis=0, how='any')
        {in2} = {in2}.dropna(axis=0, how='any')
        keys = {in1}.columns.tolist()
        {in1} = pd.merge({in1}, {in2}, how='left', on=keys, 
        indicator=True, copy=False)
        {out} = {in1}.loc[{in1}['_merge'] == 'both', keys]
        """.format(out=n_out['output data'],
                   in1=n_in['input data 1'],
                   in2=n_in['input data 2'],
                   error=('For intersection operation, both input data '
                          'sources must have the same number of attributes '
                          'and types.')))
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
def test_gbt_classifier_with_params_success():
    params = {
        GBTClassifierOperation.N_ESTIMATORS_PARAM: 11,
        GBTClassifierOperation.MIN_LEAF_PARAM: 10,
        GBTClassifierOperation.MIN_SPLIT_PARAM: 12,
        GBTClassifierOperation.LEARNING_RATE_PARAM: 1.1,
        GBTClassifierOperation.MAX_DEPTH_PARAM: 13,
        GBTClassifierOperation.SEED_PARAM: 9,
        GBTClassifierOperation.LOSS_PARAM:
        GBTClassifierOperation.LOSS_PARAM_EXP
    }
    n_out = {'algorithm': 'classifier_1'}

    instance_lr = GBTClassifierOperation(params,
                                         named_inputs={},
                                         named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        classifier_1 = GradientBoostingClassifier(loss='exponencial',
          learning_rate=1.1, n_estimators=11,
          min_samples_split=12, max_depth=13,
          min_samples_leaf=10, random_state=9)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
def test_svm_operation_params_success():
    params = {
        SvmClassifierOperation.PENALTY_PARAM: 10.0,
        SvmClassifierOperation.KERNEL_PARAM:
        SvmClassifierOperation.KERNEL_PARAM_POLY,
        SvmClassifierOperation.DEGREE_PARAM: 2,
        SvmClassifierOperation.TOLERANCE_PARAM: -0.1,
        SvmClassifierOperation.MAX_ITER_PARAM: 13,
        SvmClassifierOperation.SEED_PARAM: 12
    }
    n_in = {}
    n_out = {'algorithm': 'classifier_1'}

    instance = SvmClassifierOperation(params,
                                      named_inputs=n_in,
                                      named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        classifier_1 = SVC(tol=0.1, C=10.0, max_iter=13, 
                           degree=2, kernel='poly', random_state=12)
    """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #9
0
def test_dbscan_clustering_success():
    params = {
        DBSCANClusteringOperation.FEATURES_PARAM: ['f'],
        DBSCANClusteringOperation.ALIAS_PARAM: 'alias',
        DBSCANClusteringOperation.EPS_PARAM: 0.15,
        DBSCANClusteringOperation.MIN_SAMPLES_PARAM: 20,

    }

    named_inputs = {'input data': 'df1'}
    named_outputs = {'output data': 'df2'}

    instance = DBSCANClusteringOperation(params, named_inputs=named_inputs,
                                         named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        df2 = df1.copy()
         
        X = df2['f'].values.tolist()
        clt = DBSCAN(eps=0.15, min_samples=20)
        df2['alias'] = clt.fit_predict(X)
        """.format())

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #10
0
def test_regressor_operation_with_model_success():
    params = {
        RegressionModelOperation.FEATURES_PARAM: 'f',
        RegressionModelOperation.LABEL_PARAM: 'l'
    }
    n_in = {'algorithm': 'regressor', 'train input data': 'train_data'}
    n_out = {'model': 'model_data'}

    instance = RegressionModelOperation(params,
                                        named_inputs=n_in,
                                        named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        algorithm = regressor
        out_task_1 = train_data.copy()
        X_train = train_data['f'].values.tolist()
        if 'IsotonicRegression' in str(algorithm):
            X_train = np.ravel(X_train)
        y = train_data['l'].values.tolist()
        model_data = algorithm.fit(X_train, y)
        out_task_1['prediction'] = algorithm.predict(X_train).tolist()
    """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #11
0
def test_randomforestregressor_with_params_success():
    params = {
        RandomForestRegressorOperation.MAX_FEATURES_PARAM: 'sqrt',
        RandomForestRegressorOperation.MAX_DEPTH_PARAM: 10,
        RandomForestRegressorOperation.MIN_LEAF_PARAM: 3,
        RandomForestRegressorOperation.MIN_SPLIT_PARAM: 4,
        RandomForestRegressorOperation.N_ESTIMATORS_PARAM: 9,
        RandomForestRegressorOperation.SEED_PARAM: -9
    }
    n_out = {'algorithm': 'regressor_1'}

    instance_lr = RandomForestRegressorOperation(params,
                                                 named_inputs={},
                                                 named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        regressor_1 = RandomForestRegressor(n_estimators=9,
            max_features='sqrt',
            max_depth=10,
            min_samples_split=4,
            min_samples_leaf=3,
            random_state=-9)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #12
0
def test_feature_assembler_operation_success():
    params = {
        FeatureAssemblerOperation.ATTRIBUTES_PARAM: ['col'],
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    in1 = n_in['input data']
    out = n_out['output data']

    instance = FeatureAssemblerOperation(params,
                                         named_inputs=n_in,
                                         named_outputs=n_out)

    code = instance.generate_code()

    expected_code = dedent("""
        cols = {cols}
        {output} = {input}
        {output}['FeatureField'] = {input}[cols].values.tolist()
        """.format(cols=params[FeatureAssemblerOperation.ATTRIBUTES_PARAM],
                   output=out,
                   input=in1))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #13
0
def test_quantile_discretizer_minimum_operation_success():
    params = {
        QuantileDiscretizerOperation.ATTRIBUTE_PARAM: ['col'],
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = QuantileDiscretizerOperation(params,
                                            named_inputs=n_in,
                                            named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        output_1 = input_1
        from sklearn.preprocessing import QuantileTransformer
        qt = QuantileTransformer(n_quantiles=1000,
            output_distribution='uniform', random_state=None)
        X_train = input_1['col'].values.tolist()
        output_1['col_norm'] = qt.fit_transform(X_train).toarray().tolist()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #14
0
def test_quantile_discretizer_operation_success():
    params = {
        QuantileDiscretizerOperation.ALIAS_PARAM:
        'result',
        QuantileDiscretizerOperation.ATTRIBUTE_PARAM: ['col_1'],
        QuantileDiscretizerOperation.DISTRIBUITION_PARAM:
        QuantileDiscretizerOperation.DISTRIBUITION_PARAM_NORMAL,
        QuantileDiscretizerOperation.SEED_PARAM:
        19,
        QuantileDiscretizerOperation.N_QUANTILES_PARAM:
        500
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    in1 = n_in['input data']
    out = n_out['output data']

    instance = QuantileDiscretizerOperation(params,
                                            named_inputs=n_in,
                                            named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        output_1 = input_1
        from sklearn.preprocessing import QuantileTransformer
        qt = QuantileTransformer(n_quantiles=500,
            output_distribution='normal', random_state=19)
        X_train = input_1['col_1'].values.tolist()
        output_1['result'] = qt.fit_transform(X_train).toarray().tolist()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #15
0
def test_pca_operation_success():
    params = {
        PCAOperation.ATTRIBUTE_PARAM: ['col'],
        PCAOperation.ALIAS_PARAM: 'feature',
        PCAOperation.N_COMPONENTS: 3
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    in1 = n_in['input data']
    out = n_out['output data']

    instance = PCAOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()

    expected_code = dedent("""
        output_1 = input_1
        from sklearn.decomposition import PCA
        pca = PCA(n_components=3)
        X_train = input_1['col'].values.tolist()
        output_1['feature'] = pca.fit_transform(X_train).tolist()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
def test_perceptron_with_params_success():
    params = {
        PerceptronClassifierOperation.SHUFFLE_PARAM:
        True,
        PerceptronClassifierOperation.PENALTY_PARAM:
        PerceptronClassifierOperation.PENALTY_PARAM_EN,
        PerceptronClassifierOperation.SEED_PARAM:
        10,
        PerceptronClassifierOperation.ALPHA_PARAM:
        0.11,
        PerceptronClassifierOperation.TOLERANCE_PARAM:
        0.1,
        PerceptronClassifierOperation.MAX_ITER_PARAM:
        100
    }
    n_out = {'algorithm': 'classifier_1'}

    instance_lr = PerceptronClassifierOperation(params,
                                                named_inputs={},
                                                named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        classifier_1 = Perceptron(tol=0.1, alpha=0.11, max_iter=100,
        shuffle=True, random_state=10, penalty='elasticnet')""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
def test_random_forest_operation_params_success():
    params = {
        RandomForestClassifierOperation.SEED_PARAM: 10,
        RandomForestClassifierOperation.MAX_DEPTH_PARAM: 11,
        RandomForestClassifierOperation.MIN_SPLIT_PARAM: 12,
        RandomForestClassifierOperation.MIN_LEAF_PARAM: 13,
        RandomForestClassifierOperation.N_ESTIMATORS_PARAM: 15
    }
    n_in = {}
    n_out = {'algorithm': 'classifier_1'}

    instance = RandomForestClassifierOperation(params,
                                               named_inputs=n_in,
                                               named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        classifier_1 = RandomForestClassifier(n_estimators=15, 
         max_depth=11,  min_samples_split=12, 
         min_samples_leaf=13, random_state=10)
    """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #18
0
def test_kmeans_clustering_operation_random_type_kmeans_success():
    params = {
        KMeansClusteringOperation.N_CLUSTERS_PARAM: 10,
        KMeansClusteringOperation.MAX_ITER_PARAM: 20,
        KMeansClusteringOperation.TYPE_PARAM:
            KMeansClusteringOperation.TYPE_PARAM_KMEANS,
        KMeansClusteringOperation.INIT_PARAM:
            KMeansClusteringOperation.INIT_PARAM_RANDOM,
        KMeansClusteringOperation.TOLERANCE_PARAM: 0.001,
        KMeansClusteringOperation.SEED_PARAM: 15
    }
    named_outputs = {'algorithm': 'clustering_algo_1'}

    instance = KMeansClusteringOperation(params, named_inputs={},
                                         named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        {output} = KMeans(n_clusters={k}, init='{init}',
                max_iter={max_iter}, tol={tol}, random_state={seed})
        """.format(output=named_outputs['algorithm'],
                   k=params[KMeansClusteringOperation.N_CLUSTERS_PARAM],
                   init=params[KMeansClusteringOperation.INIT_PARAM],
                   max_iter=params[KMeansClusteringOperation.MAX_ITER_PARAM],
                   seed=params[KMeansClusteringOperation.SEED_PARAM],
                   tol=params[KMeansClusteringOperation.TOLERANCE_PARAM]))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
def test_classification_model_operation_success():
    params = {
        ClassificationModelOperation.FEATURES_ATTRIBUTE_PARAM: ['f'],
        ClassificationModelOperation.LABEL_ATTRIBUTE_PARAM: ['label']
    }
    named_inputs = {'algorithm': 'algo', 'train input data': 'df_2'}
    named_outputs = {'model': 'output_2'}

    instance = ClassificationModelOperation(params,
                                            named_inputs=named_inputs,
                                            named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        X = df_2['f'].values.tolist()
        y = df_2['label'].values.tolist()
        output_2 = algo.fit(X, y)

        task_1 = None
        """.format())

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #20
0
def test_agglomerative_clustering_success():
    params = {
        AgglomerativeClusteringOperation.ALIAS_PARAM: 'ALIAS',
        AgglomerativeClusteringOperation.FEATURES_PARAM: ['f'],
        AgglomerativeClusteringOperation.AFFINITY_PARAM:
            AgglomerativeClusteringOperation.AFFINITY_PARAM_COS,
        AgglomerativeClusteringOperation.LINKAGE_PARAM:
            AgglomerativeClusteringOperation.AFFINITY_PARAM_L2
    }

    named_inputs = {'input data': 'df1'}
    named_outputs = {'output data': 'df2'}

    instance = AgglomerativeClusteringOperation(params,
                                                named_inputs=named_inputs,
                                                named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        df2 = df1.copy()

        X = df2['f'].values.tolist()
        clt = AgglomerativeClustering(n_clusters=2, 
            linkage='l2', affinity='cosine')
        df2['ALIAS'] = clt.fit_predict(X)
        """.format())

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #21
0
def test_get_windows_function_success():
    json_code = {
        "type":
        "CallExpression",
        "arguments": [{
            "type": "Identifier",
            "name": "created_at"
        }, {
            "type": "Literal",
            "value": 10,
            "raw": "10"
        }, {
            "type": "Literal",
            "value": "end",
            "raw": "'end'"
        }],
        "callee": {
            "type": "Identifier",
            "name": "window"
        }
    }
    params = {}
    expr = Expression(json_code, params)

    expected_code = ("functions.window("
                     "functions.col('created_at'),"
                     "str('10 seconds')).start.cast('timestamp')")
    result, msg = compare_ast(ast.parse(expr.parsed_expression),
                              ast.parse(expected_code))
    assert result, msg + format_code_comparison(expr.parsed_expression,
                                                expected_code)
Example #22
0
def test_lda_clustering_success():
    params = {
        LdaClusteringOperation.N_COMPONENTES_PARAM: 10,
        LdaClusteringOperation.ALPHA_PARAM: 0.5,
        LdaClusteringOperation.SEED_PARAM: 11,
        LdaClusteringOperation.MAX_ITER_PARAM: 100,
        LdaClusteringOperation.ETA_PARAM: 0.5,
        LdaClusteringOperation.LEARNING_METHOD_PARAM:
            LdaClusteringOperation.LEARNING_METHOD_ON,

    }

    named_outputs = {'algorithm': 'clustering_algo_1'}

    instance = LdaClusteringOperation(params, named_inputs={},
                                      named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        clustering_algo_1 = LatentDirichletAllocation(n_components=10, 
         doc_topic_prior=0.5, topic_word_prior=0.5, 
         learning_method='online', max_iter=100)
        """.format())

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #23
0
def test_clustering_with_model_operation_success():
    params = {

        ClusteringModelOperation.FEATURES_PARAM: ['f'],

    }
    named_inputs = {'algorithm': 'algo',
                    'train input data': 'df_2'}
    named_outputs = {'output data': 'output_1',
                     'model': 'output_2'}

    instance = ClusteringModelOperation(params, named_inputs=named_inputs,
                                        named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        X = df_2['f'].values.tolist()
        output_2 = algo.fit(X)

        y = algo.predict(X)
        output_1 = df_2
        output_1['prediction'] = y
        """.format())

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #24
0
def test_agglomerative_clustering_minimum_success():
    params = {
        AgglomerativeClusteringOperation.FEATURES_PARAM: ['f'],
    }
    named_inputs = {'input data': 'df1'}
    named_outputs = {'output data': 'df2'}

    instance = AgglomerativeClusteringOperation(params,
                                                named_inputs=named_inputs,
                                                named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        df2 = df1.copy()

        X = df2['f'].values.tolist()
        clt = AgglomerativeClustering(n_clusters=2,
             linkage='ward', affinity='euclidean')
        df2['cluster'] = clt.fit_predict(X)
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
Example #25
0
def test_sql_two_inputs_params_success():
    params = {
        ExecuteSQLOperation.QUERY_PARAM: "select * where df2.id = 1;",
        ExecuteSQLOperation.NAMES_PARAM: "col1, col2, col3"
    }
    n_in = {'input data 1': 'input_1', 'input data 2': 'input_2'}
    n_out = {'output data': 'output_1'}
    instance = ExecuteSQLOperation(params,
                                   named_inputs=n_in,
                                   named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
    
        query = 'select * where df2.id = 1;'
        output_1 = sqldf(query, {'ds1': input_1, 'ds2': input_2})
        names = ['col1', 'col2', 'col3']

        if names is not None and len(names) > 0:
            old_names = output_1.columns
            if len(old_names) != len(names):
                raise ValueError('Invalid names. Number of attributes '
                                 'in result differs from names informed.')
            rename = dict(zip(old_names, names))
            output_1.rename(columns=rename, inplace=True)
        """)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
def test_logisticregression_with_params_success():
    params = {
        LogisticRegressionOperation.TOLERANCE_PARAM:
        0.1,
        LogisticRegressionOperation.MAX_ITER_PARAM:
        10,
        LogisticRegressionOperation.SEED_PARAM:
        2,
        LogisticRegressionOperation.REGULARIZATION_PARAM:
        1.1,
        LogisticRegressionOperation.SOLVER_PARAM:
        LogisticRegressionOperation.SOLVER_PARAM_NEWTON
    }
    n_out = {'algorithm': 'classifier_1'}

    instance_lr = LogisticRegressionOperation(params,
                                              named_inputs={},
                                              named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        classifier_1 = LogisticRegression(tol=0.1, C=1.1, max_iter=10, 
        solver='newton-cg', random_state=2)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #27
0
def test_join_left_join_keep_columns_minimal_params_success():
    params = {
        'left_attributes': ['id', 'cod'],
        'right_attributes': ['id', 'cod'],
        JoinOperation.JOIN_TYPE_PARAM: 'left',
        JoinOperation.KEEP_RIGHT_KEYS_PARAM: True,
        'aliases': '_left,_right'
    }
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        {out} = pd.merge({in0}, {in1}, how='{type}', 
            suffixes=['_left', '_right'], left_on=['id', 'cod'],
            right_on=['id', 'cod'])
        """.format(
        out=n_out['output data'],
        in0=n_in['input data 1'],
        in1=n_in['input data 2'],
        type=params[JoinOperation.JOIN_TYPE_PARAM],
    ))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
def test_mlp_classifier_with_params_success():
    params = {
        MLPClassifierOperation.HIDDEN_LAYER_SIZES_PARAM: '(100,10,9)',
        MLPClassifierOperation.ACTIVATION_PARAM:
        MLPClassifierOperation.ACTIVATION_PARAM_LOG,
        MLPClassifierOperation.SEED_PARAM: 9,
        MLPClassifierOperation.SOLVER_PARAM:
        MLPClassifierOperation.SOLVER_PARAM_LBFGS,
        MLPClassifierOperation.MAX_ITER_PARAM: 1000,
        MLPClassifierOperation.ALPHA_PARAM: 0.01,
        MLPClassifierOperation.TOLERANCE_PARAM: 0.1,
    }
    n_out = {'algorithm': 'classifier_1'}

    instance_lr = MLPClassifierOperation(params,
                                         named_inputs={},
                                         named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        classifier_1 = MLPClassifier(hidden_layer_sizes=(100,10,9),
        activation='logistic', solver='lbfgs', alpha=0.01,
        max_iter=1000, random_state=9, tol=0.1)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #29
0
def test_replace_value_minimal_params_success():
    params = {"attributes": ["col1", "col2"], "replacement": 10, "value": -10}
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    instance = ReplaceValuesOperation(params,
                                      named_inputs=n_in,
                                      named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        output_1 = input_1
        replacement = {replaces}
        for col in replacement:
            list_replaces = replacement[col]
            output_1[col] = output_1[col].replace(list_replaces[0],
            list_replaces[1])
        """.format(out=n_out['output data'],
                   in1=n_in['input data'],
                   replaces={
                       "col2": [[-10], [10]],
                       "col1": [[-10], [10]]
                   }))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Example #30
0
def test_standardscaler_minimum_operation_success():
    params = {
        StandardScalerOperation.ATTRIBUTE_PARAM: ['col'],
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    in1 = n_in['input data']
    out = n_out['output data']

    instance = StandardScalerOperation(params,
                                       named_inputs=n_in,
                                       named_outputs=n_out)

    code = instance.generate_code()

    expected_code = dedent("""
        output_1 = input_1
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        X_train = input_1['col'].values.tolist()
        output_1['col_norm'] = scaler.fit_transform(X_train).tolist()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)