コード例 #1
0
def test_readshapefile_minimal_params_success():
    params = {
        ReadShapefileOperation.POLYGON_ATTR_PARAM: 'points',
        ReadShapefileOperation.SHAPEFILE_PARAM: 'shapefile.shp',
    }
    n_out = {'geo data': 'out'}

    instance = ReadShapefileOperation(parameters=params,
                                      named_inputs={},
                                      named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        polygon = '{polygon}'
        lat_long = True
        attributes = []
        {out} = ReadShapefile(polygon, lat_long, 
        attributes, 'shapefile.shp', 'shapefile.dbf')
    """.format(polygon=params['polygon'], out=n_out['geo data']))
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #2
0
def test_gbt_regressor_with_params_success():
    params = {
        DecisionTreeClassifierOperation.SEED_PARAM: 14,
        DecisionTreeClassifierOperation.MIN_LEAF_PARAM: 4,
        DecisionTreeClassifierOperation.MIN_SPLIT_PARAM: 5,
        DecisionTreeClassifierOperation.MAX_DEPTH_PARAM: 11,
        DecisionTreeClassifierOperation.MIN_WEIGHT_PARAM: 0.1
    }
    n_out = {'algorithm': 'classifier_1'}

    instance_lr = DecisionTreeClassifierOperation(params,
                                                  named_inputs={},
                                                  named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        classifier_1 = DecisionTreeClassifier(max_depth=11, 
        min_samples_split=5, min_samples_leaf=4, 
        min_weight_fraction_leaf=0.1, random_state=14)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #3
0
def test_clean_missing_without_missing_rating_params_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['name'],
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
    attributes_{input_1} = ['{attribute}']
    if len(attributes_input_1) > 0:
        {output_1} = {input_1}.na.drop(how='any', subset=attributes_{input_1})
    else:
        {output_1} = {input_1}
    """.format(input_1=n_in['input data'],
               attribute=params['attributes'][0],
               output_1=n_out['output result']))
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #4
0
def test_intersection_minimal_params_success():
    params = {}
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = IntersectionOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        if len(df1.columns) != len(df2.columns):
            raise ValueError('{error}')
        {out} = {in1}.intersect({in2})
        """.format(out=n_out['output data'],
                   in1=n_in['input data 1'],
                   in2=n_in['input data 2'],
                   error=('For intersection operation, both input data '
                          'sources must have the same number of attributes '
                          'and types.')))
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #5
0
def test_filter_minimum_params_success():
    params = {
        FilterOperation.FILTER_PARAM: [{
            'attribute': 'code',
            'f': '>',
            'value': '201'
        }],
        'config': {}
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    instance = FilterOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = ("{out} = {in1}.filter("
                     "functions.col('{attribute}') {f} '{value}')").format(
                         out=n_out['output data'],
                         in1=n_in['input data'],
                         **params[FilterOperation.FILTER_PARAM][0])
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #6
0
def test_stdbscan_minimal_params_success():
    params = {
        STDBSCANOperation.DATETIME_PARAM: ['date'],
        STDBSCANOperation.LON_PARAM: ['lon'],
        STDBSCANOperation.LAT_PARAM: ['lat'],
    }
    n_in = {'input data': 'df1'}
    n_out = {'output data': 'out'}

    instance = STDBSCANOperation(parameters=params,
                                 named_inputs=n_in,
                                 named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
    out = st_dbscan(df1, 'lat', 'lon', 
            'date', 'cluster', spatial_threshold=500.0, 
             temporal_threshold=60, min_neighbors=15)
    """.format())
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #7
0
def test_join_inner_join_minimal_with_remove_right_columns_success():
    params = {
        'left_attributes': ['id', 'cod'],
        'right_attributes': ['id', 'cod'],
        'aliases': '_left,_right'
    }
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        cols_to_remove = [c+'_right' for c in df2.columns if c in df1.columns]

        out = pd.merge(df1, df2, how='inner', suffixes=['_left', '_right'],
                left_on=['id', 'cod'], right_on=['id', 'cod'])

        out.drop(cols_to_remove, axis=1, inplace=True)""")

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #8
0
def test_transformation_math_expression_success():
    alias = 'result_2'
    expr = [{
        'tree': {
            "type": "BinaryExpression",
            "operator": "*",
            "left": {
                "type": "Identifier",
                "name": "a"
            },
            "right": {
                "type": "Literal",
                "value": 100,
                "raw": "100"
            }
        },
        'alias': alias,
        'expression': "lower(a)"
    }]

    params = {
        TransformationOperation.EXPRESSION_PARAM: expr,
    }
    n_in = {'input data': 'df1'}
    n_out = {'output data': 'out'}
    instance = TransformationOperation(params,
                                       named_inputs=n_in,
                                       named_outputs=n_out)
    code = instance.generate_code()

    expected_code = dedent("""
        {out} = {in1}.copy()
        
        functions = [['result_2', lambda row: row['a'] * 100],]
        for col, function in functions:
            {out}[col] = {out}.apply(function, axis=1)
        """.format(out=n_out['output data'], in1=n_in['input data']))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #9
0
def test_gbt_regressor_with_params_success():
    params = {
        GradientBoostingRegressorOperation.N_ESTIMATORS_PARAM: 11,
        GradientBoostingRegressorOperation.MIN_SPLIT_PARAM: 12,
        GradientBoostingRegressorOperation.SEED_PARAM: 13,
        GradientBoostingRegressorOperation.MAX_DEPTH_PARAM: 14,
        GradientBoostingRegressorOperation.LEARNING_RATE_PARAM: 0.155,
        GradientBoostingRegressorOperation.MIN_LEAF_PARAM: 16
    }
    n_out = {'algorithm': 'regressor_1'}

    instance_lr = GradientBoostingRegressorOperation(params,
                                                     named_inputs={},
                                                     named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        regressor_1 = GradientBoostingRegressor(learning_rate=0.155,
          n_estimators=11, max_depth=14, min_samples_split=12, 
          min_samples_leaf=16, random_state=13)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #10
0
def test_linearegression_with_params_success():
    params = {
        LinearRegressionOperation.NORMALIZE_PARAM: False,
        LinearRegressionOperation.ALPHA_PARAM: 0.5,
        LinearRegressionOperation.ELASTIC_NET_PARAM: 0.55,
        LinearRegressionOperation.TOLERANCE_PARAM: 0.1,
        LinearRegressionOperation.MAX_ITER_PARAM: 10,
        LinearRegressionOperation.SEED_PARAM: 2
    }
    n_out = {'algorithm': 'regressor_1'}

    instance_lr = LinearRegressionOperation(params,
                                            named_inputs={},
                                            named_outputs=n_out)

    code = instance_lr.generate_code()
    expected_code = dedent("""
        regressor_1 = ElasticNet(alpha=0.5, l1_ratio=0.55, tol=0.1,
                                 max_iter=10, random_state=2,
                                 normalize=False)""")
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #11
0
def test_join_left_join_keep_columns_minimal_params_success():
    params = {
        JoinOperation.LEFT_ATTRIBUTES_PARAM: ['id', 'cod'],
        JoinOperation.RIGHT_ATTRIBUTES_PARAM: ['id', 'cod'],
        JoinOperation.JOIN_TYPE_PARAM: 'left',
        JoinOperation.KEEP_RIGHT_KEYS_PARAM: True,
        JoinOperation.ALIASES_PARAM: 'left_, right_  '
    }
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        def _rename_attributes(df, prefix):
            result = df
            for col in df.columns:
                result = result.withColumnRenamed(col, '{{}}{{}}'.format(
                    prefix, col))
            return result

        in0_renamed = _rename_attributes({in0}, '{a0}')
        in1_renamed = _rename_attributes({in1}, '{a1}')

        condition = [in0_renamed['{a0}id'] == in1_renamed['{a1}id'],
            in0_renamed['{a0}cod'] == in1_renamed['{a1}cod']]
        {out} = in0_renamed.join(in1_renamed, on=condition, how='left')
        """.format(
        out=n_out['output data'],
        in0=n_in['input data 1'],
        a0='left_',
        a1='right_',
        in1=n_in['input data 2'],
        type=params[JoinOperation.JOIN_TYPE_PARAM],
    ))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #12
0
def test_aggregation_rows_minimal_params_success():
    params = {
        AggregationOperation.FUNCTION_PARAM: [{
            'attribute': 'income',
            'f': 'AVG',
            'alias': 'avg_income'
        }],
        AggregationOperation.ATTRIBUTES_PARAM: ['country']
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = AggregationOperation(params,
                                    named_inputs=n_in,
                                    named_outputs=n_out)
    code = instance.generate_code()

    expected_code = dedent("""
         pivot_values = None
         pivot_attr = ''
         if pivot_attr:
              {out} = {in0}.groupBy(
                 functions.col('{agg}')).pivot(
                     pivot_attr, pivot_values).agg(
                         functions.avg('income').alias('avg_income'))
         else:
              {out} = {in0}.groupBy(
                 functions.col('{agg}')).agg(
                     functions.avg('income').alias('avg_income'))

        """.format(
        out=n_out['output data'],
        in0=n_in['input data'],
        agg='country',
    ))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #13
0
def test_tokenizer_operation_type_simple_success():
    params = {
        TokenizerOperation.TYPE_PARAM: 'simple',
        TokenizerOperation.ATTRIBUTES_PARAM: ['col'],
        TokenizerOperation.ALIAS_PARAM: 'c'
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = TokenizerOperation(params,
                                  named_inputs=n_in,
                                  named_outputs=n_out)

    code = instance.generate_code()

    expected_code = dedent("""
            col_alias = {3}
            pattern_exp = r'\s+'
            min_token_length = 3
            tokenizers = [RegexTokenizer(inputCol=col, outputCol=alias,
                    pattern=pattern_exp, minTokenLength=min_token_length)
                    for col, alias in col_alias]

            # Use Pipeline to process all attributes once
            pipeline = Pipeline(stages=tokenizers)

            {2} = pipeline.fit({1}).transform({1})
        """.format(
        params[TokenizerOperation.ATTRIBUTES_PARAM], n_in['input data'],
        n_out['output data'],
        json.dumps(
            list(
                zip(params[TokenizerOperation.ATTRIBUTES_PARAM],
                    params[TokenizerOperation.ALIAS_PARAM])))))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
コード例 #14
0
def test_clean_missing_without_missing_rating_params_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['name'],
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
        min_missing_ratio = 0.0
        max_missing_ratio = 1.0
        {output_1} = {input_1}
        for col in {attribute}:
            ratio = {input_1}[col].isnull().sum()
            if ratio >= min_missing_ratio and ratio <= max_missing_ratio:
                {output_1}.dropna(subset=col, axis='index', inplace=True)
    """.format(input_1=n_in['input data'],
               attribute=params['attributes'],
               output_1=n_out['output result']))
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #15
0
def test_gaussian_mixture_clustering_success():
    params = {
        GaussianMixtureClusteringOperation.MAX_ITER_PARAM: 15,
        GaussianMixtureClusteringOperation.TOLERANCE_PARAM: 0.11,
        GaussianMixtureClusteringOperation.N_CLUSTERS_PARAM: 11,

    }

    named_outputs = {'algorithm': 'clustering_algo_1'}

    instance = GaussianMixtureClusteringOperation(params, named_inputs={},
                                                  named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        clustering_algo_1 = GaussianMixture(n_components=11, 
            max_iter=15, tol=0.11)
        """.format())

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
コード例 #16
0
def test_dbscan_clustering_minimum_success():
    params = {
        DBSCANClusteringOperation.FEATURES_PARAM: ['f'],
    }
    named_inputs = {'input data': 'df1'}
    named_outputs = {'output data': 'df2'}

    instance = DBSCANClusteringOperation(params, named_inputs=named_inputs,
                                         named_outputs=named_outputs)

    code = instance.generate_code()

    expected_code = dedent("""
        df2 = df1.copy()
         
        X = df2['f'].values.tolist()
        clt = DBSCAN(eps=0.5, min_samples=5)
        df2['cluster'] = clt.fit_predict(X)
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
コード例 #17
0
def test_clean_missing_minimal_params_success():
    params = {
        CleanMissingOperation.ATTRIBUTES_PARAM: ['col1', 'col2'],
        CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0,
        CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0,
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output result': 'output_1'}
    instance = CleanMissingOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
     min_missing_ratio = 0.0
     max_missing_ratio = 1.0
     output_1 = input_1
     for col in ['col1', 'col2']:
        ratio = input_1[col].isnull().sum()
        if ratio >= min_missing_ratio and ratio <= max_missing_ratio:
            output_1.dropna(subset=col, axis='index', inplace=True)
    """)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #18
0
def test_onehot_encoder_minimum_operation_success():
    params = {
        OneHotEncoderOperation.ATTRIBUTE_PARAM: ['col'],
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = OneHotEncoderOperation(params,
                                      named_inputs=n_in,
                                      named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        output_1 = input_1
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder()
        X_train = input_1['col'].values.tolist()
        output_1['col_norm'] = enc.fit_transform(X_train).toarray().tolist()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
コード例 #19
0
def test_filter_minimum_params_success():
    params = {
        FilterOperation.FILTER_PARAM: [{
            'attribute': 'code',
            'f': '>',
            'value': '201'
        }, {
            'attribute': 'code2',
            'f': '<',
            'value': '200'
        }]
    }
    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}
    instance = FilterOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
    output_1 = input_1
    output_1 = output_1.query('(code > 201) and (code2 < 200)')
    """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #20
0
def test_join_case_insensitive_success():
    params = {
        JoinOperation.LEFT_ATTRIBUTES_PARAM: ['id', 'cod'],
        JoinOperation.RIGHT_ATTRIBUTES_PARAM: ['id2', 'cod2'],
        JoinOperation.KEEP_RIGHT_KEYS_PARAM: 'True',
        JoinOperation.ALIASES_PARAM: 'left_, right_  ',
        JoinOperation.MATCH_CASE_PARAM: 'True',
    }
    n_in = {'input data 1': 'df1', 'input data 2': 'df2'}
    n_out = {'output data': 'out'}
    instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
        def _rename_attributes(df, prefix):
            result = df
            for col in df.columns:
                result = result.withColumnRenamed(col, '{{}}{{}}'.format(
                    prefix, col))
            return result
        in0_renamed = _rename_attributes({in0}, '{a0}')
        in1_renamed = _rename_attributes({in1}, '{a1}')

        condition = [functions.lower(in0_renamed['{a0}id'])
            == functions.lower(in1_renamed['{a1}id2']),
            functions.lower(in0_renamed['{a0}cod'])
            == functions.lower(in1_renamed['{a1}cod2'])]
        {out} = in0_renamed.join(in1_renamed, on=condition, how='inner')
        """.format(out=n_out['output data'],
                   in0=n_in['input data 1'],
                   in1=n_in['input data 2'],
                   a0='left_',
                   a1='right_'))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #21
0
ファイル: test_text_operations.py プロジェクト: yuanbw/juicer
def test_word_to_vector_tfidf_operation_success():
    params = {
        WordToVectorOperation.TYPE_PARAM: WordToVectorOperation.TYPE_TFIDF,
        WordToVectorOperation.ATTRIBUTES_PARAM: ['col_1'],
        WordToVectorOperation.ALIAS_PARAM: 'col_2',
        WordToVectorOperation.VOCAB_SIZE_PARAM: 200,
        WordToVectorOperation.MINIMUM_DF_PARAM: 5,
    }

    n_in = {'input data': 'input_1'}
    n_out = {'output data': 'output_1'}

    instance = WordToVectorOperation(params,
                                     named_inputs=n_in,
                                     named_outputs=n_out)

    code = instance.generate_code()

    expected_code = dedent("""
         output_1 = input_1.copy()
         
         def do_nothing(tokens):
             return tokens
         
         corpus = output_1['col_1'].values.tolist()
         vector_model_1 = TfidfVectorizer(tokenizer=do_nothing,
                          preprocessor=None, lowercase=False, 
                          min_df=5, max_features=200)
         
         vector_model_1.fit(corpus)
         output_1['col_2'] = vector_model_1.transform(corpus).toarray().tolist()
         vocab_task_1 = vector_model_1.get_feature_names()
        """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #22
0
ファイル: test_vis_operation.py プロジェクト: yuanbw/juicer
def atest_area_chart_success(time_series_data):
    params = {
        Visu.TITLE_PARAM: 'Simple title 1',
        Visu.COLUMN_NAMES_PARAM: ['name, age, gender'],
        Visu.ORIENTATION_PARAM: 'landscape',
        Visu.ID_ATTR_PARAM: ['id'],
        Visu.VALUE_ATTR_PARAM: ['age'],
        'task': {
            'id': uuid.uuid4(),
        },
        'operation_id': 1,
        'operation_slug': 'area-chart',
        'user': {},
        'workflow_id': 17,
        'job_id': 100,
    }
    n_in = {'input data': 'input'}
    n_out = {}
    chart = AreaChartOperation(params, n_in, n_out)
    with mock.patch('juicer.spark.vis_operation.get_caipirinha_config',
                    get_mocked_caipirinha_config):
        code = chart.generate_code()

    expected_code = dedent("""
        from juicer.spark.vis_operation import AreaChartModel
        from juicer.util.dataframe_util import SimpleJsonEncoder as enc
        from juicer.service import caipirinha_service
        params = '{{}}'
        vis_task_1 = AreaChartModel(
            input, '{task_id}', '{operation_id}',
            '{operation_slug}', '{title}',
            {column_names},
            'landscape', {id_attribute}, {value_attribute},
            params=json.loads(params))
        config = {{
            'juicer': {{
                'services': {{
                    'limonero': {{
                        'url': 'http://limonero:3321',
                        'auth_token': 'token'
                    }},
                    'caipirinha': {{
                        'url': 'http://caipirinha:3324',
                        'auth_token': 'token',
                        'storage_id': 1
                    }},
                }}
            }}
        }}
        visualization = {{
           'job_id': '{job_id}',
           'task_id': vis_task_1.task_id,
           'title': vis_task_1.title ,
           'type': {{
               'id': vis_task_1.type_id,
               'name': vis_task_1.type_name
           }},
           'model': vis_task_1,
           'data': json.dumps(vis_task_1.get_data(), cls=enc, ignore_nan=True)
        }}
        caipirinha_service.new_visualization(
           config, {{}}, {workflow_id}, {job_id},
           '{task_id}',
           visualization, emit_event)""").format(task_id=params['task']['id'],
                                                 **params)
    ast.parse(expected_code)
    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #23
0
def test_pythoncode_minimum_params_success():
    params = {
        ExecutePythonOperation.PYTHON_CODE_PARAM:
        "df1['col3'] =  df1['col1'] + df1['col2']",
        'task': {
            'id': 1
        }
    }
    n_in = {'input data 1': 'input_1'}
    n_out = {'output data': 'output_1'}
    instance = ExecutePythonOperation(params,
                                      named_inputs=n_in,
                                      named_outputs=n_out)

    code = instance.generate_code()
    expected_code = dedent("""
    import json                                                                            
    from RestrictedPython.Guards import safe_builtins                                      
    from RestrictedPython.RCompile import compile_restricted                               
    from RestrictedPython.PrintCollector import PrintCollector                             
                                                                                           
    results = [r[1].result() for r in task_futures.items() if r[1].done()]                 
    results = dict([(r['task_name'], r) for r in results])                                 
    # Input data                                                                           
    in1 = input_1                                                                          
    in2 = None                                                                             
                                                                                           
    # Output data, initialized as None                                                     
    out1 = None                                                                            
    out2 = None                                                                            
                                                                                           
    # Variables and language supported                                                     
    ctx = {                                                                                
        'wf_results': results,                                                             
        'in1': in1,                                                                        
        'in2': in2,                                                                        
        'out1': out1,                                                                      
        'out2': out2,                                                                      
                                                                                           
        # Restrictions in Python language                                                  
         '_write_': lambda v: v,                                                           
        '_getattr_': getattr,                                                              
        '_getitem_': lambda ob, index: ob[index],                                          
        '_getiter_': lambda it: it,                                                        
        '_print_': PrintCollector,                                                         
        'json': json,                                                                      
    }                                                                                      
    user_code = "df1['col3'] =  df1['col1'] + df1['col2']"   
                                                                                           
    ctx['__builtins__'] = safe_builtins                                                    
                                                                                           
    compiled_code = compile_restricted(user_code,                                          
    str('python_execute_1'), str('exec'))                                                  
    try:                                                                                   
        exec compiled_code in ctx                                                          
                                                                                           
        # Retrieve values changed in the context                                           
        out1 = ctx['out1']                                                                 
        out2 = ctx['out2']                                                                 
                                                                                           
        if '_print' in ctx:                                                                
            emit_event(name='update task',                                                 
                message=ctx['_print'](),                                                   
                status='RUNNING',                                                          
                identifier='1')                                                            
    except NameError as ne:                                                                
        raise ValueError(_('Invalid name: {}. '                                            
            'Many Python commands are not available in Lemonade').format(ne))              
    except ImportError as ie:                                                              
        raise ValueError(_('Command import is not supported'))                             
                                                                                           
    out_1_1 = out1                                                                         
    out_2_1 = out2                         
    """)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
コード例 #24
0
def test_geo_within_success():
    params = {
        GeoWithin.POLYGON_POINTS_COLUMN_PARAM: ['polygon'],
        GeoWithin.POLYGON_ATTRIBUTES_COLUMN_PARAM: ['attribute'],
        GeoWithin.POLYGON_ALIAS_COLUMN_PARAM: 'alias',
        GeoWithin.TARGET_LAT_COLUMN_PARAM: 'latitude',
        GeoWithin.TARGET_LON_COLUMN_PARAM: 'longitude'
    }
    n_out = {'output data': 'output_1'}
    n_in = {'input data': 'input_1', 'geo data': 'geo_data'}
    instance = GeoWithin(params, named_inputs=n_in, named_outputs=n_out)
    code = instance.generate_code()
    expected_code = dedent("""
        from matplotlib.path import Path
        import pyqtree
        attributes_to_add = {attributes}

        schema = [s.name for s in {geo}.schema]
        shp_object = {geo}.select(attributes_to_add +
                ['{points}']).collect()
        bcast_shapefile = spark_session.sparkContext.broadcast(shp_object)

        f_min = functions.udf(
               lambda v, index: min([item[index] for item in v]),
                   types.DoubleType())
        f_max = functions.udf(
            lambda v, index: max([item[index] for item in v]),
                types.DoubleType())

        boundaries = {geo}.select(
            (f_min('{points}', functions.lit(1))).alias('x_min'),
            (f_min('{points}', functions.lit(0))).alias('y_min'),
            (f_max('{points}', functions.lit(1))).alias('x_max'),
            (f_max('{points}', functions.lit(0))).alias('y_max'),
        ).collect()

        global_min_x = float('+inf')
        global_min_y = float('+inf')
        global_max_x = float('-inf')
        global_max_y = float('-inf')

        to_update = []
        for inx, row in enumerate(boundaries):
            x_min = row['x_min']
            y_min = row['y_min']
            x_max = row['x_max']
            y_max = row['y_max']
            to_update.append({{
                'item': inx,
                'bbox': [x_min, y_min, x_max, y_max]
            }})
            global_min_x = min(global_min_x, x_min)
            global_min_y = min(global_min_y, y_min)
            global_max_x = max(global_max_x, x_max)
            global_max_y = max(global_max_y, y_max)

        sp_index = pyqtree.Index(
                bbox=[global_min_x, global_min_y, global_max_x, global_max_y])

        for item in to_update:
            sp_index.insert(**item)

        broad_casted_sp_index = spark_session.sparkContext.broadcast(
            sp_index)

        def get_first_polygon(lat, lng):
            x = float(lat)
            y = float(lng)
            bcast_index = broad_casted_sp_index.value
            matches = bcast_index.intersect([x, y, x, y])
            for shp_inx in matches:
                row = bcast_shapefile.value[shp_inx]
                p_polygon = Path(row['{points}'])
                # Here it uses longitude, latitude
                if p_polygon.contains_point([y, x]):
                    return [c for c in row]
            return [None] * len(bcast_shapefile.value[0])

        udf_get_first_polygon = functions.udf(
            get_first_polygon, types.ArrayType(types.StringType()))
        within = input_1.withColumn(
            "tmp_polygon_data", udf_get_first_polygon(functions.col('l'),
                                                    functions.col('l')))
        aliases = {aliases}
        {output} = within.select(within.columns +
            [within.tmp_polygon_data[i].alias(aliases.pop())
                for i, col in enumerate(schema)
                if col in attributes_to_add])
        {output} = {output}.drop('tmp_polygon_data')

           """.format(
        aliases=json.dumps(
            params[GeoWithin.POLYGON_ALIAS_COLUMN_PARAM].split(',')),
        output=n_out['output data'], geo=n_in['geo data'],
        points=params[GeoWithin.POLYGON_POINTS_COLUMN_PARAM][0],
        attributes=params[
            GeoWithin.POLYGON_ATTRIBUTES_COLUMN_PARAM]))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))

    assert result, msg + format_code_comparison(code, expected_code)
コード例 #25
0
def test_transformation_minumum_params_success():

    params = {
        "expression": [{
            "alias": "new_col1",
            "expression": "col1+2*9",
            "tree": {
                "operator": "+",
                "right": {
                    "operator": "*",
                    "right": {
                        "raw": "9",
                        "type": "Literal",
                        "value": 9
                    },
                    "type": "BinaryExpression",
                    "left": {
                        "raw": "2",
                        "type": "Literal",
                        "value": 2
                    }
                },
                "type": "BinaryExpression",
                "left": {
                    "type": "Identifier",
                    "name": "col1"
                }
            },
            "error": 'null'
        }, {
            "alias": "new_col2",
            "expression": "len(col2, 3)",
            "tree": {
                "type":
                "CallExpression",
                "callee": {
                    "type": "Identifier",
                    "name": "len"
                },
                "arguments": [{
                    "type": "Identifier",
                    "name": "col2"
                }, {
                    "raw": "3",
                    "type": "Literal",
                    "value": 3
                }]
            },
            "error": 'null'
        }, {
            "alias": "new_col3",
            "expression": "split(col3, ',')",
            "tree": {
                "type":
                "CallExpression",
                "callee": {
                    "type": "Identifier",
                    "name": "split"
                },
                "arguments": [{
                    "type": "Identifier",
                    "name": "col3"
                }, {
                    "raw": "','",
                    "type": "Literal",
                    "value": ","
                }]
            },
            "error": 'null'
        }]
    }
    n_in = {'input data': 'df1'}
    n_out = {'output data': 'out'}
    instance = TransformationOperation(params,
                                       named_inputs=n_in,
                                       named_outputs=n_out)
    code = instance.generate_code()

    expected_code = dedent("""
        {out} = {in1}.copy()
        
        functions = [['new_col1', lambda row: row['col1'] + 2 * 9],
                     ['new_col2', lambda row: len(row['col2'], 3)],
                     ['new_col3', lambda row: row['col3'].split(',')],]
        for col, function in functions:
            {out}[col] = {out}.apply(function, axis=1)
        """.format(
        out=n_out['output data'],
        in1=n_in['input data'],
    ))

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)