def test_readshapefile_minimal_params_success(): params = { ReadShapefileOperation.POLYGON_ATTR_PARAM: 'points', ReadShapefileOperation.SHAPEFILE_PARAM: 'shapefile.shp', } n_out = {'geo data': 'out'} instance = ReadShapefileOperation(parameters=params, named_inputs={}, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" polygon = '{polygon}' lat_long = True attributes = [] {out} = ReadShapefile(polygon, lat_long, attributes, 'shapefile.shp', 'shapefile.dbf') """.format(polygon=params['polygon'], out=n_out['geo data'])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_gbt_regressor_with_params_success(): params = { DecisionTreeClassifierOperation.SEED_PARAM: 14, DecisionTreeClassifierOperation.MIN_LEAF_PARAM: 4, DecisionTreeClassifierOperation.MIN_SPLIT_PARAM: 5, DecisionTreeClassifierOperation.MAX_DEPTH_PARAM: 11, DecisionTreeClassifierOperation.MIN_WEIGHT_PARAM: 0.1 } n_out = {'algorithm': 'classifier_1'} instance_lr = DecisionTreeClassifierOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" classifier_1 = DecisionTreeClassifier(max_depth=11, min_samples_split=5, min_samples_leaf=4, min_weight_fraction_leaf=0.1, random_state=14)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_clean_missing_without_missing_rating_params_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['name'], } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" attributes_{input_1} = ['{attribute}'] if len(attributes_input_1) > 0: {output_1} = {input_1}.na.drop(how='any', subset=attributes_{input_1}) else: {output_1} = {input_1} """.format(input_1=n_in['input data'], attribute=params['attributes'][0], output_1=n_out['output result'])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_intersection_minimal_params_success(): params = {} n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = IntersectionOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" if len(df1.columns) != len(df2.columns): raise ValueError('{error}') {out} = {in1}.intersect({in2}) """.format(out=n_out['output data'], in1=n_in['input data 1'], in2=n_in['input data 2'], error=('For intersection operation, both input data ' 'sources must have the same number of attributes ' 'and types.'))) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_filter_minimum_params_success(): params = { FilterOperation.FILTER_PARAM: [{ 'attribute': 'code', 'f': '>', 'value': '201' }], 'config': {} } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = FilterOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = ("{out} = {in1}.filter(" "functions.col('{attribute}') {f} '{value}')").format( out=n_out['output data'], in1=n_in['input data'], **params[FilterOperation.FILTER_PARAM][0]) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_stdbscan_minimal_params_success(): params = { STDBSCANOperation.DATETIME_PARAM: ['date'], STDBSCANOperation.LON_PARAM: ['lon'], STDBSCANOperation.LAT_PARAM: ['lat'], } n_in = {'input data': 'df1'} n_out = {'output data': 'out'} instance = STDBSCANOperation(parameters=params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" out = st_dbscan(df1, 'lat', 'lon', 'date', 'cluster', spatial_threshold=500.0, temporal_threshold=60, min_neighbors=15) """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_inner_join_minimal_with_remove_right_columns_success(): params = { 'left_attributes': ['id', 'cod'], 'right_attributes': ['id', 'cod'], 'aliases': '_left,_right' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" cols_to_remove = [c+'_right' for c in df2.columns if c in df1.columns] out = pd.merge(df1, df2, how='inner', suffixes=['_left', '_right'], left_on=['id', 'cod'], right_on=['id', 'cod']) out.drop(cols_to_remove, axis=1, inplace=True)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_transformation_math_expression_success(): alias = 'result_2' expr = [{ 'tree': { "type": "BinaryExpression", "operator": "*", "left": { "type": "Identifier", "name": "a" }, "right": { "type": "Literal", "value": 100, "raw": "100" } }, 'alias': alias, 'expression': "lower(a)" }] params = { TransformationOperation.EXPRESSION_PARAM: expr, } n_in = {'input data': 'df1'} n_out = {'output data': 'out'} instance = TransformationOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" {out} = {in1}.copy() functions = [['result_2', lambda row: row['a'] * 100],] for col, function in functions: {out}[col] = {out}.apply(function, axis=1) """.format(out=n_out['output data'], in1=n_in['input data'])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_gbt_regressor_with_params_success(): params = { GradientBoostingRegressorOperation.N_ESTIMATORS_PARAM: 11, GradientBoostingRegressorOperation.MIN_SPLIT_PARAM: 12, GradientBoostingRegressorOperation.SEED_PARAM: 13, GradientBoostingRegressorOperation.MAX_DEPTH_PARAM: 14, GradientBoostingRegressorOperation.LEARNING_RATE_PARAM: 0.155, GradientBoostingRegressorOperation.MIN_LEAF_PARAM: 16 } n_out = {'algorithm': 'regressor_1'} instance_lr = GradientBoostingRegressorOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" regressor_1 = GradientBoostingRegressor(learning_rate=0.155, n_estimators=11, max_depth=14, min_samples_split=12, min_samples_leaf=16, random_state=13)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_linearegression_with_params_success(): params = { LinearRegressionOperation.NORMALIZE_PARAM: False, LinearRegressionOperation.ALPHA_PARAM: 0.5, LinearRegressionOperation.ELASTIC_NET_PARAM: 0.55, LinearRegressionOperation.TOLERANCE_PARAM: 0.1, LinearRegressionOperation.MAX_ITER_PARAM: 10, LinearRegressionOperation.SEED_PARAM: 2 } n_out = {'algorithm': 'regressor_1'} instance_lr = LinearRegressionOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" regressor_1 = ElasticNet(alpha=0.5, l1_ratio=0.55, tol=0.1, max_iter=10, random_state=2, normalize=False)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_left_join_keep_columns_minimal_params_success(): params = { JoinOperation.LEFT_ATTRIBUTES_PARAM: ['id', 'cod'], JoinOperation.RIGHT_ATTRIBUTES_PARAM: ['id', 'cod'], JoinOperation.JOIN_TYPE_PARAM: 'left', JoinOperation.KEEP_RIGHT_KEYS_PARAM: True, JoinOperation.ALIASES_PARAM: 'left_, right_ ' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" def _rename_attributes(df, prefix): result = df for col in df.columns: result = result.withColumnRenamed(col, '{{}}{{}}'.format( prefix, col)) return result in0_renamed = _rename_attributes({in0}, '{a0}') in1_renamed = _rename_attributes({in1}, '{a1}') condition = [in0_renamed['{a0}id'] == in1_renamed['{a1}id'], in0_renamed['{a0}cod'] == in1_renamed['{a1}cod']] {out} = in0_renamed.join(in1_renamed, on=condition, how='left') """.format( out=n_out['output data'], in0=n_in['input data 1'], a0='left_', a1='right_', in1=n_in['input data 2'], type=params[JoinOperation.JOIN_TYPE_PARAM], )) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_aggregation_rows_minimal_params_success(): params = { AggregationOperation.FUNCTION_PARAM: [{ 'attribute': 'income', 'f': 'AVG', 'alias': 'avg_income' }], AggregationOperation.ATTRIBUTES_PARAM: ['country'] } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = AggregationOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" pivot_values = None pivot_attr = '' if pivot_attr: {out} = {in0}.groupBy( functions.col('{agg}')).pivot( pivot_attr, pivot_values).agg( functions.avg('income').alias('avg_income')) else: {out} = {in0}.groupBy( functions.col('{agg}')).agg( functions.avg('income').alias('avg_income')) """.format( out=n_out['output data'], in0=n_in['input data'], agg='country', )) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_tokenizer_operation_type_simple_success(): params = { TokenizerOperation.TYPE_PARAM: 'simple', TokenizerOperation.ATTRIBUTES_PARAM: ['col'], TokenizerOperation.ALIAS_PARAM: 'c' } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = TokenizerOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" col_alias = {3} pattern_exp = r'\s+' min_token_length = 3 tokenizers = [RegexTokenizer(inputCol=col, outputCol=alias, pattern=pattern_exp, minTokenLength=min_token_length) for col, alias in col_alias] # Use Pipeline to process all attributes once pipeline = Pipeline(stages=tokenizers) {2} = pipeline.fit({1}).transform({1}) """.format( params[TokenizerOperation.ATTRIBUTES_PARAM], n_in['input data'], n_out['output data'], json.dumps( list( zip(params[TokenizerOperation.ATTRIBUTES_PARAM], params[TokenizerOperation.ALIAS_PARAM]))))) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_clean_missing_without_missing_rating_params_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['name'], } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" min_missing_ratio = 0.0 max_missing_ratio = 1.0 {output_1} = {input_1} for col in {attribute}: ratio = {input_1}[col].isnull().sum() if ratio >= min_missing_ratio and ratio <= max_missing_ratio: {output_1}.dropna(subset=col, axis='index', inplace=True) """.format(input_1=n_in['input data'], attribute=params['attributes'], output_1=n_out['output result'])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_gaussian_mixture_clustering_success(): params = { GaussianMixtureClusteringOperation.MAX_ITER_PARAM: 15, GaussianMixtureClusteringOperation.TOLERANCE_PARAM: 0.11, GaussianMixtureClusteringOperation.N_CLUSTERS_PARAM: 11, } named_outputs = {'algorithm': 'clustering_algo_1'} instance = GaussianMixtureClusteringOperation(params, named_inputs={}, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" clustering_algo_1 = GaussianMixture(n_components=11, max_iter=15, tol=0.11) """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_dbscan_clustering_minimum_success(): params = { DBSCANClusteringOperation.FEATURES_PARAM: ['f'], } named_inputs = {'input data': 'df1'} named_outputs = {'output data': 'df2'} instance = DBSCANClusteringOperation(params, named_inputs=named_inputs, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" df2 = df1.copy() X = df2['f'].values.tolist() clt = DBSCAN(eps=0.5, min_samples=5) df2['cluster'] = clt.fit_predict(X) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_clean_missing_minimal_params_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['col1', 'col2'], CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0, CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0, } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" min_missing_ratio = 0.0 max_missing_ratio = 1.0 output_1 = input_1 for col in ['col1', 'col2']: ratio = input_1[col].isnull().sum() if ratio >= min_missing_ratio and ratio <= max_missing_ratio: output_1.dropna(subset=col, axis='index', inplace=True) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_onehot_encoder_minimum_operation_success(): params = { OneHotEncoderOperation.ATTRIBUTE_PARAM: ['col'], } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = OneHotEncoderOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder() X_train = input_1['col'].values.tolist() output_1['col_norm'] = enc.fit_transform(X_train).toarray().tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_filter_minimum_params_success(): params = { FilterOperation.FILTER_PARAM: [{ 'attribute': 'code', 'f': '>', 'value': '201' }, { 'attribute': 'code2', 'f': '<', 'value': '200' }] } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = FilterOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 output_1 = output_1.query('(code > 201) and (code2 < 200)') """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_case_insensitive_success(): params = { JoinOperation.LEFT_ATTRIBUTES_PARAM: ['id', 'cod'], JoinOperation.RIGHT_ATTRIBUTES_PARAM: ['id2', 'cod2'], JoinOperation.KEEP_RIGHT_KEYS_PARAM: 'True', JoinOperation.ALIASES_PARAM: 'left_, right_ ', JoinOperation.MATCH_CASE_PARAM: 'True', } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" def _rename_attributes(df, prefix): result = df for col in df.columns: result = result.withColumnRenamed(col, '{{}}{{}}'.format( prefix, col)) return result in0_renamed = _rename_attributes({in0}, '{a0}') in1_renamed = _rename_attributes({in1}, '{a1}') condition = [functions.lower(in0_renamed['{a0}id']) == functions.lower(in1_renamed['{a1}id2']), functions.lower(in0_renamed['{a0}cod']) == functions.lower(in1_renamed['{a1}cod2'])] {out} = in0_renamed.join(in1_renamed, on=condition, how='inner') """.format(out=n_out['output data'], in0=n_in['input data 1'], in1=n_in['input data 2'], a0='left_', a1='right_')) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_word_to_vector_tfidf_operation_success(): params = { WordToVectorOperation.TYPE_PARAM: WordToVectorOperation.TYPE_TFIDF, WordToVectorOperation.ATTRIBUTES_PARAM: ['col_1'], WordToVectorOperation.ALIAS_PARAM: 'col_2', WordToVectorOperation.VOCAB_SIZE_PARAM: 200, WordToVectorOperation.MINIMUM_DF_PARAM: 5, } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = WordToVectorOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1.copy() def do_nothing(tokens): return tokens corpus = output_1['col_1'].values.tolist() vector_model_1 = TfidfVectorizer(tokenizer=do_nothing, preprocessor=None, lowercase=False, min_df=5, max_features=200) vector_model_1.fit(corpus) output_1['col_2'] = vector_model_1.transform(corpus).toarray().tolist() vocab_task_1 = vector_model_1.get_feature_names() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def atest_area_chart_success(time_series_data): params = { Visu.TITLE_PARAM: 'Simple title 1', Visu.COLUMN_NAMES_PARAM: ['name, age, gender'], Visu.ORIENTATION_PARAM: 'landscape', Visu.ID_ATTR_PARAM: ['id'], Visu.VALUE_ATTR_PARAM: ['age'], 'task': { 'id': uuid.uuid4(), }, 'operation_id': 1, 'operation_slug': 'area-chart', 'user': {}, 'workflow_id': 17, 'job_id': 100, } n_in = {'input data': 'input'} n_out = {} chart = AreaChartOperation(params, n_in, n_out) with mock.patch('juicer.spark.vis_operation.get_caipirinha_config', get_mocked_caipirinha_config): code = chart.generate_code() expected_code = dedent(""" from juicer.spark.vis_operation import AreaChartModel from juicer.util.dataframe_util import SimpleJsonEncoder as enc from juicer.service import caipirinha_service params = '{{}}' vis_task_1 = AreaChartModel( input, '{task_id}', '{operation_id}', '{operation_slug}', '{title}', {column_names}, 'landscape', {id_attribute}, {value_attribute}, params=json.loads(params)) config = {{ 'juicer': {{ 'services': {{ 'limonero': {{ 'url': 'http://limonero:3321', 'auth_token': 'token' }}, 'caipirinha': {{ 'url': 'http://caipirinha:3324', 'auth_token': 'token', 'storage_id': 1 }}, }} }} }} visualization = {{ 'job_id': '{job_id}', 'task_id': vis_task_1.task_id, 'title': vis_task_1.title , 'type': {{ 'id': vis_task_1.type_id, 'name': vis_task_1.type_name }}, 'model': vis_task_1, 'data': json.dumps(vis_task_1.get_data(), cls=enc, ignore_nan=True) }} caipirinha_service.new_visualization( config, {{}}, {workflow_id}, {job_id}, '{task_id}', visualization, emit_event)""").format(task_id=params['task']['id'], **params) ast.parse(expected_code) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_pythoncode_minimum_params_success(): params = { ExecutePythonOperation.PYTHON_CODE_PARAM: "df1['col3'] = df1['col1'] + df1['col2']", 'task': { 'id': 1 } } n_in = {'input data 1': 'input_1'} n_out = {'output data': 'output_1'} instance = ExecutePythonOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" import json from RestrictedPython.Guards import safe_builtins from RestrictedPython.RCompile import compile_restricted from RestrictedPython.PrintCollector import PrintCollector results = [r[1].result() for r in task_futures.items() if r[1].done()] results = dict([(r['task_name'], r) for r in results]) # Input data in1 = input_1 in2 = None # Output data, initialized as None out1 = None out2 = None # Variables and language supported ctx = { 'wf_results': results, 'in1': in1, 'in2': in2, 'out1': out1, 'out2': out2, # Restrictions in Python language '_write_': lambda v: v, '_getattr_': getattr, '_getitem_': lambda ob, index: ob[index], '_getiter_': lambda it: it, '_print_': PrintCollector, 'json': json, } user_code = "df1['col3'] = df1['col1'] + df1['col2']" ctx['__builtins__'] = safe_builtins compiled_code = compile_restricted(user_code, str('python_execute_1'), str('exec')) try: exec compiled_code in ctx # Retrieve values changed in the context out1 = ctx['out1'] out2 = ctx['out2'] if '_print' in ctx: emit_event(name='update task', message=ctx['_print'](), status='RUNNING', identifier='1') except NameError as ne: raise ValueError(_('Invalid name: {}. ' 'Many Python commands are not available in Lemonade').format(ne)) except ImportError as ie: raise ValueError(_('Command import is not supported')) out_1_1 = out1 out_2_1 = out2 """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_geo_within_success(): params = { GeoWithin.POLYGON_POINTS_COLUMN_PARAM: ['polygon'], GeoWithin.POLYGON_ATTRIBUTES_COLUMN_PARAM: ['attribute'], GeoWithin.POLYGON_ALIAS_COLUMN_PARAM: 'alias', GeoWithin.TARGET_LAT_COLUMN_PARAM: 'latitude', GeoWithin.TARGET_LON_COLUMN_PARAM: 'longitude' } n_out = {'output data': 'output_1'} n_in = {'input data': 'input_1', 'geo data': 'geo_data'} instance = GeoWithin(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" from matplotlib.path import Path import pyqtree attributes_to_add = {attributes} schema = [s.name for s in {geo}.schema] shp_object = {geo}.select(attributes_to_add + ['{points}']).collect() bcast_shapefile = spark_session.sparkContext.broadcast(shp_object) f_min = functions.udf( lambda v, index: min([item[index] for item in v]), types.DoubleType()) f_max = functions.udf( lambda v, index: max([item[index] for item in v]), types.DoubleType()) boundaries = {geo}.select( (f_min('{points}', functions.lit(1))).alias('x_min'), (f_min('{points}', functions.lit(0))).alias('y_min'), (f_max('{points}', functions.lit(1))).alias('x_max'), (f_max('{points}', functions.lit(0))).alias('y_max'), ).collect() global_min_x = float('+inf') global_min_y = float('+inf') global_max_x = float('-inf') global_max_y = float('-inf') to_update = [] for inx, row in enumerate(boundaries): x_min = row['x_min'] y_min = row['y_min'] x_max = row['x_max'] y_max = row['y_max'] to_update.append({{ 'item': inx, 'bbox': [x_min, y_min, x_max, y_max] }}) global_min_x = min(global_min_x, x_min) global_min_y = min(global_min_y, y_min) global_max_x = max(global_max_x, x_max) global_max_y = max(global_max_y, y_max) sp_index = pyqtree.Index( bbox=[global_min_x, global_min_y, global_max_x, global_max_y]) for item in to_update: sp_index.insert(**item) broad_casted_sp_index = spark_session.sparkContext.broadcast( sp_index) def get_first_polygon(lat, lng): x = float(lat) y = float(lng) bcast_index = broad_casted_sp_index.value matches = bcast_index.intersect([x, y, x, y]) for shp_inx in matches: row = bcast_shapefile.value[shp_inx] p_polygon = Path(row['{points}']) # Here it uses longitude, latitude if p_polygon.contains_point([y, x]): return [c for c in row] return [None] * len(bcast_shapefile.value[0]) udf_get_first_polygon = functions.udf( get_first_polygon, types.ArrayType(types.StringType())) within = input_1.withColumn( "tmp_polygon_data", udf_get_first_polygon(functions.col('l'), functions.col('l'))) aliases = {aliases} {output} = within.select(within.columns + [within.tmp_polygon_data[i].alias(aliases.pop()) for i, col in enumerate(schema) if col in attributes_to_add]) {output} = {output}.drop('tmp_polygon_data') """.format( aliases=json.dumps( params[GeoWithin.POLYGON_ALIAS_COLUMN_PARAM].split(',')), output=n_out['output data'], geo=n_in['geo data'], points=params[GeoWithin.POLYGON_POINTS_COLUMN_PARAM][0], attributes=params[ GeoWithin.POLYGON_ATTRIBUTES_COLUMN_PARAM])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_transformation_minumum_params_success(): params = { "expression": [{ "alias": "new_col1", "expression": "col1+2*9", "tree": { "operator": "+", "right": { "operator": "*", "right": { "raw": "9", "type": "Literal", "value": 9 }, "type": "BinaryExpression", "left": { "raw": "2", "type": "Literal", "value": 2 } }, "type": "BinaryExpression", "left": { "type": "Identifier", "name": "col1" } }, "error": 'null' }, { "alias": "new_col2", "expression": "len(col2, 3)", "tree": { "type": "CallExpression", "callee": { "type": "Identifier", "name": "len" }, "arguments": [{ "type": "Identifier", "name": "col2" }, { "raw": "3", "type": "Literal", "value": 3 }] }, "error": 'null' }, { "alias": "new_col3", "expression": "split(col3, ',')", "tree": { "type": "CallExpression", "callee": { "type": "Identifier", "name": "split" }, "arguments": [{ "type": "Identifier", "name": "col3" }, { "raw": "','", "type": "Literal", "value": "," }] }, "error": 'null' }] } n_in = {'input data': 'df1'} n_out = {'output data': 'out'} instance = TransformationOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" {out} = {in1}.copy() functions = [['new_col1', lambda row: row['col1'] + 2 * 9], ['new_col2', lambda row: len(row['col2'], 3)], ['new_col3', lambda row: row['col3'].split(',')],] for col, function in functions: {out}[col] = {out}.apply(function, axis=1) """.format( out=n_out['output data'], in1=n_in['input data'], )) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)