def test_binary_expression_valid_success(): json_code = { "type": "BinaryExpression", "operator": "*", "left": { "type": "Identifier", "name": "column1" }, "right": { "type": "Identifier", "name": "column2" } } params = {} expr = Expression(json_code, params) result, msg = compare_ast( ast.parse(expr.parsed_expression), ast.parse("functions.col('column1') * functions.col('column2')")) assert result, msg json_code['operator'] = '/' json_code['left']['type'] = 'Literal' json_code['left']['value'] = 100 json_code['left']['raw'] = '100' expr = Expression(json_code, params) result, msg = compare_ast(ast.parse(expr.parsed_expression), ast.parse("100 / functions.col('column2')")) assert result, msg
def test_clean_missing_minimal_params_type_value_success(): params = { CleanMissingOperation.ATTRIBUTES_PARAM: ['name'], CleanMissingOperation.MIN_MISSING_RATIO_PARAM: 0.0, CleanMissingOperation.MAX_MISSING_RATIO_PARAM: 1.0, CleanMissingOperation.VALUE_PARAMETER: 200, CleanMissingOperation.CLEANING_MODE_PARAM: 'VALUE' } n_in = {'input data': 'input_1'} n_out = {'output result': 'output_1'} instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" min_missing_ratio = 0.0 max_missing_ratio = 1.0 output_1 = input_1 for col in ['name']: ratio = input_1[col].isnull().sum() if ratio >= min_missing_ratio and ratio <= max_missing_ratio: output_1[col].fillna(value=200, inplace=True) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code) # Test with value being number params[CleanMissingOperation.VALUE_PARAMETER] = 1200 instance = CleanMissingOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = expected_code.replace('200', '1200') result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_binary_expression_with_params_success(): json_code = { "type": "BinaryExpression", "operator": "*", "left": { "type": "Identifier", "name": "column1" }, "right": { "type": "Identifier", "name": "column2" } } params = {'input': 'df00'} expr = Expression(json_code, params) result, msg = compare_ast(ast.parse(expr.parsed_expression), ast.parse("df00['column1'] * df00['column2']")) assert result, msg json_code['operator'] = '/' json_code['left']['type'] = 'Literal' json_code['left']['value'] = 100 json_code['left']['raw'] = '100' expr = Expression(json_code, params) result, msg = compare_ast(ast.parse(expr.parsed_expression), ast.parse("100 / df00['column2']")) assert result, msg
def test_onehot_encoder_operation_success(): params = { OneHotEncoderOperation.ALIAS_PARAM: 'result', OneHotEncoderOperation.ATTRIBUTE_PARAM: ['col_1'] } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} in1 = n_in['input data'] out = n_out['output data'] instance = OneHotEncoderOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 from sklearn.preprocessing import OneHotEncoder enc = OneHotEncoder() X_train = input_1['col_1'].values.tolist() output_1['result'] = enc.fit_transform(X_train).toarray().tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_remove_right_with_case_columns_success(): params = { 'left_attributes': ['id', 'cod'], 'right_attributes': ['id2', 'cod2'], JoinOperation.KEEP_RIGHT_KEYS_PARAM: False, JoinOperation.MATCH_CASE_PARAM: True, 'aliases': '_left,_right' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" cols_to_remove = [c+'_right' for c in df2.columns if c in df1.columns] data1_tmp = df1[['id', 'cod']].applymap(lambda col: str(col).lower()) data1_tmp.columns = [c+"_lower" for c in data1_tmp.columns] data1_tmp = pd.concat([df1, data1_tmp], axis=1, sort=False) data2_tmp = df2[['id2', 'cod2']].applymap(lambda col: str(col).lower()) data2_tmp.columns = [c+"_lower" for c in data2_tmp.columns] data2_tmp = pd.concat([df2, data2_tmp], axis=1, sort=False) out = pd.merge(data1_tmp, data2_tmp, left_on=col1, right_on=col2, copy=False, suffixes=['_left', '_right'], how='inner') out.drop(col1+col2, axis=1, inplace=True) out.drop(cols_to_remove, axis=1, inplace=True)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_intersection_minimal_params_success(): params = {} n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = IntersectionOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" if len({in1}.columns) != len({in2}.columns): raise ValueError('{error}') {in1} = {in1}.dropna(axis=0, how='any') {in2} = {in2}.dropna(axis=0, how='any') keys = {in1}.columns.tolist() {in1} = pd.merge({in1}, {in2}, how='left', on=keys, indicator=True, copy=False) {out} = {in1}.loc[{in1}['_merge'] == 'both', keys] """.format(out=n_out['output data'], in1=n_in['input data 1'], in2=n_in['input data 2'], error=('For intersection operation, both input data ' 'sources must have the same number of attributes ' 'and types.'))) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_gbt_classifier_with_params_success(): params = { GBTClassifierOperation.N_ESTIMATORS_PARAM: 11, GBTClassifierOperation.MIN_LEAF_PARAM: 10, GBTClassifierOperation.MIN_SPLIT_PARAM: 12, GBTClassifierOperation.LEARNING_RATE_PARAM: 1.1, GBTClassifierOperation.MAX_DEPTH_PARAM: 13, GBTClassifierOperation.SEED_PARAM: 9, GBTClassifierOperation.LOSS_PARAM: GBTClassifierOperation.LOSS_PARAM_EXP } n_out = {'algorithm': 'classifier_1'} instance_lr = GBTClassifierOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" classifier_1 = GradientBoostingClassifier(loss='exponencial', learning_rate=1.1, n_estimators=11, min_samples_split=12, max_depth=13, min_samples_leaf=10, random_state=9)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_svm_operation_params_success(): params = { SvmClassifierOperation.PENALTY_PARAM: 10.0, SvmClassifierOperation.KERNEL_PARAM: SvmClassifierOperation.KERNEL_PARAM_POLY, SvmClassifierOperation.DEGREE_PARAM: 2, SvmClassifierOperation.TOLERANCE_PARAM: -0.1, SvmClassifierOperation.MAX_ITER_PARAM: 13, SvmClassifierOperation.SEED_PARAM: 12 } n_in = {} n_out = {'algorithm': 'classifier_1'} instance = SvmClassifierOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" classifier_1 = SVC(tol=0.1, C=10.0, max_iter=13, degree=2, kernel='poly', random_state=12) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_dbscan_clustering_success(): params = { DBSCANClusteringOperation.FEATURES_PARAM: ['f'], DBSCANClusteringOperation.ALIAS_PARAM: 'alias', DBSCANClusteringOperation.EPS_PARAM: 0.15, DBSCANClusteringOperation.MIN_SAMPLES_PARAM: 20, } named_inputs = {'input data': 'df1'} named_outputs = {'output data': 'df2'} instance = DBSCANClusteringOperation(params, named_inputs=named_inputs, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" df2 = df1.copy() X = df2['f'].values.tolist() clt = DBSCAN(eps=0.15, min_samples=20) df2['alias'] = clt.fit_predict(X) """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_regressor_operation_with_model_success(): params = { RegressionModelOperation.FEATURES_PARAM: 'f', RegressionModelOperation.LABEL_PARAM: 'l' } n_in = {'algorithm': 'regressor', 'train input data': 'train_data'} n_out = {'model': 'model_data'} instance = RegressionModelOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" algorithm = regressor out_task_1 = train_data.copy() X_train = train_data['f'].values.tolist() if 'IsotonicRegression' in str(algorithm): X_train = np.ravel(X_train) y = train_data['l'].values.tolist() model_data = algorithm.fit(X_train, y) out_task_1['prediction'] = algorithm.predict(X_train).tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_randomforestregressor_with_params_success(): params = { RandomForestRegressorOperation.MAX_FEATURES_PARAM: 'sqrt', RandomForestRegressorOperation.MAX_DEPTH_PARAM: 10, RandomForestRegressorOperation.MIN_LEAF_PARAM: 3, RandomForestRegressorOperation.MIN_SPLIT_PARAM: 4, RandomForestRegressorOperation.N_ESTIMATORS_PARAM: 9, RandomForestRegressorOperation.SEED_PARAM: -9 } n_out = {'algorithm': 'regressor_1'} instance_lr = RandomForestRegressorOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" regressor_1 = RandomForestRegressor(n_estimators=9, max_features='sqrt', max_depth=10, min_samples_split=4, min_samples_leaf=3, random_state=-9)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_feature_assembler_operation_success(): params = { FeatureAssemblerOperation.ATTRIBUTES_PARAM: ['col'], } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} in1 = n_in['input data'] out = n_out['output data'] instance = FeatureAssemblerOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" cols = {cols} {output} = {input} {output}['FeatureField'] = {input}[cols].values.tolist() """.format(cols=params[FeatureAssemblerOperation.ATTRIBUTES_PARAM], output=out, input=in1)) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_quantile_discretizer_minimum_operation_success(): params = { QuantileDiscretizerOperation.ATTRIBUTE_PARAM: ['col'], } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = QuantileDiscretizerOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 from sklearn.preprocessing import QuantileTransformer qt = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=None) X_train = input_1['col'].values.tolist() output_1['col_norm'] = qt.fit_transform(X_train).toarray().tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_quantile_discretizer_operation_success(): params = { QuantileDiscretizerOperation.ALIAS_PARAM: 'result', QuantileDiscretizerOperation.ATTRIBUTE_PARAM: ['col_1'], QuantileDiscretizerOperation.DISTRIBUITION_PARAM: QuantileDiscretizerOperation.DISTRIBUITION_PARAM_NORMAL, QuantileDiscretizerOperation.SEED_PARAM: 19, QuantileDiscretizerOperation.N_QUANTILES_PARAM: 500 } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} in1 = n_in['input data'] out = n_out['output data'] instance = QuantileDiscretizerOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 from sklearn.preprocessing import QuantileTransformer qt = QuantileTransformer(n_quantiles=500, output_distribution='normal', random_state=19) X_train = input_1['col_1'].values.tolist() output_1['result'] = qt.fit_transform(X_train).toarray().tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_pca_operation_success(): params = { PCAOperation.ATTRIBUTE_PARAM: ['col'], PCAOperation.ALIAS_PARAM: 'feature', PCAOperation.N_COMPONENTS: 3 } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} in1 = n_in['input data'] out = n_out['output data'] instance = PCAOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 from sklearn.decomposition import PCA pca = PCA(n_components=3) X_train = input_1['col'].values.tolist() output_1['feature'] = pca.fit_transform(X_train).tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_perceptron_with_params_success(): params = { PerceptronClassifierOperation.SHUFFLE_PARAM: True, PerceptronClassifierOperation.PENALTY_PARAM: PerceptronClassifierOperation.PENALTY_PARAM_EN, PerceptronClassifierOperation.SEED_PARAM: 10, PerceptronClassifierOperation.ALPHA_PARAM: 0.11, PerceptronClassifierOperation.TOLERANCE_PARAM: 0.1, PerceptronClassifierOperation.MAX_ITER_PARAM: 100 } n_out = {'algorithm': 'classifier_1'} instance_lr = PerceptronClassifierOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" classifier_1 = Perceptron(tol=0.1, alpha=0.11, max_iter=100, shuffle=True, random_state=10, penalty='elasticnet')""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_random_forest_operation_params_success(): params = { RandomForestClassifierOperation.SEED_PARAM: 10, RandomForestClassifierOperation.MAX_DEPTH_PARAM: 11, RandomForestClassifierOperation.MIN_SPLIT_PARAM: 12, RandomForestClassifierOperation.MIN_LEAF_PARAM: 13, RandomForestClassifierOperation.N_ESTIMATORS_PARAM: 15 } n_in = {} n_out = {'algorithm': 'classifier_1'} instance = RandomForestClassifierOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" classifier_1 = RandomForestClassifier(n_estimators=15, max_depth=11, min_samples_split=12, min_samples_leaf=13, random_state=10) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_kmeans_clustering_operation_random_type_kmeans_success(): params = { KMeansClusteringOperation.N_CLUSTERS_PARAM: 10, KMeansClusteringOperation.MAX_ITER_PARAM: 20, KMeansClusteringOperation.TYPE_PARAM: KMeansClusteringOperation.TYPE_PARAM_KMEANS, KMeansClusteringOperation.INIT_PARAM: KMeansClusteringOperation.INIT_PARAM_RANDOM, KMeansClusteringOperation.TOLERANCE_PARAM: 0.001, KMeansClusteringOperation.SEED_PARAM: 15 } named_outputs = {'algorithm': 'clustering_algo_1'} instance = KMeansClusteringOperation(params, named_inputs={}, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" {output} = KMeans(n_clusters={k}, init='{init}', max_iter={max_iter}, tol={tol}, random_state={seed}) """.format(output=named_outputs['algorithm'], k=params[KMeansClusteringOperation.N_CLUSTERS_PARAM], init=params[KMeansClusteringOperation.INIT_PARAM], max_iter=params[KMeansClusteringOperation.MAX_ITER_PARAM], seed=params[KMeansClusteringOperation.SEED_PARAM], tol=params[KMeansClusteringOperation.TOLERANCE_PARAM])) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_classification_model_operation_success(): params = { ClassificationModelOperation.FEATURES_ATTRIBUTE_PARAM: ['f'], ClassificationModelOperation.LABEL_ATTRIBUTE_PARAM: ['label'] } named_inputs = {'algorithm': 'algo', 'train input data': 'df_2'} named_outputs = {'model': 'output_2'} instance = ClassificationModelOperation(params, named_inputs=named_inputs, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" X = df_2['f'].values.tolist() y = df_2['label'].values.tolist() output_2 = algo.fit(X, y) task_1 = None """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_agglomerative_clustering_success(): params = { AgglomerativeClusteringOperation.ALIAS_PARAM: 'ALIAS', AgglomerativeClusteringOperation.FEATURES_PARAM: ['f'], AgglomerativeClusteringOperation.AFFINITY_PARAM: AgglomerativeClusteringOperation.AFFINITY_PARAM_COS, AgglomerativeClusteringOperation.LINKAGE_PARAM: AgglomerativeClusteringOperation.AFFINITY_PARAM_L2 } named_inputs = {'input data': 'df1'} named_outputs = {'output data': 'df2'} instance = AgglomerativeClusteringOperation(params, named_inputs=named_inputs, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" df2 = df1.copy() X = df2['f'].values.tolist() clt = AgglomerativeClustering(n_clusters=2, linkage='l2', affinity='cosine') df2['ALIAS'] = clt.fit_predict(X) """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_get_windows_function_success(): json_code = { "type": "CallExpression", "arguments": [{ "type": "Identifier", "name": "created_at" }, { "type": "Literal", "value": 10, "raw": "10" }, { "type": "Literal", "value": "end", "raw": "'end'" }], "callee": { "type": "Identifier", "name": "window" } } params = {} expr = Expression(json_code, params) expected_code = ("functions.window(" "functions.col('created_at')," "str('10 seconds')).start.cast('timestamp')") result, msg = compare_ast(ast.parse(expr.parsed_expression), ast.parse(expected_code)) assert result, msg + format_code_comparison(expr.parsed_expression, expected_code)
def test_lda_clustering_success(): params = { LdaClusteringOperation.N_COMPONENTES_PARAM: 10, LdaClusteringOperation.ALPHA_PARAM: 0.5, LdaClusteringOperation.SEED_PARAM: 11, LdaClusteringOperation.MAX_ITER_PARAM: 100, LdaClusteringOperation.ETA_PARAM: 0.5, LdaClusteringOperation.LEARNING_METHOD_PARAM: LdaClusteringOperation.LEARNING_METHOD_ON, } named_outputs = {'algorithm': 'clustering_algo_1'} instance = LdaClusteringOperation(params, named_inputs={}, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" clustering_algo_1 = LatentDirichletAllocation(n_components=10, doc_topic_prior=0.5, topic_word_prior=0.5, learning_method='online', max_iter=100) """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_clustering_with_model_operation_success(): params = { ClusteringModelOperation.FEATURES_PARAM: ['f'], } named_inputs = {'algorithm': 'algo', 'train input data': 'df_2'} named_outputs = {'output data': 'output_1', 'model': 'output_2'} instance = ClusteringModelOperation(params, named_inputs=named_inputs, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" X = df_2['f'].values.tolist() output_2 = algo.fit(X) y = algo.predict(X) output_1 = df_2 output_1['prediction'] = y """.format()) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_agglomerative_clustering_minimum_success(): params = { AgglomerativeClusteringOperation.FEATURES_PARAM: ['f'], } named_inputs = {'input data': 'df1'} named_outputs = {'output data': 'df2'} instance = AgglomerativeClusteringOperation(params, named_inputs=named_inputs, named_outputs=named_outputs) code = instance.generate_code() expected_code = dedent(""" df2 = df1.copy() X = df2['f'].values.tolist() clt = AgglomerativeClustering(n_clusters=2, linkage='ward', affinity='euclidean') df2['cluster'] = clt.fit_predict(X) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_sql_two_inputs_params_success(): params = { ExecuteSQLOperation.QUERY_PARAM: "select * where df2.id = 1;", ExecuteSQLOperation.NAMES_PARAM: "col1, col2, col3" } n_in = {'input data 1': 'input_1', 'input data 2': 'input_2'} n_out = {'output data': 'output_1'} instance = ExecuteSQLOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" query = 'select * where df2.id = 1;' output_1 = sqldf(query, {'ds1': input_1, 'ds2': input_2}) names = ['col1', 'col2', 'col3'] if names is not None and len(names) > 0: old_names = output_1.columns if len(old_names) != len(names): raise ValueError('Invalid names. Number of attributes ' 'in result differs from names informed.') rename = dict(zip(old_names, names)) output_1.rename(columns=rename, inplace=True) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_logisticregression_with_params_success(): params = { LogisticRegressionOperation.TOLERANCE_PARAM: 0.1, LogisticRegressionOperation.MAX_ITER_PARAM: 10, LogisticRegressionOperation.SEED_PARAM: 2, LogisticRegressionOperation.REGULARIZATION_PARAM: 1.1, LogisticRegressionOperation.SOLVER_PARAM: LogisticRegressionOperation.SOLVER_PARAM_NEWTON } n_out = {'algorithm': 'classifier_1'} instance_lr = LogisticRegressionOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" classifier_1 = LogisticRegression(tol=0.1, C=1.1, max_iter=10, solver='newton-cg', random_state=2)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_left_join_keep_columns_minimal_params_success(): params = { 'left_attributes': ['id', 'cod'], 'right_attributes': ['id', 'cod'], JoinOperation.JOIN_TYPE_PARAM: 'left', JoinOperation.KEEP_RIGHT_KEYS_PARAM: True, 'aliases': '_left,_right' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" {out} = pd.merge({in0}, {in1}, how='{type}', suffixes=['_left', '_right'], left_on=['id', 'cod'], right_on=['id', 'cod']) """.format( out=n_out['output data'], in0=n_in['input data 1'], in1=n_in['input data 2'], type=params[JoinOperation.JOIN_TYPE_PARAM], )) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_mlp_classifier_with_params_success(): params = { MLPClassifierOperation.HIDDEN_LAYER_SIZES_PARAM: '(100,10,9)', MLPClassifierOperation.ACTIVATION_PARAM: MLPClassifierOperation.ACTIVATION_PARAM_LOG, MLPClassifierOperation.SEED_PARAM: 9, MLPClassifierOperation.SOLVER_PARAM: MLPClassifierOperation.SOLVER_PARAM_LBFGS, MLPClassifierOperation.MAX_ITER_PARAM: 1000, MLPClassifierOperation.ALPHA_PARAM: 0.01, MLPClassifierOperation.TOLERANCE_PARAM: 0.1, } n_out = {'algorithm': 'classifier_1'} instance_lr = MLPClassifierOperation(params, named_inputs={}, named_outputs=n_out) code = instance_lr.generate_code() expected_code = dedent(""" classifier_1 = MLPClassifier(hidden_layer_sizes=(100,10,9), activation='logistic', solver='lbfgs', alpha=0.01, max_iter=1000, random_state=9, tol=0.1)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_replace_value_minimal_params_success(): params = {"attributes": ["col1", "col2"], "replacement": 10, "value": -10} n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = ReplaceValuesOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 replacement = {replaces} for col in replacement: list_replaces = replacement[col] output_1[col] = output_1[col].replace(list_replaces[0], list_replaces[1]) """.format(out=n_out['output data'], in1=n_in['input data'], replaces={ "col2": [[-10], [10]], "col1": [[-10], [10]] })) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_standardscaler_minimum_operation_success(): params = { StandardScalerOperation.ATTRIBUTE_PARAM: ['col'], } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} in1 = n_in['input data'] out = n_out['output data'] instance = StandardScalerOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" output_1 = input_1 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_train = input_1['col'].values.tolist() output_1['col_norm'] = scaler.fit_transform(X_train).tolist() """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)