def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], 'df_' + node["id"] + '=' + df_name + '.' + node["ddfo_name"] + '(', my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] code.append("query_" + node["id"] + "=" + df_name + '.selectExpr("CAST(' + node["parameters"]["unique_column_name"]["value"] + ' AS STRING) AS key", "to_json(struct(*)) AS value").writeStream.format("kafka").option("kafka.bootstrap.servers", ') code.append(CodeGenerationUtils.handle_primitive(node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")") code.append(".trigger(" + __generate_trigger_code(node) + ")") code.append('.option("topic", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["topic"]["value"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["checkpointLocation"]["value"]) + ").start()") code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep]) args["additional_info"]["written_topics"].append({"topic_name": node["parameters"]["topic"]["value"], "host": node["parameters"]["host"]["value"], "port": node["parameters"]["port"]["value"]}) return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } # Must be a valid schema at this point. param_string = CodeGenerationUtils.handle_parameter( node["parameters"]["schema"], my_args) gen_code = [] gen_code.append( "df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ' ) gen_code.append( CodeGenerationUtils.handle_primitive( node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")") gen_code.append('.option("subscribe", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["topic"]["value"]) + ")") gen_code.append( '.option("startingOffsets", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["startingOffsets"]["value"]) + ")") gen_code.append( '.load().select(from_json(col("value").cast("string"), ' + param_string + ")") # For streams, we will use timestamp as a key while writing to kafka topic in case. gen_code.extend([ '.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } updated_function_name = CodeGenerationUtils.handle_parameter( node["parameters"]["udf_function"], my_args) gen_code = [] gen_code.extend([ "udf_" + node["id"] + " = udf(" + updated_function_name + ", " + node["parameters"]["udf_return_type"]["value"] + "())", os.linesep ]) gen_code.extend([ "tuple_list = " + CodeGenerationUtils.handle_parameter( node["parameters"]["udf_input_tuples"], my_args), os.linesep ]) gen_code.extend([ "output_list = " + CodeGenerationUtils.handle_parameter( node["parameters"]["udf_outputs"], my_args), os.linesep ]) gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep]) gen_code.extend(["for index in range(len(tuple_list)):", os.linesep]) gen_code.extend([ "\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn(output_list[index], udf_" + node["id"] + "(*tuple_list[index]))", os.linesep, os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def __arg_dict_to_string(args): # Assuming that corresponding argument is a string which is appropriate for pre-defined datetime format. code=["{"] for arg in args: if(arg in __set_of_datetime_arguments): code.extend([CodeGenerationUtils.handle_primitive(arg), ": ", 'datetime.strptime("' + args[arg] + '", "' + __datetime_format + '")', ", "]) else: code.extend([CodeGenerationUtils.handle_primitive(arg), ": ", CodeGenerationUtils.handle_primitive(args[arg]), ", "]) if (len(args) > 0): code.pop() code.append("}") return ''.join(code)
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = [] shared_function_set.add(SharedFunctionTypes.SELECT_EXPR_HELPERS) gen_code.extend(["expressions_" + node["id"] + "=[]", os.linesep]) for expr in node["parameters"]["expressions"]["value"]: gen_code.extend([ "expressions_" + node["id"] + ".extend(", 'single_select_expr_generator(' + CodeGenerationUtils.handle_parameter(expr["input_cols"], my_args) + ', ' + CodeGenerationUtils.handle_parameter( expr["output_cols"], my_args) + ', ' + CodeGenerationUtils.handle_parameter( expr["operation"], my_args) + '))', os.linesep ]) gen_code.extend([ "df_" + node["id"] + "=" + df_name + ".selectExpr(" + "*expressions_" + node["id"] + ")", os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] final_code=[] shared_function_set = set() additional_local_code = [] errors = [] checklist={"df_count": {1}, "model_count": {0}} error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) if(error == ErrorTypes.NO_ERROR): error, pipeline_order= PipelineValidityChecker.check_validity(node["nodes"], node["edges"]) if(error==ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} gen_code, error = __generate_stages(node["nodes"], pipeline_order, df_name, my_args) if(error == ErrorTypes.NO_ERROR): gen_code.append(os.linesep) gen_code.extend(__generate_code_for_pipeline_instantination(node, pipeline_order, my_args)) gen_code.extend(['model_' + node["id"] + "=" + 'pipeline_' + node["id"] + ".fit(" + df_name + ")", os.linesep]) # Following might not be logical for pipelines with an estimator gen_code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def __generate_trigger_code(node): trigger_type = node["parameters"]["trigger_type"]["value"] if (trigger_type == "once"): return "once=True" else: return trigger_type + "=" + CodeGenerationUtils.handle_primitive( node["parameters"]["trigger_value"]["value"])
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) shared_function_set = set() errors = [] code = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] shared_function_set.add(SharedFunctionTypes.VECTOR_DISASSEMBLER) code = [ "df_" + node["id"] + " = " + "vector_disassembler(" + df_name + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["vector_column"]["value"]) + ")", os.linesep ] return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code = [] errors = [] if(error == ErrorTypes.NO_ERROR): error, extra2= CVValiditiyChecker.check_validity(node["nodes"], node["edges"]) if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} gen_code=[] gen_code.extend(__generate_code_for_estimator_instantination(node["nodes"][extra2["estimator_node_id"]], my_args)) gen_code.extend(__generate_code_for_evaluator_instantination(node["nodes"][extra2["evaluator_node_id"]], my_args)) gen_code.extend(__generate_code_for_param_grid(node, 'estimator_' + extra2["estimator_node_id"], my_args)) gen_code.extend(__generate_code_for_cv_instantination(node, extra2["estimator_node_id"], extra2["evaluator_node_id"])) gen_code.extend(['model_' + node["id"] + "=" + 'cv_' + node["id"] + ".fit(" + df_name + ")", os.linesep]) # Following might not be logical unless you aim to predict on training data for some specific needs. gen_code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def __generate_stage_template(node, non_indicator_params, args): code = [] class_name = "" # Do not allow other families than estimator and transformer. Handle the "else" case later... if (node["family"] == NodeFamilyTypes.Estimator.value): class_name = node["estimator_name"] elif (node["family"] == NodeFamilyTypes.Transformer.value): class_name = node["transformer_name"] mmi_part = [] for i in range(len(node["multi_instance_indicator"])): mmi_part.extend([ node["multi_instance_indicator"][i] + "=" + "mmi_value_" + node["multi_instance_indicator"][i] + "_" + node["id"] + "[i]", ", " ]) arg_part = CodeGenerationUtils.handle_arguments(non_indicator_params, args) if (not bool(arg_part)): mmi_part.pop() code.extend(["stages_" + node["id"], ".append(", class_name + '(']) code.extend(mmi_part) code.extend(arg_part) code.extend(["))", os.linesep]) return ''.join(code)
def __generate_code_for_pipeline_instantination(node, args): code = [] non_indicator_params = {} for param in node["parameters"]: if (param in node["multi_instance_indicator"]): code.extend([ "mmi_value_" + param + "_" + node["id"] + " = " + CodeGenerationUtils.handle_parameter( node["parameters"][param], args), os.linesep ]) else: non_indicator_params[param] = node["parameters"][param] code.extend(["stages_" + node["id"], " = ", "[]", os.linesep]) code.extend([ "for i in ", "range(len(mmi_value_" + node["multi_instance_indicator"][0] + "_" + node["id"], ")):", os.linesep ]) code.extend([ "\t", __generate_stage_template(node, non_indicator_params, args), os.linesep ]) if (not ("in_pipeline" in args and args["in_pipeline"])): code.extend([ 'pipeline_' + node["id"] + "=Pipeline(stages=", "stages_" + node["id"] + ")", os.linesep ]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } if (is_schema_appropriate): gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], "df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(", my_args) else: # For safety, but consider it again if ("schema" in node["parameters"]): del node["parameters"]["schema"] if (node["can_infer_schema"]): node["parameters"]["inferSchema"] = { "value": True, "type": "boolean" } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], "df_" + node["id"] + "=" + "spark.read.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ").load(", my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def __handle_expression_string_only(node, args): code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep] code.extend([ "df_" + args["node_id"] + " = " + "df_" + args["node_id"] + "." + node["sql_name"] + "(" + CodeGenerationUtils.handle_primitive( node["parameters"]["expression"]["value"]) + ")", os.linesep ]) return code
def __single_generation(node, df_name, args): code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], 'transformer_' + node["id"] + ' = ' + node["transformer_name"] + '(', args) code.extend([ 'df_' + node["id"] + "=" + 'transformer_' + node["id"] + '.transform(' + df_name + ')', os.linesep ]) return code
def __handle_in_op_out_trio(node, args): in_op_out_trio_list = node["parameters"]["in_op_out_trio_list"]["value"] code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep] code.extend( ["df_" + args["node_id"] + " = " + "df_" + args["node_id"] + ".agg("]) for elem in in_op_out_trio_list: code.extend([ "F." + elem["operation"]["value"] + "(" + CodeGenerationUtils.handle_primitive(elem["input_column"]["value"]) + ").alias(" + CodeGenerationUtils.handle_primitive( elem["output_column"]["value"]) + ")", ", " ]) # Assuming that there is at least 1 agg request. However, check this and produce error before got here... code.pop() code.extend([")", os.linesep]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code = [] errors = [] if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} input_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["input_cols"], my_args) output_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["output_cols"], my_args) window_size = node["parameters"]["window_size"]["value"] partitioning_column = node["parameters"]["partitioning_column"]["value"] ordering_column = node["parameters"]["ordering_column"]["value"] ordering_direction = node["parameters"]["ordering_direction"]["value"] gen_code=[] gen_code.extend(["input_cols = " + output_cols, os.linesep]) gen_code.extend(["output_cols = " + input_cols, os.linesep]) gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep]) gen_code.extend(["for inC, outC in zip(input_cols, output_cols):", os.linesep]) gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', col(inC))", os.linesep]) gen_code.extend(["\twSpec = Window.partitionBy('" + partitioning_column + "').orderBy(col('" + ordering_column + "')." + ordering_direction + "())", os.linesep]) gen_code.extend(["\tfor j in range(" + str(window_size) + "):", os.linesep]) gen_code.extend(["\t\tlag_values = lag('temp', default=0).over(wSpec)", os.linesep]) gen_code.extend(["\t\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', F.when((col('temp')==1) | (lag_values==None) | (lag_values<1) | (lag_values>=" + str(window_size + 1) + "), col('temp')).otherwise(lag_values+1))", os.linesep]) gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn(outC, F.when(col('temp') > 0, 1.0).otherwise(0.0))", os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def __single_generation(node, df_name, args, model_elements_save_paths): code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], 'estimator_' + node["id"] + ' = ' + node["estimator_name"] + '(', args) code.extend([ 'model_' + node["id"] + "=" + 'estimator_' + node["id"] + ".fit(" + df_name + ")", os.linesep ]) code.extend([ 'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep ]) if (model_elements_save_paths is not None): for elem in model_elements_save_paths: code.extend([ 'model_' + node["id"] + "." + elem + ".write.format('parquet').save(" + CodeGenerationUtils.handle_primitive( model_elements_save_paths[elem]["value"]) + ")", os.linesep ]) return code
def __handle_col_list_plus_kwargs(node, args): kwargs_str = [] if ("kwargs" in node["parameters"]): kwargs = node["parameters"]["kwargs"]["value"] # If there are parameters for kwargs and all them is optional; and if user do not provide any of them, then we assume that there will be an empty dictionary here... for elem in kwargs: kwargs_str.extend([ ", ", elem, "=", CodeGenerationUtils.handle_parameter(kwargs[elem], args) ]) code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep] code.extend([ "df_" + args["node_id"] + " = " + "df_" + args["node_id"] + "." + node["sql_name"] + "(" + CodeGenerationUtils.handle_parameter( node["parameters"]["input_columns"], args) ]) code.extend(kwargs_str) code.extend([")", os.linesep]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], df_name + ".write.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ").save(", my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) if "path" in node["parameters"]: args["additional_info"]["written_tables"].append( {"table_path": node["parameters"]["path"]["value"]}) return final_code, shared_function_set, error
def test_handle_parameter_for_primitive( self, create_parameter_for_primitive, create_accompanying_arg_for_parameter): # parameter_val=create_parameter_for_primitive # arg_val=create_accompanying_arg_for_parameter # assert CodeGenerationUtils.handle_parameter(parameter_val, arg_val) == "\"" + str(parameter_val["value"]) + "\"" expected_val = str(create_parameter_for_primitive["value"]) if (isinstance(create_parameter_for_primitive["value"], str)): expected_val = "\"" + create_parameter_for_primitive["value"] + "\"" assert CodeGenerationUtils.handle_parameter( create_parameter_for_primitive, create_accompanying_arg_for_parameter) == expected_val
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if (error == ErrorTypes.NO_ERROR): final_code = [ "correlation_" + node["id"] + " = " + "Correlation.corr(" + df_name + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["method"]["value"]) + ")", os.linesep ] final_code.extend([ "result_array_" + node["id"] + " = ", "correlation_" + node["id"] + ".head()[0].toArray()", os.linesep ]) # In the future, dynamically name columns according to an appropriate convention... final_code.extend([ "df_" + node["id"] + " = sc.parallelize(" + "result_array_" + node["id"] + ")", ".map(lambda x: [float(i) for i in x]).toDF()", os.linesep ]) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } # Must be a valid schema at this point. param_string = CodeGenerationUtils.handle_parameter( node["parameter"]["schema"], my_args) gen_code = [] gen_code.extend([ "df_" + node["id"] + ' = spark.readStream.schema(' + param_string + ")." + node["file_type"] + "(" + CodeGenerationUtils.handle_primitive( node["parameters"]["path"]["value"]) + ")", os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ")") code.append(".trigger(" + __generate_trigger_code(node) + ")") code.append('.option("path", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["path"]["value"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["checkpointLocation"]["value"]) + ").start()") code.extend([ os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep ]) args["additional_info"]["written_tables"].append( {"table_path": node["parameters"]["path"]["value"]}) return code, shared_function_set, error
def __generate_code_for_param_grid(node, cur_estimator_name, args): # In the future handle this in special requirement handler for parameters code=["param_grid_" + node["id"] + "=", "None", os.linesep] # Assuming that fix parameters are given in the estimator itself. # Maybe reconsider this part. grid_params = node["parameters"]["parameter_grid"] if(bool(grid_params)): code.pop() code.pop() code.extend(["ParamGridBuilder()"]) for param in grid_params: code.extend([".addGrid(" + cur_estimator_name + "." + param + ", " + CodeGenerationUtils.handle_parameter(grid_params[param], args) + ")"]) code.extend([".build()", os.linesep]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {0}, "model_count": {1}} error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): model_id=extra["models"][0]["source_id"] code = ["model_" + model_id + ".save(" + CodeGenerationUtils.handle_primitive(node["parameters"]["model_path"]["value"]) + ")", os.linesep] return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } # Depending on the column that multi_instance_indicator indicates, we will decide to apply whether to multi-instance generation or usual generation model_elements_save_paths = None if ("model_elements_save_paths" in node["parameters"]): model_elements_save_paths = node["parameters"][ "model_elements_save_paths"]["value"] del node["parameters"]["model_elements_save_paths"] if (MultiInstanceHandlerUtils.should_generate_multiple_instances(node) ): gen_code = MultiInstanceHandlerUtils.multi_instance_generation( node, df_name, my_args) else: gen_code = __single_generation(node, df_name, my_args, model_elements_save_paths) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {2}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): df_names = __get_dfs_to_join(extra) code.extend([ "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["join_column"]["value"]) + ")", os.linesep ]) return code, shared_function_set, error
def test_handle_parameter_for_primitive_array( self, create_parameter_for_primitive_array, create_accompanying_arg_for_parameter): primitive_array = create_parameter_for_primitive_array["value"] code = ["["] for elem in primitive_array: if (isinstance(elem, str)): code.append("\"" + elem + "\"") else: code.append(str(elem)) code.append(", ") code.pop() code.append("]") expected_val = ''.join(code) actual_val = CodeGenerationUtils.handle_parameter( create_parameter_for_primitive_array, create_accompanying_arg_for_parameter) assert all([a == b for a, b in zip(actual_val, expected_val)])
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = [] gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep]) between_operation = node["parameters"]["rolling_stats_info"]["value"][ "between_operation"]["value"] first_argument_input_cols = CodeGenerationUtils.handle_parameter( node["parameters"]["rolling_stats_info"]["value"]["first_argument"] ["value"]["input_cols"], my_args) first_argument_operation = node["parameters"]["rolling_stats_info"][ "value"]["first_argument"]["value"]["operation"]["value"] gen_code.extend( ["first_cols = " + first_argument_input_cols, os.linesep]) output_cols = CodeGenerationUtils.handle_parameter( node["parameters"]["rolling_stats_info"]["value"]["output_cols"], my_args) gen_code.extend(["output_cols = " + output_cols, os.linesep]) partitioning_column = node["parameters"]["rolling_stats_info"][ "value"]["partitioning_column"]["value"] ordering_column = node["parameters"]["rolling_stats_info"]["value"][ "ordering_column"]["value"] ordering_direction = node["parameters"]["rolling_stats_info"]["value"][ "ordering_direction"]["value"] lags = node["parameters"]["rolling_stats_info"]["value"]["lags"] lags_str = CodeGenerationUtils.handle_parameter(lags, my_args) window_str = "over (partition by " + partitioning_column + " order by " + ordering_column + " " + ordering_direction + " rows " + "'+ str(lag) +'" + " preceding) " # if window_size == -1: # window_str = "over (partition by " + partition_column + " order by " + ordering_column + " " + ordering_direction + " rows unbounded preceding) " # else: # window_str = "over (partition by " + partition_column + " order by " + ordering_column + " " + ordering_direction + " rows " + str(window_size) + " preceding) " if between_operation != 'Identity': second_argument_input_cols = CodeGenerationUtils.handle_parameter( node["parameters"]["rolling_stats_info"]["value"] ["second_argument"]["value"]["input_cols"], my_args) second_argument_operation = node["parameters"][ "rolling_stats_info"]["value"]["second_argument"]["value"][ "operation"]["value"] gen_code.extend( ["second_cols = " + second_argument_input_cols, os.linesep]) loop_str = "for col_1,col_2,out_col in zip(first_cols, second_cols, output_cols):" if first_argument_operation == 'Identity': if second_argument_operation == 'Identity': select_str = "df_" + node["id"] + " = df_" + node[ "id"] + ".selectExpr('*', col_1 + ' " + between_operation + " '+ col_2 + ' as out_col' + str(lag))" else: select_str = "df_" + node["id"] + " = df_" + node[ "id"] + ".selectExpr('*', col_1 + ' " + between_operation + " ' + '" + second_argument_operation + "(' + col_2 + ') " + window_str + "as out_col' + str(lag))" else: if second_argument_operation == 'Identity': select_str = "df_" + node["id"] + " = df_" + node[ "id"] + ".selectExpr('*', '" + first_argument_operation + "(' + col_1 + ') " + window_str + between_operation + " ' + col_2 + ' as out_col' + str(lag))" else: select_str = "df_" + node["id"] + " = df_" + node[ "id"] + ".selectExpr('*', '" + first_argument_operation + "(' + col_1 + ') " + window_str + between_operation + " " + second_argument_operation + "(' + col_2 + ') " + window_str + "as out_col' + str(lag))" else: loop_str = "for col_1,out_col in zip(first_cols, output_cols):" if first_argument_operation == 'Identity': select_str = "df_" + node["id"] + " = df_" + node[ "id"] + ".selectExpr('*', col_1 + ' as out_col' + str(lag))" else: select_str = "df_" + node["id"] + " = df_" + node[ "id"] + ".selectExpr('*', '" + first_argument_operation + "(' + col_1 + ') " + window_str + "as out_col' + str(lag))" gen_code.extend(["lags = " + lags_str, os.linesep]) gen_code.extend(["for lag in lags:", os.linesep]) gen_code.extend(["\t", loop_str, os.linesep]) gen_code.extend(["\t\t" + select_str, os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error