def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): # Must be a valid schema at this point. code.append("schema_" + node["id"] + "=") code.extend([ CodeGenerationUtils.arrange_schema( node["parameter"]["schema"]), os.linesep ]) code.extend([ "df_" + node["id"] + ' = spark.readStream.schema(schema_' + node["id"] + ")." + node["file_type"] + "(" + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["file_path"]) + ")", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], 'df_' + node["id"] + '=' + df_name + '.' + node["ddfo_name"] + '(', my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] if(error == ErrorTypes.NO_ERROR): if(bool(extra["dfs"])): df_name="df_"+extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str(extra["portions"][0][1]) + "]" code = ['estimator_' + node["id"] + ' = ' + node["estimator_name"] + '('] for param in node["parameters"]: code.extend([param + "=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"][param]), ", "]) if (len(node["parameters"]) > 0): code.pop() code.extend([")", os.linesep]) code.extend(['model_' + node["id"] + "=" + 'estimator_' + node["id"] + ".fit(df_" + df_name + ")", os.linesep]) code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(df_' + df_name + ')', os.linesep]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str( extra["portions"][0][1]) + "]" code.append( df_name + '.selectExpr("CAST(' + node["parameters"]["unique_column_name"] + ' AS STRING) AS key", "to_json(struct(*)) AS value").write.format("kafka").option("kafka.bootstrap.servers", ' ) code.append( CodeGenerationUtils.arrange_parameter_value( node["parameters"]["host"] + ":" + node["parameters"]["port"]) + ")") code.extend([ '.option("topic", ' + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["topic"]) + ").save()", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {0}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code=[] errors=[] if(error == ErrorTypes.NO_ERROR): error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node) if(error == ErrorTypes.NO_ERROR): my_args = {"node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} # Must be a valid schema at this point. additional_code, param_string = CodeGenerationUtils.handle_parameter(node["parameter"]["schema"], my_args) gen_code=[] gen_code.extend(additional_code) gen_code.append("df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ') gen_code.append(CodeGenerationUtils.handle_primitive(node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")") gen_code.append('.option("subscribe", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["topic"]["value"] + ")")) gen_code.append('.load().select(from_json(col("value").cast("string"), '+ param_string +")") # For streams, we will use timestamp as a key while writing to kafka topic in case. gen_code.extend(['.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) shared_function_set = set() errors = [] code = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] shared_function_set.add(SharedFunctionTypes.VECTOR_DISASSEMBLER) code = [ "df_" + node["id"] + " = " + "vector_disassembler(" + df_name + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["vector_column"]["value"]) + ")", os.linesep ] return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {1}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] model_id = extra["models"][0]["source_id"] code = [ "df_" + node["id"] + "=" + "model_" + model_id + ".transform(" + df_name + ")", os.linesep ] return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code = [] errors = [] if(error == ErrorTypes.NO_ERROR): if("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} # Depending on the column that multi_instance_indicator indicates, we will decide to apply whether to multi-instance generation or usual generation if(MultiInstanceHandlerUtils.should_generate_multiple_instances(node)): gen_code = MultiInstanceHandlerUtils.multi_instance_generation(node, df_name, my_args) else: gen_code = __single_generation(node, df_name, my_args) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node=args["node"] requireds_info=args["requireds_info"] edges=args["edges"] checklist={"df_count": {0}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] if(error == ErrorTypes.NO_ERROR): error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node) if(error == ErrorTypes.NO_ERROR): remaining_params = node["parameters"].keys() remaining_params.remove("file_path") if(is_schema_appropriate): code.append("schema_"+node["id"]+"=") code.extend([CodeGenerationUtils.arrange_schema(node["parameter"]["schema"]), os.linesep]) code.append("df_" + node["id"] + "=" + "spark.read."+ node["file_type"] +"(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"] + ", " + "schema="+ "schema_"+node["id"])) remaining_params.remove("schema") else: if(node["can_infer_schema"]): code.append("df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]) +", " +"inferSchema="+"True") else: code.append("df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"])) for param in remaining_params: code.extend([", " + param + "=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"][param])]) code.extend([")", os.linesep]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str( extra["portions"][0][1]) + "]" code.extend([ "df_" + node["id"] + "=" + df_name + ".randomSplit(" + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["weights"]) + ", " + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["seed"]) + ")", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code=[] errors=[] if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args={"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} updated_function_name = CodeGenerationUtils.handle_parameter(node["parameters"]["udf_function"], my_args) gen_code=[] gen_code.extend(["udf_"+node["id"]+" = udf("+updated_function_name+", "+node["parameters"]["udf_return_type"]["value"]+"())", os.linesep]) gen_code.extend(["tuple_list = " + CodeGenerationUtils.handle_parameter(node["parameters"]["udf_input_tuples"], my_args), os.linesep]) gen_code.extend(["output_list = " + CodeGenerationUtils.handle_parameter(node["parameters"]["udf_outputs"], my_args), os.linesep]) gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep]) gen_code.extend(["for index in range(len(tuple_list)):", os.linesep]) gen_code.extend(["\tdf_"+node["id"]+" = df_"+node["id"]+".withColumn(output_list[index], udf_"+node["id"]+"(*tuple_list[index]))", os.linesep, os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, extra2 = CVValiditiyChecker.check_validity( node["nodes"], node["edges"]) if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = [] gen_code.extend( __generate_code_for_estimator_instantination( node["nodes"][extra2["estimator_node_id"]], my_args)) gen_code.extend( __generate_code_for_evaluator_instantination( node["nodes"][extra2["evaluator_node_id"]], my_args)) gen_code.extend( __generate_code_for_param_grid( node, 'estimator_' + extra2["estimator_node_id"], my_args)) gen_code.extend( __generate_code_for_cv_instantination(node, extra2["estimator_node_id"], extra2["evaluator_node_id"])) gen_code.extend([ 'model_' + node["id"] + "=" + 'cv_' + node["id"] + ".fit(" + df_name + ")", os.linesep ]) # Following might not be logical unless you aim to predict on training data for some specific needs. gen_code.extend([ 'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] final_code = [] shared_function_set = set() additional_local_code = [] errors = [] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) if (error == ErrorTypes.NO_ERROR): error, pipeline_order = PipelineValidityChecker.check_validity( node["nodes"], node["edges"]) if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code, error = __generate_stages(node["nodes"], pipeline_order, df_name, my_args) if (error == ErrorTypes.NO_ERROR): gen_code.append(os.linesep) gen_code.extend( __generate_code_for_pipeline_instantination( node, pipeline_order, my_args)) gen_code.extend([ 'model_' + node["id"] + "=" + 'pipeline_' + node["id"] + ".fit(" + df_name + ")", os.linesep ]) # Following might not be logical for pipelines with an estimator gen_code.extend([ 'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {0}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): code = ["model_" + node["id"] + "=" + node["parameters"]["model_type"]["value"] +".load(" + CodeGenerationUtils.handle_primitive(node["parameters"]["model_path"]["value"]) + ")", os.linesep] return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = [] shared_function_set.add(SharedFunctionTypes.SELECT_EXPR_HELPERS) gen_code.append("df_" + node["id"] + "=" + df_name + ".selectExpr(") for expr in node["parameters"]["expressions"]["value"]: gen_code.extend([ 'single_select_expr_generator(' + CodeGenerationUtils.handle_parameter(expr["input_cols"], my_args) + ', ' + CodeGenerationUtils.handle_parameter( expr["output_cols"], my_args) + ', ' + CodeGenerationUtils.handle_parameter( expr["operation"], my_args) + ')', ', ' ]) gen_code.pop() gen_code.extend([")", os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] code.append( df_name + '.selectExpr("CAST(' + node["parameters"]["unique_column_name"]["value"] + ' AS STRING) AS key", "to_json(struct(*)) AS value").writeStream.format("kafka").option("kafka.bootstrap.servers", ' ) code.append( CodeGenerationUtils.handle_primitive( node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")") code.append(".trigger(" + __generate_trigger_code(node) + ")") code.append('.option("topic", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["topic"]["value"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["checkpointLocation"]["value"]) + ").start()") code.extend([ os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep ]) args["additional_info"]["written_topics"].append({ "topic_name": node["parameters"]["topic"]["value"], "host": node["parameters"]["host"]["value"], "port": node["parameters"]["port"]["value"] }) return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } if (is_schema_appropriate): gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], "df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(", my_args) else: # For safety, but consider it again if ("schema" in node["parameters"]): del node["parameters"]["schema"] if (node["can_infer_schema"]): node["parameters"]["inferSchema"] = { "value": True, "type": "boolean" } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], "df_" + node["id"] + "=" + "spark.read.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ").load(", my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {2}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): df_names = __get_dfs_to_join(extra) code.extend([ "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] + ", " + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["join_column"]) + ")", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): error, extra2 = CVValiditiyChecker.check_validity( node["nodes"], node["edges"]) if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str( extra["portions"][0][1]) + "]" code.extend( __generate_code_for_estimator_instantination( node["nodes"][extra2["estimator_node_id"]])) code.extend( __generate_code_for_evaluator_instantination( node["nodes"][extra2["evaluator_node_id"]])) code.extend( __generate_code_for_param_grid( node, 'estimator_' + extra2["estimator_node_id"])) code.extend( __generate_code_for_cv_instantination(node, extra2["estimator_node_id"], extra2["evaluator_node_id"])) code.extend([ 'model_' + node["id"] + "=" + 'cv_' + node["id"] + ".fit(" + df_name + ")", os.linesep ]) # Following might not be logical unless you aim to predict on training data for some specific needs. code.extend([ 'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {1}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): model_id = extra["models"][0] code = [ "model_" + model_id + ".save(" + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["model_path"]) + ")", os.linesep ] return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code = [] errors = [] if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} input_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["input_cols"], my_args) output_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["output_cols"], my_args) window_size = node["parameters"]["window_size"]["value"] partitioning_column = node["parameters"]["partitioning_column"]["value"] ordering_column = node["parameters"]["ordering_column"]["value"] ordering_direction = node["parameters"]["ordering_direction"]["value"] gen_code=[] gen_code.extend(["input_cols = " + output_cols, os.linesep]) gen_code.extend(["output_cols = " + input_cols, os.linesep]) gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep]) gen_code.extend(["for inC, outC in zip(input_cols, output_cols):", os.linesep]) gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', col(inC))", os.linesep]) gen_code.extend(["\twSpec = Window.partitionBy('" + partitioning_column + "').orderBy(col('" + ordering_column + "')." + ordering_direction + "())", os.linesep]) gen_code.extend(["\tfor j in range(" + str(window_size) + "):", os.linesep]) gen_code.extend(["\t\tlag_values = lag('temp', default=0).over(wSpec)", os.linesep]) gen_code.extend(["\t\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', F.when((col('temp')==1) | (lag_values==None) | (lag_values<1) | (lag_values>=" + str(window_size + 1) + "), col('temp')).otherwise(lag_values+1))", os.linesep]) gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn(outC, F.when(col('temp') > 0, 1.0).otherwise(0.0))", os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): # Must be a valid schema at this point. code.append("schema_" + node["id"] + "=") code.extend([ CodeGenerationUtils.arrange_schema( node["parameter"]["schema"]), os.linesep ]) code.append( "df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ' ) code.append( CodeGenerationUtils.arrange_parameter_value( node["parameters"]["host"] + ":" + node["parameters"]["port"]) + ")") code.append('.option("subscribe", ' + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["topic"] + ")")) code.append( '.load().select(from_json(col("value").cast("string"), schema_' + node["id"] + ")") # For streams, we will use timestamp as a key while writing to kafka topic in case. code.extend([ '.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] if(error == ErrorTypes.NO_ERROR): if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str(extra["portions"][0][1]) + "]" code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format("+CodeGenerationUtils.arrange_parameter_value(node["file_type"])+")") code.append(".trigger("+ __generate_trigger_code(node) +")") code.append('.option("path", '+ CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["checkpoint_path"]) + ").start()") code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } # Must be a valid schema at this point. additional_code, param_string = CodeGenerationUtils.handle_parameter( node["parameter"]["schema"], my_args) gen_code = [] gen_code.extend(additional_code) gen_code.extend([ "df_" + node["id"] + ' = spark.readStream.schema(' + param_string + ")." + node["file_type"] + "(" + CodeGenerationUtils.handle_primitive( node["parameters"]["path"]["value"]) + ")", os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], df_name + ".write.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ").save(", my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) args["additional_info"]["written_tables"].append( {"table_path": node["parameters"]["path"]["value"]}) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if (error == ErrorTypes.NO_ERROR): final_code = [ "correlation_" + node["id"] + " = " + "Correlation.corr(" + df_name + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["method"]["value"]) + ")", os.linesep ] final_code.extend([ "result_array_" + node["id"] + " = ", "correlation_" + node["id"] + ".head()[0].toArray()", os.linesep ]) # In the future, dynamically name columns according to an appropriate convention... final_code.extend([ "df_" + node["id"] + " = sc.parallelize(" + "result_array_" + node["id"] + ")", ".map(lambda x: [float(i) for i in x]).toDF()", os.linesep ]) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if(error==ErrorTypes.NO_ERROR): # For now, directly returns the correlation result (as a dataframe) which includes (in a row~): # pValues: DenseVector # degreesOfFreedom: list # statistics: DenseVector final_code=["df_"+ node["id"] + " = " + node["parameters"]["test_type"]["value"] + ".test(" + df_name + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["features_column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["label_column"]["value"]) + ")", os.linesep] return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ")") code.append(".trigger("+ __generate_trigger_code(node) +")") code.append('.option("path", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["path"]["value"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["checkpointLocation"]["value"]) + ").start()") code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep]) args["additional_info"]["written_tables"].append({"table_path": node["parameters"]["path"]["value"]}) return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] code = [] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) if (error == ErrorTypes.NO_ERROR): error, pipeline_order = PipelineValidityChecker.check_validity( node["nodes"], node["edges"]) if (error == ErrorTypes.NO_ERROR): code.extend(__generate_stages(node["nodes"], pipeline_order)) code.append(os.linesep) code.extend( __generate_code_for_pipeline_instantination( node, pipeline_order)) if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str( extra["portions"][0][1]) + "]" code.extend([ 'model_' + node["id"] + "=" + 'pipeline_' + node["id"] + ".fit(" + df_name + ")", os.linesep ]) # Following might not be logical for pipelines with an estimator code.extend([ 'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } df_name = "df_" + my_args["node_id"] gen_code = [df_name + " = " + my_args["input_dfs"][0], os.linesep] input_columns = ["["] conditions = ["["] values = ["["] otherwises = ["["] output_columns = ["["] for exp in node["parameters"]["expressions"]["value"]: input_columns.extend([ CodeGenerationUtils.handle_parameter(exp["input_columns"], my_args), ", " ]) conditions.extend([ CodeGenerationUtils.handle_parameter(exp["condition"], my_args), ", " ]) values.extend([ CodeGenerationUtils.handle_parameter(exp["value"], my_args), ", " ]) otherwises.extend([ CodeGenerationUtils.handle_parameter(exp["otherwise"], my_args), ", " ]) output_columns.extend([ CodeGenerationUtils.handle_parameter(exp["output_columns"], my_args), ", " ]) # Check there are at least 1 elememnt in expressions input_columns.pop() conditions.pop() values.pop() otherwises.pop() output_columns.pop() input_columns.extend(["]"]) conditions.extend(["]"]) values.extend(["]"]) otherwises.extend(["]"]) output_columns.extend(["]"]) gen_code.extend( ["input_columns = " + ''.join(input_columns), os.linesep]) gen_code.extend(["conditions = " + ''.join(conditions), os.linesep]) gen_code.extend(["values = " + ''.join(values), os.linesep]) gen_code.extend(["otherwises = " + ''.join(otherwises), os.linesep]) gen_code.extend( ["output_columns = " + ''.join(output_columns), os.linesep]) gen_code.extend([ "for in_cols, cond, val, otw, out_cols in zip(input_columns, conditions, values, otherwises, output_columns):", os.linesep ]) gen_code.extend( ["\tfor in_col, out_col in zip(in_cols, out_cols):", os.linesep]) gen_code.extend([ "\t\tcur_cond = eval(cond.replace('$','" + df_name + "[\"'+in_col+'\"]'" + "))", os.linesep ]) gen_code.extend([ "\t\t" + df_name + " = " + df_name + ".withColumn(out_col, F.when(cur_cond, val).otherwise(otw))", os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error