def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {0}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() additional_local_code=[] errors=[] if(error == ErrorTypes.NO_ERROR): error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node) if(error == ErrorTypes.NO_ERROR): my_args = {"node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors} # Must be a valid schema at this point. additional_code, param_string = CodeGenerationUtils.handle_parameter(node["parameter"]["schema"], my_args) gen_code=[] gen_code.extend(additional_code) gen_code.append("df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ') gen_code.append(CodeGenerationUtils.handle_primitive(node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")") gen_code.append('.option("subscribe", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["topic"]["value"] + ")")) gen_code.append('.load().select(from_json(col("value").cast("string"), '+ param_string +")") # For streams, we will use timestamp as a key while writing to kafka topic in case. gen_code.extend(['.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep]) final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code) return final_code, shared_function_set, error
def __create_bash_operator(task_id, args): op_task_id = args["app_id"] + "_" + task_id return [ 'Task_' + op_task_id + ' = BashOperator(task_id=' + CodeGenerationUtils.handle_primitive(op_task_id) + ", bash_command='" + args["bash_command"] + ' ' + CodeGenerationUtils.handle_primitive(op_task_id) + " ', dag=dag)" ]
def __create_spark_operator(task_id, args): op_task_id = args["app_id"] + "_" + task_id op_name = 'Task_' + op_task_id operator_args_str = str(args["spark_operator_conf"]) script_path = os.path.join(args["code_base_path"], op_task_id + '.py') return [ "operator_args = " + operator_args_str, os.linesep, op_name + ' = SparkSubmitOperator(task_id=' + CodeGenerationUtils.handle_primitive(op_task_id) + ', application=' + CodeGenerationUtils.handle_primitive(script_path) + ', dag=dag, **operator_args)' ]
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] code.append( df_name + '.selectExpr("CAST(' + node["parameters"]["unique_column_name"]["value"] + ' AS STRING) AS key", "to_json(struct(*)) AS value").writeStream.format("kafka").option("kafka.bootstrap.servers", ' ) code.append( CodeGenerationUtils.handle_primitive( node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")") code.append(".trigger(" + __generate_trigger_code(node) + ")") code.append('.option("topic", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["topic"]["value"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive( node["parameters"]["checkpointLocation"]["value"]) + ").start()") code.extend([ os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep ]) args["additional_info"]["written_topics"].append({ "topic_name": node["parameters"]["topic"]["value"], "host": node["parameters"]["host"]["value"], "port": node["parameters"]["port"]["value"] }) return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) shared_function_set = set() errors = [] code = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] shared_function_set.add(SharedFunctionTypes.VECTOR_DISASSEMBLER) code = [ "df_" + node["id"] + " = " + "vector_disassembler(" + df_name + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["vector_column"]["value"]) + ")", os.linesep ] return code, shared_function_set, error
def __generate_trigger_code(node): trigger_type = node["parameters"]["trigger_type"]["value"] if (trigger_type == "once"): return "once=True" else: return trigger_type + "=" + "'" + CodeGenerationUtils.handle_primitive( node["parameters"]["trigger_value"]["value"])
def __handle_expression_string_only(node, args): code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep] code.extend([ "df_" + args["node_id"] + " = " + "df_" + args["node_id"] + "." + node["sql_name"] + "(" + CodeGenerationUtils.handle_primitive( node["parameters"]["expression"]["value"]) + ")", os.linesep ]) return code
def __arg_dict_to_string(args): # Assuming that corresponding argument is a string which is appropriate for pre-defined datetime format. code = ["{"] for arg in args: if (arg in __set_of_datetime_arguments): code.extend([ CodeGenerationUtils.handle_primitive(arg), ": ", 'datetime.strptime("' + args[arg] + '", "' + __datetime_format + '")', ", " ]) else: code.extend([ CodeGenerationUtils.handle_primitive(arg), ": ", CodeGenerationUtils.handle_primitive(args[arg]), ", " ]) if (len(args) > 0): code.pop() code.append("}") return ''.join(code)
def __handle_in_op_out_trio(node, args): in_op_out_trio_list = node["parameters"]["in_op_out_trio_list"]["value"] code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep] code.extend( ["df_" + args["node_id"] + " = " + "df_" + args["node_id"] + " .agg("]) for elem in in_op_out_trio_list: code.extend([ "F." + elem["operation"]["value"] + "(" + CodeGenerationUtils.handle_primitive(elem["input_column"]["value"]) + ").alias(" + CodeGenerationUtils.handle_primitive( elem["output_column"]["value"]) + ")", ", " ]) # Assuming that there is at least 1 agg request. However, check this and produce error before got here... code.pop() code.extend([")", os.linesep]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if (error == ErrorTypes.NO_ERROR): final_code = [ "correlation_" + node["id"] + " = " + "Correlation.corr(" + df_name + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["method"]["value"]) + ")", os.linesep ] final_code.extend([ "result_array_" + node["id"] + " = ", "correlation_" + node["id"] + ".head()[0].toArray()", os.linesep ]) # In the future, dynamically name columns according to an appropriate convention... final_code.extend([ "df_" + node["id"] + " = sc.parallelize(" + "result_array_" + node["id"] + ")", ".map(lambda x: [float(i) for i in x]).toDF()", os.linesep ]) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ")") code.append(".trigger("+ __generate_trigger_code(node) +")") code.append('.option("path", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["path"]["value"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["checkpointLocation"]["value"]) + ").start()") code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep]) args["additional_info"]["written_tables"].append({"table_path": node["parameters"]["path"]["value"]}) return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } if (is_schema_appropriate): gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], "df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(", my_args) else: # For safety, but consider it again if ("schema" in node["parameters"]): del node["parameters"]["schema"] if (node["can_infer_schema"]): node["parameters"]["inferSchema"] = { "value": True, "type": "boolean" } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], "df_" + node["id"] + "=" + "spark.read.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ").load(", my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {2}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): df_names = __get_dfs_to_join(extra) code.extend([ "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] + ", " + CodeGenerationUtils.handle_primitive( node["parameters"]["join_column"]["value"]) + ")", os.linesep ]) return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {1}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] shared_function_set = set() if (error == ErrorTypes.NO_ERROR): model_id = extra["models"][0]["source_id"] code = [ "model_" + model_id + ".save(" + CodeGenerationUtils.handle_primitive( node["parameters"]["model_path"]["value"]) + ")", os.linesep ] return code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): my_args = { "node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } # Must be a valid schema at this point. additional_code, param_string = CodeGenerationUtils.handle_parameter( node["parameter"]["schema"], my_args) gen_code = [] gen_code.extend(additional_code) gen_code.extend([ "df_" + node["id"] + ' = spark.readStream.schema(' + param_string + ")." + node["file_type"] + "(" + CodeGenerationUtils.handle_primitive( node["parameters"]["path"]["value"]) + ")", os.linesep ]) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) final_code = [] shared_function_set = set() additional_local_code = [] errors = [] if (error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str( extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] my_args = { "node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors } gen_code = CodeGenerationUtils.handle_instantination_or_call( node["parameters"], df_name + ".write.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ").save(", my_args) final_code = CodeGenerationUtils.merge_with_additional_code( gen_code, additional_local_code) args["additional_info"]["written_tables"].append( {"table_path": node["parameters"]["path"]["value"]}) return final_code, shared_function_set, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) final_code=[] shared_function_set = set() if(error == ErrorTypes.NO_ERROR): if ("portion" in extra["dfs"][0]): df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]" else: df_name = "df_" + extra["dfs"][0]["source_id"] if(error==ErrorTypes.NO_ERROR): # For now, directly returns the correlation result (as a dataframe) which includes (in a row~): # pValues: DenseVector # degreesOfFreedom: list # statistics: DenseVector final_code=["df_"+ node["id"] + " = " + node["parameters"]["test_type"]["value"] + ".test(" + df_name + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["features_column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["label_column"]["value"]) + ")", os.linesep] return final_code, shared_function_set, error