def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str( extra["portions"][0][1]) + "]" code.extend([ "df_" + node["id"] + "=" + df_name + ".randomSplit(" + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["weights"]) + ", " + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["seed"]) + ")", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): # Must be a valid schema at this point. code.append("schema_" + node["id"] + "=") code.extend([ CodeGenerationUtils.arrange_schema( node["parameter"]["schema"]), os.linesep ]) code.extend([ "df_" + node["id"] + ' = spark.readStream.schema(schema_' + node["id"] + ")." + node["file_type"] + "(" + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["file_path"]) + ")", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {1}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str( extra["portions"][0][1]) + "]" code.append( df_name + '.selectExpr("CAST(' + node["parameters"]["unique_column_name"] + ' AS STRING) AS key", "to_json(struct(*)) AS value").write.format("kafka").option("kafka.bootstrap.servers", ' ) code.append( CodeGenerationUtils.arrange_parameter_value( node["parameters"]["host"] + ":" + node["parameters"]["port"]) + ")") code.extend([ '.option("topic", ' + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["topic"]) + ").save()", os.linesep ]) return code, error
def generate_code(args): node=args["node"] requireds_info=args["requireds_info"] edges=args["edges"] checklist={"df_count": {0}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] if(error == ErrorTypes.NO_ERROR): error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node) if(error == ErrorTypes.NO_ERROR): remaining_params = node["parameters"].keys() remaining_params.remove("file_path") if(is_schema_appropriate): code.append("schema_"+node["id"]+"=") code.extend([CodeGenerationUtils.arrange_schema(node["parameter"]["schema"]), os.linesep]) code.append("df_" + node["id"] + "=" + "spark.read."+ node["file_type"] +"(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"] + ", " + "schema="+ "schema_"+node["id"])) remaining_params.remove("schema") else: if(node["can_infer_schema"]): code.append("df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]) +", " +"inferSchema="+"True") else: code.append("df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"])) for param in remaining_params: code.extend([", " + param + "=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"][param])]) code.extend([")", os.linesep]) return code, error
def __arg_dict_to_string(args): # Assuming that corresponding argument is a string which is appropriate for pre-defined datetime format. code=["{"] for arg in args: if(arg in __set_of_datetime_arguments): code.extend(['datetime.strptime("' + args[arg] + '", "' + __datetime_format +'")', ","]) else: code.extend([CodeGenerationUtils.arrange_parameter_value(arg), ": ", CodeGenerationUtils.arrange_parameter_value(args[arg]), ","]) if (len(args) > 0): code.pop() code.append("}") return ''.join(code)
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] if(error == ErrorTypes.NO_ERROR): if(bool(extra["dfs"])): df_name="df_"+extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str(extra["portions"][0][1]) + "]" code = ['estimator_' + node["id"] + ' = ' + node["estimator_name"] + '('] for param in node["parameters"]: code.extend([param + "=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"][param]), ", "]) if (len(node["parameters"]) > 0): code.pop() code.extend([")", os.linesep]) code.extend(['model_' + node["id"] + "=" + 'estimator_' + node["id"] + ".fit(df_" + df_name + ")", os.linesep]) code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(df_' + df_name + ')', os.linesep]) return code, error
def __generate_code_for_evaluator_instantination(node): code = ['evaluator_' + node["id"] + ' = ' + node["evaluator_name"] + '('] for param in node["parameters"]: code.extend([ param + "=" + CodeGenerationUtils.arrange_parameter_value( node["parameters"][param]), ", " ]) if (len(node["parameters"]) > 0): code.pop() code.extend([")", os.linesep]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): error, is_schema_appropriate = DataSourceValidityChecker.check_validity( node) if (error == ErrorTypes.NO_ERROR): # Must be a valid schema at this point. code.append("schema_" + node["id"] + "=") code.extend([ CodeGenerationUtils.arrange_schema( node["parameter"]["schema"]), os.linesep ]) code.append( "df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ' ) code.append( CodeGenerationUtils.arrange_parameter_value( node["parameters"]["host"] + ":" + node["parameters"]["port"]) + ")") code.append('.option("subscribe", ' + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["topic"] + ")")) code.append( '.load().select(from_json(col("value").cast("string"), schema_' + node["id"] + ")") # For streams, we will use timestamp as a key while writing to kafka topic in case. code.extend([ '.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist={"df_count": {1}, "model_count": {0}} error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist) code=[] if(error == ErrorTypes.NO_ERROR): if (bool(extra["dfs"])): df_name = "df_" + extra["dfs"][0] else: df_name = "df_" + extra["portions"][0][0] + "[" + str(extra["portions"][0][1]) + "]" code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format("+CodeGenerationUtils.arrange_parameter_value(node["file_type"])+")") code.append(".trigger("+ __generate_trigger_code(node) +")") code.append('.option("path", '+ CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]) + ")") code.append('.option("checkpointLocation", ' + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["checkpoint_path"]) + ").start()") code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep]) return code, error
def __generate_code_for_param_grid(node, cur_estimator_name): code = ["param_grid_" + node["id"] + "=None"] # Assuming that fix parameters are given in the estimator itself. # Maybe reconsider this part. grid_params = node["parameters"]["parameter_grid"] if (bool(grid_params)): code = ["param_grid_" + node["id"] + "=ParamGridBuilder()"] for param in grid_params: code.extend([ ".addGrid(" + cur_estimator_name + "." + param + ", " + CodeGenerationUtils.arrange_parameter_value( grid_params[param]) + ")" ]) code.extend([".build()", os.linesep]) return code
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {2}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): df_names = __get_dfs_to_join(extra) code.extend([ "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] + ", " + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["join_column"]) + ")", os.linesep ]) return code, error
def generate_code(args): node = args["node"] requireds_info = args["requireds_info"] edges = args["edges"] checklist = {"df_count": {0}, "model_count": {0}} error, extra = IncomingEdgeValidityChecker.check_validity( node["id"], requireds_info, edges, checklist) code = [] if (error == ErrorTypes.NO_ERROR): code = [ "model_" + node["id"] + "=" + node["parameters"]["model_type"] + ".load(" + CodeGenerationUtils.arrange_parameter_value( node["parameters"]["model_path"]) + ")", os.linesep ] return code, error