Esempio n. 1
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "input_dfs": [df_name],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            gen_code = CodeGenerationUtils.handle_instantination_or_call(
                node["parameters"], 'df_' + node["id"] + '=' + df_name + '.' +
                node["ddfo_name"] + '(', my_args)

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 2
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        code.append("query_" + node["id"] + "=" + df_name + '.selectExpr("CAST(' + node["parameters"]["unique_column_name"]["value"] + ' AS STRING) AS key", "to_json(struct(*)) AS value").writeStream.format("kafka").option("kafka.bootstrap.servers", ')
        code.append(CodeGenerationUtils.handle_primitive(node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")")
        code.append(".trigger(" + __generate_trigger_code(node) + ")")
        code.append('.option("topic", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["topic"]["value"]) + ")")
        code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["checkpointLocation"]["value"]) + ").start()")
        code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep])

        args["additional_info"]["written_topics"].append({"topic_name": node["parameters"]["topic"]["value"], "host": node["parameters"]["host"]["value"], "port": node["parameters"]["port"]["value"]})

    return code, shared_function_set, error
Esempio n. 3
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            # Must be a valid schema at this point.
            param_string = CodeGenerationUtils.handle_parameter(
                node["parameters"]["schema"], my_args)
            gen_code = []

            gen_code.append(
                "df_" + node["id"] +
                ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", '
            )
            gen_code.append(
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["host"]["value"] + ":" +
                    node["parameters"]["port"]["value"]) + ")")
            gen_code.append('.option("subscribe", ' +
                            CodeGenerationUtils.handle_primitive(
                                node["parameters"]["topic"]["value"]) + ")")
            gen_code.append(
                '.option("startingOffsets", ' +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["startingOffsets"]["value"]) + ")")
            gen_code.append(
                '.load().select(from_json(col("value").cast("string"), ' +
                param_string + ")")
            # For streams, we will use timestamp as a key while writing to kafka topic in case.
            gen_code.extend([
                '.alias("value"), "timestamp").select("value.*", "timestamp")',
                os.linesep
            ])

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 4
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }

        updated_function_name = CodeGenerationUtils.handle_parameter(
            node["parameters"]["udf_function"], my_args)
        gen_code = []
        gen_code.extend([
            "udf_" + node["id"] + " = udf(" + updated_function_name + ", " +
            node["parameters"]["udf_return_type"]["value"] + "())", os.linesep
        ])

        gen_code.extend([
            "tuple_list = " + CodeGenerationUtils.handle_parameter(
                node["parameters"]["udf_input_tuples"], my_args), os.linesep
        ])
        gen_code.extend([
            "output_list = " + CodeGenerationUtils.handle_parameter(
                node["parameters"]["udf_outputs"], my_args), os.linesep
        ])
        gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep])
        gen_code.extend(["for index in range(len(tuple_list)):", os.linesep])
        gen_code.extend([
            "\tdf_" + node["id"] + " = df_" + node["id"] +
            ".withColumn(output_list[index], udf_" + node["id"] +
            "(*tuple_list[index]))", os.linesep, os.linesep
        ])

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 5
0
def __arg_dict_to_string(args):
    # Assuming that corresponding argument is a string which is appropriate for pre-defined datetime format.
    code=["{"]
    for arg in args:
        if(arg in __set_of_datetime_arguments):
            code.extend([CodeGenerationUtils.handle_primitive(arg), ": ", 'datetime.strptime("' + args[arg] + '", "' + __datetime_format + '")', ", "])
        else:
            code.extend([CodeGenerationUtils.handle_primitive(arg), ": ", CodeGenerationUtils.handle_primitive(args[arg]), ", "])
    if (len(args) > 0):
        code.pop()
    code.append("}")
    return ''.join(code)
Esempio n. 6
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }
        gen_code = []

        shared_function_set.add(SharedFunctionTypes.SELECT_EXPR_HELPERS)

        gen_code.extend(["expressions_" + node["id"] + "=[]", os.linesep])
        for expr in node["parameters"]["expressions"]["value"]:
            gen_code.extend([
                "expressions_" + node["id"] + ".extend(",
                'single_select_expr_generator(' +
                CodeGenerationUtils.handle_parameter(expr["input_cols"],
                                                     my_args) +
                ', ' + CodeGenerationUtils.handle_parameter(
                    expr["output_cols"], my_args) +
                ', ' + CodeGenerationUtils.handle_parameter(
                    expr["operation"], my_args) + '))', os.linesep
            ])

        gen_code.extend([
            "df_" + node["id"] + "=" + df_name + ".selectExpr(" +
            "*expressions_" + node["id"] + ")", os.linesep
        ])

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 7
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    final_code=[]
    shared_function_set = set()
    additional_local_code = []
    errors = []

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    if(error == ErrorTypes.NO_ERROR):
        error, pipeline_order= PipelineValidityChecker.check_validity(node["nodes"], node["edges"])
        if(error==ErrorTypes.NO_ERROR):
            if ("portion" in extra["dfs"][0]):
                df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
            else:
                df_name = "df_" + extra["dfs"][0]["source_id"]

            my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}
            gen_code, error = __generate_stages(node["nodes"], pipeline_order, df_name, my_args)
            if(error == ErrorTypes.NO_ERROR):
                gen_code.append(os.linesep)
                gen_code.extend(__generate_code_for_pipeline_instantination(node, pipeline_order, my_args))

                gen_code.extend(['model_' + node["id"] + "=" + 'pipeline_' + node["id"] + ".fit(" + df_name + ")", os.linesep])
                # Following might not be logical for pipelines with an estimator
                gen_code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep])

                final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 8
0
def __generate_trigger_code(node):
    trigger_type = node["parameters"]["trigger_type"]["value"]
    if (trigger_type == "once"):
        return "once=True"
    else:
        return trigger_type + "=" + CodeGenerationUtils.handle_primitive(
            node["parameters"]["trigger_value"]["value"])
Esempio n. 9
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    shared_function_set = set()
    errors = []
    code = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        shared_function_set.add(SharedFunctionTypes.VECTOR_DISASSEMBLER)
        code = [
            "df_" + node["id"] + " = " + "vector_disassembler(" + df_name +
            ", " + CodeGenerationUtils.handle_primitive(
                node["parameters"]["vector_column"]["value"]) + ")", os.linesep
        ]

    return code, shared_function_set, error
Esempio n. 10
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if(error == ErrorTypes.NO_ERROR):
        error, extra2= CVValiditiyChecker.check_validity(node["nodes"], node["edges"])

        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}
        gen_code=[]
        gen_code.extend(__generate_code_for_estimator_instantination(node["nodes"][extra2["estimator_node_id"]], my_args))
        gen_code.extend(__generate_code_for_evaluator_instantination(node["nodes"][extra2["evaluator_node_id"]], my_args))
        gen_code.extend(__generate_code_for_param_grid(node, 'estimator_' + extra2["estimator_node_id"], my_args))
        gen_code.extend(__generate_code_for_cv_instantination(node, extra2["estimator_node_id"], extra2["evaluator_node_id"]))

        gen_code.extend(['model_' + node["id"] + "=" + 'cv_' + node["id"] + ".fit(" + df_name + ")", os.linesep])
        # Following might not be logical unless you aim to predict on training data for some specific needs.
        gen_code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' + df_name + ')', os.linesep])

        final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 11
0
def __generate_stage_template(node, non_indicator_params, args):
    code = []
    class_name = ""
    # Do not allow other families than estimator and transformer. Handle the "else" case later...
    if (node["family"] == NodeFamilyTypes.Estimator.value):
        class_name = node["estimator_name"]
    elif (node["family"] == NodeFamilyTypes.Transformer.value):
        class_name = node["transformer_name"]

    mmi_part = []
    for i in range(len(node["multi_instance_indicator"])):
        mmi_part.extend([
            node["multi_instance_indicator"][i] + "=" + "mmi_value_" +
            node["multi_instance_indicator"][i] + "_" + node["id"] + "[i]",
            ", "
        ])

    arg_part = CodeGenerationUtils.handle_arguments(non_indicator_params, args)
    if (not bool(arg_part)):
        mmi_part.pop()

    code.extend(["stages_" + node["id"], ".append(", class_name + '('])
    code.extend(mmi_part)
    code.extend(arg_part)
    code.extend(["))", os.linesep])

    return ''.join(code)
Esempio n. 12
0
def __generate_code_for_pipeline_instantination(node, args):
    code = []
    non_indicator_params = {}
    for param in node["parameters"]:
        if (param in node["multi_instance_indicator"]):
            code.extend([
                "mmi_value_" + param + "_" + node["id"] +
                " = " + CodeGenerationUtils.handle_parameter(
                    node["parameters"][param], args), os.linesep
            ])
        else:
            non_indicator_params[param] = node["parameters"][param]

    code.extend(["stages_" + node["id"], " = ", "[]", os.linesep])
    code.extend([
        "for i in ", "range(len(mmi_value_" +
        node["multi_instance_indicator"][0] + "_" + node["id"], ")):",
        os.linesep
    ])
    code.extend([
        "\t",
        __generate_stage_template(node, non_indicator_params, args), os.linesep
    ])
    if (not ("in_pipeline" in args and args["in_pipeline"])):
        code.extend([
            'pipeline_' + node["id"] + "=Pipeline(stages=",
            "stages_" + node["id"] + ")", os.linesep
        ])

    return code
Esempio n. 13
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            if (is_schema_appropriate):
                gen_code = CodeGenerationUtils.handle_instantination_or_call(
                    node["parameters"], "df_" + node["id"] + "=" +
                    "spark.read." + node["file_type"] + "(", my_args)
            else:
                # For safety, but consider it again
                if ("schema" in node["parameters"]):
                    del node["parameters"]["schema"]

                if (node["can_infer_schema"]):
                    node["parameters"]["inferSchema"] = {
                        "value": True,
                        "type": "boolean"
                    }

                gen_code = CodeGenerationUtils.handle_instantination_or_call(
                    node["parameters"],
                    "df_" + node["id"] + "=" + "spark.read.format(" +
                    CodeGenerationUtils.handle_primitive(node["file_type"]) +
                    ").load(", my_args)

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 14
0
File: SQL.py Progetto: sooxwq/arakat
def __handle_expression_string_only(node, args):
    code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep]
    code.extend([
        "df_" + args["node_id"] + " = " + "df_" + args["node_id"] + "." +
        node["sql_name"] + "(" + CodeGenerationUtils.handle_primitive(
            node["parameters"]["expression"]["value"]) + ")", os.linesep
    ])
    return code
Esempio n. 15
0
def __single_generation(node, df_name, args):
    code = CodeGenerationUtils.handle_instantination_or_call(
        node["parameters"],
        'transformer_' + node["id"] + ' = ' + node["transformer_name"] + '(',
        args)
    code.extend([
        'df_' + node["id"] + "=" + 'transformer_' + node["id"] +
        '.transform(' + df_name + ')', os.linesep
    ])
    return code
Esempio n. 16
0
File: SQL.py Progetto: sooxwq/arakat
def __handle_in_op_out_trio(node, args):
    in_op_out_trio_list = node["parameters"]["in_op_out_trio_list"]["value"]

    code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep]
    code.extend(
        ["df_" + args["node_id"] + " = " + "df_" + args["node_id"] + ".agg("])
    for elem in in_op_out_trio_list:
        code.extend([
            "F." + elem["operation"]["value"] + "(" +
            CodeGenerationUtils.handle_primitive(elem["input_column"]["value"])
            + ").alias(" + CodeGenerationUtils.handle_primitive(
                elem["output_column"]["value"]) + ")", ", "
        ])

    # Assuming that there is at least 1 agg request. However, check this and produce error before got here...
    code.pop()
    code.extend([")", os.linesep])

    return code
Esempio n. 17
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}

        input_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["input_cols"], my_args)
        output_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["output_cols"], my_args)

        window_size = node["parameters"]["window_size"]["value"]
        partitioning_column = node["parameters"]["partitioning_column"]["value"]
        ordering_column = node["parameters"]["ordering_column"]["value"]
        ordering_direction = node["parameters"]["ordering_direction"]["value"]
        gen_code=[]
        gen_code.extend(["input_cols = " + output_cols, os.linesep])
        gen_code.extend(["output_cols = " + input_cols, os.linesep])
        gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep])
        gen_code.extend(["for inC, outC in zip(input_cols, output_cols):", os.linesep])
        gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', col(inC))", os.linesep])
        gen_code.extend(["\twSpec = Window.partitionBy('" + partitioning_column + "').orderBy(col('" + ordering_column + "')." + ordering_direction + "())", os.linesep])
        gen_code.extend(["\tfor j in range(" + str(window_size) + "):", os.linesep])
        gen_code.extend(["\t\tlag_values = lag('temp', default=0).over(wSpec)", os.linesep])
        gen_code.extend(["\t\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', F.when((col('temp')==1) | (lag_values==None) | (lag_values<1) | (lag_values>=" + str(window_size + 1) + "), col('temp')).otherwise(lag_values+1))", os.linesep])
        gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn(outC, F.when(col('temp') > 0, 1.0).otherwise(0.0))", os.linesep])

        final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 18
0
def __single_generation(node, df_name, args, model_elements_save_paths):
    code = CodeGenerationUtils.handle_instantination_or_call(
        node["parameters"],
        'estimator_' + node["id"] + ' = ' + node["estimator_name"] + '(', args)
    code.extend([
        'model_' + node["id"] + "=" + 'estimator_' + node["id"] + ".fit(" +
        df_name + ")", os.linesep
    ])
    code.extend([
        'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' +
        df_name + ')', os.linesep
    ])
    if (model_elements_save_paths is not None):
        for elem in model_elements_save_paths:
            code.extend([
                'model_' + node["id"] + "." + elem +
                ".write.format('parquet').save(" +
                CodeGenerationUtils.handle_primitive(
                    model_elements_save_paths[elem]["value"]) + ")", os.linesep
            ])
    return code
Esempio n. 19
0
File: SQL.py Progetto: sooxwq/arakat
def __handle_col_list_plus_kwargs(node, args):
    kwargs_str = []
    if ("kwargs" in node["parameters"]):
        kwargs = node["parameters"]["kwargs"]["value"]
        # If there are parameters for kwargs and all them is optional; and if user do not provide any of them, then we assume that there will be an empty dictionary here...
        for elem in kwargs:
            kwargs_str.extend([
                ", ", elem, "=",
                CodeGenerationUtils.handle_parameter(kwargs[elem], args)
            ])

    code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep]
    code.extend([
        "df_" + args["node_id"] + " = " + "df_" + args["node_id"] + "." +
        node["sql_name"] + "(" + CodeGenerationUtils.handle_parameter(
            node["parameters"]["input_columns"], args)
    ])
    code.extend(kwargs_str)
    code.extend([")", os.linesep])

    return code
Esempio n. 20
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }

        gen_code = CodeGenerationUtils.handle_instantination_or_call(
            node["parameters"], df_name + ".write.format(" +
            CodeGenerationUtils.handle_primitive(node["file_type"]) +
            ").save(", my_args)

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

        if "path" in node["parameters"]:
            args["additional_info"]["written_tables"].append(
                {"table_path": node["parameters"]["path"]["value"]})

    return final_code, shared_function_set, error
Esempio n. 21
0
    def test_handle_parameter_for_primitive(
            self, create_parameter_for_primitive,
            create_accompanying_arg_for_parameter):
        # parameter_val=create_parameter_for_primitive
        # arg_val=create_accompanying_arg_for_parameter
        # assert CodeGenerationUtils.handle_parameter(parameter_val, arg_val) == "\"" + str(parameter_val["value"]) + "\""
        expected_val = str(create_parameter_for_primitive["value"])
        if (isinstance(create_parameter_for_primitive["value"], str)):
            expected_val = "\"" + create_parameter_for_primitive["value"] + "\""

        assert CodeGenerationUtils.handle_parameter(
            create_parameter_for_primitive,
            create_accompanying_arg_for_parameter) == expected_val
Esempio n. 22
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if (error == ErrorTypes.NO_ERROR):
            final_code = [
                "correlation_" + node["id"] + " = " + "Correlation.corr(" +
                df_name + ", " + CodeGenerationUtils.handle_primitive(
                    node["parameters"]["column"]["value"]) + ", " +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["method"]["value"]) + ")", os.linesep
            ]
            final_code.extend([
                "result_array_" + node["id"] + " = ",
                "correlation_" + node["id"] + ".head()[0].toArray()",
                os.linesep
            ])
            # In the future, dynamically name columns according to an appropriate convention...
            final_code.extend([
                "df_" + node["id"] + " = sc.parallelize(" + "result_array_" +
                node["id"] + ")",
                ".map(lambda x: [float(i) for i in x]).toDF()", os.linesep
            ])

    return final_code, shared_function_set, error
Esempio n. 23
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            # Must be a valid schema at this point.
            param_string = CodeGenerationUtils.handle_parameter(
                node["parameter"]["schema"], my_args)
            gen_code = []

            gen_code.extend([
                "df_" + node["id"] + ' = spark.readStream.schema(' +
                param_string + ")." + node["file_type"] + "(" +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["path"]["value"]) + ")", os.linesep
            ])

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 24
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        code.append("query_" + node["id"] + "=" + df_name +
                    ".writeStream.format(" +
                    CodeGenerationUtils.handle_primitive(node["file_type"]) +
                    ")")
        code.append(".trigger(" + __generate_trigger_code(node) + ")")
        code.append('.option("path", ' + CodeGenerationUtils.handle_primitive(
            node["parameters"]["path"]["value"]) + ")")
        code.append('.option("checkpointLocation", ' +
                    CodeGenerationUtils.handle_primitive(
                        node["parameters"]["checkpointLocation"]["value"]) +
                    ").start()")
        code.extend([
            os.linesep, "query_" + node["id"], ".awaitTermination()",
            os.linesep
        ])

        args["additional_info"]["written_tables"].append(
            {"table_path": node["parameters"]["path"]["value"]})

    return code, shared_function_set, error
Esempio n. 25
0
def __generate_code_for_param_grid(node, cur_estimator_name, args):
    # In the future handle this in special requirement handler for parameters
    code=["param_grid_" + node["id"] + "=", "None", os.linesep]
    # Assuming that fix parameters are given in the estimator itself.
    # Maybe reconsider this part.
    grid_params = node["parameters"]["parameter_grid"]
    if(bool(grid_params)):
        code.pop()
        code.pop()
        code.extend(["ParamGridBuilder()"])
        for param in grid_params:
            code.extend([".addGrid(" + cur_estimator_name + "." + param + ", " + CodeGenerationUtils.handle_parameter(grid_params[param], args) + ")"])
        code.extend([".build()", os.linesep])

    return code
Esempio n. 26
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {0}, "model_count": {1}}
    error, extra= IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):

        model_id=extra["models"][0]["source_id"]

        code = ["model_" + model_id + ".save(" + CodeGenerationUtils.handle_primitive(node["parameters"]["model_path"]["value"]) + ")", os.linesep]

    return code, shared_function_set, error
Esempio n. 27
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }
        # Depending on the column that multi_instance_indicator indicates, we will decide to apply whether to multi-instance generation or usual generation

        model_elements_save_paths = None
        if ("model_elements_save_paths" in node["parameters"]):
            model_elements_save_paths = node["parameters"][
                "model_elements_save_paths"]["value"]
            del node["parameters"]["model_elements_save_paths"]

        if (MultiInstanceHandlerUtils.should_generate_multiple_instances(node)
            ):
            gen_code = MultiInstanceHandlerUtils.multi_instance_generation(
                node, df_name, my_args)
        else:
            gen_code = __single_generation(node, df_name, my_args,
                                           model_elements_save_paths)

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error
Esempio n. 28
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {2}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        df_names = __get_dfs_to_join(extra)
        code.extend([
            "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] +
            ", " + CodeGenerationUtils.handle_primitive(
                node["parameters"]["join_column"]["value"]) + ")", os.linesep
        ])

    return code, shared_function_set, error
Esempio n. 29
0
    def test_handle_parameter_for_primitive_array(
            self, create_parameter_for_primitive_array,
            create_accompanying_arg_for_parameter):
        primitive_array = create_parameter_for_primitive_array["value"]
        code = ["["]
        for elem in primitive_array:
            if (isinstance(elem, str)):
                code.append("\"" + elem + "\"")
            else:
                code.append(str(elem))
            code.append(", ")

        code.pop()

        code.append("]")
        expected_val = ''.join(code)
        actual_val = CodeGenerationUtils.handle_parameter(
            create_parameter_for_primitive_array,
            create_accompanying_arg_for_parameter)

        assert all([a == b for a, b in zip(actual_val, expected_val)])
Esempio n. 30
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }
        gen_code = []
        gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep])

        between_operation = node["parameters"]["rolling_stats_info"]["value"][
            "between_operation"]["value"]

        first_argument_input_cols = CodeGenerationUtils.handle_parameter(
            node["parameters"]["rolling_stats_info"]["value"]["first_argument"]
            ["value"]["input_cols"], my_args)
        first_argument_operation = node["parameters"]["rolling_stats_info"][
            "value"]["first_argument"]["value"]["operation"]["value"]
        gen_code.extend(
            ["first_cols = " + first_argument_input_cols, os.linesep])

        output_cols = CodeGenerationUtils.handle_parameter(
            node["parameters"]["rolling_stats_info"]["value"]["output_cols"],
            my_args)
        gen_code.extend(["output_cols = " + output_cols, os.linesep])

        partitioning_column = node["parameters"]["rolling_stats_info"][
            "value"]["partitioning_column"]["value"]
        ordering_column = node["parameters"]["rolling_stats_info"]["value"][
            "ordering_column"]["value"]
        ordering_direction = node["parameters"]["rolling_stats_info"]["value"][
            "ordering_direction"]["value"]

        lags = node["parameters"]["rolling_stats_info"]["value"]["lags"]
        lags_str = CodeGenerationUtils.handle_parameter(lags, my_args)

        window_str = "over (partition by " + partitioning_column + " order by " + ordering_column + " " + ordering_direction + " rows " + "'+ str(lag) +'" + " preceding) "

        # if window_size == -1:
        #     window_str = "over (partition by " + partition_column + " order by " + ordering_column + " " + ordering_direction + " rows unbounded preceding) "
        # else:
        #     window_str = "over (partition by " + partition_column + " order by " + ordering_column + " " + ordering_direction + " rows " + str(window_size) + " preceding) "

        if between_operation != 'Identity':
            second_argument_input_cols = CodeGenerationUtils.handle_parameter(
                node["parameters"]["rolling_stats_info"]["value"]
                ["second_argument"]["value"]["input_cols"], my_args)
            second_argument_operation = node["parameters"][
                "rolling_stats_info"]["value"]["second_argument"]["value"][
                    "operation"]["value"]
            gen_code.extend(
                ["second_cols = " + second_argument_input_cols, os.linesep])

            loop_str = "for col_1,col_2,out_col in zip(first_cols, second_cols, output_cols):"
            if first_argument_operation == 'Identity':
                if second_argument_operation == 'Identity':
                    select_str = "df_" + node["id"] + " = df_" + node[
                        "id"] + ".selectExpr('*', col_1 + ' " + between_operation + " '+ col_2 + ' as out_col' + str(lag))"
                else:
                    select_str = "df_" + node["id"] + " = df_" + node[
                        "id"] + ".selectExpr('*', col_1 + ' " + between_operation + " ' + '" + second_argument_operation + "(' + col_2 + ') " + window_str + "as out_col' + str(lag))"
            else:
                if second_argument_operation == 'Identity':
                    select_str = "df_" + node["id"] + " = df_" + node[
                        "id"] + ".selectExpr('*', '" + first_argument_operation + "(' + col_1 + ') " + window_str + between_operation + " ' + col_2 + ' as out_col' + str(lag))"
                else:
                    select_str = "df_" + node["id"] + " = df_" + node[
                        "id"] + ".selectExpr('*', '" + first_argument_operation + "(' + col_1 + ') " + window_str + between_operation + " " + second_argument_operation + "(' + col_2 + ') " + window_str + "as out_col' + str(lag))"
        else:
            loop_str = "for col_1,out_col in zip(first_cols, output_cols):"
            if first_argument_operation == 'Identity':
                select_str = "df_" + node["id"] + " = df_" + node[
                    "id"] + ".selectExpr('*', col_1 + ' as out_col' + str(lag))"
            else:
                select_str = "df_" + node["id"] + " = df_" + node[
                    "id"] + ".selectExpr('*', '" + first_argument_operation + "(' + col_1 + ') " + window_str + "as out_col' + str(lag))"

        gen_code.extend(["lags = " + lags_str, os.linesep])
        gen_code.extend(["for lag in lags:", os.linesep])

        gen_code.extend(["\t", loop_str, os.linesep])
        gen_code.extend(["\t\t" + select_str, os.linesep])

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error