def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {0}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code=[]
    errors=[]
    if(error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node)
        if(error == ErrorTypes.NO_ERROR):
            my_args = {"node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}
            # Must be a valid schema at this point.
            additional_code, param_string = CodeGenerationUtils.handle_parameter(node["parameter"]["schema"], my_args)
            gen_code=[]
            gen_code.extend(additional_code)

            gen_code.append("df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ')
            gen_code.append(CodeGenerationUtils.handle_primitive(node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")")
            gen_code.append('.option("subscribe", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["topic"]["value"] + ")"))
            gen_code.append('.load().select(from_json(col("value").cast("string"), '+ param_string +")")
            # For streams, we will use timestamp as a key while writing to kafka topic in case.
            gen_code.extend(['.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep])

            final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
Ejemplo n.º 2
0
def __create_bash_operator(task_id, args):
    op_task_id = args["app_id"] + "_" + task_id
    return [
        'Task_' + op_task_id + ' = BashOperator(task_id=' +
        CodeGenerationUtils.handle_primitive(op_task_id) + ", bash_command='" +
        args["bash_command"] + ' ' +
        CodeGenerationUtils.handle_primitive(op_task_id) + " ', dag=dag)"
    ]
Ejemplo n.º 3
0
def __create_spark_operator(task_id, args):
    op_task_id = args["app_id"] + "_" + task_id
    op_name = 'Task_' + op_task_id
    operator_args_str = str(args["spark_operator_conf"])
    script_path = os.path.join(args["code_base_path"], op_task_id + '.py')
    return [
        "operator_args = " + operator_args_str, os.linesep,
        op_name + ' = SparkSubmitOperator(task_id=' +
        CodeGenerationUtils.handle_primitive(op_task_id) + ', application=' +
        CodeGenerationUtils.handle_primitive(script_path) +
        ', dag=dag, **operator_args)'
    ]
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        code.append(
            df_name + '.selectExpr("CAST(' +
            node["parameters"]["unique_column_name"]["value"] +
            ' AS STRING) AS key", "to_json(struct(*)) AS value").writeStream.format("kafka").option("kafka.bootstrap.servers", '
        )
        code.append(
            CodeGenerationUtils.handle_primitive(
                node["parameters"]["host"]["value"] + ":" +
                node["parameters"]["port"]["value"]) + ")")
        code.append(".trigger(" + __generate_trigger_code(node) + ")")
        code.append('.option("topic", ' + CodeGenerationUtils.handle_primitive(
            node["parameters"]["topic"]["value"]) + ")")
        code.append('.option("checkpointLocation", ' +
                    CodeGenerationUtils.handle_primitive(
                        node["parameters"]["checkpointLocation"]["value"]) +
                    ").start()")
        code.extend([
            os.linesep, "query_" + node["id"], ".awaitTermination()",
            os.linesep
        ])

        args["additional_info"]["written_topics"].append({
            "topic_name":
            node["parameters"]["topic"]["value"],
            "host":
            node["parameters"]["host"]["value"],
            "port":
            node["parameters"]["port"]["value"]
        })

    return code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    shared_function_set = set()
    errors = []
    code = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        shared_function_set.add(SharedFunctionTypes.VECTOR_DISASSEMBLER)
        code = [
            "df_" + node["id"] + " = " + "vector_disassembler(" + df_name +
            ", " + CodeGenerationUtils.handle_primitive(
                node["parameters"]["vector_column"]["value"]) + ")", os.linesep
        ]

    return code, shared_function_set, error
def __generate_trigger_code(node):
    trigger_type = node["parameters"]["trigger_type"]["value"]
    if (trigger_type == "once"):
        return "once=True"
    else:
        return trigger_type + "=" + "'" + CodeGenerationUtils.handle_primitive(
            node["parameters"]["trigger_value"]["value"])
Ejemplo n.º 7
0
def __handle_expression_string_only(node, args):
    code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep]
    code.extend([
        "df_" + args["node_id"] + " = " + "df_" + args["node_id"] + "." +
        node["sql_name"] + "(" + CodeGenerationUtils.handle_primitive(
            node["parameters"]["expression"]["value"]) + ")", os.linesep
    ])
    return code
Ejemplo n.º 8
0
def __arg_dict_to_string(args):
    # Assuming that corresponding argument is a string which is appropriate for pre-defined datetime format.
    code = ["{"]
    for arg in args:
        if (arg in __set_of_datetime_arguments):
            code.extend([
                CodeGenerationUtils.handle_primitive(arg), ": ",
                'datetime.strptime("' + args[arg] + '", "' +
                __datetime_format + '")', ", "
            ])
        else:
            code.extend([
                CodeGenerationUtils.handle_primitive(arg), ": ",
                CodeGenerationUtils.handle_primitive(args[arg]), ", "
            ])
    if (len(args) > 0):
        code.pop()
    code.append("}")
    return ''.join(code)
Ejemplo n.º 9
0
def __handle_in_op_out_trio(node, args):
    in_op_out_trio_list = node["parameters"]["in_op_out_trio_list"]["value"]

    code = ["df_" + args["node_id"] + " = " + args["input_dfs"][0], os.linesep]
    code.extend(
        ["df_" + args["node_id"] + " = " + "df_" + args["node_id"] + " .agg("])
    for elem in in_op_out_trio_list:
        code.extend([
            "F." + elem["operation"]["value"] + "(" +
            CodeGenerationUtils.handle_primitive(elem["input_column"]["value"])
            + ").alias(" + CodeGenerationUtils.handle_primitive(
                elem["output_column"]["value"]) + ")", ", "
        ])

    # Assuming that there is at least 1 agg request. However, check this and produce error before got here...
    code.pop()
    code.extend([")", os.linesep])

    return code
Ejemplo n.º 10
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if (error == ErrorTypes.NO_ERROR):
            final_code = [
                "correlation_" + node["id"] + " = " + "Correlation.corr(" +
                df_name + ", " + CodeGenerationUtils.handle_primitive(
                    node["parameters"]["column"]["value"]) + ", " +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["method"]["value"]) + ")", os.linesep
            ]
            final_code.extend([
                "result_array_" + node["id"] + " = ",
                "correlation_" + node["id"] + ".head()[0].toArray()",
                os.linesep
            ])
            # In the future, dynamically name columns according to an appropriate convention...
            final_code.extend([
                "df_" + node["id"] + " = sc.parallelize(" + "result_array_" +
                node["id"] + ")",
                ".map(lambda x: [float(i) for i in x]).toDF()", os.linesep
            ])

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ")")
        code.append(".trigger("+ __generate_trigger_code(node) +")")
        code.append('.option("path", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["path"]["value"]) + ")")
        code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["checkpointLocation"]["value"]) + ").start()")
        code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep])

        args["additional_info"]["written_tables"].append({"table_path": node["parameters"]["path"]["value"]})

    return code, shared_function_set, error
Ejemplo n.º 12
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            if (is_schema_appropriate):
                gen_code = CodeGenerationUtils.handle_instantination_or_call(
                    node["parameters"], "df_" + node["id"] + "=" +
                    "spark.read." + node["file_type"] + "(", my_args)
            else:
                # For safety, but consider it again
                if ("schema" in node["parameters"]):
                    del node["parameters"]["schema"]

                if (node["can_infer_schema"]):
                    node["parameters"]["inferSchema"] = {
                        "value": True,
                        "type": "boolean"
                    }

                gen_code = CodeGenerationUtils.handle_instantination_or_call(
                    node["parameters"],
                    "df_" + node["id"] + "=" + "spark.read.format(" +
                    CodeGenerationUtils.handle_primitive(node["file_type"]) +
                    ").load(", my_args)

                final_code = CodeGenerationUtils.merge_with_additional_code(
                    gen_code, additional_local_code)

    return final_code, shared_function_set, error
Ejemplo n.º 13
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {2}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        df_names = __get_dfs_to_join(extra)
        code.extend([
            "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] +
            ", " + CodeGenerationUtils.handle_primitive(
                node["parameters"]["join_column"]["value"]) + ")", os.linesep
        ])

    return code, shared_function_set, error
Ejemplo n.º 14
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {1}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):

        model_id = extra["models"][0]["source_id"]

        code = [
            "model_" + model_id + ".save(" +
            CodeGenerationUtils.handle_primitive(
                node["parameters"]["model_path"]["value"]) + ")", os.linesep
        ]

    return code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            # Must be a valid schema at this point.
            additional_code, param_string = CodeGenerationUtils.handle_parameter(
                node["parameter"]["schema"], my_args)
            gen_code = []
            gen_code.extend(additional_code)

            gen_code.extend([
                "df_" + node["id"] + ' = spark.readStream.schema(' +
                param_string + ")." + node["file_type"] + "(" +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["path"]["value"]) + ")", os.linesep
            ])

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Ejemplo n.º 16
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }

        gen_code = CodeGenerationUtils.handle_instantination_or_call(
            node["parameters"], df_name + ".write.format(" +
            CodeGenerationUtils.handle_primitive(node["file_type"]) +
            ").save(", my_args)

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

        args["additional_info"]["written_tables"].append(
            {"table_path": node["parameters"]["path"]["value"]})

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if(error==ErrorTypes.NO_ERROR):
            # For now, directly returns the correlation result (as a dataframe) which includes (in a row~):
            # pValues: DenseVector
            # degreesOfFreedom: list
            # statistics: DenseVector
            final_code=["df_"+ node["id"] + " = " + node["parameters"]["test_type"]["value"] + ".test(" + df_name + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["features_column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["label_column"]["value"]) + ")", os.linesep]

    return final_code, shared_function_set, error