Example #1
0
 def run(**kwargs):
     session = BaseService.initialize_maintenance_objects()
     config = DatabaseConfig.current_config(session)
     spark_worker_debug = os.environ.get("SPARK_WORKER_DEBUG", "1")
     if spark_worker_debug == "1":
         LivyRunner.local_run(**kwargs)
     else:
         LivyRunner.online_run(**kwargs)
Example #2
0
 def load_streaming_context(ssc, name, topics):
     # type: (StreamingContext, str, List[str]) -> DStream
     maintenance_session = BaseService.initialize_maintenance_objects()
     database_config = DatabaseConfig.current_config(maintenance_session)
     bootstrap_servers = database_config["kafka"]["bootstrap_servers"]
     return KafkaUtils.createDirectStream(
         ssc=ssc,
         kafkaParams={"bootstrap.servers": bootstrap_servers},
         topics=topics)
Example #3
0
# coding=utf-8

import os
import sys
import re

sys.path.append(os.path.dirname(__file__))

from airflow import DAG
from process.airflow_task.dag_creator import create_retailer_dag
from process.model.retailer_config import RetailerConfig, DatabaseConfig
from process.service.base_service import BaseService

maintenance_session = BaseService.initialize_maintenance_objects()
retailer_configs = maintenance_session.query(RetailerConfig) \
    .filter(RetailerConfig.activated.is_(True)).all()

action = None
dag_name = None
detected_org_code = None
ss_re = re.compile(
    "^z_retailer_sync_(.*)_(ss|month|dr|stock|_spark_import|_data_deal)$")
skip_all_retailer_dags = False

if len(sys.argv) >= 3:
    action = sys.argv[1]
    dag_name = sys.argv[2]

    if action == "run" or action == "test":
        skip_all_retailer_dags = not dag_name.startswith("z_retailer_sync_")
        matched_org_codes = ss_re.findall(dag_name)
    def maintenance_session(self) -> Session:
        if self._maintenance_session is None:
            self._maintenance_session = BaseService.initialize_maintenance_objects(
            )

        return self._maintenance_session
Example #5
0
    def online_run(name,
                   file,
                   args,
                   py_files=(),
                   skip_log=False,
                   spark_cpu_size=2,
                   record_id=False,
                   executor_memory="6g",
                   driver_memory="8g",
                   memory_overhead="2048"):
        session = BaseService.initialize_maintenance_objects()
        config = DatabaseConfig.current_config(session)
        env_prefix = config.get("prefix", None)
        livy_prefix = config["livy"]["address"]
        task_name = name_with_prefix(env_prefix, name)
        if "fast" in task_name and env_prefix is None:
            livy_prefix = "127.0.0.1:8998"

        # LivyRunner.stop_if_exists(task_name, livy_prefix, session)
        # target_file_name, target_zp_file_name = HdfsScriptUploader.upload(name, file, config)
        target_file_name = HdfsScriptUploader.upload(name, file, config)
        data = {
            "name": task_name,
            "file": target_file_name,
            # "args": [str(env_prefix), target_zp_file_name, json.dumps(config)] + args,
            "args": [str(env_prefix), json.dumps(config)] + args,
            "conf": {
                "spark.cores.max": str(spark_cpu_size),
                "spark.sql.warehouse.dir": "/user/hive/warehouse",
                "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version":
                "2",
                "spark.executor.memory": executor_memory,
                "spark.driver.memory": driver_memory,
                "spark.mesos.executor.memoryOverhead": memory_overhead
            }
        }

        r = requests.post("http://{0}/batches".format(livy_prefix), json=data)
        print(r.json())
        result = r.json()
        batch_id = result["id"]
        print("record_id: {0}".format(record_id))
        if record_id:
            LivyStreamingTask.record_livy_streaming_task(
                session, batch_id, task_name)

        state = "running"
        while state == "running":
            sleep_time = 1
            if skip_log is True:
                sleep_time = 30

            time.sleep(sleep_time)
            r = requests.get("http://{0}/batches/{1}/state".format(
                livy_prefix, batch_id))
            batch_result = r.json()
            print(batch_result)
            state = batch_result["state"]

        if skip_log is False:
            log_request = requests.get("http://{0}/batches/{1}/log".format(
                livy_prefix, batch_id),
                                       params={
                                           "from": 0,
                                           "size": 100000
                                       })
            logs = log_request.json()["log"]
            if logs is not None:
                for l in logs:
                    print("livy log: {0}".format(l))

        if state != "success":
            raise "Livy job run into state: {0}, and batch id is: {1}".format(
                state, batch_id)
Example #6
0
    def local_run(name,
                  file,
                  args,
                  py_files=(),
                  skip_log=False,
                  spark_cpu_size=2,
                  record_id=False,
                  executor_memory="6g",
                  driver_memory="8g",
                  memory_overhead="2048"):
        session = BaseService.initialize_maintenance_objects()
        config = DatabaseConfig.current_config(session)
        env_prefix = config.get("prefix", None)
        task_name = name_with_prefix(env_prefix, name)
        hive_ware_path = name_with_prefix(env_prefix, "hive")

        spark = SparkSession \
            .builder \
            .master("local[8]") \
            .appName(task_name) \
            .config("spark.driver.memory", "4G") \
            .config("spark.executor.cores", 8) \
            .config("spark.executor.memory", "8G") \
            .config("spark.sql.warehouse.dir", "/user/{0}/warehouse".format(hive_ware_path)) \
            .config("spark.network.timeout", "360000") \
            .config("spark.executor.heartbeatInterval", "360000") \
            .config("spark.sql.pivotMaxValues", "200000") \
            .enableHiveSupport() \
            .getOrCreate()
        code_dir = os.path.dirname(__file__)
        globals()["spark"] = spark
        first_argv = sys.argv[0]
        all_argv = [first_argv,
                    str(env_prefix), code_dir,
                    json.dumps(config)] + args
        sys.argv.clear()
        for a in all_argv:
            sys.argv.append(a)
        print(file)

        root_path = os.path.dirname(__file__)
        os.makedirs(os.path.join(root_path, "local_run"), exist_ok=True)
        base_file_name = os.path.basename(file)
        only_name = base_file_name.replace(".py", "")
        script_prefix = "def {0}():\r\n".format(only_name)

        target_local_run_file = os.path.join(root_path, "local_run",
                                             base_file_name)
        with open(target_local_run_file, "w", encoding="utf8") as f:
            f.write(script_prefix)

            with open(file, "r", encoding="utf8") as code:
                for line in code:
                    if len(file) == 0:
                        f.write("   \r\n")
                    else:
                        f.write("   {0}".format(line))

        f = importlib.import_module("local_run.{0}".format(only_name))
        print(getattr(f, only_name))
        getattr(f, only_name)()