Beispiel #1
0
def ipy_imports(launch_spark=False):
    from xutil.helpers import (log, get_exception_message, get_error_str,
                               get_profile)
    from xutil.database.base import get_conn
    from xutil.diskio import (write_csv, read_csv, read_file, write_file,
                              get_hdfs)
    from jmespath import search
    from pathlib import Path
    from collections import Counter, namedtuple, OrderedDict
    import time, datetime

    if launch_spark:
        from xutil.database.spark import Spark
        parser = argparse.ArgumentParser(description='Spark IPython')
        parser.add_argument('--master',
                            help='Master string for Spark Instance')
        parser.add_argument('--profile',
                            help='Database profile name from PROFILE_YAML')
        args = parser.parse_args()
        dbs = get_profile(create_if_missing=True)['databases']
        if args.profile and args.profile in dbs and dbs[
                args.profile]['type'].lower() in ('hive', 'spark'):
            conn = get_conn(args.profile)
            globals()['sparko'] = conn.sparko
        elif args.profile:
            log(
                Exception('Profile {} not found or incompatible.'.format(
                    args.profile)))
            sys.exit(1)
        else:
            globals()['sparko'] = Spark(master=args.master)
        globals()['sc'] = sparko.sc
        globals()['spark'] = sparko.spark

    ldict = locals()
    for name in ldict:
        var = ldict[name]
        if callable(var) or isinstance(var, __builtins__.__class__):
            # is a function or class or module
            globals()[name] = var
Beispiel #2
0
def create_profile():
    "Create profile.yaml if it does not exists"
    from xutil.helpers import get_profile
    get_profile(create_if_missing=True)
    log('+YAML Profile located @ {}'.format(os.environ['PROFILE_YAML']))
Beispiel #3
0
  def __init__(self,
               app_name=None,
               master=None,
               conf={},
               spark_home=None,
               restart=False,
               hive_enabled=False,
               config_name=socket.gethostname().lower(),
               prog_handler=None,
               log=log):

    # restart = True if version != self.version else restart
    if os.getenv('KLOG'): os.system('bash $KLOG')  # kerb login
    spark_home = self.set_sparkenv(spark_home)

    from pyspark import SparkContext, SQLContext, SparkConf
    from pyspark.sql import SparkSession
    active_sc = SparkContext._active_spark_context

    if active_sc:
      log("Active SC ->> " + active_sc.appName)
      sc = active_sc
      spark = SparkSession(sc)
    else:
      sc = None
      spark = None

    if sc and restart:
      log('~Stopping Spark Instance ({})'.format(sc.appName))
      try:
        ps_data = {p.pid: p for p in psutil.process_iter() if p.cmdline()}
        child_pid = ps_data[os.getpid()].children()[0].pid
        if not hive_enabled:
          os.system('kill -9 ' + str(child_pid))
          SparkContext._gateway = None
      except:
        print(get_exception_message())

      sc.stop()
      sc = None

      # sc = sc.getOrCreate()

    profile = get_profile()
    if profile:
      conf_def = profile['spark-conf']
      if 'spark-conf-name' in profile:
        if config_name in profile['spark-conf-name']:
          # overwrite the default spark-conf
          for key in profile['spark-conf-name'][config_name]:
            conf_def[key] = profile['spark-conf-name'][config_name][key]
    else:
      conf_def = {
        "spark.master": "local[4]",
        "spark.driver.memory": "5g",
        "spark.driver.maxResultSize": "2g",
        "spark.driver.cores": "1",
        "spark.executor.instances": "4",
        "spark.executor.cores": "4",
        "spark.sql.broadcastTimeout": 900,
        # "spark.sql.tungsten.enabled": "true",
        "spark.io.compression.codec": "snappy",
        "spark.rdd.compress": "true",
        "spark.streaming.backpressure.enabled": "true",
        "spark.sql.parquet.compression.codec": "snappy",
      }

    # set extraClassPath
    conf_def["spark.driver.extraClassPath"] = self._get_jar_paths(profile)
    if 'SPARK_CLASSPATH' in os.environ and os.environ['SPARK_CLASSPATH']:
      conf_def["spark.driver.extraClassPath"] = conf_def["spark.driver.extraClassPath"] + ':' + os.environ['SPARK_CLASSPATH']
      del os.environ['SPARK_CLASSPATH']

    if master: conf['spark.master'] = master
    if hive_enabled: conf["spark.sql.catalogImplementation"] = "hive"

    for c in conf_def:
      conf[c] = conf_def[c] if c not in conf else conf[c]

    # Launch Spark Instance
    version = self.get_spark_version(spark_home)

    app_name = app_name if app_name else 'Spark_{}_{}_{}'.format(
      str(version).replace('.', ''), os.getenv('USER'), os.getpid())

    if not sc:
      log('Starting Spark Instance ({}) with version {} / {}'.format(
        app_name, version, conf['spark.master']))
      sc, spark, proc = self.init_spark(app_name, spark_home, hive_enabled, conf, restart, prog_handler)
      self.proc = proc

    self.hive_enabled = hive_enabled
    self.version = version
    self.sc = sc
    self.uiWebUrl = sc.uiWebUrl
    self.local_uiWebUrl = 'http://{}:{}'.format(socket.gethostname(), sc.uiWebUrl.split(':')[-1])
    self.spark = spark
Beispiel #4
0
def get_conn(db,
             dbs=None,
             echo=True,
             reconnect=False,
             use_jdbc=False,
             conn_expire_min=10,
             spark_hive=False) -> DBConn:
    global conns

    dbs = dbs if dbs else get_databases()
    profile = get_profile()
    db_dict = struct(dbs[db])

    if db_dict.type.lower() == 'hive' and spark_hive:
        db_dict.type = 'spark'

    use_jdbc = True if (
        use_jdbc or
        ('use_jdbc' in db_dict and db_dict['use_jdbc'])) else use_jdbc

    if db in conns and not reconnect:
        if (now() -
                conns[db].last_connect).total_seconds() / 60 < conn_expire_min:
            return conns[db]

    if use_jdbc:
        log('*USING JDBC for ' + db)
        from .jdbc import JdbcConn
        conn = JdbcConn(db_dict, profile=profile)

    elif db_dict.type.lower() == 'oracle':
        from .oracle import OracleConn
        conn = OracleConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'spark':
        from .spark import SparkConn
        conn = SparkConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'hive':
        from .hive import HiveConn, Beeline
        if 'use_beeline' in db_dict and db_dict.use_beeline:
            conn = Beeline(db_dict, echo=echo)
        else:
            conn = HiveConn(db_dict, echo=echo)

    elif db_dict.type.lower() in ('postgresql', 'redshift'):
        from .postgresql import PostgreSQLConn
        conn = PostgreSQLConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'sqlserver':
        from .sqlserver import SQLServerConn
        conn = SQLServerConn(db_dict, echo=echo)

    elif db_dict.type.lower() == 'sqlite':
        from .sqlite import SQLiteConn
        conn = SQLiteConn(db_dict, echo=echo)
    else:
        raise Exception(f'Type {db_dict.type} not handled!')

    conns[db] = conn
    return conn
Beispiel #5
0
WORKER_PREFIX = os.getenv('DBNET_WORKER_PREFIX', default='dbnet')
WEBAPP_HOST = os.getenv('DBNET_WEBAPP_HOST', default='0.0.0.0')
WEBAPP_PORT = int(os.getenv('DBNET_WEBAPP_PORT', default=5566))
DBNET_FOLDER = os.getenv('DBNET_FOLDER', default=get_home_path() + '/dbnet')
MAX_WORKER_PER_DB = int(os.getenv('DBNET_MAX_WORKER_PER_DB', default=3))
DBNET_DB_URL = os.getenv('DBNET_DB_URL')

os.makedirs(DBNET_FOLDER, exist_ok=True)

hostname = socket.gethostname()
workers = OrderedDict()
db_workers_map = OrderedDict()
conf_queue = Queue()
exit_queue = Queue()
profile = get_profile(create_if_missing=True,
                      def_profl_path=f'{DBNET_FOLDER}/profile.yaml')
databases = get_databases(profile)
print(f'profile `{os.getenv("PROFILE_YAML")}` databases -> {list(databases)}')


def start_worker_webapp():
    """Starts the WebApp worker"""
    worker_name = '{}-webapp'.format(WORKER_PREFIX)

    worker = Worker(worker_name,
                    'web-app',
                    fn=webapp_worker.run,
                    log=log,
                    kill_if_running=True,
                    args=(WEBAPP_HOST, WEBAPP_PORT),
                    kwargs={'mon_worker': workers['mon']},