Ejemplo n.º 1
0
    def drop_table(self, table, log=log):
        "Drop table"

        try:
            sql = self._template('core.drop_table').format(table)
            self._do_execute(sql)
        except Exception as E:
            message = get_exception_message().lower()
            if self._template('error_filter.table_not_exist') in message:
                if self.echo:
                    log('Table "{}" already dropped.'.format(table))
            else:
                raise E
Ejemplo n.º 2
0
    def start_sql(sql, id, limit, options, sid):
        rows = fields = []
        get_fields = lambda r: r.__fields__ if hasattr(r, '__fields__'
                                                       ) else r._fields
        s_t = epoch()
        cache_used = False
        limit = int(options['limit']) if 'limit' in options else limit

        try:

            def exec_sql(sql, limit_def=5000):
                log('\n------------SQL-START------------\n{}\n------------SQL-END------------ \n'
                    .format(sql),
                    color='blue')
                log('LIMIT: ' + str(limit), color='blue')
                cache_used = False
                if sql in worker_sql_cache:
                    for fields, rows in list(worker_sql_cache[sql]['results']):
                        # if limit above limit_def, then refresh
                        if limit > limit_def: break

                        # if limit is same and not a csv call, then refresh
                        if limit == worker_sql_cache[sql][
                                'limit'] and 'csv' not in options:
                            break

                        # if ran more than 10 minutes ago, then refresh
                        if now_minus(minutes=10
                                     ) > worker_sql_cache[sql]['timestamp']:
                            del worker_sql_cache[sql]
                            break

                        if len(fields) > 0:
                            cache_used = True  # must return data/fields
                            worker_sql_cache[sql]['limit'] = limit
                            log('+Cache Used')

                        yield fields, rows, cache_used

                if not cache_used:
                    worker_sql_cache[sql] = dict(timestamp=now(),
                                                 results=[],
                                                 limit=limit)
                    rows = conn.query(
                        sql.replace('%', '%%'),
                        dtype='tuple',
                        limit=limit if limit > limit_def else limit_def)
                    fields = conn._fields
                    worker_sql_cache[sql]['results'].append((fields, rows))
                    yield fields, rows, cache_used

            if 'meta' in options:
                # get_schemas or
                meta_func = options['meta']
                rows = getattr(conn, meta_func)(**options['kwargs'])
                rows = [tuple(r) for r in rows]
                fields = conn._fields

            elif 'special' in options:
                pass

            else:
                for fields, rows, cache_used in exec_sql(sql):
                    fields, rows = fields, rows
                    rows = rows[:limit] if len(rows) > limit else rows

            if rows == None: rows = []

            if 'email_address' in options or 'csv' in options:
                file_name = '{}-{}-{}.csv'.format(database, options['name'],
                                                  data_dict['id'])
                file_path = '{}/{}'.format(CSV_FOLDER, file_name)
                write_csv(file_path, fields, rows)
                if os.path.getsize(file_path) > 20 * (1024**2):
                    rc = os.system('gzip -f ' + file_path)
                    file_name = file_name + '.gz' if rc == 0 else file_name
                    file_path = '{}/{}'.format(CSV_FOLDER, file_name)

                url = 'http://{base_url}:{port}/csv/{name}'.format(
                    base_url=socket.gethostname(),
                    port=WEBAPP_PORT,
                    name=file_name,
                )
                options['url'] = url

            if 'email_address' in options:
                subj = 'DbNet -- Result for Query {}'.format(data_dict['id'])
                body_text = 'URL: {url}\n\nROWS: {rows}\n\nSQL:\n{sql}'.format(
                    url=url, rows=len(rows), sql=sql)
                to_address = options['email_address']
                email_template = os.getenv("SMTP_TEMPLATE")
                if 'exchange_server' == email_template:
                    email_func = send_email_exchange
                elif 'outlook' == email_template:
                    email_func = send_from_outlook
                elif 'gmail' == email_template:
                    email_func = send_from_gmail
                else:
                    raise Exception('Email method not implemented!')

                email_func(to_address, subj, body_text)

                if len(rows) > 100:
                    rows = rows[:100]

            e_t = epoch()
            secs = e_t - s_t

            # Add query
            store.sqlx('queries').add(
                task_id=data_dict['id'],
                database=database,
                sql_text=sql,
                exec_date=s_t,
                duration_sec=secs,
                row_count=len(rows),
                limit_val=limit,
                cached=cache_used,
                sql_md5=hashlib.md5(sql.encode('utf-8')).hexdigest(),
                last_updated=epoch(),
            )

            if sql.strip():
                sql_fpath = '{}/{}.{}.sql'.format(SQL_FOLDER, database,
                                                  data_dict['id'])
                sql_text = '-- Completed @ {} in {} seconds.\n\n{}'.format(
                    now_str(), secs, sql)
                write_file(sql_fpath, sql_text)

            # time.sleep(0.5)
            data = dict(
                id=data_dict['id'],
                payload_type='query-data',
                database=database,
                rows=rows,
                headers=fields,
                start_ts=s_t,
                end_ts=e_t,
                execute_time=round(secs, 2),
                completed=True,
                cache_used=cache_used,
                options=options,
                pid=worker_pid,
                orig_req=data_dict,
                sid=sid,
            )

        except Exception as E:
            secs = epoch() - s_t
            err_msg_long = get_exception_message()
            err_msg = get_error_str(E)

            worker.log(E)
            data = dict(id=id,
                        payload_type='query-data',
                        database=database,
                        rows=[],
                        headers=[],
                        execute_time=round(secs, 2),
                        completed=False,
                        error='ERROR:\n' + err_msg,
                        options=options,
                        pid=worker_pid,
                        orig_req=data_dict,
                        sid=sid)

        finally:
            # worker.pipe.send_to_parent(data)
            worker.put_parent_q(data)
Ejemplo n.º 3
0
    def execute_multi(self,
                      sql,
                      dtype='namedtuple',
                      limit=None,
                      echo=True,
                      query_name='Record',
                      log=log):
        """
    Execute multiple SQL statements separtated by ';'. Returns a generator.
    Example:
      for fields, rows in conn.execute(sql):
        print(fields)
        print(len(rows))
    """

        self.reconnect(min_tresh=10)

        cursor = self.get_cursor()
        data = None
        fields = None
        rows = []
        message_mapping = {
            'drop ': 'Dropping {}.',
            'truncate ': 'Truncating {}.',
            'select ': 'Selecting {}.',
            'create ': 'Creating {}.',
            'insert ': 'Inserting {}.',
            'alter ': 'Altering {}.',
            'update ': 'Updating {}.',
            'delete ': 'Deleting {}.',
            'exec ': 'Calling Procedure {}.',
            'grant ': 'Granting {}.',
        }

        sqls = sql.split(';')

        for sql in sqls:
            if not sql.strip(): continue

            sql_ = sql.strip().lower()

            for word, message in message_mapping.items():
                if sql_.startswith(word):
                    if echo:
                        log(
                            message.format(' '.join(
                                sql_.splitlines()[0].split()[1:3]).upper()))
                    break

            # Call procedure with callproc
            if sql_.startswith('exec '):
                procedure = sql_[5:].split('(')[0]
                args = sql_[5:].split('(')[1][:-1].replace("'", '').split(',')
                args = [a.strip() for a in args]
                cursor.callproc(procedure, args)
                continue

            try:
                self._fields = []
                rows = self.select(sql,
                                   rec_name=query_name,
                                   dtype=dtype,
                                   limit=limit,
                                   echo=echo,
                                   log=log)
                fields = self._fields

                if '-- pk_test:' in sql.lower() and sql_.startswith('create'):
                    sql_lines = sql_.splitlines()
                    regexp = r'create\s+table\s+(\S*)[\sa-zA-Z\d]+ as'
                    table = re.findall(regexp, sql_lines[0])[0]
                    line = [
                        l for l in sql_lines
                        if l.strip().lower().startswith('-- pk_test:')
                    ][0]
                    fields = line.split(':')[-1]
                    self.check_pk(table, fields)

            except Exception as E:
                message = get_exception_message().lower()

                if sql_.startswith('drop ') and self.error_msg[
                        'table_not_exist'] in message:
                    log("WARNING: Table already dropped.")
                else:
                    raise E

            if not fields: fields = []

            yield fields, rows
Ejemplo n.º 4
0
  def __init__(self,
               app_name=None,
               master=None,
               conf={},
               spark_home=None,
               restart=False,
               hive_enabled=False,
               config_name=socket.gethostname().lower(),
               prog_handler=None,
               log=log):

    # restart = True if version != self.version else restart
    if os.getenv('KLOG'): os.system('bash $KLOG')  # kerb login
    spark_home = self.set_sparkenv(spark_home)

    from pyspark import SparkContext, SQLContext, SparkConf
    from pyspark.sql import SparkSession
    active_sc = SparkContext._active_spark_context

    if active_sc:
      log("Active SC ->> " + active_sc.appName)
      sc = active_sc
      spark = SparkSession(sc)
    else:
      sc = None
      spark = None

    if sc and restart:
      log('~Stopping Spark Instance ({})'.format(sc.appName))
      try:
        ps_data = {p.pid: p for p in psutil.process_iter() if p.cmdline()}
        child_pid = ps_data[os.getpid()].children()[0].pid
        if not hive_enabled:
          os.system('kill -9 ' + str(child_pid))
          SparkContext._gateway = None
      except:
        print(get_exception_message())

      sc.stop()
      sc = None

      # sc = sc.getOrCreate()

    profile = get_profile()
    if profile:
      conf_def = profile['spark-conf']
      if 'spark-conf-name' in profile:
        if config_name in profile['spark-conf-name']:
          # overwrite the default spark-conf
          for key in profile['spark-conf-name'][config_name]:
            conf_def[key] = profile['spark-conf-name'][config_name][key]
    else:
      conf_def = {
        "spark.master": "local[4]",
        "spark.driver.memory": "5g",
        "spark.driver.maxResultSize": "2g",
        "spark.driver.cores": "1",
        "spark.executor.instances": "4",
        "spark.executor.cores": "4",
        "spark.sql.broadcastTimeout": 900,
        # "spark.sql.tungsten.enabled": "true",
        "spark.io.compression.codec": "snappy",
        "spark.rdd.compress": "true",
        "spark.streaming.backpressure.enabled": "true",
        "spark.sql.parquet.compression.codec": "snappy",
      }

    # set extraClassPath
    conf_def["spark.driver.extraClassPath"] = self._get_jar_paths(profile)
    if 'SPARK_CLASSPATH' in os.environ and os.environ['SPARK_CLASSPATH']:
      conf_def["spark.driver.extraClassPath"] = conf_def["spark.driver.extraClassPath"] + ':' + os.environ['SPARK_CLASSPATH']
      del os.environ['SPARK_CLASSPATH']

    if master: conf['spark.master'] = master
    if hive_enabled: conf["spark.sql.catalogImplementation"] = "hive"

    for c in conf_def:
      conf[c] = conf_def[c] if c not in conf else conf[c]

    # Launch Spark Instance
    version = self.get_spark_version(spark_home)

    app_name = app_name if app_name else 'Spark_{}_{}_{}'.format(
      str(version).replace('.', ''), os.getenv('USER'), os.getpid())

    if not sc:
      log('Starting Spark Instance ({}) with version {} / {}'.format(
        app_name, version, conf['spark.master']))
      sc, spark, proc = self.init_spark(app_name, spark_home, hive_enabled, conf, restart, prog_handler)
      self.proc = proc

    self.hive_enabled = hive_enabled
    self.version = version
    self.sc = sc
    self.uiWebUrl = sc.uiWebUrl
    self.local_uiWebUrl = 'http://{}:{}'.format(socket.gethostname(), sc.uiWebUrl.split(':')[-1])
    self.spark = spark
Ejemplo n.º 5
0
    def execute(self,
                sql,
                dtype='tuple',
                limit=None,
                echo=True,
                query_name='Record',
                log=log):
        """Execute SQL, return last result"""
        self.reconnect(min_tresh=10)

        data = None
        fields = None
        rows = []
        message_mapping = {
            'drop ': 'Dropping {}.',
            'truncate ': 'Truncating {}.',
            'select ': 'Selecting {}.',
            'create ': 'Creating {}.',
            'insert ': 'Inserting {}.',
            'alter ': 'Altering {}.',
            'update ': 'Updating {}.',
            'delete ': 'Deleting {}.',
            'exec ': 'Calling Procedure {}.',
            'grant ': 'Granting {}.',
        }

        sql_ = sql.strip().lower()

        for word, message in message_mapping.items():
            if sql_.startswith(word):
                if echo:
                    log(
                        message.format(' '.join(
                            sql_.splitlines()[0].split()[1:3]).upper()))
                break

        # Call procedure with callproc
        if sql_.startswith('exec '):
            procedure = sql_[5:].split('(')[0]
            args = sql_[5:].split('(')[1][:-1].replace("'", '').split(',')
            args = [a.strip() for a in args]
            connection = self.engine.raw_connection()
            try:
                cursor = connection.cursor()
                cursor.callproc(procedure, args)
                self._fields = self._get_cursor_fields(
                    cursor_desc=cursor.description)
                rows = list(cursor.fetchall())
                cursor.close()
                connection.commit()
                return fields, rows
            finally:
                connection.close()

        try:
            self._fields = []
            rows = self.query(sql,
                              rec_name=query_name,
                              dtype=dtype,
                              limit=limit,
                              echo=echo,
                              log=log)
            fields = self._fields

            if '-- pk_test:' in sql.lower() and sql_.startswith('create'):
                sql_lines = sql_.splitlines()
                regexp = r'create\s+table\s+(\S*)[\sa-zA-Z\d]+ as'
                table = re.findall(regexp, sql_lines[0])[0]
                line = [
                    l for l in sql_lines
                    if l.strip().lower().startswith('-- pk_test:')
                ][0]
                fields = line.split(':')[-1]
                self.check_pk(table, fields)

        except Exception as E:
            message = get_exception_message().lower()

            if sql_.startswith(
                    'drop ') and self.error_msg['table_not_exist'] in message:
                log("WARNING: Table already dropped.")
            else:
                raise E

        if not fields: fields = []

        return fields, rows