Ejemplo n.º 1
0
def test_join_match():
    t1 = 'bank.mint_categories'
    t2 = 'bank.mint_categories'
    t1_field = 'category, sub_category'
    t2_field = 'category, sub_category'
    conn = get_conn('PG_XENIAL')
    rows = conn.analyze_join_match(t1, t2, t1_field, t2_field, as_sql=False)
    print(rows)
Ejemplo n.º 2
0
    def setUpClass(cls):
        cls.client = docker.from_env()
        cls.container = cls.client.containers.run(**cls.cntnr_params)
        log('-Waiting for {} to start...'.format(cls.db_name))

        st = now()
        os.environ['PROFILE_YAML'] = get_dir_path(
            __file__) + '/../database/templates/profile.yaml'
        cls.conn = None

        while 1:
            time.sleep(1)
            try:
                cls.conn = get_conn(cls.db_name, echo=False)
                break
            except Exception as E:
                if tdelta_seconds(now(), st) > cls.timeout:
                    cls.container.kill()
                    raise E
Ejemplo n.º 3
0
def ipy_imports(launch_spark=False):
    from xutil.helpers import (log, get_exception_message, get_error_str,
                               get_profile)
    from xutil.database.base import get_conn
    from xutil.diskio import (write_csv, read_csv, read_file, write_file,
                              get_hdfs)
    from jmespath import search
    from pathlib import Path
    from collections import Counter, namedtuple, OrderedDict
    import time, datetime

    if launch_spark:
        from xutil.database.spark import Spark
        parser = argparse.ArgumentParser(description='Spark IPython')
        parser.add_argument('--master',
                            help='Master string for Spark Instance')
        parser.add_argument('--profile',
                            help='Database profile name from PROFILE_YAML')
        args = parser.parse_args()
        dbs = get_profile(create_if_missing=True)['databases']
        if args.profile and args.profile in dbs and dbs[
                args.profile]['type'].lower() in ('hive', 'spark'):
            conn = get_conn(args.profile)
            globals()['sparko'] = conn.sparko
        elif args.profile:
            log(
                Exception('Profile {} not found or incompatible.'.format(
                    args.profile)))
            sys.exit(1)
        else:
            globals()['sparko'] = Spark(master=args.master)
        globals()['sc'] = sparko.sc
        globals()['spark'] = sparko.spark

    ldict = locals()
    for name in ldict:
        var = ldict[name]
        if callable(var) or isinstance(var, __builtins__.__class__):
            # is a function or class or module
            globals()[name] = var
Ejemplo n.º 4
0
  def jdbc_write(self,
                 df,
                 db_profile,
                 sql_table,
                 create_table=True,
                 truncate=False,
                 order_by=[],
                 grant_sql=None,
                 post_sql=None,
                 batch_size=20000,
                 partitions=7,
                 tgt_mode='append',
                 log=log,
                 **kwargs):
    "Write data to table"

    count_recs = get_kw('count_recs', False, kwargs)
    counter = df.count() if count_recs else get_kw('tot_cnt', -1, kwargs)


    if db_profile['type'] == 'oracle':
      # trim field names if is Oracle
      for col in df.columns:
        df = df.withColumnRenamed(col, col[:30]) if len(col) > 30 else df
      for col in df.columns:
        df = df.withColumnRenamed(col, col.upper().replace(' ', '_'))

      # convert booleans to string
      df = self.convert_fields(df, 'boolean', 'string')

    s_t = datetime.datetime.now()
    conn = get_conn(db_profile['name'])

    if truncate and tgt_mode != 'overwrite':
      create_table = False
      conn.execute('TRUNCATE TABLE ' + sql_table)
      time.sleep(2)

    if tgt_mode == 'overwrite':
      field_types = self.df_to_field_types(df)
      conn.create_table(sql_table, field_types, drop=True)
      if grant_sql:
        conn.execute(grant_sql)
      time.sleep(2)
      tgt_mode = 'append'

    partitions = partitions if order_by else 1
    df.repartition(partitions).write \
      .format("jdbc") \
      .option("url", db_profile['jdbc_url']) \
      .option("dbtable", sql_table) \
      .option("user", db_profile['user']) \
      .option("password", db_profile['password']) \
      .option("batchsize", batch_size) \
      .save(mode=tgt_mode)

    # .option("createTableColumnTypes", create_types_str) \ # 2.2 compatible
    # .option("sessionInitStatement", sessionInitStatement) \ # 2.3 compatible
    # .option("customSchema", customSchema) \ # 2.3 compatible

    # if order_by:
    #   # using many partitions, the data is not ordered
    #   sql = '''
    #   CREATE TABLE {t2} AS SELECT * FROM {t1} ORDER BY {ord};
    #   DROP TABLE {t1} CASCADE CONSTRAINTS PURGE;
    #   ALTER TABLE {t2} RENAME TO {t1n};
    #   '''.format(
    #     t1=sql_table,
    #     # t1n=sql_table.split('.')[-1] if '.' in sql_table else sql_table,
    #     t1n=sql_table.split('.')[-1] if '.' in sql_table else sql_table,
    #     t2=sql_table + 'z',
    #     ord=', '.join(order_by),
    #   )
    #   conn.execute(sql, 'ORDERING', echo=False)

    if post_sql:
      conn.execute(post_sql, 'EXECUTING POST-SQL', echo=False)

    secs = (datetime.datetime.now() - s_t).total_seconds()
    mins = round(secs / 60, 1)
    rate = round(counter / secs, 1)

    log("Inserted {} records into table '{}' in {} mins [{} r/s].".format(
      counter, sql_table, mins, rate))
Ejemplo n.º 5
0
  def jdbc_read(self, db_name, sql_table, partition_dict={}, fetchsize=10000, log=log):
    "Read data from table or sub-query SQL"
    db_profile = get_db_profile(db_name)
    null_sql_table = None

    if partition_dict:
      # This attempts the split a range evenly
      # This is not optimal when there is no certainty that the range is
      # continuous, such as an incremented BIGINT primary key.

      if not ('partitionColumn' in partition_dict and 'numPartitions' in partition_dict):
        raise("partition_dict must contain 'partitionColumn' and 'numPartitions'")

      # Automatically obtain lowerBound and upperBound values.
      if not ('lowerBound' in partition_dict and 'upperBound' in partition_dict):
        conn = get_conn(db_name, echo=False)
        sql = conn.template('routine.number_min_max').format(
          field=partition_dict['partitionColumn'],
          table=sql_table,
        )
        log('Getting partitionColumn stats for {}'.format(
          partition_dict['partitionColumn']))
        data = conn.select(sql, echo=False)

        if not len(data):
          raise('Table "{}" seems empty?'.format(sql_table))

        tot_cnt, field_cnt, min_val, max_val = data[0]
        null_cnt = tot_cnt - field_cnt

        # if there are null count, they will be missed
        if null_cnt > 0:
          log('null_cnt ({}) is > 0 for field "{}" in table "{}". Selecting a Null filtered partition!'.format(null_cnt, partition_dict['partitionColumn'], sql_table))

          null_sql_table = "(select * from {} where {} is null)".format(sql_table, partition_dict['partitionColumn'])

          df_null = self.spark.read \
            .format("jdbc") \
            .option("url", db_profile['jdbc_url']) \
            .option("dbtable", null_sql_table) \
            .option("user", db_profile['user']) \
            .option("password", db_profile['password']) \
            .option("fetchsize", fetchsize) \
            .load()

        partition_dict['lowerBound'] = min_val
        partition_dict['upperBound'] = max_val

      df = self.spark.read \
        .format("jdbc") \
        .option("url", db_profile['jdbc_url']) \
        .option("dbtable", sql_table) \
        .option("user", db_profile['user']) \
        .option("password", db_profile['password']) \
        .option("partitionColumn", partition_dict['partitionColumn']) \
        .option("lowerBound", partition_dict['lowerBound']) \
        .option("upperBound", partition_dict['upperBound']) \
        .option("numPartitions", partition_dict['numPartitions']) \
        .option("fetchsize", fetchsize) \
        .load()
    else:
      df = self.spark.read \
        .format("jdbc") \
        .option("url", db_profile['jdbc_url']) \
        .option("dbtable", sql_table) \
        .option("user", db_profile['user']) \
        .option("password", db_profile['password']) \
        .option("fetchsize", fetchsize) \
        .load()
      tot_cnt = df.count()

    if null_sql_table:
      df.registerTempTable("df")
      df_null.registerTempTable("df_null")
      df = self.spark.sql('select * from df union all select * from df_null')

    self._last_df_cnt = tot_cnt
    return df
Ejemplo n.º 6
0
 def setUp(self):
     self.conn = get_conn(db_name, echo=False)
     if pre_sql:
         self.conn.execute(pre_sql)