def does_period_have_data(inst, db, ssl, table, start_dt, stop_dt): assert start_dt is None or isinstance(start_dt, datetime.datetime) assert stop_dt is None or isinstance(stop_dt, datetime.datetime) if start_dt is None or stop_dt is None: ymd_filter = '' else: ymd_filter = tools.get_ymd_filter(tools.dt_to_iso8601(start_dt), tools.dt_to_iso8601(stop_dt)) ssl_opt = tools.format_ssl(ssl) sql = """ SELECT 'found-data' FROM {tab} c WHERE 1 = 1 {filter} LIMIT 1 """.format(tab=table, filter=ymd_filter) sql = ' '.join(sql.split()) cmd = """ impala-shell -i {inst} -d {db} --quiet -B {ssl} -q "{sql}" | columns | cut -f 1 """.format(inst=inst, db=db, ssl=ssl_opt, sql=sql) r = envoy.run(cmd) if r.status_code != 0: print(cmd) print(r.std_err) print(r.std_out) tools.abort("Error: does_period_have_data() failed!") else: if 'found-data' in r.std_out.strip(): return True else: return False
def get_cmd(inst, db, child_table, child_col, parent_table, parent_col, start_ts, stop_ts, ssl): if start_ts is None or stop_ts is None: filter = '' else: filter = tools.get_ymd_filter(start_ts, stop_ts) sql = """ WITH t1 AS ( \ SELECT c.{c_col} AS child_col, \ p.{p_col} AS par_col \ FROM {c_tab} c \ LEFT OUTER JOIN {p_tab} p \ ON c.{c_col} = p.{p_col} \ WHERE p.{p_col} IS NULL \ {filter} \ ) \ SELECT COALESCE(COUNT(*), 0) \ FROM t1 \ """.format(c_col=child_col, p_col=parent_col, c_tab=child_table, p_tab=parent_table, filter=filter) sql = ' '.join(sql.split()) ssl_opt = tools.format_ssl(ssl) cmd = """ impala-shell -i {inst} -d {db} --quiet -B {ssl} -q "{sql}" """.format(inst=inst, db=db, ssl=ssl_opt, sql=sql) mode = 'incremental' if filter else 'full' return cmd, mode
def get_cmd(inst, db, table, cols, ssl): sql = """ WITH t1 AS ( \ SELECT %s , \ COUNT(*) AS dup_cnt \ FROM %s \ GROUP BY %s \ HAVING COUNT(*) > 1 \ ) \ SELECT COUNT(*) \ FROM t1 \ WHERE dup_cnt > 1 \ """ % (cols, table, cols) sql = ' '.join(sql.split()) sslopt = tools.format_ssl(ssl) cmd = """ impala-shell -i %s -d %s --quiet -B %s -q "%s" """ % (inst, db, sslopt, sql) return cmd
def get_first_dt_by_ymd(inst, db, table, ssl): sql = """ with year_tab AS ( SELECT MIN(year) AS year FROM {tab} ), mon_tab AS ( SELECT MIN(month) AS month FROM {tab} t INNER JOIN year_tab yt ON t.year = yt.year ), day_tab AS ( SELECT MIN(day) AS day FROM {tab} t INNER JOIN year_tab yt ON t.year = yt.year INNER JOIN mon_tab mt ON t.month = mt.month ) SELECT year, month, day FROM year_tab CROSS JOIN mon_tab CROSS JOIN day_tab """.format(tab=table) ssl_option = tools.format_ssl(ssl) sql = ' '.join(sql.split()) cmd = """ impala-shell -i {inst} -d {db} --quiet --output_delimiter ',' -B {ssl} -q '{sql}' """\ .format(inst=inst, db=db, sql=sql, ssl=ssl_option) try: stdout = subprocess.check_output(cmd, shell=True)[:-1] # remove ending newline except subprocess.CalledProcessError as e: return None #FIXME: why would this happen? if stdout: fields = stdout.split(',') assert len(fields) == 3, "Invalid fields: %s" % ','.join(fields) return datetime.datetime(int(fields[0]), int(fields[1]), int(fields[2])) else: # no data found return None