def validate_eval(kv, p): """Validate a format with column values Args: kv (dict): column name as key, and value p (str): format to be evaluated Returns: bool: True on success, otherwise False """ try: s = p.format(**kv) except KeyError as e: raise DbProfilerException.ValidationError(_("Parameter error: ") + "`%s' %s" % (p, kv), rule=p, params=kv) try: return eval(s) except SyntaxError as e: raise DbProfilerException.ValidationError(_("Syntax error: ") + "`%s'" % s, rule=p, params=kv)
def validate(self, stats): """Validate a min/max rule based the column statistics Args: stats (dict): a table statistics. see Data_Structure.txt for more info. Returns: True if the expression is true, otherwise False. """ # rule: [ column_name, expression ] assert len(self.rule) == 2 assert 'columns' in stats c = None for col in stats['columns']: if col['column_name'] == self.rule[0]: c = col break if c is None: raise DbProfilerException.ValidationError( _("Column `%s' not found. Check your validation rule again.") % self.rule[0], self.rule) assert 'row_count' in stats assert ('nulls' in c and 'min' in c and 'max' in c and 'cardinality' in c) kv = { 'rows': stats['row_count'], 'nulls': c['nulls'], 'min': c['min'], 'max': c['max'], 'cardinality': c['cardinality'] } self.statistics[0] += 1 try: s = self.rule[1].format(**kv) except KeyError as e: self.statistics[1] += 1 raise DbProfilerException.ValidationError( _("Parameter error: ") + "`%s'" % kv, self.rule) try: if eval(s) is False: self.statistics[1] += 1 return False except SyntaxError: self.statistics[1] += 1 raise DbProfilerException.ValidationError( _("Syntax error: ") + "`%s'" % s, self.rule) return True
def query_to_resultset(self, query, max_rows=10000): """Build a QueryResult object from the query Args: query (str): a query string to be executed. max_rows (int): max rows which can be kept in a QueryResult object. Returns: QueryResult: an object holding query, column names and result set. """ assert query assert isinstance(query, unicode) log.trace('query_to_resultset: start query=%s' % query) res = QueryResult(query) try: if self.conn is None: self.connect() cur = self.conn.cursor() cur.execute(res.query) desc = [] for d in cur.description: desc.append(d[0]) res.column_names = deepcopy(tuple(desc)) for i, r in enumerate(cur.fetchall()): # let's consider the memory size. if i > max_rows: raise DbProfilerException.InternalError( u'Exceeded the record limit (%d) for QueryResult.' % max_rows, query=query) res.resultset.append( deepcopy([ float(x) if isinstance(x, Decimal) else x for x in r ])) cur.close() except DbProfilerException.InternalError as e: raise e except DbProfilerException.DriverError as e: raise e except Exception as e: raise DbProfilerException.QueryError( "Could not execute a query: %s" % e.args[1].split('\n')[0], query=query, source=e) finally: if self.conn: self.conn.rollback() log.trace('query_to_resultset: end') return res
def run_record_validation(self, schema_name, table_name, validation_rules=None, fetch_size=500000): log.trace('run_record_validation: start. %s.%s' % (schema_name, table_name)) v = DbProfilerValidator.DbProfilerValidator( schema_name, table_name, validation_rules=validation_rules) if not v.record_validators: log.info(_("Skipping record validation since no validation rule.")) return {} column_names = self.get_column_names(schema_name, table_name) if not column_names: msg = 'No column found on the table `%s\'.' % table_name raise DbProfilerException.InternalError(msg) q = u'SELECT %s "%s" FROM "%s"."%s"' % (self.parallel_hint, '","'.join(column_names), schema_name, table_name) (count, failed) = self._query_record_validation(q, v, fetch_size=fetch_size) log.trace(("run_record_validation: end. " "row count %d invalid record %d" % (count, failed))) return v.get_validation_results()
def connect(self): s = self.connstr + " user="******" password="******"Could not connect to the server: %s" % e.args[0].split('\n')[0], source=e) return True
def __init__(self, connstr, dbuser, dbpass): self.connstr = connstr self.dbuser = dbuser self.dbpass = dbpass name = "psycopg2" try: self.driver = __import__(name, fromlist=['']) except Exception as e: raise DbProfilerException.DriverError( u"Could not load the driver module: %s" % name, source=e)
def validate(self, column_names, record): """Validate one record Args: columns (list): a list of column names associated with each field. record (list): a record, consisting of list of fields. Returns: True on evaluation succeeded, otherwise False. """ # self.rule: [ column name, format ] assert len(self.rule) == 2 assert len(column_names) == len(record) self.statistics[0] += 1 kv = {} for k, v in zip(column_names, record): # if the value is not a number, it needs to be quoted with "'". try: float(v) kv[k] = v except ValueError as e: kv[k] = "'" + v + "'" try: s = self.rule[1].format(**kv) except KeyError as e: self.statistics[1] += 1 raise DbProfilerException.ValidationError( _("Parameter error: ") + "`%s'" % kv, self.rule) try: if eval(s) is False: self.statistics[1] += 1 return False except SyntaxError: self.statistics[1] += 1 raise DbProfilerException.ValidationError( _("Syntax error: ") + "`%s'" % s, self.rule) return True
def connect(self): try: self.conn = self.driver.connect(host=self.host, db=self.dbname, user=self.dbuser, passwd=self.dbpass) except Exception as e: raise DbProfilerException.DriverError( u"Could not connect to the server: %s" % e.args[1].split('\n')[0], source=e) return True
def connect(self): try: self.conn = self.driver.connect(self.host, self.dbuser, self.dbpass, self.dbname) except Exception as e: # assuming OperationalError msg = to_unicode(e[0][1]).replace('\n', ' ') msg = re.sub(r'DB-Lib.*', '', msg) raise DbProfilerException.DriverError( u"Could not connect to the server: %s" % msg, source=e) return True
def __init__(self, host, dbname, dbuser, dbpass): self.host = host self.dbname = dbname self.dbuser = dbuser self.dbpass = dbpass name = "pymssql" try: self.driver = __import__(name, fromlist=['']) except Exception as e: raise DbProfilerException.DriverError( u"Could not load the driver module: %s" % name, source=e)
def disconnect(self): if self.conn is None: return False try: self.conn.close() except Exception as e: raise DbProfilerException.DriverError( u"Could not disconnect from the server: %s" % to_unicode(e), source=e) self.conn = None return True
def disconnect(self): if self.conn is None: return False try: self.conn.close() except Exception as e: msg = (u"Could not disconnect from the server: %s" % unicode(e).split('\n')[0]) raise DbProfilerException.DriverError(msg, source=e) self.conn = None return True
def validate(self, dbdriver): if dbdriver is None: raise DbProfilerException.DriverError( _("Database driver not found.")) try: res = dbdriver.q2rs(self.query) except DbProfilerException.QueryError as e: raise DbProfilerException.ValidationError(_("SQL error: ") + "`%s'" % self.query, self.label, source=e) assert res assert len(res.column_names) == len(res.resultset[0]) assert len(res.resultset) == 1 kv = {} for k, v in zip(res.column_names, res.resultset[0]): kv[k] = v return validate_eval(kv, self.rule[2])
def get_sample_rows(self, schema_name, table_name, rows_limit=10): column_name = self.get_column_names(schema_name, table_name) if len(column_name) == 0: raise DbProfilerException.InternalError( "Could not get column names of the table: %s.%s" % (schema_name, table_name)) select_list = ','.join(column_name) assert select_list q = u'SELECT {0} FROM {1}.{2} LIMIT {3}'.format( select_list, schema_name, table_name, rows_limit) return self._query_sample_rows(q)
def __init__(self, host, port, dbname, dbuser, dbpass): self.host = host self.port = port self.dbname = dbname self.dbuser = dbuser self.dbpass = dbpass name = "cx_Oracle" try: self.driver = __import__(name, fromlist=['']) except Exception as e: msg = u"Could not load the driver module: %s" % name raise DbProfilerException.DriverError(msg, source=e)
def connect(self): try: if self.host is not None and self.port is not None: # use host name and port number dsn_tns = self.driver.makedsn(self.host, self.port, self.dbname) else: # use tns name dsn_tns = self.dbname log.trace("dsn_tns: %s" % dsn_tns) self.conn = self.driver.connect(self.dbuser, self.dbpass, dsn_tns) except Exception as e: msg = (u"Could not connect to the server: %s" % unicode(e).split('\n')[0]) raise DbProfilerException.DriverError(msg, source=e) return True