def open(self):
        if self._conn:
            log.info(_("The repository file `%s' has already been opened.") %
                     self.filename)
            return

        repo_found = False
        try:
            repo_found = os.path.exists(self.filename)
        except Exception as e:
            raise DbProfilerException(
                _("Could not access to the repository file `%s'.") %
                self.filename)

        if repo_found is False:
            raise InternalError(_("The repository file `%s' not found.") %
                                self.filename)

        try:
            self._conn = sqlite3.connect(self.filename)
        except Exception as e:
            raise DbProfilerException(
                _("Could not read the repository file `%s'.") % self.filename)

        assert self._conn
        log.info(_("The repository file `%s' has been opened.") %
                 self.filename)
        return
Example #2
0
    def validate_sql(self, dbdriver):
        if dbdriver is None:
            raise DriverError(u'Database driver not found.')

        validated_count = 0
        failed_count = 0
        for label in self.sql_validators:
            validator = self.sql_validators[label]
            log.info(_("Validating with SQL: %s") % '; '.join(validator.rule))
            validated_count += 1

            try:
                res = validator.validate(dbdriver)
            except ValidationError as e:
                log.error(_("SQL validation error: %s") %
                          '; '.join(validator.rule),
                          detail=e.source.value if e.source else None)
                self._column_counter.incr(validator.rule[0], validator.label)
                failed_count += 1
                continue

            if res is False:
                self._column_counter.incr(validator.rule[0], validator.label)
                failed_count += 1

        return (validated_count, failed_count)
 def destroy(self):
     try:
         if os.path.exists(self.filename):
             os.unlink(self.filename)
     except Exception as e:
         log.error(_("Could not destroy the repository."),
                   detail=unicode(e))
         return False
     log.info(_("The repository has been destroyed."))
     return True
Example #4
0
def export_file(filename, body):
    try:
        f = open(filename, "w")
        f.write(body.encode('utf-8'))
        f.close()
        log.info(_("Generated %s.") % filename)
    except IOError as e:
        log.error(_("Could not generate %s: %s") % (filename, unicode(e)))
        return False
    return True
 def init(self):
     try:
         if os.path.exists(self.filename):
             log.info(_("The repository already exists."))
             return True
         self.__init_sqlite3(self.filename)
     except Exception as e:
         log.error(_("Could not create the repository."), detail=unicode(e))
         return False
     log.info(_("The repository has been initialized."))
     return True
    def create_validation_rule(self, database_name, schema_name, table_name,
                               column_name, description, rule,
                               param='', param2=''):
        """
        Args:
            database_name(str):
            schema_name(str):
            table_name(str):
            column_name(str):
            description(str):
            rule(str):
            param(str):
            param2(str):

        Returns:
            integer when the rule successfully gets created. None when already
            exists.
        """

        r = self.get_validation_rules(database_name, schema_name, table_name,
                                      column_name, description, rule,
                                      param, param2)
        assert len(r) <= 1
        if r:
            log.warning((_("The same validation rule already exists: ") +
                         u"{0},{1},{2},{3},{4},{5},{6},{7}"
                         .format(database_name, schema_name, table_name,
                                 column_name, description, rule,
                                 param, param2)))
            return None

        query = u"""
INSERT INTO validation_rule (id,database_name,schema_name,table_name,
                             column_name,description,rule,param,param2)
  VALUES ((SELECT coalesce(max(id),0)+1 FROM validation_rule),
          '{0}','{1}','{2}','{3}','{4}','{5}','{6}','{7}');
""".format(database_name, schema_name, table_name, column_name, description,
           rule,
           '' if param is None else "%s" % param.replace("'", "''"),
           '' if param2 is None else "%s" % param2.replace("'", "''"))

        log.trace("create_validation_rule: %s" % query.replace('\n', ''))
        id = None
        try:
            cursor = self._conn.cursor()
            cursor.execute(query)
            cursor.execute("SELECT max(id) FROM validation_rule")
            id = cursor.fetchone()[0]
            self._conn.commit()
        except Exception as e:
            raise InternalError(_("Could not register validation rule: "),
                                query=query, source=e)
        return id
Example #7
0
    def connect(self):
        if self.dbconn is None:
            log.info(_("Connecting the database."))
            try:
                self.dbdriver.connect()
            except DbProfilerException as e:
                log.error(_("Could not connect to the database."),
                          detail=e.source)
                log.error(_("Abort."))
                sys.exit(1)

            self.dbconn = self.dbdriver.conn
            log.info(_("Connected to the database."))
        return True
Example #8
0
    def add_rule(self, id_, database_name, schema_name, table_name,
                 column_name, description, rule, param, param2=None):
        assert isinstance(id_, int)

        if self.schema_name != schema_name or self.table_name != table_name:
            return False

        label = id_
        log.debug("add_rule: label = %s" % label)

        assert param
        if rule == 'regexp':
            self.add_rule_regexp(label, column_name, param)
        elif rule == 'eval':
            self.add_rule_eval(label, column_name, param)
        elif rule == 'columnstat':
            self.add_rule_columnstat(label, column_name, param)
        elif rule == 'sql':
            assert param2
            self.add_rule_sql(label, column_name, param, param2)
        else:
            raise InternalError(_("Unsupported validation rule: %s") % rule)
        self.descriptions[label] = description

        return True
    def get_table_list(self, database_name=None, schema_name=None,
                       table_name=None):
        table_list = []

        cond = []
        if database_name:
            cond.append("database_name = '%s'" % database_name)
        if schema_name:
            cond.append("schema_name = '%s'" % schema_name)
        if table_name:
            cond.append("table_name = '%s'" % table_name)
        where = "WHERE (%s)" % " AND ".join(cond) if cond else ''

        query = """
SELECT DISTINCT database_name, schema_name, table_name
  FROM repo
{0}
 ORDER BY database_name, schema_name, table_name
""".format(where)

        log.trace("get_table_list: query = %s" % query)

        try:
            cursor = self._conn.cursor()
            for r in cursor.execute(query):
                table_list.append([r[0], r[1], r[2]])
        except Exception as e:
            log.error(_("Could not get data."), detail=unicode(e))
            return None

        return table_list
    def get_datamap_items(self, database_name, schema_name, table_name,
                          column_name=None):
        """Get one or more datamap entries from the repository

        Args:
          database_name (str):
          schema_name_name (str):
          table_name (str):
          column_name (str):

        Returns:
          list: a list which consists of one or more datamap entries.
        """
        assert database_name and schema_name and table_name

        query = u"""
SELECT data FROM datamapping
WHERE database_name = '%s' AND schema_name = '%s' AND table_name = '%s'
""" % (database_name, schema_name, table_name)
        if column_name:
            query = query + u" AND column_name = '%s'" % column_name
        query = query + u"ORDER BY lineno"

        datamap = []
        try:
            cursor = self._conn.cursor()
            for r in cursor.execute(query):
                datamap.append(json.loads(r[0]))
        except Exception as e:
            raise InternalError(_("Could not get data."), query=query)

        return datamap
Example #11
0
def export_json(repo, tables=[], output_path='./json'):
    json_data = []
    try:
        f = open(output_path + "/EXPORT.JSON", "a")
        for tab in tables:
            database_name = tab[0]
            schema_name = tab[1]
            table_name = tab[2]
            data = repo.get_table(database_name, schema_name, table_name)
            json_data.append(data)
        f.write(json.dumps(json_data, indent=2).encode('utf-8'))
        f.close()
        log.info(_("Generated JSON file."))
    except IOError, e:
        log.error(_("Could not generate JSON file."))
        sys.exit(1)
    def get_validation_rule(self, id):
        """
        Args:
            id(integer):

        Returns:
            tuple: (id,database_name,schema_name,table_name,column_name,
                    description,rule,param,param2) or None
        """
        query = (u"SELECT id,database_name,schema_name,table_name,column_name,"
                 u"description,rule,param,param2 FROM validation_rule "
                 u"WHERE id = %d" % id)

        log.trace("get_validation_rule: %s" % query.replace('\n', ''))
        tup = None
        try:
            cursor = self._conn.cursor()
            cursor.execute(query)
            r = cursor.fetchone()
            if r:
                tup = tuple(r)
        except Exception as e:
            raise InternalError(_("Could not get validation rule: "),
                                query=query, source=e)
        return tup
Example #13
0
    def validate_table(self, table_data):
        validated_count = 0
        failed_count = 0

        # Run statistics validators.
        for label in self.statistics_validators:
            validator = self.statistics_validators[label]
            log.info(_("Validating column statistics: %s") %
                     '; '.join(validator.rule))
            validated_count += 1
            try:
                res = validator.validate(table_data)
            except ValidationError as e:
                log.error(u'%s' % e.value)
                res = False

            if res is False:
                log.trace("VALIDATION FAILED: %s %s %s" %
                          (validator.label, unicode(validator.rule),
                           validator.column_names))
                self._column_counter.incr(validator.rule[0], validator.label)
                failed_count += 1
            else:
                log.trace("VALIDATION OK: %s %s %s" %
                          (validator.label, unicode(validator.rule),
                           validator.column_names))

        return (validated_count, failed_count)
Example #14
0
def create_popover_content(term):
    assert isinstance(term, dict)
    synonyms = ', '.join(term.get('synonyms', []))
    if synonyms:
        synonyms = u'<br/>%s: ' % _("Synonym") + synonyms
    related = ', '.join(term.get('related_terms', []))
    if related:
        related = u'<br/>%s: ' % _("Related Terms") + related
    assets = ', '.join(term.get('assigned_assets', []))
    if assets:
        assets = u'<br/>%s: ' % _("Assigned Assets") + assets
    content = (u"{1}<br/>{2}{3}{4}<div align=right><a href='glossary.html#{0}'"
               u" target='_glossary'>{5}</a></div>".format
               (term['term'], term['description_short'], synonyms, related,
                assets, _("Details...")))
    return content
    def append_table(self, tab):
        """
        Update a table record if the same record (with same timestamp)
        already exist.
        Otherwise, append the table record to the repository.

        Args:
            tab: a dictionary of table record.

        Returns:
            True on success, otherwise False.
        """
        assert (tab['database_name'] and tab['schema_name'] and
                tab['table_name'] and tab['timestamp'])

        query = None

        log.trace("append_table: start %s.%s.%s" %
                  (tab['database_name'], tab['schema_name'],
                   tab['table_name']))

        try:
            if self.has_table_record(tab):
                query = """
UPDATE repo
   SET data = '%s'
 WHERE database_name = '{database_name}'
   AND schema_name = '{schema_name}'
   AND table_name = '{table_name}'
   AND created_at = datetime('{timestamp}')
""".format(**tab) % DbProfilerFormatter.jsonize(tab).replace("'", "''")
            else:
                query = """
INSERT INTO repo VALUES ('{database_name}','{schema_name}','{table_name}',
                         datetime('{timestamp}'), '%s')
""".format(**tab) % DbProfilerFormatter.jsonize(tab).replace("'", "''")
                log.trace("append_table: INSERT")

            log.debug("append_table: query = %s" % query)

            assert self._conn
            cursor = self._conn.cursor()

            assert cursor
            cursor.execute(query)
            self._conn.commit()
        except Exception as e:
            raise InternalError(_("Could not register table data: "),
                                query=query, source=e)

        # Remove all tag id/label pairs to replace with new ones.
        tagid = "%s.%s.%s" % (tab['database_name'], tab['schema_name'],
                              tab['table_name'])
        self.delete_tag_id(tagid)
        if tab.get('tags'):
            for label in tab['tags']:
                self.put_tag(tagid, label)

        log.trace("append_table: end")
        return True
    def get_tags(self):
        """Get a list of tag names and number of tags associated with tables.

        Returns:
            list: a list of lists: [[tag,num of tables], ...]
        """
        log.trace("get_tags: start")

        query = """
SELECT tag_label,
       COUNT(*)
  FROM tags
 WHERE tag_label <> ''
 GROUP BY
       tag_label
 ORDER BY
       COUNT(*) DESC
"""

        tags = []
        try:
            cursor = self._conn.cursor()
            log.debug("get_tags: query = %s" % query)
            for r in cursor.execute(query):
                tags.append([r[0], r[1]])
        except Exception as e:
            log.trace("get_tags: " + unicode(e))
            raise InternalError(_("Could not get tag info: "),
                                query=query, source=e)

        log.trace("get_tags: end")
        return tags
    def get(self):
        jsondata = u""
        try:
            data_all = []

            cursor = self._conn.cursor()
            for r in cursor.execute("SELECT * FROM repo"):
                data_all.append(json.loads(unicode(r[4])))

            log.info(_("Retrieved all data from the repository `%s'.") %
                     self.filename)
        except Exception as e:
            log.error(_("Could not retreive from the repository `%s'") %
                      self.filename, detail=unicode(e))
            return None
        return data_all
Example #18
0
    def run_postscan_validation(self, schema_name, table_name, tablemeta,
                                columnmeta, table_data, validation_rules):
        if not validation_rules:
            return table_data

        v = DbProfilerValidator.DbProfilerValidator(table_data['schema_name'],
                                                    table_data['table_name'],
                                                    self, validation_rules)

        log.info(_("Column statistics validation: start"))
        validated1, failed1 = v.validate_table(table_data)
        log.info(_("Column statistics validation: end (%d)") % validated1)
        log.info(_("SQL validation: start"))
        validated2, failed2 = v.validate_sql(self.dbdriver)
        log.info(_("SQL validation: end (%d)") % validated2)

        v.update_table_data(table_data)
        return table_data
Example #19
0
    def _run_record_validation(self, schema_name, table_name, tablemeta,
                               columnmeta, validation_rules,
                               skip_record_validation):
        log.info(_("Record validation: start"))
        if skip_record_validation:
            log.info(_("Record validation: skipping"))
            return
        if not validation_rules:
            log.info(_("Record validation: no validation rule"))
            return

        validation = self.run_record_validation(schema_name, table_name,
                                                validation_rules)
        assert isinstance(validation, dict)

        for col in tablemeta.column_names:
            if validation and col in validation:
                columnmeta[col].validation = validation[col]
        log.info(_("Record validation: end"))
 def get_tag_labels(self, tag_id):
     labels = []
     try:
         cursor = self._conn.cursor()
         query = u"SELECT tag_label FROM tags WHERE tag_id = '%s'" % tag_id
         for r in cursor.execute(query):
             labels.append(r[0])
     except Exception as e:
         raise InternalError(_("Could not get tag labels: "),
                             query=query, source=e)
     return labels
 def delete_textelement(self, id_):
     log.trace('delete_textelement: start')
     try:
         cursor = self._conn.cursor()
         query = u"DELETE FROM textelement WHERE id_= '%s'" % id_
         cursor.execute(query)
         self._conn.commit()
     except Exception as e:
         raise InternalError(_("Could not delete text element: "),
                             query=query, source=e)
     log.trace('delete_textelement: end')
     return True
 def delete_tag_id(self, tag_id):
     log.trace('delete_tag_id: start %s' % tag_id)
     try:
         cursor = self._conn.cursor()
         query = u"DELETE FROM tags WHERE tag_id = '%s'" % tag_id
         cursor.execute(query)
         self._conn.commit()
     except Exception as e:
         raise InternalError(_("Could not delete tag id: "),
                             query=query, source=e)
     log.trace('delete_tag_id: end')
     return True
 def put_textelement(self, id_, text):
     log.trace('put_textelement: start')
     try:
         cursor = self._conn.cursor()
         query = (u"INSERT INTO textelement VALUES ('%s', '%s')" %
                  (id_, text if text else ''))
         cursor.execute(query)
         self._conn.commit()
     except Exception as e:
         raise InternalError(_("Could not register text element: "),
                             query=query, source=e)
     log.trace('put_textelement: end')
     return True
 def get_textelements(self, id_):
     log.trace('get_textelements: start')
     texts = []
     try:
         cursor = self._conn.cursor()
         query = u"SELECT text_ FROM textelement WHERE id_= '%s'" % id_
         for r in cursor.execute(query):
             texts.append(r[0])
     except Exception as e:
         raise InternalError(_("Could not get text element: "),
                             query=query, source=e)
     log.trace('get_textelements: end')
     return texts
    def set(self, data):
        try:
            cursor = self._conn.cursor()
            cursor.execute("DELETE FROM repo")
            self._conn.commit()
        except Exception as e:
            log.error(_("Could not initialize the repository."),
                      detail=unicode(e))
            return False

        for d in data:
            self.append_table(d)

        return True
Example #26
0
    def verify(self):
        repo = DbProfilerRepository.DbProfilerRepository(self.repofile)
        repo.open()

        log.info(_("Verifying the validation results."))

        table_list = repo.get_table_list()
        valid = 0
        invalid = 0
        for t in table_list:
            table = repo.get_table(t[0], t[1], t[2])
            v, i = verify_table(table)
            valid += v
            invalid += i

        if invalid == 0:
            log.info(
                _("No invalid results: %d/%d") % (invalid, valid + invalid))
        else:
            log.info(_("Invalid results: %d/%d") % (invalid, valid + invalid))

        repo.close()
        return (True if invalid > 0 else False)
    def get_files(self, objtype, objid):
        """Get file names assigned to the object.

        Args:
            objtype(str): object type ['tag','schema','table']
            objid(str): object identifier

        Returns:
            list: a list of file names.
        """
        if objtype not in ['tag', 'schema', 'table']:
            raise InternalError(_('invalid object type: %s') % objtype)

        id_ = u'%s:%s' % (objtype, objid)
        return self.get_textelements(id_)
    def delete_files(self, objtype, objid):
        """Remove file names associated to the object.

        Args:
            objtype(str): object type ['tag','schema','table']
            objid(str): object identifier

        Returns:
            bool: True if succeeded.
        """
        if objtype not in ['tag', 'schema', 'table']:
            raise InternalError(_('invalid object type: %s') % objtype)

        id_ = u'%s:%s' % (objtype, objid)
        return self.delete_textelement(id_)
    def update_validation_rule(self, id, database_name, schema_name,
                               table_name, column_name, description,
                               rule, param=None, param2=None):
        """
        Args:
            id(integer):
            database_name(str):
            schema_name(str):
            table_name(str):
            column_name(str):
            description(str):
            rule(str):
            param(str):
            param2(str):

        Returns:
            True when the rule successfully gets updated, otherwise False.
        """

        query = u"""
UPDATE validation_rule
   SET database_name = '{0}',
       schema_name = '{1}',
       table_name = '{2}',
       column_name = '{3}',
       description = '{4}',
       rule = '{5}',
       param = '{6}',
       param2 = '{7}'
 WHERE id = {8}
""".format(database_name, schema_name, table_name, column_name, description,
           rule,
           '' if param is None else "%s" % param,
           '' if param2 is None else "%s" % param2, id)

        log.trace("update_validation_rule: %s" % query.replace('\n', ''))
        rowcount = 0
        try:
            cursor = self._conn.cursor()
            cursor.execute(query)
            rowcount = cursor.rowcount
            self._conn.commit()
        except Exception as e:
            raise InternalError(_("Could not update validation rule: "),
                                query=query, source=e)
        if rowcount == 0:
            return False
        return True
 def get_bg_terms_all(self):
     """
     Returns:
         list: a list of terms in the business glossary.
     """
     query = (u"SELECT term FROM business_glossary WHERE is_latest = 1 "
              u"ORDER BY length(term) desc,term")
     try:
         cursor = self._conn.cursor()
         cursor.execute(query)
         data = []
         for r in cursor.fetchall():
             data.append(r[0])
     except Exception as e:
         raise InternalError(_("Could not get a list of business terms: "),
                             query=query, source=e)
     return data