Beispiel #1
0
    def initFromCatalog(dbURL, utility=False, useAllSegmentFileSpaces=False):
        """
        Factory method, initializes a HAWQArray from provided database URL

        Please note that -
        useAllSegmentFilespaces when set to true makes this method add *all* filespaces
        to the segments of hawqarray. If false, only returns Master/Standby all filespaces
        This is *hacky* and we know that it is not the right way to design methods/interfaces
        We are doing this so that we do not affect behavior of existing tools like upgrade, gprecoverseg etc
        """

        conn = dbconn.connect(dbURL, utility)

        # Get the version from the database:
        version_str = None
        for row in dbconn.execSQL(conn, "SELECT version()"):
            version_str = row[0]
        version = GpVersion(version_str)

        # Now only support HAWQ 2.x
        hawq_major_version = version.getVersionRelease().split('.')[0]
        if hawq_major_version == '2':

            hawq_site = HawqXMLParser(GPHOME)
            master_data_directory = hawq_site.get_value_from_name(
                'hawq_master_directory')
            segment_data_directory = hawq_site.get_value_from_name(
                'hawq_segment_directory')

            config_rows = dbconn.execSQL(
                conn, '''
                               SELECT sc.registration_order,
                                      sc.role,
                                      sc.status,
                                      sc.hostname,
                                      sc.address,
                                      sc.port,
                                      CASE
                                          WHEN sc.registration_order <= 0 THEN '%s'
                                          ELSE '%s'
                                      END AS datadir
                               FROM pg_catalog.gp_segment_configuration sc
                               ORDER BY sc.registration_order;''' %
                (master_data_directory, segment_data_directory))

            # All of filesystem is shared storage
            filesystemRows = dbconn.execSQL(
                conn, '''
                SELECT oid, fsysname, true AS fsysshared
                FROM pg_filesystem
                ORDER BY fsysname
            ''')

            filesystemArr = [
                HAWQFilesystemObj(fsysRow[0], fsysRow[1], fsysRow[2])
                for fsysRow in filesystemRows
            ]

            filespaceRows = dbconn.execSQL(
                conn, '''
                SELECT oid, fsname, fsfsys AS fsoid
                FROM pg_filespace
                WHERE oid != %d
                ORDER BY fsname;
            ''' % (SYSTEM_FILESPACE))

            filespaceArr = [
                HAWQFilespaceObj(
                    fsRow[0], fsRow[1],
                    HAWQFilesystemObj.getFilesystemObj(filesystemArr,
                                                       fsRow[2]))
                for fsRow in filespaceRows
            ]

        else:
            raise Exception("HAWQ version is invalid: %s" % version)

        hawqdbs = []
        print "### initFromCatalog ###"
        hdb = None
        for row in config_rows:

            print row

            # Extract fields from the row
            (registration_order, role, status, hostname, address, port,
             datadir) = row

            # In GPSQL, only master maintain the filespace information.
            # if registration_order != MASTER_REGISTRATION_ORDER and \
            #    fsoid != SYSTEM_FILESPACE and \
            #    not useAllSegmentFileSpaces:
            #    print "### initFromCatalog ... continue ###"
            #    continue

            # The query returns all the filespaces for a segment on separate
            # rows.  If this row is the same dbid as the previous row simply
            # add this filespace to the existing list, otherwise create a
            # new segment.
            # if seg and seg.getSegmentRegistrationOrder() == registration_order:
            #     seg.addSegmentFilespace(fsoid, fslocation)
            # else:
            #     seg = HAWQDB(registration_order, role, status,
            #                 hostname, address, port, datadir)
            #    segments.append(seg)

            hdb = HAWQDB(registration_order, role, status, hostname, address,
                         port, datadir)
            print "### initFromCatalog ... hdb ###"
            print hdb
            hawqdbs.append(hdb)
            print "### initFromCatalog ... hawqdbs ###"
            print hawqdbs

        conn.close()

        # origSegments = [seg.copy() for seg in segments]

        array = HAWQArray(hawqdbs)
        array.version = version
        array.setFilespaces(filespaceArr)
        array.setFilesystem(filesystemArr)

        return array
Beispiel #2
0
    def initFromCatalog(dbURL, utility=False, useAllSegmentFileSpaces=False):
        """
        Factory method, initializes a HAWQArray from provided database URL

        Please note that -
        useAllSegmentFilespaces when set to true makes this method add *all* filespaces
        to the segments of hawqarray. If false, only returns Master/Standby all filespaces
        This is *hacky* and we know that it is not the right way to design methods/interfaces
        We are doing this so that we do not affect behavior of existing tools like upgrade, gprecoverseg etc
        """

        conn = dbconn.connect(dbURL, utility)

        # Get the version from the database:
        version_str = None
        for row in dbconn.execSQL(conn, "SELECT version()"):
            version_str = row[0]
        version = GpVersion(version_str)

        # Only for HAWQ 2.0
        if version.getVersionRelease() in ("2.0"):

            hawq_site = HawqXMLParser(GPHOME)
            master_data_directory  = hawq_site.get_value_from_name('hawq_master_directory')
            segment_data_directory = hawq_site.get_value_from_name('hawq_segment_directory')

            config_rows = dbconn.execSQL(conn, '''
                               SELECT sc.registration_order,
                                      sc.role,
                                      sc.status,
                                      sc.hostname,
                                      sc.address,
                                      sc.port,
                                      CASE
                                          WHEN sc.registration_order <= 0 THEN '%s'
                                          ELSE '%s'
                                      END AS datadir
                               FROM pg_catalog.gp_segment_configuration sc
                               ORDER BY sc.registration_order;''' %
                               (master_data_directory, segment_data_directory))

            # All of filesystem is shared storage
            filesystemRows = dbconn.execSQL(conn, '''
                SELECT oid, fsysname, true AS fsysshared
                FROM pg_filesystem
                ORDER BY fsysname
            ''')

            filesystemArr = [HAWQFilesystemObj(fsysRow[0], fsysRow[1], fsysRow[2]) for fsysRow in filesystemRows]

            filespaceRows = dbconn.execSQL(conn, '''
                SELECT oid, fsname, fsfsys AS fsoid
                FROM pg_filespace
                WHERE oid != %d
                ORDER BY fsname;
            ''' % (SYSTEM_FILESPACE))

            filespaceArr = [HAWQFilespaceObj(fsRow[0], fsRow[1], HAWQFilesystemObj.getFilesystemObj(filesystemArr, fsRow[2])) for fsRow in filespaceRows]

        else:
            raise Exception("HAWQ version is invalid: %s" % version)

        hawqdbs = []
        print "### initFromCatalog ###"
        hdb = None
        for row in config_rows:

            print row

            # Extract fields from the row
            (registration_order, role, status, hostname, 
             address, port, datadir) = row

            # In GPSQL, only master maintain the filespace information.
            # if registration_order != MASTER_REGISTRATION_ORDER and \
            #    fsoid != SYSTEM_FILESPACE and \
            #    not useAllSegmentFileSpaces:
            #    print "### initFromCatalog ... continue ###"
            #    continue

            # The query returns all the filespaces for a segment on separate
            # rows.  If this row is the same dbid as the previous row simply
            # add this filespace to the existing list, otherwise create a
            # new segment.
            # if seg and seg.getSegmentRegistrationOrder() == registration_order:
            #     seg.addSegmentFilespace(fsoid, fslocation)
            # else:
            #     seg = HAWQDB(registration_order, role, status, 
            #                 hostname, address, port, datadir)
            #    segments.append(seg)

            hdb = HAWQDB(registration_order, role, status, 
                         hostname, address, port, datadir)
            print "### initFromCatalog ... hdb ###"
            print hdb
            hawqdbs.append(hdb)
            print "### initFromCatalog ... hawqdbs ###"
            print hawqdbs
        
        conn.close()
        
        # origSegments = [seg.copy() for seg in segments]
        
        array = HAWQArray(hawqdbs)
        array.version = version
        array.setFilespaces(filespaceArr)
        array.setFilesystem(filesystemArr)
        
        return array
Beispiel #3
0
class GPCatalog():
    """
    Catalog is a container class that contains dictionary of CatalogTable 
    objects.

    It provides the CatalogTables with a context that they can use to
    refer to other CatalogTables (e.g. describe foreign keys) and it
    provides calling code with a simple wrapper for what a known catalog
    layout looks like.

    It supports multiple source versions of the database.  It issues a
    warning if there are catalog tables defined in the database that
    it is unaware of, usually indicating that it is operating against
    an unknown version.
    """

    # --------------------------------------------------------------------
    # Public API functions:
    #   - Catalog()              - Create a Catalog object
    #   - getCatalogTable()      - Returns a single CatalogTable
    #   - getCatalogTables()     - Returns a list of CatalogTable
    #   - getCatalogVersion()    - Returns a GpVersion
    # --------------------------------------------------------------------
    def getCatalogTable(self, tablename):
        """
        getCatalogTable(tablename) => Returns the specified CatalogTable
        
        Raises: CatalogException when the table does not exist
        """
        if tablename not in self._tables:
            raise GPCatalogException("No such catalog table: %s" %
                                     str(tablename))

        return self._tables[tablename]

    def getCatalogTables(self):
        """
        getCatalogTables() => Returns a list of CatalogTable
        """
        return list(self._tables.values())

    def getCatalogVersion(self):
        """
        getCatalogVersion() => Returns the GpVersion object
        """
        return self._version

    # --------------------------------------------------------------------
    # Private implementation functions:
    # --------------------------------------------------------------------
    def __init__(self, dbConnection):
        """
        Catalog() constructor

        1) Uses the supplied database connection to get a list of catalog tables
        2) iterate through the list building up CatalogTable objects
        3) Mark "coordinator only" tables manually
        4) Mark a couple primary keys manually
        5) Mark foreign keys manually
        6) Mark known catalog differences manually
        7) Validate and return the Catalog object
        """
        self._dbConnection = dbConnection
        self._tables = {}
        self._version = None
        self._tidycat = {}  # tidycat definitions from JSON file

        version_query = """
           SELECT version()
        """
        catalog_query = """
           SELECT oid, relname, relisshared FROM pg_class 
           WHERE relnamespace=11 and relkind = 'r' 
        """

        # Read the catalog version from the database
        try:
            curs = self._query(version_query)
        except Exception as e:
            raise GPCatalogException("Error reading database version: " +
                                     str(e))
        self._version = GpVersion(curs.getresult()[0][0])

        # Read the list of catalog tables from the database
        try:
            curs = self._query(catalog_query)
        except Exception as e:
            raise GPCatalogException("Error reading catalog: " + str(e))

        # Construct our internal representation of the catalog

        for [oid, relname, relisshared] in curs.getresult():
            self._tables[relname] = GPCatalogTable(self, relname)
            # Note: stupid API returns t/f for boolean value
            self._tables[relname]._setShared(relisshared == 't')
            self._tables[relname]._setOid(oid)

        # The tidycat.pl utility has been used to generate a json file
        # describing aspects of the catalog that we can not currently
        # interrogate from the catalog itself.  This includes things
        # like which tables are coordinator only vs segment local and what
        # the foreign key relationships are.
        self._getJson()

        # Which tables are "coordinator only" is not derivable from the catalog
        # so we have to set this manually.
        self._markCoordinatorOnlyTables()

        # We derived primary keys for most of the catalogs based on un
        # unique indexes, but we have to manually set a few stranglers
        self._setPrimaryKeys()

        # Foreign key relationships of the catalog tables are not actually
        # defined in the catalog, so must be obtained from tidycat
        self._setForeignKeys()

        # Most catalog tables are now ready to go, but some columns can
        # not be compared directly between segments, we need to indicate
        # these exceptions manually.
        self._setKnownDifferences()

        # Finally validate that everything looks right, this will issue
        # warnings if there are any regular catalog tables that do not
        # have primary keys set.
        self._validate()

    def _query(self, qry):
        """
        Simple wrapper around querying the database connection
        """
        return self._dbConnection.query(qry)

    def _markCoordinatorOnlyTables(self):
        """
        We mark two types of catalog tables as "coordinator only"
          - True "coordinator only" tables
          - Tables we know to have different contents on coordinator/segment

        While the later two are not technically "coordinator only" they have
        the property that we cannot validate cross segment consistency,
        which makes them the same for our current purposes.

        We may want to eventually move these other types of tables into
        a different classification.
        """
        for name in COORDINATOR_ONLY_TABLES:
            if name in self._tables:
                self._tables[name]._setCoordinatorOnly()

        for name in SEGMENT_LOCAL_TABLES:
            if name in self._tables:
                self._tables[name]._setCoordinatorOnly()

    def _setPrimaryKeys(self):
        """
        Most of the catalog primary keys are set automatically in
        CatalogTable by looking at unique indexes over the catalogs.

        However there are a couple of catalog tables that do not have
        unique indexes that we still want to perform cross segment
        consistency on, for them we have to manually set a primary key
        """
        self._tables['gp_version_at_initdb']._setPrimaryKey(
            "schemaversion productversion")
        self._tables['pg_constraint']._setPrimaryKey(
            "conname connamespace conrelid contypid")
        self._tables['pg_depend']._setPrimaryKey(
            "classid objid objsubid refclassid refobjid refobjsubid deptype")

        if self._version >= "4.0":
            self._tables['pg_resqueuecapability']._setPrimaryKey(
                "resqueueid restypid")

    def _getJson(self):
        """
        Read the json file generated by tidycat which contains, among other
        things, the primary key/foreign key relationships for the catalog
        tables.  Build the fkeys for each table and validate them against 
        the catalog.
        """
        indir = os.path.dirname(__file__)
        jname = str(self._version.getVersionRelease()) + ".json"
        try:
            # json doc in data subdirectory of pylib module
            infil = open(os.path.join(indir, "data", jname), "r")
            d = json.load(infil)
            # remove the tidycat comment
            if "__comment" in d:
                del d["__comment"]
            if "__info" in d:
                del d["__info"]
            infil.close()
            self._tidycat = d
        except Exception as e:
            # older versions of product will not have tidycat defs --
            # need to handle this case
            logger.warn("GPCatalogTable: " + str(e))

    def _setForeignKeys(self):
        """
        Setup the foreign key relationships amongst the catalogs.  We 
        drive this based on the tidycat generate json file since this
        information is not derivable from the catalog.
        """
        try:
            for tname, tdef in self._tidycat.items():
                if "foreign_keys" not in tdef:
                    continue
                for fkdef in tdef["foreign_keys"]:
                    fk2 = GPCatalogTableForeignKey(tname, fkdef[0], fkdef[1],
                                                   fkdef[2])
                    self._tables[tname]._addForeignKey(fk2)
        except Exception as e:
            # older versions of product will not have tidycat defs --
            # need to handle this case
            logger.warn("GPCatalogTable: " + str(e))

    def _setKnownDifferences(self):
        """
        Some catalogs have columns that, for one reason or another, we
        need to mark as being different between the segments and the coordinator.
        
        These fall into two categories:
           - Bugs (marked with the appropriate jiras)
           - A small number of "special" columns
        """

        # -------------
        # Special cases
        # -------------

        # pg_class:
        #   - relfilenode should generally be consistent, but may not be (jira?)
        #   - relpages/reltuples/relfrozenxid/relminmxid are all vacumm/analyze related
        #   - relhasindex/relhaspkey are only cleared when vacuum completes
        #   - relowner has its own checks:
        #       => may want to separate out "owner" columns like acl and oid
        self._tables['pg_class']._setKnownDifferences(
            "relfilenode relpages reltuples relhasindex relhaspkey relowner relfrozenxid relminmxid relallvisible"
        )

        # pg_type: typowner has its own checks:
        #       => may want to separate out "owner" columns like acl and oid
        self._tables['pg_type']._setKnownDifferences("typowner")

        # pg_database: datfrozenxid and datminmxid are vacuum related
        self._tables['pg_database']._setKnownDifferences(
            "datfrozenxid datminmxid")

        # -------------
        # Issues still present in the product
        # -------------

        # MPP-11289 : inconsistent OIDS for table "default values"
        self._tables['pg_attrdef']._setKnownDifferences("oid")

        # MPP-11284 : inconsistent OIDS for constraints
        self._tables['pg_constraint']._setKnownDifferences("oid")

        # MPP-11282: Inconsistent oids for language callback functions
        # MPP-12015: Inconsistent oids for operator communtator/negator functions
        self._tables['pg_proc']._setKnownDifferences("oid prolang")

        # MPP-11282: pg_language oids and callback functions
        self._tables['pg_language']._setKnownDifferences(
            "oid lanplcallfoid lanvalidator")

        # MPP-12015: Inconsistent oids for operator communtator/negator functions
        # MPP-12015: Inconsistent oids for operator sort/cmp operators
        self._tables['pg_operator']._setKnownDifferences(
            "oid oprcom oprnegate oprlsortop oprrsortop oprltcmpop oprgtcmpop")

        self._tables['pg_aggregate']._setKnownDifferences("aggsortop")

        # MPP-11281 : Inconsistent oids for views
        self._tables['pg_rewrite']._setKnownDifferences("oid ev_action")

        # MPP-11285 : Inconsistent oids for triggers
        self._tables['pg_trigger']._setKnownDifferences("oid")

        # MPP-11575 : Inconsistent handling of indpred for partial indexes
        # indcheckxmin column related to HOT feature in pg_index is calculated
        # independently for coordinator and segment based on individual nodes
        # transaction state, hence it can be different so skip it from checks.
        self._tables['pg_index']._setKnownDifferences("indpred indcheckxmin")

    def _validate(self):
        """
        Check that all tables defined in the catalog have either been marked
        as "coordinator only" or have a primary key
        """
        for relname in sorted(self._tables):
            if self._tables[relname].isCoordinatorOnly():
                continue
            if self._tables[relname].getPrimaryKey() == []:
                logger.warn(
                    "GPCatalogTable: unable to derive primary key for %s" %
                    str(relname))