Beispiel #1
0
    def getForeignKeys(self, columnseparator='|'):
        retval = []
        with pymysql.connect(host=self.db_host,
                             user=self.db_user,
                             passwd=self.db_password,
                             db=self.db_catalog) as cursor:
            cursor.execute(""" 
				SELECT
					TABLE_CATALOG, TABLE_SCHEMA, REFERENCED_TABLE_SCHEMA, TABLE_NAME, REFERENCED_TABLE_NAME, CONSTRAINT_NAME, GROUP_CONCAT(COLUMN_NAME SEPARATOR '{0}'), GROUP_CONCAT(REFERENCED_COLUMN_NAME SEPARATOR '{0}')
				FROM 
					INFORMATION_SCHEMA.KEY_COLUMN_USAGE
				WHERE 
					TABLE_SCHEMA = '{1}'
					AND
					REFERENCED_TABLE_NAME IS NOT NULL
					AND
					REFERENCED_TABLE_SCHEMA IS NOT NULL
					AND
					REFERENCED_COLUMN_NAME IS NOT NULL
				GROUP BY
					TABLE_NAME, TABLE_SCHEMA, REFERENCED_TABLE_NAME, REFERENCED_TABLE_SCHEMA
			""".format(columnseparator, self.db_catalog))
            retval = [
                ForeignKey(db_catalog=d[1],
                           pkdb_schema='',
                           fkdb_schema='',
                           pktablename=d[3],
                           fktablename=d[4],
                           keyname=d[5],
                           pk_columns=d[6].split(columnseparator),
                           fk_columns=d[7].split(columnseparator),
                           type='explicit') for d in cursor.fetchall()
            ]
        return retval
Beispiel #2
0
    def getForeignKeys(self, columnseparator='|'):
        retval = []
        with pymssql.connect(self.db_host, self.db_user, self.db_password,
                             self.db_catalog) as conn:
            with conn.cursor() as cursor:
                cursor.execute(""" 
					with cte1 as (
						SELECT  
							obj.name AS FK_NAME,
							pksch.name AS [pkschema],
							fksch.name AS [fkschema],
							tab1.name AS [table],
							col1.name AS [column],
							tab2.name AS [referenced_table],
							col2.name AS [referenced_column]
						FROM sys.foreign_key_columns fkc
						INNER JOIN sys.objects obj
							ON obj.object_id = fkc.constraint_object_id
						INNER JOIN sys.tables tab1
							ON tab1.object_id = fkc.parent_object_id
						INNER JOIN sys.tables tab2
							ON tab2.object_id = fkc.referenced_object_id
						INNER JOIN sys.schemas pksch
							ON tab1.schema_id = pksch.schema_id
						INNER JOIN sys.schemas fksch
							ON tab2.schema_id = fksch.schema_id
						INNER JOIN sys.columns col1
							ON col1.column_id = parent_column_id AND col1.object_id = tab1.object_id
						INNER JOIN sys.columns col2
							ON col2.column_id = referenced_column_id AND col2.object_id = tab2.object_id
					)

					SELECT
						G.pkschema,
						G.fkschema,
						G.[table] as pktable,
						G.referenced_table as fktable,
						G.FK_NAME,
						stuff(
						(
							select cast('{0}' as varchar(max)) + U.[column]
							from cte1 U
							WHERE U.FK_NAME = G.FK_NAME
							for xml path('')
						), 1, 1, '') AS pkcolumns,
						stuff(
						(
							select cast('{0}' as varchar(max)) + U.[referenced_column]
							from cte1 U
							WHERE U.FK_NAME = G.FK_NAME
							for xml path('')
						), 1, 1, '') AS fkcolumns
					FROM
						cte1 G
					GROUP BY
						G.FK_NAME, G.pkschema, G.fkschema, G.[table], G.referenced_table
				 """.format(columnseparator))
                retval = [
                    ForeignKey(db_catalog=self.db_catalog,
                               pkdb_schema=d[0],
                               fkdb_schema=d[1],
                               pktablename=d[2],
                               fktablename=d[3],
                               keyname=d[4],
                               pk_columns=d[5].split(columnseparator),
                               fk_columns=d[6].split(columnseparator),
                               type='explicit') for d in cursor.fetchall()
                ]
        return retval
    def discoverfks(self, theta):
        # phase 1
        fs = []
        fm = []
        # b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>)
        bksketches = {}
        quantiles = {}
        s = {}

        # calculate bottom-k sketch for all columns and store in dictionary <bksketches>
        for column in self.columns:
            bksketches[(column.db_schema, column.tablename,
                        column.columnname)] = self.bottomksketch(
                            self.getDataFn(column.db_schema, column.columnname,
                                           column.tablename))

        pkall = self.pksingle
        pkall.extend(self.pkmulti)
        for pk in pkall:  # foreach primary key (single and multi)
            pkcollst = pk.db_columns.split(self.colseparator)
            n = len(pkcollst)

            for keycolumn_name in pkcollst:  # foreach column in primary key
                for candidate in self.columns:  # foreach column as foreign key candidate
                    if self.inclusion(
                            bksketches[
                                (candidate.db_schema, candidate.tablename,
                                 candidate.columnname)], bksketches[
                                     (pk.db_schema, pk.tablename,
                                      keycolumn_name)]) >= theta and (
                                          candidate.tablename != pk.tablename):
                        if n == 1:  # in case we are dealing with a single column pk
                            fs.append(([candidate], pk))
                        if n > 1:  # in case we are dealing with a multi column pk
                            if (pk.db_columns, keycolumn_name) not in s:
                                s[(pk.db_columns, keycolumn_name)] = []
                            # dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be
                            # just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name')
                            # indicate the two entries in s for PK 'id|name'. For each entry we store a list of
                            # candidate columns found in other tables
                            s[(pk.db_columns,
                               keycolumn_name)].append(candidate)
            if n > 1:
                bksketches[(pk.db_schema, pk.tablename,
                            pk.db_columns)] = self.bottomksketch(
                                self.getDataFn(
                                    pk.db_schema,
                                    pk.db_columns.split(self.colseparator),
                                    pk.tablename))

            quantiles[(pk.db_schema, pk.tablename,
                       pk.db_columns)] = self.quantilehistogram(
                           self.getDataFn(
                               pk.db_schema,
                               pk.db_columns.split(self.colseparator),
                               pk.tablename))

        # phase 2

        # fks: dictionary that indexes on (<foreignkey table>, <primary key column>)
        # value of the dictionary are those candidate columns in <foreignkey table> for <primary key column>
        # TBD: remove the table loop
        fks = {}
        for kvp in s:
            spkcolname = kvp[1]
            for e in s[kvp]:
                key = (e.tablename, spkcolname)
                if key not in fks:
                    fks[key] = []
                fks[key].append(e)

        for pkm in self.pkmulti:
            pkcollst = pkm.db_columns.split(self.colseparator)
            print(pkm)
            # print()

            # for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian
            # product and store in the fm list
            for table in self.tables:
                tname = table.tablename
                L = []
                for pkcolumn in pkcollst:
                    key = (tname, pkcolumn)
                    if key not in fks:
                        continue
                    L.append(fks[key])
                if len(L) == len(pkcollst):
                    cart = self.cartesian(L)
                    for prod in cart:
                        fm.append((prod, pkm))

        for flst, pk in fm:
            pkcollst = pk.db_columns.split(self.colseparator)
            fcols = [c.columnname for c in flst]

            fschema = flst[0].db_schema  # TBD: ugly indices here
            ftable = flst[0].tablename  # TBD: and here

            fsample = self.bottomksketch(self.getDataFn(
                fschema, fcols, ftable))
            if self.inclusion(
                    fsample, bksketches[(pk.db_schema, pk.tablename,
                                         pk.db_columns)]) >= theta:
                quantiles[(pk.db_schema, pk.tablename,
                           pk.db_columns)] = self.quantilehistogram(
                               self.getDataFn(
                                   pk.db_schema,
                                   pk.db_columns.split(self.colseparator),
                                   pk.tablename))
                quantiles[(fschema, ftable,
                           "|".join(fcols))] = self.quantilehistogram(
                               self.getDataFn(fschema, fcols, ftable))
            else:
                fm.remove((flst, pk))

        for flst, pk in fs:
            # only index zero because every fs has only one candidate column...
            quantiles[(flst[0].db_schema, flst[0].tablename,
                       flst[0].columnname)] = self.quantilehistogram(
                           self.getDataFn(flst[0].db_schema,
                                          flst[0].columnname,
                                          flst[0].tablename))

        result = []
        fall = fs
        fall.extend(fm)

        for f, pk in fall:

            fcols = []
            for cdict in f:
                fcols.append(cdict.columnname)
            fschema = f[0].db_schema  # TBD: ugly indices here
            ftable = f[0].tablename  # TBD: and here

            if quantiles[(fschema, ftable,
                          "|".join(fcols))] is not None and quantiles[(
                              pk.db_schema, pk.tablename,
                              pk.db_columns)] is not None:  # empty columns....
                qfk = quantiles[(fschema, ftable, "|".join(fcols))]
                qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)]

                emdscore = 0
                try:
                    for i in range(len(qfk)):
                        fkhist = qfk[i][0]
                        pkhist = qpk[i][0]

                        fkbins = qfk[i][1]
                        pkbins = qpk[i][1]

                        emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1],
                                            pkbins[0:-1])
                    emdscore = emdscore / len(qfk[0])
                except:
                    emdscore = -1

                if math.isnan(emdscore):
                    emdscore = -1

                nfk = ForeignKey(db_catalog=pk.db_catalog,
                                 pkdb_schema=pk.db_schema,
                                 fkdb_schema=fschema,
                                 pktablename=pk.tablename,
                                 fktablename=ftable,
                                 fk_columns=fcols,
                                 keyname='implicit_fk',
                                 type='implicit')
                nfk.pk_columns = pk.db_columns
                nfk.score = emdscore

                result.append((nfk, emdscore))

        # print("## len(Q): " + str(len(q)))

        return sorted(result, key=lambda kvp: kvp[1], reverse=False)
Beispiel #4
0
 def single(self, fk, statistics=None):
     obj = ForeignKey()
     print(fk)
     obj.db_catalog = fk['db_catalog']
     obj.pkdb_schema = fk['srcschema']
     obj.fkdb_schema = fk['referred_schema']
     obj.pktablename = fk['srctable']
     obj.fktablename = fk['referred_table']
     obj.pk_columns = '|'.join(fk['constrained_columns'])
     obj.fk_columns = '|'.join(fk['referred_columns'])
     obj.keyname = fk['name']
     obj.type = 'EXPLICIT'
     obj.score = 1.0
     obj.comment = ''
     obj.tags = ''
     obj.date_added = datetime.datetime.now()
     return obj
Beispiel #5
0
	def discoverfks(self, theta):
		# phase 1
		fs = []
		fm = []
		# b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>)
		bksketches = {}
		quantiles = {}
		s = {}

		# calculate bottom-k sketch for all columns and store in dictionary <bksketches>
		for column in self.columns:
			bksketches[(column.db_schema, column.tablename, column.columnname)] = self.bottomksketch(self.getDataFn(column.db_schema, column.columnname, column.tablename))

		pkall = self.pksingle
		pkall.extend(self.pkmulti)
		for pk in pkall: # foreach primary key (single and multi)
			pkcollst = pk.db_columns.split(self.colseparator)
			n = len(pkcollst)

			for keycolumn_name in pkcollst: # foreach column in primary key
				for candidate in self.columns: # foreach column as foreign key candidate
					this = bksketches[(candidate.db_schema, candidate.tablename, candidate.columnname)]
					that = bksketches[(pk.db_schema, pk.tablename, keycolumn_name)]
					if self.inclusion(this, that) >= theta and (candidate.tablename != pk.tablename):
						if n == 1: # in case we are dealing with a single column pk
							fs.append(([candidate], pk))
						if n > 1: # in case we are dealing with a multi column pk
							if (pk.db_columns, keycolumn_name) not in s:
								s[(pk.db_columns, keycolumn_name)] = []
							# dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be
							# just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name')
							# indicate the two entries in s for PK 'id|name'. For each entry we store a list of
							# candidate columns found in other tables
							s[(pk.db_columns, keycolumn_name)].append(candidate)
			if n > 1:
				bksketches[(pk.db_schema, pk.tablename, pk.db_columns)] = self.bottomksketch(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))

			quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))

		# phase 2

		# fks: dictionary that indexes on (<foreignkey table>, <primary key column>)
		# value of the dictionary are those candidate columns in <foreignkey table> for <primary key column>
		# TBD: remove the table loop
		fks = {}
		for kvp in s:
			spkcolname = kvp[1]
			for e in s[kvp]:
				key = (e.tablename, spkcolname)
				if key not in fks:
					fks[key] = []
				fks[key].append(e)

		for pkm in self.pkmulti:
			pkcollst = pkm.db_columns.split(self.colseparator)
			print(pkm)
			# print()

			# for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian
			# product and store in the fm list
			for table in self.tables:
				tname = table.tablename
				L = []
				for pkcolumn in pkcollst:
					key = (tname, pkcolumn)
					if key not in fks:
						continue
					L.append(fks[key])
				if len(L) == len(pkcollst):
					cart = self.cartesian(L)
					for prod in cart:
						fm.append((prod, pkm))

		for flst,pk in fm:
			pkcollst = pk.db_columns.split(self.colseparator)
			fcols = [ c.columnname for c in flst ]

			fschema = flst[0].db_schema # TBD: ugly indices here
			ftable = flst[0].tablename # TBD: and here

			fsample = self.bottomksketch(self.getDataFn(fschema, fcols, ftable))
			if self.inclusion(fsample, bksketches[(pk.db_schema, pk.tablename, pk.db_columns)]) >= theta:
				quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))
				quantiles[(fschema, ftable, "|".join(fcols))] = self.quantilehistogram(self.getDataFn(fschema, fcols, ftable))
			else:
				fm.remove((flst,pk))

		for flst,pk in fs:
			# only index zero because every fs has only one candidate column...
			quantiles[(flst[0].db_schema, flst[0].tablename, flst[0].columnname)] = self.quantilehistogram(self.getDataFn(flst[0].db_schema, flst[0].columnname, flst[0].tablename))

		result = []
		fall = fs
		fall.extend(fm)

		for f,pk in fall:

			fcols = []
			for cdict in f:
				fcols.append(cdict.columnname)
			fschema = f[0].db_schema # TBD: ugly indices here
			ftable = f[0].tablename # TBD: and here

			if quantiles[(fschema, ftable, "|".join(fcols))] is not None and quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] is not None: # empty columns....
				qfk = quantiles[(fschema, ftable, "|".join(fcols))]
				qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)]

				emdscore = 0
				try:
					for i in range(len(qfk)):
						fkhist = qfk[i][0]
						pkhist = qpk[i][0]

						fkbins = qfk[i][1]
						pkbins = qpk[i][1]

						emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1], pkbins[0:-1])
					emdscore = emdscore/len(qfk[0])
				except:
					emdscore = -1

				if math.isnan(emdscore):
					emdscore = -1

				nfk = ForeignKey(db_catalog=pk.db_catalog, pkdb_schema=pk.db_schema, fkdb_schema=fschema, pktablename=pk.tablename, fktablename=ftable, fk_columns=fcols, keyname='implicit_fk', type='implicit')
				nfk.pk_columns=pk.db_columns
				nfk.score = emdscore

				result.append((nfk, emdscore))

		# print("## len(Q): " + str(len(q)))

		return sorted(result, key=lambda kvp: kvp[1], reverse=False)