Exemple #1
0
 def single(self, pk, statistics=None):
     obj = PrimaryKey()
     obj.db_catalog = pk['db_catalog']
     obj.schema = pk['schemaname']
     obj.tablename = pk['tablename']
     obj.db_columns = '|'.join(pk['constrained_columns'])
     obj.keyname = pk['name']
     obj.type = 'EXPLICIT'
     obj.score = 1.0
     obj.comment = ''
     obj.tags = ''
     obj.date_added = datetime.datetime.now()
     return obj
Exemple #2
0
    def doOneTable(self, table):
        excluded_fields = ['text', 'decimal', 'float', 'binary']
        retval = []

        try:
            data = {}
            counts = {}
            columnnames = []
            alldata = []

            columns = [ c for c in table.columns if c.type not in excluded_fields ]
            columnnames = [ c.name for c in columns ]

            alldata = self.getDataForSelectedColumns(table)

            for column in table.columns:
                idx = columnnames.index(column.name)
                data[column.name] = [ row[idx] for row in alldata ]

            if len(data) > 0:
                for columnname in columnnames:
                    values = [str(d) for d in data[columnname]]
                    c = Counter(values)
                    counts[columnname] = dict(c)

                singlekeys = itertools.combinations(columnnames, 1)
                pks = []
                useless = []
                candidateCombi = {}
                numrows = len(alldata)

                for combi in singlekeys:
                    key = combi[0]
                    dictionary = counts[key]
                    if len(dictionary.keys()) == 1:
                        useless.append(key)
                    if len(dictionary.keys()) == numrows:
                        candidateCombi[combi] = len(dictionary.keys())
                        pks.append(key)

                for key in useless:
                    columnnames.remove(key)

                duokeys = itertools.combinations(columnnames, 2)
                triplekeys = []
                if len(columnnames) <= 10:
                    triplekeys = itertools.combinations(columnnames, 3)

                for combi in itertools.chain(duokeys, triplekeys):
                    n = 1
                    overlap = [val for val in combi if val in useless]
                    overlap = overlap or [val for val in combi if val in pks]
                    if len(overlap) == 0:
                        for key in combi:
                            n = n * len(counts[key].keys())
                        if n >= numrows:
                            candidateCombi[combi] = n

                sortedCombis = sorted(candidateCombi.items(), key=operator.itemgetter(1), reverse=True)
                rankedCombis = []
                for length in range(1, len(columnnames)):
                    for combi in sortedCombis:
                        if len(combi[0]) == length:
                            rankedCombis.append(combi)

                del counts

                combiCounts = {}
                validCombis = []
                for combi in rankedCombis:
                    validCombis.append(combi[0])
                    combiCounts[combi[0]] = sortedcontainers.SortedList()

                for row in alldata:
                    for combi in validCombis:
                        values = []
                        for key in combi:
                            idx = columnnames.index(key)
                            value = row[idx]
                            values.append(str(value))
                        if len(combi) > 0:
                            if not values in combiCounts[combi]:
                                combiCounts[combi].add(values)
                            else:
                                validCombis.remove(combi)

                for validCombi in validCombis:
                    pk = PrimaryKey()
                    pk.db_catalog = table.info['db_catalog']
                    pk.db_schema = table.info['schemaname']
                    pk.tablename = table.name
                    pk.db_columns = self.colseparator.join(list(validCombi))
                    pk.keyname = 'detected PK'
                    pk.type = 'IMPLICIT'
                    pk.score = 1.0
                    retval.append(pk)
        except Exception as e:
            print('could not process: ' + table.name)
            print(e)

        return retval