def single(self, pk, statistics=None): obj = PrimaryKey() obj.db_catalog = pk['db_catalog'] obj.schema = pk['schemaname'] obj.tablename = pk['tablename'] obj.db_columns = '|'.join(pk['constrained_columns']) obj.keyname = pk['name'] obj.type = 'EXPLICIT' obj.score = 1.0 obj.comment = '' obj.tags = '' obj.date_added = datetime.datetime.now() return obj
def doOneTable(self, table): excluded_fields = ['text', 'decimal', 'float', 'binary'] retval = [] try: data = {} counts = {} columnnames = [] alldata = [] columns = [ c for c in table.columns if c.type not in excluded_fields ] columnnames = [ c.name for c in columns ] alldata = self.getDataForSelectedColumns(table) for column in table.columns: idx = columnnames.index(column.name) data[column.name] = [ row[idx] for row in alldata ] if len(data) > 0: for columnname in columnnames: values = [str(d) for d in data[columnname]] c = Counter(values) counts[columnname] = dict(c) singlekeys = itertools.combinations(columnnames, 1) pks = [] useless = [] candidateCombi = {} numrows = len(alldata) for combi in singlekeys: key = combi[0] dictionary = counts[key] if len(dictionary.keys()) == 1: useless.append(key) if len(dictionary.keys()) == numrows: candidateCombi[combi] = len(dictionary.keys()) pks.append(key) for key in useless: columnnames.remove(key) duokeys = itertools.combinations(columnnames, 2) triplekeys = [] if len(columnnames) <= 10: triplekeys = itertools.combinations(columnnames, 3) for combi in itertools.chain(duokeys, triplekeys): n = 1 overlap = [val for val in combi if val in useless] overlap = overlap or [val for val in combi if val in pks] if len(overlap) == 0: for key in combi: n = n * len(counts[key].keys()) if n >= numrows: candidateCombi[combi] = n sortedCombis = sorted(candidateCombi.items(), key=operator.itemgetter(1), reverse=True) rankedCombis = [] for length in range(1, len(columnnames)): for combi in sortedCombis: if len(combi[0]) == length: rankedCombis.append(combi) del counts combiCounts = {} validCombis = [] for combi in rankedCombis: validCombis.append(combi[0]) combiCounts[combi[0]] = sortedcontainers.SortedList() for row in alldata: for combi in validCombis: values = [] for key in combi: idx = columnnames.index(key) value = row[idx] values.append(str(value)) if len(combi) > 0: if not values in combiCounts[combi]: combiCounts[combi].add(values) else: validCombis.remove(combi) for validCombi in validCombis: pk = PrimaryKey() pk.db_catalog = table.info['db_catalog'] pk.db_schema = table.info['schemaname'] pk.tablename = table.name pk.db_columns = self.colseparator.join(list(validCombi)) pk.keyname = 'detected PK' pk.type = 'IMPLICIT' pk.score = 1.0 retval.append(pk) except Exception as e: print('could not process: ' + table.name) print(e) return retval