def analyseTable(tablename, trace=0): tableDict = {} tableDict["tableName"]=tablename columns = execute("describe "+tablename) tableDict["tableColumnCount"]=len(columns) count = eval(execute("select count(*) from "+tablename)[0][0]) tableDict["tableRowCount"]=count tableDict["tableColumns"]={} for column in columns: columnDict = {} columnDict["columnName"] = column[0] columnType = "data" columnQualifier = "" if trace: print column[0] query = ("select count(distinct "+column[0]+") from "+tablename) distinctCount = eval(execute(query)[0][0]) #print distinctCount if distinctCount == count: if trace: print "Each value is distinct" columnType = "unique" columnDict["information"]=log2up(distinctCount) else: distincts = execute("select distinct "+column[0]+", count(*) from "+tablename+" group by "+column[0]) #print distincts reverseCounts, examples = analyseDistinctValues(distincts) #print reverseCounts celltotal = 0 cells = 0 for reverseCount in reverseCounts.keys(): if not(reverseCounts[reverseCount]==1 and examples[reverseCount]==None): cells+=reverseCounts[reverseCount] celltotal += Math.log10(reverseCount)* reverseCounts[reverseCount] if cells>0: cellavg = 10**(celltotal / cells) if trace: print "avg occurrence", cellavg standouts={} for reverseCount in reverseCounts.keys(): #print reverseCount if reverseCounts[reverseCount]==1 and examples[reverseCount]==None: standouts["Nulls"]=reverseCount if trace: print "Null occurs", reverseCount, "times." if reverseCount == count: columnType = "unused" elif reverseCount >= (0.9 * count): columnQualifier += "sparselyUsed " else: if trace: print "There is/are",reverseCounts[reverseCount], "value(s) that occurs", reverseCount, "times. eg",examples[reverseCount] occurrence = {} occurrence["repeatCount"]=reverseCount occurrence["valueCount"]=reverseCounts[reverseCount] occurrence["example"]=examples[reverseCount] if reverseCount > 100 * cellavg: if trace: print "*** HIGH OCCURRENCE" standouts["highFrequency-"+str(reverseCount)]=occurrence if columnQualifier.find("unbalanced")<0: columnQualifier += "unbalanced " elif reverseCount < cellavg / 50: if trace: print "*** LOW OCCURRENCE" standouts["lowFrequency-"+str(reverseCount)]=occurrence if columnQualifier.find("unbalanced")<0: columnQualifier += "unbalanced " columnDict["standouts"]=standouts if columnType == "data": if distinctCount == 1: columnType = "constant" elif distinctCount < 10: columnType = "smallCategorisation" elif distinctCount < 40: columnType = "categorisation" elif distinctCount > count/2: columnType = "key" #print "information content:",collapse(condense(sorted(prepare(reverseCounts)))) columnDict["information"]=collapse(condense(sorted(prepare(reverseCounts))))[0][2] if trace: print column[0],"looks like",columnQualifier+columnType columnDict["looksLike"]=columnType columnDict["qualifier"]=columnQualifier if trace: print tableDict["tableColumns"][column[0]]=columnDict return tableDict