コード例 #1
0
ファイル: data1.py プロジェクト: andrewcaelliott/acedata
def analyseTable(tablename, trace=0):
    tableDict = {}
    tableDict["tableName"]=tablename
    columns = execute("describe "+tablename)
    tableDict["tableColumnCount"]=len(columns)
    count = eval(execute("select count(*) from "+tablename)[0][0])
    tableDict["tableRowCount"]=count
    tableDict["tableColumns"]={}
    for column in columns:
        columnDict = {}
        columnDict["columnName"] = column[0]
        columnType = "data"
        columnQualifier = ""
        if trace:
            print column[0]
        query = ("select count(distinct "+column[0]+") from "+tablename)
        distinctCount = eval(execute(query)[0][0])
            
        #print distinctCount
        if distinctCount == count:
            if trace:
                print "Each value is distinct"
            columnType = "unique"
            columnDict["information"]=log2up(distinctCount)
        else:
            distincts = execute("select distinct "+column[0]+", count(*) from "+tablename+" group by "+column[0])
            #print distincts
            reverseCounts, examples = analyseDistinctValues(distincts)
            #print reverseCounts
            celltotal = 0
            cells = 0
            for reverseCount in reverseCounts.keys():
                if not(reverseCounts[reverseCount]==1 and examples[reverseCount]==None):
                    cells+=reverseCounts[reverseCount]
                    celltotal += Math.log10(reverseCount)* reverseCounts[reverseCount]
        
            if cells>0:
                cellavg = 10**(celltotal / cells)
        
                if trace:
                    print "avg occurrence", cellavg
            
        
            standouts={}
            for reverseCount in reverseCounts.keys():
                #print reverseCount
                if reverseCounts[reverseCount]==1 and examples[reverseCount]==None:
                    standouts["Nulls"]=reverseCount
                    if trace:
                        print "Null occurs", reverseCount, "times."
                if reverseCount == count:
                    columnType = "unused"
                elif reverseCount >= (0.9 * count):
                    columnQualifier += "sparselyUsed "
                else:
                    if trace:
                        print "There is/are",reverseCounts[reverseCount], "value(s) that occurs", reverseCount, "times. eg",examples[reverseCount]
                occurrence = {}
                occurrence["repeatCount"]=reverseCount
                occurrence["valueCount"]=reverseCounts[reverseCount]
                occurrence["example"]=examples[reverseCount]
                if reverseCount > 100 * cellavg:
                    if trace:
                        print "*** HIGH OCCURRENCE"
                    standouts["highFrequency-"+str(reverseCount)]=occurrence
                    if columnQualifier.find("unbalanced")<0:
                        columnQualifier += "unbalanced "
                elif reverseCount <  cellavg / 50:
                    if trace:
                        print "*** LOW OCCURRENCE"
                    standouts["lowFrequency-"+str(reverseCount)]=occurrence
                    if columnQualifier.find("unbalanced")<0:
                        columnQualifier += "unbalanced "
                  
            columnDict["standouts"]=standouts
              
            if columnType == "data":
                if distinctCount == 1:
                    columnType = "constant"
                elif distinctCount < 10:
                    columnType = "smallCategorisation"
                elif distinctCount < 40:
                    columnType = "categorisation"
                elif distinctCount > count/2:
                    columnType = "key"
                      
            #print "information content:",collapse(condense(sorted(prepare(reverseCounts))))
            columnDict["information"]=collapse(condense(sorted(prepare(reverseCounts))))[0][2]
                      
        if trace:
            print column[0],"looks like",columnQualifier+columnType
        columnDict["looksLike"]=columnType
        columnDict["qualifier"]=columnQualifier
        if trace:
            print 
        tableDict["tableColumns"][column[0]]=columnDict
    return tableDict