def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None): if not summaryResult: raise Exception("summaryResult is empty for infoFromSummary") if h2o.beta_features: # names = summaryResult['names'] # means = summaryResult['means'] summaries = summaryResult['summaries'] # what if we didn't get the full # of cols in this summary view? # I guess the test should deal with that if 1==0 and numCols and (len(summaries)!=numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries))) for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] h2o_exec.checkForBadFP(nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype)) if stattype == 'Enum': cardinality = stats['cardinality'] h2o_exec.checkForBadFP(cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype)) else: mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] # check for NaN/Infinity in some of these # apparently we can get NaN in the mean for a numerica col with all NA? h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True) h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True) h2o_exec.checkForBadFP(zeros, 'zeros for colname: %s stattype %s' % (colname, stattype)) if numRows and (nacnt==numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (colname, stattype) else: if not mins: print h2o.dump_json(column) raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows)) if not maxs: print h2o.dump_json(column) raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows)) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] if not noPrint: print "\n\n************************" print "colname:", colname print "coltype:", coltype print "nacnt:", nacnt print "stattype:", stattype if stattype == 'Enum': print "cardinality:", cardinality else: print "mean:", mean print "sd:", sd print "zeros:", zeros print "mins:", mins print "maxs:", maxs print "pct:", pct print "pctile:", pctile # histogram stuff print "hstart:", hstart print "hstep:", hstep print "hbrk:", hbrk print "hcnt:", hcnt else: summary = summaryResult['summary'] columnList = summary['columns'] # can't get the right number of columns in summary? have to ask for more cols (does va support > 1000) if 1==0 and numCols and (len(columnList)!=numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(columnList))) for column in columnList: N = column['N'] # self.assertEqual(N, rowCount) name = column['name'] stype = column['type'] histogram = column['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] # if not noPrint: # for b in bin_names: # print "bin_name:", b bins = histogram['bins'] nbins = histogram['bins'] if not noPrint: print "\n\n************************" print "N:", N print "name:", name print "type:", stype print "bin_size:", bin_size print "len(bin_names):", len(bin_names), bin_names print "len(bins):", len(bins), bins print "len(nbins):", len(nbins), nbins # not done if enum if stype != "enum": zeros = column['zeros'] na = column['na'] maxs = column['max'] mins = column['min'] mean = column['mean'] sigma = column['sigma'] if not noPrint: print "zeros:", zeros print "na:", na print "maxs:", maxs print "mins:", mins print "mean:", mean print "sigma:", sigma if numRows and (na==numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (name, stype) else: if not mins: print h2o.dump_json(column) raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stype, N, na, numRows)) if not maxs: print h2o.dump_json(column) raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stype, N, na, numRows)) # sometimes we don't get percentiles? (if 0 or 1 bins?) if len(bins) >= 2: percentiles = column['percentiles'] thresholds = percentiles['thresholds'] values = percentiles['values'] if not noPrint: # h2o shows 5 of them, ordered print "len(max):", len(maxs), maxs print "len(min):", len(mins), mins print "len(thresholds):", len(thresholds), thresholds print "len(values):", len(values), values for v in values: # 0 is the most max or most min if not v >= mins[0]: m = "Percentile value %s should all be >= the min dataset value %s" % (v, mins[0]) raise Exception(m) if not v <= maxs[0]: m = "Percentile value %s should all be <= the max dataset value %s" % (v, maxs[0]) raise Exception(m)
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None): if not summaryResult: raise Exception("summaryResult is empty for infoFromSummary") summaries = summaryResult['summaries'] # what if we didn't get the full # of cols in this summary view? # I guess the test should deal with that if 1==0 and numCols and (len(summaries)!=numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries))) coltypeList = [] for column in summaries: colname = column['colname'] # is this always None? unused? coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] coltypeList.append(stattype) h2o_exec.checkForBadFP(nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype)) if stattype == 'Enum': cardinality = stats['cardinality'] h2o_exec.checkForBadFP(cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype)) else: mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] # check for NaN/Infinity in some of these # apparently we can get NaN in the mean for a numerica col with all NA? h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP(zeros, 'zeros for colname: %s stattype %s' % (colname, stattype)) if numRows and (nacnt==numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (colname, stattype) else: if not mins: print dump_json(column) # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows)) print "Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows) if not maxs: # this is failing on maprfs best buy...why? (va only?) print dump_json(column) # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows)) print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] if not noPrint: print "\n\n************************" print "colname:", colname print "coltype:", coltype print "nacnt:", nacnt print "stattype:", stattype if stattype == 'Enum': print "cardinality:", cardinality else: print "mean:", mean print "sd:", sd print "zeros:", zeros print "mins:", mins print "maxs:", maxs print "pct:", pct print "pctile:", pctile # histogram stuff print "hstart:", hstart print "hstep:", hstep print "hbrk:", hbrk print "hcnt:", hcnt return coltypeList
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None): if not summaryResult: raise Exception("summaryResult is empty for infoFromSummary") if h2o.beta_features: # names = summaryResult['names'] # means = summaryResult['means'] summaries = summaryResult['summaries'] # what if we didn't get the full # of cols in this summary view? # I guess the test should deal with that if 1 == 0 and numCols and (len(summaries) != numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries))) for column in summaries: colname = column['colname'] coltype = column['type'] nacnt = column['nacnt'] stats = column['stats'] stattype = stats['type'] h2o_exec.checkForBadFP( nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype)) if stattype == 'Enum': cardinality = stats['cardinality'] h2o_exec.checkForBadFP( cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype)) else: mean = stats['mean'] sd = stats['sd'] zeros = stats['zeros'] mins = stats['mins'] maxs = stats['maxs'] pct = stats['pct'] pctile = stats['pctile'] # check for NaN/Infinity in some of these # apparently we can get NaN in the mean for a numerica col with all NA? h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True, infOkay=True) h2o_exec.checkForBadFP( zeros, 'zeros for colname: %s stattype %s' % (colname, stattype)) if numRows and (nacnt == numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % ( colname, stattype) else: if not mins: print h2o.dump_json(column) # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows)) print "Why is min[] empty for a %s col (%s) ? %s %s %s" % ( mins, stattype, colname, nacnt, numRows) if not maxs: # this is failing on maprfs best buy...why? (va only?) print h2o.dump_json(column) # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows)) print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % ( maxs, stattype, colname, nacnt, numRows) hstart = column['hstart'] hstep = column['hstep'] hbrk = column['hbrk'] hcnt = column['hcnt'] if not noPrint: print "\n\n************************" print "colname:", colname print "coltype:", coltype print "nacnt:", nacnt print "stattype:", stattype if stattype == 'Enum': print "cardinality:", cardinality else: print "mean:", mean print "sd:", sd print "zeros:", zeros print "mins:", mins print "maxs:", maxs print "pct:", pct print "pctile:", pctile # histogram stuff print "hstart:", hstart print "hstep:", hstep print "hbrk:", hbrk print "hcnt:", hcnt else: summary = summaryResult['summary'] columnList = summary['columns'] # can't get the right number of columns in summary? have to ask for more cols (does va support > 1000) if 1 == 0 and numCols and (len(columnList) != numCols): raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(columnList))) for column in columnList: N = column['N'] # self.assertEqual(N, rowCount) name = column['name'] stype = column['type'] histogram = column['histogram'] bin_size = histogram['bin_size'] bin_names = histogram['bin_names'] # if not noPrint: # for b in bin_names: # print "bin_name:", b bins = histogram['bins'] nbins = histogram['bins'] if not noPrint: print "\n\n************************" print "N:", N print "name:", name print "type:", stype print "bin_size:", bin_size print "len(bin_names):", len(bin_names), bin_names print "len(bins):", len(bins), bins print "len(nbins):", len(nbins), nbins # not done if enum if stype != "enum": zeros = column['zeros'] na = column['na'] maxs = column['max'] mins = column['min'] mean = column['mean'] sigma = column['sigma'] if not noPrint: print "zeros:", zeros print "na:", na print "maxs:", maxs print "mins:", mins print "mean:", mean print "sigma:", sigma if numRows and (na == numRows): print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % ( name, stype) else: if not mins: print h2o.dump_json(column) raise Exception( "Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stype, N, na, numRows)) if not maxs: print h2o.dump_json(column) # bestbuy dataset in maprfs is failing this ..for va only? not sure why. some nas? print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % ( maxs, stype, N, na, numRows) # sometimes we don't get percentiles? (if 0 or 1 bins?) if len(bins) >= 2: percentiles = column['percentiles'] thresholds = percentiles['thresholds'] values = percentiles['values'] if not noPrint: # h2o shows 5 of them, ordered print "len(max):", len(maxs), maxs print "len(min):", len(mins), mins print "len(thresholds):", len(thresholds), thresholds print "len(values):", len(values), values for v in values: # 0 is the most max or most min if not v >= mins[0]: m = "Percentile value %s should all be >= the min dataset value %s" % ( v, mins[0]) raise Exception(m) if not v <= maxs[0]: m = "Percentile value %s should all be <= the max dataset value %s" % ( v, maxs[0]) raise Exception(m)