Example #1
0
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
    if not summaryResult:
        raise Exception("summaryResult is empty for infoFromSummary")
    if h2o.beta_features:
        # names = summaryResult['names']
        # means = summaryResult['means']
        summaries = summaryResult['summaries']

        # what if we didn't get the full # of cols in this summary view? 
        # I guess the test should deal with that
        if 1==0 and numCols and (len(summaries)!=numCols):
            raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries)))

        for column in summaries:
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']
            h2o_exec.checkForBadFP(nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype))

            if stattype == 'Enum':
                cardinality = stats['cardinality']
                h2o_exec.checkForBadFP(cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype))
                
            else:
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                # check for NaN/Infinity in some of these
                # apparently we can get NaN in the mean for a numerica col with all NA?
                h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True)
                h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True)
                h2o_exec.checkForBadFP(zeros, 'zeros for colname: %s stattype %s' % (colname, stattype))

                if numRows and (nacnt==numRows):
                    print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (colname, stattype)
                else:
                    if not mins:
                        print h2o.dump_json(column)
                        raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows))
                    if not maxs:
                        print h2o.dump_json(column)
                        raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows))

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            if not noPrint:
                print "\n\n************************"
                print "colname:", colname
                print "coltype:", coltype
                print "nacnt:", nacnt

                print "stattype:", stattype
                if stattype == 'Enum':
                    print "cardinality:", cardinality
                else:
                    print "mean:", mean
                    print "sd:", sd
                    print "zeros:", zeros
                    print "mins:", mins
                    print "maxs:", maxs
                    print "pct:", pct
                    print "pctile:", pctile

                # histogram stuff
                print "hstart:", hstart
                print "hstep:", hstep
                print "hbrk:", hbrk
                print "hcnt:", hcnt

    else:
        summary = summaryResult['summary']
        columnList = summary['columns']
        # can't get the right number of columns in summary? have to ask for more cols (does va support >  1000)
        if 1==0 and numCols and (len(columnList)!=numCols):
            raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(columnList)))
        for column in columnList:
            N = column['N']
            # self.assertEqual(N, rowCount)
            name = column['name']
            stype = column['type']
            histogram = column['histogram']
            bin_size = histogram['bin_size']
            bin_names = histogram['bin_names']
            # if not noPrint:
            #     for b in bin_names:
            #        print "bin_name:", b

            bins = histogram['bins']
            nbins = histogram['bins']
            if not noPrint:
                print "\n\n************************"
                print "N:", N
                print "name:", name
                print "type:", stype
                print "bin_size:", bin_size
                print "len(bin_names):", len(bin_names), bin_names
                print "len(bins):", len(bins), bins
                print "len(nbins):", len(nbins), nbins

            # not done if enum
            if stype != "enum":
                zeros = column['zeros']
                na = column['na']
                maxs = column['max']
                mins = column['min']
                mean = column['mean']
                sigma = column['sigma']
                if not noPrint:
                    print "zeros:", zeros
                    print "na:", na
                    print "maxs:", maxs
                    print "mins:", mins
                    print "mean:", mean
                    print "sigma:", sigma

                if numRows and (na==numRows):
                    print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (name, stype)
                else:
                    if not mins:
                        print h2o.dump_json(column)
                        raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stype, N, na, numRows))
                    if not maxs:
                        print h2o.dump_json(column)
                        raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stype, N, na, numRows))


                # sometimes we don't get percentiles? (if 0 or 1 bins?)
                if len(bins) >= 2:
                    percentiles = column['percentiles']
                    thresholds = percentiles['thresholds']
                    values = percentiles['values']

                    if not noPrint:
                        # h2o shows 5 of them, ordered
                        print "len(max):", len(maxs), maxs
                        print "len(min):", len(mins), mins
                        print "len(thresholds):", len(thresholds), thresholds
                        print "len(values):", len(values), values

                    for v in values:
                        # 0 is the most max or most min
                       if not v >= mins[0]:
                            m = "Percentile value %s should all be >= the min dataset value %s" % (v, mins[0])
                            raise Exception(m)
                       if not v <= maxs[0]:
                            m = "Percentile value %s should all be <= the max dataset value %s" % (v, maxs[0])
                            raise Exception(m)
Example #2
0
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
    if not summaryResult:
        raise Exception("summaryResult is empty for infoFromSummary")

    summaries = summaryResult['summaries']
    # what if we didn't get the full # of cols in this summary view? 
    # I guess the test should deal with that
    if 1==0 and numCols and (len(summaries)!=numCols):
        raise Exception("Expected numCols: %s cols in summary. Got %s" % (numCols, len(summaries)))

    coltypeList = []
    for column in summaries:
        colname = column['colname']
        # is this always None? unused?
        coltype = column['type']
        nacnt = column['nacnt']
        stats = column['stats']
        stattype = stats['type']
        coltypeList.append(stattype)
        h2o_exec.checkForBadFP(nacnt, 'nacnt for colname: %s stattype: %s' % (colname, stattype))

        if stattype == 'Enum':
            cardinality = stats['cardinality']
            h2o_exec.checkForBadFP(cardinality, 'cardinality for colname: %s stattype: %s' % (colname, stattype))
            
        else:
            mean = stats['mean']
            sd = stats['sd']
            zeros = stats['zeros']
            mins = stats['mins']
            maxs = stats['maxs']
            pct = stats['pct']
            pctile = stats['pctile']

            # check for NaN/Infinity in some of these
            # apparently we can get NaN in the mean for a numerica col with all NA?
            h2o_exec.checkForBadFP(mean, 'mean for colname: %s stattype: %s' % (colname, stattype), nanOkay=True, infOkay=True)
            h2o_exec.checkForBadFP(sd, 'sd for colname: %s stattype %s' % (colname, stattype), nanOkay=True, infOkay=True)
            h2o_exec.checkForBadFP(zeros, 'zeros for colname: %s stattype %s' % (colname, stattype))

            if numRows and (nacnt==numRows):
                print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (colname, stattype)
            else:
                if not mins:
                    print dump_json(column)
                    # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows))
                    print "Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows)
                if not maxs:
                    # this is failing on maprfs best buy...why? (va only?)
                    print dump_json(column)
                    # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows))
                    print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows)

        hstart = column['hstart']
        hstep = column['hstep']
        hbrk = column['hbrk']
        hcnt = column['hcnt']

        if not noPrint:
            print "\n\n************************"
            print "colname:", colname
            print "coltype:", coltype
            print "nacnt:", nacnt

            print "stattype:", stattype
            if stattype == 'Enum':
                print "cardinality:", cardinality
            else:
                print "mean:", mean
                print "sd:", sd
                print "zeros:", zeros
                print "mins:", mins
                print "maxs:", maxs
                print "pct:", pct
                print "pctile:", pctile

            # histogram stuff
            print "hstart:", hstart
            print "hstep:", hstep
            print "hbrk:", hbrk
            print "hcnt:", hcnt

    return coltypeList
Example #3
0
def infoFromSummary(summaryResult, noPrint=False, numCols=None, numRows=None):
    if not summaryResult:
        raise Exception("summaryResult is empty for infoFromSummary")
    if h2o.beta_features:
        # names = summaryResult['names']
        # means = summaryResult['means']
        summaries = summaryResult['summaries']

        # what if we didn't get the full # of cols in this summary view?
        # I guess the test should deal with that
        if 1 == 0 and numCols and (len(summaries) != numCols):
            raise Exception("Expected numCols: %s cols in summary. Got %s" %
                            (numCols, len(summaries)))

        for column in summaries:
            colname = column['colname']
            coltype = column['type']
            nacnt = column['nacnt']
            stats = column['stats']
            stattype = stats['type']
            h2o_exec.checkForBadFP(
                nacnt,
                'nacnt for colname: %s stattype: %s' % (colname, stattype))

            if stattype == 'Enum':
                cardinality = stats['cardinality']
                h2o_exec.checkForBadFP(
                    cardinality, 'cardinality for colname: %s stattype: %s' %
                    (colname, stattype))

            else:
                mean = stats['mean']
                sd = stats['sd']
                zeros = stats['zeros']
                mins = stats['mins']
                maxs = stats['maxs']
                pct = stats['pct']
                pctile = stats['pctile']

                # check for NaN/Infinity in some of these
                # apparently we can get NaN in the mean for a numerica col with all NA?
                h2o_exec.checkForBadFP(mean,
                                       'mean for colname: %s stattype: %s' %
                                       (colname, stattype),
                                       nanOkay=True,
                                       infOkay=True)
                h2o_exec.checkForBadFP(sd,
                                       'sd for colname: %s stattype %s' %
                                       (colname, stattype),
                                       nanOkay=True,
                                       infOkay=True)
                h2o_exec.checkForBadFP(
                    zeros,
                    'zeros for colname: %s stattype %s' % (colname, stattype))

                if numRows and (nacnt == numRows):
                    print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (
                        colname, stattype)
                else:
                    if not mins:
                        print h2o.dump_json(column)
                        # raise Exception ("Why is min[] empty for a %s col (%s) ? %s %s %s" % (mins, stattype, colname, nacnt, numRows))
                        print "Why is min[] empty for a %s col (%s) ? %s %s %s" % (
                            mins, stattype, colname, nacnt, numRows)
                    if not maxs:
                        # this is failing on maprfs best buy...why? (va only?)
                        print h2o.dump_json(column)
                        # raise Exception ("Why is max[] empty for a %s col? (%s) ? %s %s %s" % (maxs, stattype, colname, nacnt, numRows))
                        print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (
                            maxs, stattype, colname, nacnt, numRows)

            hstart = column['hstart']
            hstep = column['hstep']
            hbrk = column['hbrk']
            hcnt = column['hcnt']

            if not noPrint:
                print "\n\n************************"
                print "colname:", colname
                print "coltype:", coltype
                print "nacnt:", nacnt

                print "stattype:", stattype
                if stattype == 'Enum':
                    print "cardinality:", cardinality
                else:
                    print "mean:", mean
                    print "sd:", sd
                    print "zeros:", zeros
                    print "mins:", mins
                    print "maxs:", maxs
                    print "pct:", pct
                    print "pctile:", pctile

                # histogram stuff
                print "hstart:", hstart
                print "hstep:", hstep
                print "hbrk:", hbrk
                print "hcnt:", hcnt

    else:
        summary = summaryResult['summary']
        columnList = summary['columns']
        # can't get the right number of columns in summary? have to ask for more cols (does va support >  1000)
        if 1 == 0 and numCols and (len(columnList) != numCols):
            raise Exception("Expected numCols: %s cols in summary. Got %s" %
                            (numCols, len(columnList)))
        for column in columnList:
            N = column['N']
            # self.assertEqual(N, rowCount)
            name = column['name']
            stype = column['type']
            histogram = column['histogram']
            bin_size = histogram['bin_size']
            bin_names = histogram['bin_names']
            # if not noPrint:
            #     for b in bin_names:
            #        print "bin_name:", b

            bins = histogram['bins']
            nbins = histogram['bins']
            if not noPrint:
                print "\n\n************************"
                print "N:", N
                print "name:", name
                print "type:", stype
                print "bin_size:", bin_size
                print "len(bin_names):", len(bin_names), bin_names
                print "len(bins):", len(bins), bins
                print "len(nbins):", len(nbins), nbins

            # not done if enum
            if stype != "enum":
                zeros = column['zeros']
                na = column['na']
                maxs = column['max']
                mins = column['min']
                mean = column['mean']
                sigma = column['sigma']
                if not noPrint:
                    print "zeros:", zeros
                    print "na:", na
                    print "maxs:", maxs
                    print "mins:", mins
                    print "mean:", mean
                    print "sigma:", sigma

                if numRows and (na == numRows):
                    print "%s is all NAs with type: %s. no checking for min/max/mean/sigma" % (
                        name, stype)
                else:
                    if not mins:
                        print h2o.dump_json(column)
                        raise Exception(
                            "Why is min[] empty for a %s col (%s) ? %s %s %s" %
                            (mins, stype, N, na, numRows))
                    if not maxs:
                        print h2o.dump_json(column)
                        # bestbuy dataset in maprfs is failing this ..for va only? not sure why. some nas?
                        print "Why is max[] empty for a %s col? (%s) ? %s %s %s" % (
                            maxs, stype, N, na, numRows)

                # sometimes we don't get percentiles? (if 0 or 1 bins?)
                if len(bins) >= 2:
                    percentiles = column['percentiles']
                    thresholds = percentiles['thresholds']
                    values = percentiles['values']

                    if not noPrint:
                        # h2o shows 5 of them, ordered
                        print "len(max):", len(maxs), maxs
                        print "len(min):", len(mins), mins
                        print "len(thresholds):", len(thresholds), thresholds
                        print "len(values):", len(values), values

                    for v in values:
                        # 0 is the most max or most min
                        if not v >= mins[0]:
                            m = "Percentile value %s should all be >= the min dataset value %s" % (
                                v, mins[0])
                            raise Exception(m)
                        if not v <= maxs[0]:
                            m = "Percentile value %s should all be <= the max dataset value %s" % (
                                v, maxs[0])
                            raise Exception(m)