Beispiel #1
0
def simpleCheckGLMScore(self, glmScore, family='gaussian', allowFailWarning=False, **kwargs):
    warnings = None
    if 'warnings' in glmScore:
        warnings = glmScore['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    validation = glmScore['validation']
    validation['err'] = h2o_util.cleanseInfNan(validation['err'])
    validation['nullDev'] = h2o_util.cleanseInfNan(validation['nullDev'])
    validation['resDev'] = h2o_util.cleanseInfNan(validation['resDev'])
    print "%15s %s" % ("err:\t", validation['err'])
    print "%15s %s" % ("nullDev:\t", validation['nullDev'])
    print "%15s %s" % ("resDev:\t", validation['resDev'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validation['auc'])
        print "%15s %s" % ("threshold:\t", validation['threshold'])

    err = False
    if family=="poisson" or family=="gaussian":
        if 'aic' not in validation:
            print "aic is missing from the glm json response"
            err = True

    if math.isnan(validation['err']):
        print "Why is this err = 'nan'?? %6s %s" % ("err:\t", validation['err'])
        err = True

    if math.isnan(validation['resDev']):
        print "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
        err = True

    if err:
        raise Exception ("How am I supposed to tell that any of these errors should be ignored?")

    # legal?
    if math.isnan(validation['nullDev']):
        ## emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", validation['nullDev'])
        ## raise Exception(emsg)
        pass
Beispiel #2
0
def simpleCheckGLMScore(self, glmScore, family='gaussian', allowFailWarning=False, **kwargs):
    warnings = None
    if 'warnings' in glmScore:
        warnings = glmScore['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    validation = glmScore['validation']
    validation['err'] = h2o_util.cleanseInfNan(validation['err'])
    validation['nullDev'] = h2o_util.cleanseInfNan(validation['nullDev'])
    validation['resDev'] = h2o_util.cleanseInfNan(validation['resDev'])
    print "%15s %s" % ("err:\t", validation['err'])
    print "%15s %s" % ("nullDev:\t", validation['nullDev'])
    print "%15s %s" % ("resDev:\t", validation['resDev'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validation['auc'])
        print "%15s %s" % ("threshold:\t", validation['threshold'])

    err = False
    if family=="poisson" or family=="gaussian":
        if 'aic' not in validation:
            print "aic is missing from the glm json response"
            err = True

    if math.isnan(validation['err']):
        print "Why is this err = 'nan'?? %6s %s" % ("err:\t", validation['err'])
        err = True

    if math.isnan(validation['resDev']):
        print "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation['resDev'])
        err = True

    if err:
        raise Exception ("How am I supposed to tell that any of these errors should be ignored?")

    # legal?
    if math.isnan(validation['nullDev']):
        ## emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", validation['nullDev'])
        ## raise Exception(emsg)
        pass
Beispiel #3
0
def simpleCheckGLMScore(self, glmScore, family="gaussian", allowFailWarning=False, **kwargs):
    warnings = None
    if "warnings" in glmScore:
        warnings = glmScore["warnings"]
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x, w) and not allowFailWarning:
                if re.search(c, w):
                    # ignore the fail to converge warning now
                    pass
                else:
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    validation = glmScore["validation"]
    validation["err"] = h2o_util.cleanseInfNan(validation["err"])
    validation["nullDev"] = h2o_util.cleanseInfNan(validation["nullDev"])
    validation["resDev"] = h2o_util.cleanseInfNan(validation["resDev"])
    print "%15s %s" % ("err:\t", validation["err"])
    print "%15s %s" % ("nullDev:\t", validation["nullDev"])
    print "%15s %s" % ("resDev:\t", validation["resDev"])

    # threshold only there if binomial?
    # auc only for binomial
    if family == "binomial":
        print "%15s %s" % ("auc:\t", validation["auc"])
        print "%15s %s" % ("threshold:\t", validation["threshold"])

    if family == "poisson" or family == "gaussian":
        print "%15s %s" % ("aic:\t", validation["aic"])

    if math.isnan(validation["err"]):
        emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validation["err"])
        raise Exception(emsg)

    if math.isnan(validation["resDev"]):
        emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validation["resDev"])
        raise Exception(emsg)

    # legal?
    if math.isnan(validation["nullDev"]):
        ## emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", validation['nullDev'])
        ## raise Exception(emsg)
        pass
Beispiel #4
0
def oldSimpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors. 
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    GLMModel = glm['glm_model']
    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" % dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    GLMParams = GLMModel['glm']
    family = GLMParams["family"]

    # number of submodels = number of lambda
    # min of 2. lambda_max is first
    submodels = GLMModel['submodels']
    # since all our tests?? only use one lambda, the best_lamda_idx should = 1
    best_lambda_idx = GLMModel['best_lambda_idx']
    print "best_lambda_idx:", best_lambda_idx
    lambda_max = GLMModel['lambda_max']
    print "lambda_max:", lambda_max

    # currently lambda_max is not set by tomas. ..i.e.not valid
    if 1==0 and (lambda_max <= submodels[best_lambda_idx].lambda_value):
        raise Exception("lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value))

    # submodels0 = submodels[0]
    # submodels1 = submodels[-1] # hackery to make it work when there's just one

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels)))

    if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
        raise Exception("best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels)))

    submodels1 = submodels[best_lambda_idx] # hackery to make it work when there's just one
    iterations = submodels1['iteration']


    print "GLMModel/iterations:", iterations

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
            raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

    if 'validation' not in submodels1:
        raise Exception("Should be a 'validation' key in submodels1: %s" % dump_json(submodels1))
    validationsList = submodels1['validation']
    validations = validationsList
        
    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    print "GLMModel/validations"        
    validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
    validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])        
    print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
    print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        best_threshold = validations['best_threshold']
        thresholds = validations['thresholds']
        print "%15s %s" % ("best_threshold:\t", best_threshold)

        # have to look up the index for the cm, from the thresholds list
        best_index = None

        for i,t in enumerate(thresholds):
            if t >= best_threshold: # ends up using next one if not present
                best_index = i
                break
            
        assert best_index!=None, "%s %s" % (best_threshold, thresholds)
        print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

        # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
        submodels = glm['glm_model']['submodels']
        # FIX! this isn't right if we have multiple lambdas? different submodels?
        cms = submodels[0]['validation']['_cms']
        self.assertEqual(len(thresholds), len(cms), 
            msg="thresholds %s and cm %s should be lists of the same size. %s" % (len(thresholds), len(cms), thresholds))
        # FIX! best_threshold isn't necessarily in the list. jump out if >=
        assert best_index<len(cms), "%s %s" % (best_index, len(cms))
        # if we want 0.5..rounds to int
        # mid = len(cms)/2
        # cm = cms[mid]
        cm = cms[best_index]

        print "cm:", dump_json(cm['_arr'])
        predErr = cm['_predErr']
        classErr = cm['_classErr']
        # compare to predErr
        # pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']);
        # FIX!
        pctWrong = 0
        print "predErr:", predErr
        print "calculated pctWrong from cm:", pctWrong
        print "classErr:", classErr

        # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

        print "\nTrain\n==========\n"
        # print h2o_gbm.pp_cm(cm['_arr'])


    if family=="poisson" or family=="gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    coefficients_names = GLMModel['coefficients_names']
    # print "coefficients_names:", coefficients_names
    idxs = submodels1['idxs']
    print "idxs:", idxs
    coefficients_names = coefficients_names

    # always check both normalized and normal coefficients
    norm_beta = submodels1['norm_beta']
    # if norm_beta and len(coefficients_names)!=len(norm_beta):
    #    print len(coefficients_names), len(norm_beta)
    #    raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta))
#
    beta = submodels1['beta']
    # print "beta:", beta
    # if len(coefficients_names)!=len(beta):
    #    print len(coefficients_names), len(beta)
    #    raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta))


    # test wants to use normalized?
    if doNormalized:
        beta_used = norm_beta
    else:
        beta_used = beta

    coefficients = {}
    # create a dictionary with name, beta (including intercept) just like v1

    for i,b in zip(idxs, beta_used[:-1]):
        name = coefficients_names[i]
        coefficients[name] = b

    print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
    print  "coefficients:", coefficients
    print  "beta:", beta
    print  "norm_beta:", norm_beta

    coefficients['Intercept'] = beta_used[-1]
    print "len(coefficients_names)", len(coefficients_names)
    print "len(idxs)", len(idxs)
    print "idxs[-1]", idxs[-1]
    print "intercept demapping info:", \
        "coefficients_names[-i]:", coefficients_names[-1], \
        "idxs[-1]:", idxs[-1], \
        "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
        "beta_used[-1]:", beta_used[-1], \
        "coefficients['Intercept']", coefficients['Intercept']

    # last one is intercept
    interceptName = coefficients_names[idxs[-1]]
    if interceptName != "Intercept" or abs(beta_used[-1])<1e-26:
        raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\
            (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

    # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names
    # new 5/28/14. glm can point to zero coefficients
    # for i in idxs:
    #     if beta_used[i]==0.0:
    ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
    if len(idxs) > len(beta_used):
        raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used)))
    intercept = coefficients.pop('Intercept', None)

    # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
    # the last one shoudl be 'Intercept' ?
    coefficients_names.pop()

    # have to skip the output col! get it from kwargs
    # better always be there!
    y = kwargs['response']

    # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names'
    # from the response
    # Tomas created 'coefficients_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before
    # he gets it? 
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint: 
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # coefficients_names is input only now..same for header or no header, or expanded enums
    for c in coefficients_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint: 
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(coefficients_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        # add kwargs to help debug without looking at console log
        self.assertGreater(absXCoeff, 1e-26, (
            "abs. value of GLM coefficients['" + str(colX) + "'] is " +
            str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) +  "\n" +
            "kwargs:" + dump_json(kwargs)
            ))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coefficients['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" +
        "kwargs:" + dump_json(kwargs)
        ))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients)>0):
        maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
        minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
    else: 
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero, 
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients)>1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26\n" +
            "kwargs:" + dump_json(kwargs)
            ))

    print "submodels1, run_time (milliseconds):", submodels1['run_time']

    # shouldn't have any errors
    check_sandbox_for_errors()

    return (warnings, cList, intercept)
Beispiel #5
0
def simpleCheckGLM(self,
                   glm,
                   colX,
                   allowFailWarning=False,
                   allowZeroCoeff=False,
                   prettyPrint=False,
                   noPrint=False,
                   maxExpectedIterations=None,
                   doNormalized=False,
                   **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors.
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    if h2o.beta_features:
        GLMModel = glm['glm_model']
    else:
        GLMModel = glm['GLMModel']

    if not GLMModel:
        raise Exception("GLMModel didn't exist in the glm response? %s" %
                        h2o.dump_json(glm))

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x, w) and not allowFailWarning:
                if re.search(c, w):
                    # ignore the fail to converge warning now
                    pass
                else:
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    if h2o.beta_features:
        GLMParams = GLMModel['glm']
    else:
        GLMParams = GLMModel["GLMParams"]

    family = GLMParams["family"]

    if h2o.beta_features:
        # number of submodels = number of lambda
        # min of 2. lambda_max is first
        submodels = GLMModel['submodels']
        lambdas = GLMModel['lambdas']
        # since all our tests?? only use one lambda, the best_lamda_idx should = 1
        best_lambda_idx = GLMModel['best_lambda_idx']
        print "best_lambda_idx:", best_lambda_idx
        lambda_max = GLMModel['lambda_max']
        print "lambda_max:", lambda_max

        # currently lambda_max is not set by tomas. ..i.e.not valid
        if 1 == 0 and lambda_max <= lambdas[best_lambda_idx]:
            raise Exception(
                "lambda_max %s should always be > the lambda result %s we're checking"
                % (lambda_max, lambdas[best_lambda_idx]))

        # submodels0 = submodels[0]
        # submodels1 = submodels[-1] # hackery to make it work when there's just one

        if (best_lambda_idx >= len(lambdas)) or (best_lambda_idx < 0):
            raise Exception(
                "best_lambda_idx: %s should point to one of lambdas (which has len %s)"
                % (best_lambda_idx, len(lambdas)))

        if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0):
            raise Exception(
                "best_lambda_idx: %s should point to one of submodels (which has len %s)"
                % (best_lambda_idx, len(submodels)))

        submodels1 = submodels[
            best_lambda_idx]  # hackery to make it work when there's just one
        iterations = submodels1['iteration']

    else:
        iterations = GLMModel['iterations']

    print "GLMModel/iterations:", iterations

    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations > maxExpectedIterations:
        raise Exception(
            "Convergence issue? GLM did iterations: %d which is greater than expected: %d"
            % (iterations, maxExpectedIterations))

    if h2o.beta_features:
        if 'validation' not in submodels1:
            raise Exception("Should be a 'validation' key in submodels1: %s" %
                            h2o.dump_json(submodels1))
        validationsList = submodels1['validation']
        validations = validationsList

    else:
        # pop the first validation from the list
        if 'validations' not in GLMModel:
            raise Exception("Should be a 'validations' key in GLMModel: %s" %
                            h2o.dump_json(GLMModel))
        validationsList = GLMModel['validations']
        # don't want to modify validationsList in case someone else looks at it
        validations = validationsList[0]

    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    # not checked in v2?
    if not h2o.beta_features:
        if not 'xval_models' in validations:
            if n_folds > 1:
                raise Exception(
                    "No cross validation models returned. Asked for " +
                    n_folds)
        else:
            xval_models = validations['xval_models']
            if n_folds and n_folds > 1:
                if len(xval_models) != n_folds:
                    raise Exception(
                        len(xval_models) +
                        " cross validation models returned. Asked for " +
                        n_folds)
            else:
                # should be default 10?
                if len(xval_models) != 10:
                    raise Exception(
                        str(len(xval_models)) +
                        " cross validation models returned. Default should be 10"
                    )

    if h2o.beta_features:
        print "GLMModel/validations"
        validations['null_deviance'] = h2o_util.cleanseInfNan(
            validations['null_deviance'])
        validations['residual_deviance'] = h2o_util.cleanseInfNan(
            validations['residual_deviance'])
        print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
        print "%15s %s" % ("residual_deviance:\t",
                           validations['residual_deviance'])

    else:
        print "GLMModel/validations"
        validations['err'] = h2o_util.cleanseInfNan(validations['err'])
        validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
        validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
        print "%15s %s" % ("err:\t", validations['err'])
        print "%15s %s" % ("nullDev:\t", validations['nullDev'])
        print "%15s %s" % ("resDev:\t", validations['resDev'])

    # threshold only there if binomial?
    # auc only for binomial
    if family == "binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        if h2o.beta_features:
            best_threshold = validations['best_threshold']
            thresholds = validations['thresholds']
            print "%15s %s" % ("best_threshold:\t", best_threshold)

            # have to look up the index for the cm, from the thresholds list
            best_index = None

            # FIX! best_threshold isn't necessarily in the list. jump out if >=
            for i, t in enumerate(thresholds):
                if t >= best_threshold:  # ends up using next one if not present
                    best_index = i
                    break

            assert best_index != None, "%s %s" % (best_threshold, thresholds)
            print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold

            # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1]
            submodels = glm['glm_model']['submodels']
            cms = submodels[0]['validation']['_cms']
            assert best_index < len(cms), "%s %s" % (best_index, len(cms))
            # if we want 0.5..rounds to int
            # mid = len(cms)/2
            # cm = cms[mid]
            cm = cms[best_index]

            print "cm:", h2o.dump_json(cm['_arr'])
            predErr = cm['_predErr']
            classErr = cm['_classErr']
            # compare to predErr
            pctWrong = h2o_gbm.pp_cm_summary(cm['_arr'])
            print "predErr:", predErr
            print "calculated pctWrong from cm:", pctWrong
            print "classErr:", classErr

            # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)")

            print "\nTrain\n==========\n"
            print h2o_gbm.pp_cm(cm['_arr'])
        else:
            print "%15s %s" % ("threshold:\t", validations['threshold'])

    if family == "poisson" or family == "gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    if not h2o.beta_features:
        if math.isnan(validations['err']):
            emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t",
                                                         validations['err'])
            raise Exception(emsg)

        if math.isnan(validations['resDev']):
            emsg = "Why is this resDev = 'nan'?? %6s %s" % (
                "resDev:\t", validations['resDev'])
            raise Exception(emsg)

        # legal?
        if math.isnan(validations['nullDev']):
            pass

    # get a copy, so we don't destroy the original when we pop the intercept
    if h2o.beta_features:
        coefficients_names = GLMModel['coefficients_names']
        # print "coefficients_names:", coefficients_names
        idxs = submodels1['idxs']
        print "idxs:", idxs
        column_names = coefficients_names

        # always check both normalized and normal coefficients
        norm_beta = submodels1['norm_beta']
        # if norm_beta and len(column_names)!=len(norm_beta):
        #    print len(column_names), len(norm_beta)
        #    raise Exception("column_names and normalized_norm_beta from h2o json not same length. column_names: %s normalized_norm_beta: %s" % (column_names, norm_beta))
        #
        beta = submodels1['beta']
        # print "beta:", beta
        # if len(column_names)!=len(beta):
        #    print len(column_names), len(beta)
        #    raise Exception("column_names and beta from h2o json not same length. column_names: %s beta: %s" % (column_names, beta))

        # test wants to use normalized?
        if doNormalized:
            beta_used = norm_beta
        else:
            beta_used = beta

        coefficients = {}
        # create a dictionary with name, beta (including intercept) just like v1

        for i, b in zip(idxs, beta_used[:-1]):
            name = coefficients_names[i]
            coefficients[name] = b

        print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used)
        print "coefficients:", coefficients
        print "beta:", beta
        print "norm_beta:", norm_beta

        coefficients['Intercept'] = beta_used[-1]
        print "intercept demapping info:", \
            "column_names[-i]:", column_names[-1], \
            "idxs[-1]:", idxs[-1], \
            "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \
            "beta_used[-1]:", beta_used[-1], \
            "coefficients['Intercept']", coefficients['Intercept']

        # last one is intercept
        interceptName = coefficients_names[idxs[-1]]
        if interceptName != "Intercept" or abs(beta_used[-1]) < 1e-26:
            raise Exception("'Intercept' should be last in coefficient_names and beta %s %s %s" %\
                (idxs[-1], beta_used[-1], "-"+interceptName+"-"))

        # idxs has the order for non-zero coefficients, it's shorter than beta_used and column_names
        # new 5/28/14. glm can point to zero coefficients
        # for i in idxs:
        #     if beta_used[i]==0.0:
        ##        raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i]))
        if len(idxs) > len(beta_used):
            raise Exception("idxs shouldn't be longer than beta_used %s %s" %
                            (len(idxs), len(beta_used)))
        intercept = coefficients.pop('Intercept', None)

        # intercept demapping info: idxs[-1]: 54 coefficient_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
        # the last one shoudl be 'Intercept' ?
        column_names.pop()

    else:
        if doNormalized:
            coefficients = GLMModel['normalized_coefficients'].copy()
        else:
            coefficients = GLMModel['coefficients'].copy()
        column_names = GLMModel['column_names']
        # get the intercept out of there into it's own dictionary
        intercept = coefficients.pop('Intercept', None)
        print "First intercept:", intercept

    # have to skip the output col! get it from kwargs
    # better always be there!
    if h2o.beta_features:
        y = kwargs['response']
    else:
        y = kwargs['y']

    # the dict keys are column headers if they exist...how to order those? new: use the 'column_names'
    # from the response
    # Tomas created 'column_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'column_names' if dropped by GLMQuery before
    # he gets it?
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint:
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # column_names is input only now..same for header or no header, or expanded enums
    for c in column_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint:
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(column_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        self.assertGreater(
            absXCoeff, 1e-26,
            ("abs. value of GLM coefficients['" + str(colX) + "'] is " +
             str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX)))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26,
                       ("abs. value of GLM coefficients['Intercept'] is " +
                        str(absIntercept) + ", not >= 1e-26 for Intercept"))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients) > 0):
        maxKey = max([(abs(coefficients[x]), x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[
            maxKey]
        minKey = min([(abs(coefficients[x]), x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[
            minKey]
    else:
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero,
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients) > 1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(
            s, 1e-26, ("sum of abs. value of GLM coefficients/intercept is " +
                       str(s) + ", not >= 1e-26"))

    if h2o.beta_features:
        print "submodels1, run_time (milliseconds):", submodels1['run_time']
    else:

        print "GLMModel model time (milliseconds):", GLMModel['model_time']
        print "GLMModel validation time (milliseconds):", validations[
            'val_time']
        print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']

    # shouldn't have any errors
    h2o.check_sandbox_for_errors()

    return (warnings, cList, intercept)
Beispiel #6
0
def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False,
    prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs):
    # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter

    # h2o GLM will verboseprint the result and print errors. 
    # so don't have to do that
    # different when cross validation  is used? No trainingErrorDetails?
    if h2o.beta_features:
        GLMModel = glm['glm_model']
    else:
        GLMModel = glm['GLMModel']

    warnings = None
    if 'warnings' in GLMModel and GLMModel['warnings']:
        warnings = GLMModel['warnings']
        # stop on failed
        x = re.compile("failed", re.IGNORECASE)
        # don't stop if fail to converge
        c = re.compile("converge", re.IGNORECASE)
        for w in warnings:
            print "\nwarning:", w
            if re.search(x,w) and not allowFailWarning: 
                if re.search(c,w):
                    # ignore the fail to converge warning now
                    pass
                else: 
                    # stop on other 'fail' warnings (are there any? fail to solve?
                    raise Exception(w)

    # for key, value in glm.iteritems(): print key
    # not in GLMGrid?

    # FIX! don't get GLMParams if it can't solve?
    if h2o.beta_features:
        GLMParams = GLMModel['glm']
    else:
        GLMParams = GLMModel["GLMParams"]

    family = GLMParams["family"]

    if h2o.beta_features:
        submodels0 = GLMModel['submodels'][0]
        iterations = submodels0['iteration']
    else:
        iterations = GLMModel['iterations']

    print "GLMModel/iterations:", iterations

            # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
    if maxExpectedIterations is not None and iterations  > maxExpectedIterations:
            raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

    if h2o.beta_features:
        if 'validation' not in submodels0:
            raise Exception("Should be a 'validation' key in submodels0: %s" % h2o.dump_json(submodels0))
        validationsList = submodels0['validation']
        validations = validationsList
        
    else:
        # pop the first validation from the list
        if 'validations' not in GLMModel:
            raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel))
        validationsList = GLMModel['validations']
        # don't want to modify validationsList in case someone else looks at it
        validations = validationsList[0]

    # xval. compare what we asked for and what we got.
    n_folds = kwargs.setdefault('n_folds', None)

    # not checked in v2?
    if not h2o.beta_features:
        if not 'xval_models' in validations:
            if n_folds > 1:
                raise Exception("No cross validation models returned. Asked for "+n_folds)
        else:
            xval_models = validations['xval_models']
            if n_folds and n_folds > 1:
                if len(xval_models) != n_folds:
                    raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds)
            else:
                # should be default 10?
                if len(xval_models) != 10:
                    raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10")

    if h2o.beta_features:
        print "GLMModel/validations"
        validations['avg_err'] = h2o_util.cleanseInfNan(validations['avg_err'])
        validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
        validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])
        print "%15s %s" % ("avg_err:\t", validations['avg_err'])
        print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
        print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

    else:
        print "GLMModel/validations"
        validations['err'] = h2o_util.cleanseInfNan(validations['err'])
        validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
        validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
        print "%15s %s" % ("err:\t", validations['err'])
        print "%15s %s" % ("nullDev:\t", validations['nullDev'])
        print "%15s %s" % ("resDev:\t", validations['resDev'])

    # threshold only there if binomial?
    # auc only for binomial
    if family=="binomial":
        print "%15s %s" % ("auc:\t", validations['auc'])
        if h2o.beta_features:
            print "%15s %s" % ("best_threshold:\t", validations['best_threshold'])
        else:
            print "%15s %s" % ("threshold:\t", validations['threshold'])

    if family=="poisson" or family=="gaussian":
        print "%15s %s" % ("aic:\t", validations['aic'])

    if not h2o.beta_features:
        if math.isnan(validations['err']):
            emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err'])
            raise Exception(emsg)

        if math.isnan(validations['resDev']):
            emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev'])
            raise Exception(emsg)

        # legal?
        if math.isnan(validations['nullDev']):
            pass

    # get a copy, so we don't destroy the original when we pop the intercept
    if h2o.beta_features:
        coefficients_names = GLMModel['coefficients_names']
        idxs = submodels0['idxs']
        column_names = coefficients_names

        # always check both normalized and normal coefficients
        norm_beta = submodels0['norm_beta']
        if len(column_names)!=len(norm_beta):
            print len(column_names), len(norm_beta)
            raise Exception("column_names and normalized_norm_beta from h2o json not same length. column_names: %s normalized_norm_beta: %s" % (column_names, norm_beta))

        beta = submodels0['beta']
        if len(column_names)!=len(beta):
            print len(column_names), len(beta)
            raise Exception("column_names and beta from h2o json not same length. column_names: %s beta: %s" % (column_names, beta))


        # test wants to use normalized?
        if doNormalized:
            beta_used = norm_beta
        else:
            beta_used = beta

        coefficients = {}
        # create a dictionary with name, beta (including intercept) just like v1

        for n,b in zip(column_names, beta_used):
            coefficients[n] = b

        print  "coefficients:", coefficients
        print  "beta:", beta
        print  "norm_beta:", norm_beta

        print "intercept demapping info:", \
            "column_names[-i]:", column_names[-1], \
            "idxs[-1]:", idxs[-1], \
            "coefficients_names[[idxs[-1]]:", coefficients_names[idxs[-1]], \
            "beta_used[-1]:", beta_used[-1], \
            "coefficients['Intercept']", coefficients['Intercept']

        # idxs has the order for non-zero coefficients, it's shorter than beta_used and column_names
        for i in idxs:
            if beta_used[i]==0.0:
                raise Exception("idxs shouldn't point to any 0 coefficients i: %s beta_used[i]:" (i, beta_used[i]))

        intercept = coefficients.pop('Intercept', None)

        # intercept demapping info: idxs[-1]: 54 coefficient_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099
        # the last one shoudl be 'Intercept' ?
        column_names.pop()

    else:
        if doNormalized:
            coefficients = GLMModel['normalized_coefficients'].copy()
        else:
            coefficients = GLMModel['coefficients'].copy()
        column_names = GLMModel['column_names']
        # get the intercept out of there into it's own dictionary
        intercept = coefficients.pop('Intercept', None)
        print "First intercept:", intercept

    # have to skip the output col! get it from kwargs
    # better always be there!
    if h2o.beta_features:
        y = kwargs['response']
    else:
        y = kwargs['y']


    # the dict keys are column headers if they exist...how to order those? new: use the 'column_names'
    # from the response
    # Tomas created 'column_names which is the coefficient list in order.
    # Just use it to index coefficients! works for header or no-header cases
    # I guess now we won't print the "None" cases for dropped columns (constant columns!)
    # Because Tomas doesn't get everything in 'column_names' if dropped by GLMQuery before
    # he gets it? 
    def add_to_coefficient_list_and_string(c, cList, cString):
        if c in coefficients:
            cValue = coefficients[c]
            cValueString = "%s: %.5e   " % (c, cValue)
        else:
            print "Warning: didn't see '" + c + "' in json coefficient response.",\
                  "Inserting 'None' with assumption it was dropped due to constant column)"
            cValue = None
            cValueString = "%s: %s   " % (c, cValue)

        cList.append(cValue)
        # we put each on newline for easy comparison to R..otherwise keep condensed
        if prettyPrint: 
            cValueString = "H2O coefficient " + cValueString + "\n"
        # not mutable?
        return cString + cValueString

    # creating both a string for printing and a list of values
    cString = ""
    cList = []
    # print in order using col_names
    # column_names is input only now..same for header or no header, or expanded enums
    for c in column_names:
        cString = add_to_coefficient_list_and_string(c, cList, cString)

    if prettyPrint: 
        print "\nH2O intercept:\t\t%.5e" % intercept
        print cString
    else:
        if not noPrint:
            print "\nintercept:", intercept, cString

    print "\nTotal # of coefficients:", len(column_names)

    # pick out the coefficent for the column we enabled for enhanced checking. Can be None.
    # FIX! temporary hack to deal with disappearing/renaming columns in GLM
    if (not allowZeroCoeff) and (colX is not None):
        absXCoeff = abs(float(coefficients[str(colX)]))
        self.assertGreater(absXCoeff, 1e-26, (
            "abs. value of GLM coefficients['" + str(colX) + "'] is " +
            str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX)
            ))

    # intercept is buried in there too
    absIntercept = abs(float(intercept))
    self.assertGreater(absIntercept, 1e-26, (
        "abs. value of GLM coefficients['Intercept'] is " +
        str(absIntercept) + ", not >= 1e-26 for Intercept"
                ))

    # this is good if we just want min or max
    # maxCoeff = max(coefficients, key=coefficients.get)
    # for more, just invert the dictionary and ...
    if (len(coefficients)>0):
        maxKey = max([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Largest abs. coefficient value:", maxKey, coefficients[maxKey]
        minKey = min([(abs(coefficients[x]),x) for x in coefficients])[1]
        print "H2O Smallest abs. coefficient value:", minKey, coefficients[minKey]
    else: 
        print "Warning, no coefficients returned. Must be intercept only?"

    # many of the GLM tests aren't single column though.
    # quick and dirty check: if all the coefficients are zero, 
    # something is broken
    # intercept is in there too, but this will get it okay
    # just sum the abs value  up..look for greater than 0

    # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff?
    if (not allowZeroCoeff) and (len(coefficients)>1):
        s = 0.0
        for c in coefficients:
            v = coefficients[c]
            s += abs(float(v))

        self.assertGreater(s, 1e-26, (
            "sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26"
            ))

    if h2o.beta_features:
        print "submodels0, run_time (milliseconds):", submodels0['run_time']
    else:

        print "GLMModel model time (milliseconds):", GLMModel['model_time']
        print "GLMModel validation time (milliseconds):", validations['val_time']
        print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']

    # shouldn't have any errors
    h2o.check_sandbox_for_errors()

    return (warnings, cList, intercept)