def rapids(self, timeoutSecs=120, ignoreH2oError=False, **kwargs): # FIX! assume both of these are strings for now, not lists if 'ast' in kwargs and kwargs['ast'] is not None: assert isinstance(kwargs['ast'], basestring), "only string assumed? %s" % kwargs['ast'] if 'funs' in kwargs and kwargs['funs'] is not None: assert isinstance(kwargs['funs'], basestring), "only string assumed? %s" % kwargs['funs'] # currently runExec only does one or the other params_dict = { 'ast': None, 'funs': None, } check_params_update_kwargs(params_dict, kwargs, 'rapids', True) if 1==1: result = self.do_json_request('Rapids.json', cmd='post', timeout=timeoutSecs, postData=params_dict) else: result = self.do_json_request('Rapids.json', timeout=timeoutSecs, params=params_dict) verboseprint("rapids result:", dump_json(result)) # FIX! maybe add something for ignoring conditionally? if 'exception' in result and result['exception'] and not ignoreH2oError: exception = result['exception'] raise Exception('rapids with kwargs:\n%s\ngot exception:\n"%s"\n' % (dump_json(kwargs), exception)) h2o_sandbox.check_sandbox_for_errors() return result
def get_redirect_url(response): url = None params = None # StoreView has old style, while beta_features if 'response_info' in response: response_info = response['response_info'] if 'redirect_url' not in response_info: raise Exception("Response during polling must have 'redirect_url'\n%s" % dump_json(response)) if response_info['status'] != 'done': redirect_url = response_info['redirect_url'] if redirect_url: url = self.url(redirect_url) params = None else: if response_info['status'] != 'done': raise Exception( "'redirect_url' during polling is null but status!='done': \n%s" % dump_json(response)) else: if 'response' not in response: raise Exception("'response' not in response.\n%s" % dump_json(response)) if response['response']['status'] != 'done': if 'redirect_request' not in response['response']: raise Exception("'redirect_request' not in response. \n%s" % dump_json(response)) url = self.url(response['response']['redirect_request']) params = response['response']['redirect_request_args'] return (url, params)
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False): warnings = '' intercept = model.global_beta[-1] interceptName = model.coefficient_names[-1] coeffs = model.global_beta[:-1] coeffs_names = model.coefficient_names[:-1] assert len(coeffs) == (len(model.coefficient_names)-1) assert len(coeffs) == len(labelListUsed), "%s %s" % (coeffs, labelListUsed) # labelList still has the response column? # ignored columns aren't in model.names, but output response is. # labelListUsed has the response col removed so add 1 assert len(model.names) == (len(labelListUsed)+1), "%s %s" % (model.names, labelList) assert model.threshold!=0 print "len(coeffs)", len(coeffs) print "coeffs:", coeffs # last one is intercept if interceptName != "Intercept" or abs(intercept)<1e-26: raise Exception("'Intercept' should be last in coefficient_names and global_beta %s %s" % (interceptName, intercept)) y = parameters['response_column'] cString = "\n" for i,c in enumerate(coeffs_names): cString += "%s: %.5e " % (coeffs_names[i], coeffs[i]) print cString print "\nH2O intercept:\t\t%.5e" % intercept print "\nTotal # of coeffs:", len(coeffs_names) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ( "abs. value of GLM coeffs['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "parameters:" + dump_json(parameters) )) if (not allowZeroCoeff) and (len(coeffs)>1): s = 0.0 for c in coeffs: s += abs(float(c)) self.assertGreater(s, 1e-26, ( "sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" + "parameters:" + dump_json(parameters) )) # shouldn't have any errors check_sandbox_for_errors() return (warnings, coeffs, intercept)
def checkScalarResult(resultExec, resultKey, allowEmptyResult=False, nanOkay=False): # make the common problems easier to debug verboseprint("checkScalarResult resultExec:", dump_json(resultExec)) if 'funstr' not in resultExec: emsg = "checkScalarResult: 'funstr' missing" if 'result' not in resultExec: emsg = "checkScalarResult: 'result' missing" if 'scalar' not in resultExec: emsg = "checkScalarResult: 'scalar' missing" if 'num_cols' not in resultExec: emsg = "checkScalarResult: 'num_cols' missing" if 'num_rows' not in resultExec: emsg = "checkScalarResult: 'num_rows' missing" elif 'cols' not in resultExec: emsg = "checkScalarResult: 'cols' missing" else: emsg = None num_cols = resultExec["num_cols"] num_rows = resultExec["num_rows"] cols = resultExec["cols"] # print "cols:", dump_json(cols) if emsg: print "\nKey: '" + str(resultKey) + "' resultExec:\n", dump_json(resultExec) sys.stdout.flush() raise Exception("exec result (resultExec) missing what we expected. Look at json above. " + emsg) if (cols and (not num_rows or num_rows==0) ) and not allowEmptyResult: print "resultExec[0]:", dump_json(resultExec) raise Exception ("checkScalarResult says 'cols' exist in exec json response,"+\ " but num_rows: %s is 0 or None. Is that an expected 'empty' key state?" % num_rows+\ " Use 'allowEmptyResult if so.") # Cycle thru rows and extract all the meta-data into a dict? # assume "0" and "row" keys exist for each list entry in rows # FIX! the key for the value can be 0 or 1 or ?? (apparently col?) Should change H2O here # cols may not exist..if the result was just scalar? if not cols: # just return the scalar result then scalar = resultExec['scalar'] if scalar is None: raise Exception("both cols and scalar are null: %s %s" % (cols, scalar)) checkForBadFP(scalar, json=resultExec, nanOkay=nanOkay) return scalar metaDict = cols[0] for key,value in metaDict.items(): print "Inspect metaDict:", key, value min_value = metaDict['min'] stype = metaDict['type'] # if it's an enum col, it's okay for min to be NaN .. checkForBadFP(min_value, json=metaDict, nanOkay=nanOkay or stype=='Enum') return min_value
def checkForBadFP(value, name='min_value', nanOkay=False, infOkay=False, json=None): # if we passed the json, dump it for debug if 'Infinity' in str(value) and not infOkay: if json: print dump_json(json) raise Exception("Infinity in inspected %s can't be good for: %s" % (str(value), name)) if 'NaN' in str(value) and not nanOkay: if json: print dump_json(json) raise Exception("NaN in inspected %s can't be good for: %s" % (str(value), name))
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs): ''' Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out. ''' params_dict = {} # merge kwargs into params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False) start_time = time.time() pollCount = 0 while True: result = self.do_json_request('3/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict) # print 'Job: ', dump_json(result) if key: frames_result = self.frames(key=key) print 'frames_result for key:', key, dump_json(result) jobs = result['jobs'][0] description = jobs['description'] dest = jobs['dest'] dest_name = dest['name'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] print description, \ "dest_name:", dest_name, \ "\tprogress:", "%-10s" % progress, \ "\tstatus:", "%-12s" % status, \ "\tmsec:", msec if status=='DONE' or status=='CANCELLED' or status=='FAILED': h2o_sandbox.check_sandbox_for_errors() return result # what about 'CREATED' # FIX! what are the other legal polling statuses that we should check for? if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs): h2o_sandbox.check_sandbox_for_errors() emsg = "Job:", job_key, "timed out in:", timeoutSecs # for debug a = h2o.nodes[0].get_cloud() print "cloud.json:", dump_json(a) raise Exception(emsg) print emsg return None # check every other poll, for now if (pollCount % 2) == 0: h2o_sandbox.check_sandbox_for_errors() time.sleep(retryDelaySecs) pollCount += 1
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. # if schema=='put': # h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + # "\nMeans multi-machine with 'put' will fail") # schema = 'local' if not node: node = h2o_nodes.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs) verboseprint("importPattern:", importPattern) verboseprint("importResult", dump_json(importResult)) assert len(importResult['keys']) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path) # print "importResult:", importResult # get rid of parse timing in tests now start = time.time() parseResult = parse_only(node, importPattern, hex_key, importResult['keys'], timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) elapsed = time.time() - start print importPattern, "parsed in", elapsed, "seconds.", "%d pct. of timeout" % ((elapsed*100)/timeoutSecs), "\n" parseResult['python_elapsed'] = elapsed verboseprint("parseResult:", dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up check_sandbox_for_errors() print "WARNING: not doing inspect/summary for now after parse" ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) ## numRows = inspect['numRows'] ## numCols = inspect['numCols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing check_sandbox_for_errors() return parseResult
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False): if not node: node = h2o_nodes.nodes[0] kwargs = {'ast': execExpr} start = time.time() resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) verboseprint('exec took', time.time() - start, 'seconds') print "exec:", dump_json(resultExec) # when do I get cols? # "result": "1.0351050710011848E-300", # "scalar": 1.0351050710011848e-300, # "funstr": null, # "key": null, # "col_names": null, # "num_cols": 0, # "num_rows": 0, # "exception": null, # echoing? # "string": null # "funs": null, # "ast": "(= !x (xorsum ([ $r1 \"null\" #0) $TRUE))", if 'cols' in resultExec and resultExec['cols']: # not null if 'funstr' in resultExec and resultExec['funstr']: # not null raise Exception("cols and funstr shouldn't both be in resultExec: %s" % dump_json(resultExec)) else: print "Frame return" # if test said to look at a resultKey, it's should be in h2o k/v store # inspect a result key? # Should we get the key name from the exec return? if resultKey is not None: kwargs = {'ast': resultKey} resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) print "exec key result:", dump_json(resultExec) # handles the 1x1 data frame result. Not really interesting if bigger than 1x1? result = resultExec['cols'][0]['min'] else: if 'funstr' in resultExec and resultExec['funstr']: # not null print "function return" result = resultExec['funstr'] else: print "scalar return" result = resultExec['scalar'] return resultExec, result
def cancelAllJobs(timeoutSecs=10, **kwargs): # I guess you could pass pattern # what if jobs had just been dispatched? wait until they get in the queue state correctly time.sleep(2) a = h2o_nodes.nodes[0].jobs(timeoutSecs=120) print "jobs():", dump_json(a) jobsList = a['jobs'] for j in jobsList: if j['end_time'] == '': b = h2o_nodes.nodes[0].jobs_cancel(key=j['key']) print "jobs_cancel():", dump_json(b) # it's possible we could be in a bad state where jobs don't cancel cleanly pollWaitJobs(timeoutSecs=timeoutSecs, **kwargs) # wait for all the cancels to happen. If we missed one, we might timeout here.
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path a_node = h2o.nodes[0] import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) print dump_json(import_result) frames = a_node.frames(key=import_result['keys'][0], len=5)['frames'] print dump_json(frames) parse_result = a_node.parse(key=import_result['keys'][0]) hex_key = parse_result['frames'][0]['key']['name'] verboseprint(hex_key, ":", dump_json(parse_result))
def test_b_algo_parameters(self): # for algo in ['kmeans', 'gbm', 'deeplearning', 'glm', 'word2vec', 'example', 'quantile', 'grep']: for algo in ["kmeans", "gbm", "deeplearning", "drf", "glm", "gbm", "pca", "naivebayes"]: paramResult = h2o.n0.model_builders(algo=algo) self.print_params(paramResult) mmResult = h2o.n0.model_metrics(algo=algo) print "mmResult", dump_json(mmResult)
def test_rapids_ifelse_nested(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(2): for execObj, expected in zip(objList, resultList): freshObj = copy(execObj) result = freshObj.do() # do some scalar result checking if expected is not None: # result is a string now?? print "result:", result print "expected:", expected assert float(result)==expected, "%s %s" (result,expected) # rows might be zero! print "freshObj:", dump_json(freshObj.execResult) if 'key' in freshObj.execResult and freshObj.execResult['key']: keys.append(freshObj.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_parse_rand_utf8(self): SYNDATASETS_DIR = h2o.make_syn_dir() print "HACK: reduce rows to 10 for debug" tryList = [ # do two cols to detect bad eol behavior (10, 2, 'cA', 120), (10, 2, 'cG', 120), (10, 2, 'cH', 120), ] print "What about messages to log (INFO) about unmatched quotes (before eol)" # got this ..trying to avoid for now # Exception: rjson error in parse: Argument 'source_key' error: Parser setup appears to be broken, got AUTO for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEED=SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', checkHeader=0, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) print "parseResult:", dump_json(parseResult) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert len(missingList) == 0 # FIX! check type? # print "inspect:", h2o.dump_json(inspect) self.assertEqual(numRows, rowCount, msg='Wrong numRows: %s %s' % (numRows, rowCount)) self.assertEqual(numCols, colCount, msg='Wrong numCols: %s %s' % (numCols, colCount))
def showGBMGridResults(GBMResult, expectedErrorMax, classification=True): # print "GBMResult:", dump_json(GBMResult) jobs = GBMResult['jobs'] print "GBM jobs:", jobs for jobnum, j in enumerate(jobs): _distribution = j['_distribution'] model_key = j['destination_key'] job_key = j['job_key'] # inspect = h2o_cmd.runInspect(key=model_key) # print "jobnum:", jobnum, dump_json(inspect) gbmTrainView = h2o_cmd.runGBMView(model_key=model_key) print "jobnum:", jobnum, dump_json(gbmTrainView) if classification: cms = gbmTrainView['gbm_model']['cms'] cm = cms[-1]['_arr'] # take the last one print "GBM cms[-1]['_predErr']:", cms[-1]['_predErr'] print "GBM cms[-1]['_classErr']:", cms[-1]['_classErr'] pctWrongTrain = pp_cm_summary(cm); if pctWrongTrain > expectedErrorMax: raise Exception("Should have < %s error here. pctWrongTrain: %s" % (expectedErrorMax, pctWrongTrain)) errsLast = gbmTrainView['gbm_model']['errs'][-1] print "\nTrain", jobnum, job_key, "\n==========\n", "pctWrongTrain:", pctWrongTrain, "errsLast:", errsLast print "GBM 'errsLast'", errsLast print pp_cm(cm) else: print "\nTrain", jobnum, job_key, "\n==========\n", "errsLast:", errsLast print "GBMTrainView errs:", gbmTrainView['gbm_model']['errs']
def test_rapids_basic_with_funs_noinc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(100): if i==0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon {v} %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v2 (+ %v2 #1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=15) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_xl_oobe(self): # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key b = DF('b1') c = DF('c1') # look at our secret stash in the base class. Should see the DFInit? assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) Assign(a, 0) Assign(b, 0) Assign(c, 0) print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) print "Referring to non-existent rows causes a problem (AAIOBE)" Assign(c[1], (a[2] + b[2])) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ $c1 #1 #1) (+ ([ $a1 #2 #2) ([ $b1 #2 #2)))" assert ast==astExpected, "Actual: %s Expected: %s" % (ast, astExpected) # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def compute_model_metrics(self, model, frame, timeoutSecs=60, **kwargs): """ Score a model on the h2o cluster on the given Frame and return only the model metrics. """ assert model is not None, '"model" parameter is null' assert frame is not None, '"frame" parameter is null' models = self.models(key=model, timeoutSecs=timeoutSecs) assert models is not None, "/Models REST call failed" assert ( models["models"][0]["model_id"]["name"] == model ), "/Models/{0} returned Model {1} rather than Model {2}".format(model, models["models"][0]["key"]["name"], model) # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=frame) assert frames is not None, "/Frames/{0} REST call failed".format(frame) print "frames:", dump_json(frames) # is the name not there? # assert frames['frames'][0]['model_id']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, models['models'][0]['key']['name'], frame) result = self.do_json_request( "/3/ModelMetrics.json/models/" + model + "/frames/" + frame, cmd="post", timeout=timeoutSecs ) mm = result["model_metrics"][0] verboseprint("model metrics: " + repr(mm)) h2o_sandbox.check_sandbox_for_errors() return mm
def runInspect(node=None, key=None, timeoutSecs=30, verbose=False, **kwargs): if not key: raise Exception('No key for Inspect') if not node: node = h2o_nodes.nodes[0] a = node.inspect(key, timeoutSecs=timeoutSecs, **kwargs) if verbose: print "inspect of %s:" % key, dump_json(a) return a
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path a_node = h2o.nodes[0] # import_result = a_node.import_files(path=find_file("smalldata/logreg/prostate.csv")) import_result = a_node.import_files(path=find_file("smalldata/poker/poker-hand-testing.data")) # print dump_json(import_result) k = import_result['keys'][0] # frames_result = a_node.frames(key=k[0], len=5) frames_result = a_node.frames(key=k) frame = frames_result['frames'][0] byteSize = frame['byteSize'] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing'] stype = c['type'] zeros = c['zeros'] domain = c['domain'] # print dump_json(frame) # how do you parse multiple files parse_result = a_node.parse(key=k) frame = parse_result['frames'][0] hex_key = frame['key']['name'] verboseprint(hex_key, ":", dump_json(parse_result))
def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['keys'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k,v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse(key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['key']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))
def test(n, tries=None, timeoutSecs=14.0): c = n.get_cloud(noExtraErrorCheck=noExtraErrorCheck, timeoutSecs=timeoutSecs) # FIX! unique to h2o-dev for now, because of the port reuse problems (TCP_WAIT) compared to h2o # flag them early rather than after timeout check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # don't want to check everything. But this will check that the keys are returned! consensus = c["consensus"] locked = c["locked"] cloud_size = c["cloud_size"] cloud_name = c["cloud_name"] if "nodes" not in c: emsg = "\nH2O didn't include a list of nodes in get_cloud response after initial cloud build" raise Exception(emsg) # only print it when you get consensus if cloud_size != node_count: print "\nNodes in cloud while building:" for i, ci in enumerate(c["nodes"]): # 'h2o' disappeared? if "h2o" not in ci: print "ci:", dump_json(ci) # apparently this can happen in cases where I didn't join a cloud because # of a different md5 version. We'll eventually exception out? # raise Exception("What happened to the 'h2o' ci dict entry?, not there") else: print "node %s" % i, ci["h2o"] ### print "node %s" % i, ci['h2o']['node'] if cloud_size > node_count: emsg = ( "\n\nERROR: cloud_size: %d reported via json is bigger than we expect: %d" % (cloud_size, node_count) + "\nLikely have zombie(s) with the same cloud name on the network." + "\nLook at the cloud IP's in 'grep Paxos sandbox/*stdout*' for some IP's you didn't expect." + "\n\nYou probably don't have to do anything, as the cloud shutdown in this test should" + "\nhave sent a Shutdown.json to all in that cloud (you'll see a kill -2 in the *stdout*)." + "\nIf you try again, and it still fails, go to those IPs and kill the zombie h2o's." + "\nIf you think you really have an intermittent cloud build, report it." + "\n" + "\nbuilding cloud size of 2 with 127.0.0.1 may temporarily report 3 incorrectly," + "\nwith no zombie?" ) for ci in c["nodes"]: emsg += "\n" + ci["h2o"]["node"] raise Exception(emsg) a = (cloud_size == node_count) and consensus if a: verboseprint("\tLocked won't happen until after keys are written") verboseprint("\nNodes in final cloud:") for ci in c["nodes"]: verboseprint("ci", ci) # this isn't in there all the time? # verboseprint(ci['h2o']['node']) return a
def import_files(self, path, timeoutSecs=180): """ Import a file or files into h2o. The 'file' parameter accepts a directory or a single file. 192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets """ a = self.do_json_request("3/ImportFiles.json", timeout=timeoutSecs, params={"path": path}) verboseprint("\nimport_files result:", dump_json(a)) h2o_sandbox.check_sandbox_for_errors() return a
def runInspect(node=None, key=None, verbose=False, **kwargs): if not key: raise Exception("No key for Inspect") if not node: node = h2o_nodes.nodes[0] a = node.frames(key, **kwargs) if verbose: print "inspect of %s:" % key, dump_json(a) return a
def verify_cloud_size(nodeList=None, expectedCloudName=None, verbose=False, timeoutSecs=10, ignoreHealth=False): if not nodeList: nodeList = h2o_nodes.nodes expectedSize = len(nodeList) # cloud size and consensus have to reflect a single grab of information from a node. cloudStatus = [n.get_cloud(timeoutSecs=timeoutSecs) for n in nodeList] # get cloud_name from all cloudSizes = [c['cloud_size'] for c in cloudStatus] cloudConsensus = [c['consensus'] for c in cloudStatus] cloudHealthy = [c['cloud_healthy'] for c in cloudStatus] cloudName = [c['cloud_name'] for c in cloudStatus] if not all(cloudHealthy): msg = "Some node reported cloud_healthy not true: %s" % cloudHealthy if not ignoreHealth: raise Exception(msg) # gather up all the node_healthy status too for i, c in enumerate(cloudStatus): nodesHealthy = [n['node_healthy'] for n in c['nodes']] if not all(nodesHealthy): print "node %s cloud status: %s" % (i, dump_json(c)) msg = "node %s says some node is not reporting node_healthy: %s" % (c['node_name'], nodesHealthy) if not ignoreHealth: raise Exception(msg) if expectedSize == 0 or len(cloudSizes) == 0 or len(cloudConsensus) == 0: print "\nexpectedSize:", expectedSize print "cloudSizes:", cloudSizes print "cloudConsensus:", cloudConsensus raise Exception("Nothing in cloud. Can't verify size") for s in cloudSizes: consensusStr = (",".join(map(str, cloudConsensus))) sizeStr = (",".join(map(str, cloudSizes))) if (s != expectedSize): raise Exception("Inconsistent cloud size." + "nodeList report size: %s consensus: %s instead of %d." % \ (sizeStr, consensusStr, expectedSize)) # check that all cloud_names are right if expectedCloudName: for i, cn in enumerate(cloudName): if cn != expectedCloudName: # tear everyone down, in case of zombies. so we don't have to kill -9 manually print "node %s has the wrong cloud name: %s expectedCloudName: %s." % (i, cn, expectedCloudName) # print "node %s cloud status: %s" % (i, dump_json(cloudStatus[i])) print "tearing cloud down" tear_down_cloud(nodeList=nodeList, sandboxIgnoreErrors=False) raise Exception("node %s has the wrong cloud name: %s expectedCloudName: %s" % \ (i, cn, expectedCloudName)) return (sizeStr, consensusStr, expectedSize)
def typeahead(self, timeoutSecs=10, **kwargs): params_dict = { 'src': None, 'limit': None, } check_params_update_kwargs(params_dict, kwargs, 'typeahead', print_params=True) # odd ...needs /files a = self.do_json_request('3/Typeahead.json/files', params=params_dict, timeout=timeoutSecs) verboseprint("\ntypeahead result:", dump_json(a)) return a
def test_xl_seq_A(self): # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key b = DF('b1') c = DF('c1') print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) # look at our secret stash in the base class. Should see the DFInit? # DF does a kv store init. Key doesn't # DF inherits from Key. KeyIndexed inherits from Key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) Assign(a, 0) checkAst("(= !a1 #0)") Assign(b, 0) checkAst("(= !b1 #0)") Assign(c, 0) checkAst("(= !c1 #0)") Assign(a, [0]) checkAst("(= !a1 (c {#0}))") Assign(b, [0,1]) checkAst("(= !b1 (c {#0;#1}))") Assign(c, [0,1,2]) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(a, (0,)) # make sure it's a tuple with comma checkAst("(= !a1 (c {#0}))") Assign(b, (0,1)) checkAst("(= !b1 (c {#0;#1}))") Assign(c, (0,1,2)) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(c, a[0] + b[1]) checkAst("(= !c1 (+ ([ $a1 #0 #0) ([ $b1 #1 #0)))") Assign(c[0], (a[0] + b[1])) checkAst("(= ([ $c1 #0 #0) (+ ([ $a1 #0 #0) ([ $b1 #1 #0)))") # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def test_rapids_ddply_with_funs(self): if 1==0: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' else: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # get rid of the enum response cole execExpr2 = '(= !r2 ([ $r1 "null" {#0;#1;#2;#3}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=15) keys = [] for execExpr1 in initList: # ddply function can only return one row. Just use expressions above as nose # some of the expressions above use $v, but v won't be created as key outside any more with ddply funs = '[(def anon {v} %s;;(sum $v $TRUE);;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, doFuns=True, resultKey=None, timeoutSecs=5) execExpr2 = '(h2o.ddply $r2 {#2;#3} $anon)' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, doFuns=False, resultKey=None, timeoutSecs=120) # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr1) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def jobs_admin(self, timeoutSecs=120, **kwargs): params_dict = { # 'expression': None, } params_dict.update(kwargs) verboseprint("\njobs_admin:", params_dict) a = self.do_json_request('Jobs.json', timeout=timeoutSecs, params=params_dict) verboseprint("\njobs_admin result:", dump_json(a)) # print "WARNING: faking jobs admin" # a = { 'jobs': {} } return a
def import_files(self, path, timeoutSecs=180): ''' Import a file or files into h2o. The 'file' parameter accepts a directory or a single file. 192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets ''' a = self.do_json_request('2/ImportFiles.json', timeout=timeoutSecs, params={"path": path} ) verboseprint("\nimport_files result:", dump_json(a)) return a
def test_rapids_basic_with_funs_inc(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for i in range(2): if i==0: # should never see v as a key from the function? execExpr1 = '(= !v1 (c {#0;#0}))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr1, resultKey='v1', timeoutSecs=5) execExpr2 = '(= !v2 (cbind %v1 %v1 ))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey='v2', timeoutSecs=5) else: # adding to v shouldn't hurt, but not required cause function output will update it # execExpr1 = '(= !v (+ %v #1))' # execExpr1 = '(+ %v #1)' # add to itself? execExpr1 = '(+ %v %v)' funs = '[(def anon { v } %s;;;)]' % execExpr1 execResult, result = h2e.exec_expr(h2o.nodes[0], funs, resultKey=None, timeoutSecs=5, doFuns=True) # execExpr2 = '(= !v2 (anon ([ %v2 "null" #0)))' # execExpr2 = '(= !v2 (anon %v2))' execExpr2 = '(= !v1 (anon %v1))' execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr2, resultKey=None, timeoutSecs=5) print "result:", result # see if the execExpr had a lhs assign. If so, it better be in the storeview r = re.search('![a-zA-Z0-9]+', execExpr2) if r: lhs = r.group(0)[1:] print "Found key lhs assign", lhs # FIX! check if v is ever there. # KeyIndexeds gets too many rollup stats problems. Don't use for now if 1==0: inspect = h2o_cmd.runInspect(key=lhs) missingList, labelList, numRows, numCols = infoFromInspect(inspect) storeview = h2o_cmd.runStoreView() print "\nstoreview:", dump_json(storeview) if not k in storeView['keys']: raise Exception("Expected to find %s in %s", (k, storeView['keys'])) else: print "No key lhs assign" # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr2) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_DL_covtype(self): h2o.nodes[0].remove_all_keys() csvPathname_train = 'standard/covtype.data' csvPathname_test = 'standard/covtype.data' hex_key = 'covtype.hex' validation_key = 'covtype_v.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_train, hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False, columnTypeDict={54: 'Enum'}) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList parseResultV = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname_test, hex_key=validation_key, timeoutSecs=timeoutSecs, doSummary=False) pV = h2o_cmd.ParseObj(parseResult) iV = h2o_cmd.InspectObj(pV.parse_key) parse_keyV = pV.parse_key numRowsV = iV.numRows numColsV = iV.numCols labelListV = iV.labelList response = numCols - 1 #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'deeplearning_' + identifier + '.hex' parameters = { # loss enum True None [u'MeanSquare', u'CrossEntropy'] 'loss': 'CrossEntropy', 'validation_frame': validation_key, # KeyIndexed None 'ignored_columns': None, # string[] None 'response_column': labelList[response], # string None 'balance_classes': None, # boolean false 'max_after_balance_size': None, # float Infinity 'keep_cross_validation_splits': None, # boolean false 'checkpoint': None, # Key None 'overwrite_with_best_model': None, # boolean true 'expert_mode': None, # boolean false 'autoencoder': None, # boolean false # 'use_all_factor_levels': None, # boolean true # [u'Tanh', u'TanhWithDropout', u'Rectifier', u'RectifierWithDropout', u'Maxout', u'MaxoutWithDropout'] 'activation': 'Tanh', # enum Rectifier 'hidden': '[100,100,100]', # int[] [200, 200] 'epochs': 0.7, # double 10.0 'train_samples_per_iteration': 100000, # long -2 'target_ratio_comm_to_comp': None, # double 0.02 'seed': None, # long 1679194146842485659 'adaptive_rate': True, # boolean true 'rho': None, # double 0.99 'epsilon': None, # double 1.0E-8 'rate': None, # double 0.005 'rate_annealing': None, # double 1.0E-6 'rate_decay': None, # double 1.0 'momentum_start': None, # double 0.0 'momentum_ramp': None, # double 1000000.0 'momentum_stable': None, # double 0.0 'nesterov_accelerated_gradient': None, # boolean true 'input_dropout_ratio': 0.0, # double 0.0 'hidden_dropout_ratios': None, # double[] None 'l1': 1e-5, # double 0.0 'l2': 1e-7, # double 0.0 'max_w2': None, # float Infinity 'initial_weight_distribution': None, # enum UniformAdaptive [u'UniformAdaptive', u'Uniform', u'Normal'] 'initial_weight_scale': None, # double 1.0 'loss': 'CrossEntropy', # enum MeanSquare [u'Automatic', u'MeanSquare', u'CrossEntropy'] 'score_interval': None, # double 5.0 'score_training_samples': None, # long 10000 'score_validation_samples': None, # long 0 'score_duty_cycle': None, # double 0.1 'classification_stop': None, # double 0.0 'regression_stop': None, # double 1.0E-6 'quiet_mode': None, # boolean false 'max_confusion_matrix_size': None, # int 20 'max_hit_ratio_k': None, # int 10 'balance_classes': None, # boolean false 'class_sampling_factors': None, # float[] None 'max_after_balance_size': None, # float Infinity 'score_validation_sampling': None, # enum Uniform [u'Uniform', u'Stratified'] 'diagnostics': None, # boolean true 'variable_importances': None, # boolean false 'fast_mode': None, # boolean true 'ignore_const_cols': None, # boolean true 'force_load_balance': None, # boolean true 'replicate_training_data': None, # boolean false 'single_node_mode': None, # boolean false 'shuffle_training_data': None, # boolean false 'missing_values_handling': None, # enum MeanImputation [u'Skip', u'MeanImputation'] 'sparse': None, # boolean false 'col_major': None, # boolean false 'average_activation': None, # double 0.0 'sparsity_beta': None, # double 0.0 } expectedErr = 0.20 ## expected validation error for the above model relTol = 0.20 ## 15% rel. error tolerance due to Hogwild! timeoutSecs = 300 start = time.time() bmResult = h2o.n0.build_model(algo='deeplearning', model_id=model_key, training_frame=hex_key, parameters=parameters, timeoutSecs=timeoutSecs) bm = OutputObj(bmResult, 'bm') print 'deep learning took', time.time() - start, 'seconds' modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') # print "model:", dump_json(model) cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=validation_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=validation_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') ## h2o_cmd.runStoreView() # FIX! should be the scored error print "model", dump_json(model) actualErr = model['validation_metrics']['MSE'] print "expected error: " + format(expectedErr) print "actual error: " + format(actualErr) if actualErr != expectedErr and abs( (expectedErr - actualErr) / expectedErr) > relTol: raise Exception( "error of %s is not within %s %% relative error of %s" % (actualErr, float(relTol) * 100, expectedErr))
def oldSimpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, **kwargs): # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter # h2o GLM will verboseprint the result and print errors. # so don't have to do that # different when cross validation is used? No trainingErrorDetails? GLMModel = glm['glm_model'] if not GLMModel: raise Exception("GLMModel didn't exist in the glm response? %s" % dump_json(glm)) warnings = None if 'warnings' in GLMModel and GLMModel['warnings']: warnings = GLMModel['warnings'] # stop on failed x = re.compile("failed", re.IGNORECASE) # don't stop if fail to converge c = re.compile("converge", re.IGNORECASE) for w in warnings: print "\nwarning:", w if re.search(x, w) and not allowFailWarning: if re.search(c, w): # ignore the fail to converge warning now pass else: # stop on other 'fail' warnings (are there any? fail to solve? raise Exception(w) # for key, value in glm.iteritems(): print key # not in GLMGrid? # FIX! don't get GLMParams if it can't solve? GLMParams = GLMModel['glm'] family = GLMParams["family"] # number of submodels = number of lambda # min of 2. lambda_max is first submodels = GLMModel['submodels'] # since all our tests?? only use one lambda, the best_lamda_idx should = 1 best_lambda_idx = GLMModel['best_lambda_idx'] print "best_lambda_idx:", best_lambda_idx lambda_max = GLMModel['lambda_max'] print "lambda_max:", lambda_max # currently lambda_max is not set by tomas. ..i.e.not valid if 1 == 0 and (lambda_max <= submodels[best_lambda_idx].lambda_value): raise Exception( "lambda_max %s should always be > the lambda result %s we're checking" % (lambda_max, submodels[best_lambda_idx].lambda_value)) # submodels0 = submodels[0] # submodels1 = submodels[-1] # hackery to make it work when there's just one if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception( "best_lambda_idx: %s should point to one of lambdas (which has len %s)" % (best_lambda_idx, len(submodels))) if (best_lambda_idx >= len(submodels)) or (best_lambda_idx < 0): raise Exception( "best_lambda_idx: %s should point to one of submodels (which has len %s)" % (best_lambda_idx, len(submodels))) submodels1 = submodels[ best_lambda_idx] # hackery to make it work when there's just one iterations = submodels1['iteration'] print "GLMModel/iterations:", iterations # if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter if maxExpectedIterations is not None and iterations > maxExpectedIterations: raise Exception( "Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations)) if 'validation' not in submodels1: raise Exception("Should be a 'validation' key in submodels1: %s" % dump_json(submodels1)) validationsList = submodels1['validation'] validations = validationsList # xval. compare what we asked for and what we got. n_folds = kwargs.setdefault('n_folds', None) print "GLMModel/validations" validations['null_deviance'] = h2o_util.cleanseInfNan( validations['null_deviance']) validations['residual_deviance'] = h2o_util.cleanseInfNan( validations['residual_deviance']) print "%15s %s" % ("null_deviance:\t", validations['null_deviance']) print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance']) # threshold only there if binomial? # auc only for binomial if family == "binomial": print "%15s %s" % ("auc:\t", validations['auc']) best_threshold = validations['best_threshold'] thresholds = validations['thresholds'] print "%15s %s" % ("best_threshold:\t", best_threshold) # have to look up the index for the cm, from the thresholds list best_index = None for i, t in enumerate(thresholds): if t >= best_threshold: # ends up using next one if not present best_index = i break assert best_index != None, "%s %s" % (best_threshold, thresholds) print "Now printing the right 'best_threshold' %s from '_cms" % best_threshold # cm = glm['glm_model']['submodels'][0]['validation']['_cms'][-1] submodels = glm['glm_model']['submodels'] # FIX! this isn't right if we have multiple lambdas? different submodels? cms = submodels[0]['validation']['_cms'] self.assertEqual( len(thresholds), len(cms), msg="thresholds %s and cm %s should be lists of the same size. %s" % (len(thresholds), len(cms), thresholds)) # FIX! best_threshold isn't necessarily in the list. jump out if >= assert best_index < len(cms), "%s %s" % (best_index, len(cms)) # if we want 0.5..rounds to int # mid = len(cms)/2 # cm = cms[mid] cm = cms[best_index] print "cm:", dump_json(cm['_arr']) predErr = cm['_predErr'] classErr = cm['_classErr'] # compare to predErr # pctWrong = h2o_gbm.pp_cm_summary(cm['_arr']); # FIX! pctWrong = 0 print "predErr:", predErr print "calculated pctWrong from cm:", pctWrong print "classErr:", classErr # self.assertLess(pctWrong, 9,"Should see less than 9% error (class = 4)") print "\nTrain\n==========\n" # print h2o_gbm.pp_cm(cm['_arr']) if family == "poisson" or family == "gaussian": print "%15s %s" % ("AIC:\t", validations['AIC']) coefficients_names = GLMModel['coefficients_names'] # print "coefficients_names:", coefficients_names idxs = submodels1['idxs'] print "idxs:", idxs coefficients_names = coefficients_names # always check both normalized and normal coefficients norm_beta = submodels1['norm_beta'] # if norm_beta and len(coefficients_names)!=len(norm_beta): # print len(coefficients_names), len(norm_beta) # raise Exception("coefficients_names and normalized_norm_beta from h2o json not same length. coefficients_names: %s normalized_norm_beta: %s" % (coefficients_names, norm_beta)) # beta = submodels1['beta'] # print "beta:", beta # if len(coefficients_names)!=len(beta): # print len(coefficients_names), len(beta) # raise Exception("coefficients_names and beta from h2o json not same length. coefficients_names: %s beta: %s" % (coefficients_names, beta)) # test wants to use normalized? if doNormalized: beta_used = norm_beta else: beta_used = beta coefficients = {} # create a dictionary with name, beta (including intercept) just like v1 for i, b in zip(idxs, beta_used[:-1]): name = coefficients_names[i] coefficients[name] = b print "len(idxs)", len(idxs), "len(beta_used)", len(beta_used) print "coefficients:", coefficients print "beta:", beta print "norm_beta:", norm_beta coefficients['Intercept'] = beta_used[-1] print "len(coefficients_names)", len(coefficients_names) print "len(idxs)", len(idxs) print "idxs[-1]", idxs[-1] print "intercept demapping info:", \ "coefficients_names[-i]:", coefficients_names[-1], \ "idxs[-1]:", idxs[-1], \ "coefficients_names[idxs[-1]]:", coefficients_names[idxs[-1]], \ "beta_used[-1]:", beta_used[-1], \ "coefficients['Intercept']", coefficients['Intercept'] # last one is intercept interceptName = coefficients_names[idxs[-1]] if interceptName != "Intercept" or abs(beta_used[-1]) < 1e-26: raise Exception("'Intercept' should be last in coefficients_names and beta %s %s %s" %\ (idxs[-1], beta_used[-1], "-"+interceptName+"-")) # idxs has the order for non-zero coefficients, it's shorter than beta_used and coefficients_names # new 5/28/14. glm can point to zero coefficients # for i in idxs: # if beta_used[i]==0.0: ## raise Exception("idxs shouldn't point to any 0 coefficients i: %s %s:" % (i, beta_used[i])) if len(idxs) > len(beta_used): raise Exception("idxs shouldn't be longer than beta_used %s %s" % (len(idxs), len(beta_used))) intercept = coefficients.pop('Intercept', None) # intercept demapping info: idxs[-1]: 54 coefficients_names[[idxs[-1]]: Intercept beta_used[-1]: -6.6866753099 # the last one shoudl be 'Intercept' ? coefficients_names.pop() # have to skip the output col! get it from kwargs # better always be there! y = kwargs['response'] # the dict keys are column headers if they exist...how to order those? new: use the 'coefficients_names' # from the response # Tomas created 'coefficients_names which is the coefficient list in order. # Just use it to index coefficients! works for header or no-header cases # I guess now we won't print the "None" cases for dropped columns (constant columns!) # Because Tomas doesn't get everything in 'coefficients_names' if dropped by GLMQuery before # he gets it? def add_to_coefficient_list_and_string(c, cList, cString): if c in coefficients: cValue = coefficients[c] cValueString = "%s: %.5e " % (c, cValue) else: print "Warning: didn't see '" + c + "' in json coefficient response.",\ "Inserting 'None' with assumption it was dropped due to constant column)" cValue = None cValueString = "%s: %s " % (c, cValue) cList.append(cValue) # we put each on newline for easy comparison to R..otherwise keep condensed if prettyPrint: cValueString = "H2O coefficient " + cValueString + "\n" # not mutable? return cString + cValueString # creating both a string for printing and a list of values cString = "" cList = [] # print in order using col_names # coefficients_names is input only now..same for header or no header, or expanded enums for c in coefficients_names: cString = add_to_coefficient_list_and_string(c, cList, cString) if prettyPrint: print "\nH2O intercept:\t\t%.5e" % intercept print cString else: if not noPrint: print "\nintercept:", intercept, cString print "\nTotal # of coefficients:", len(coefficients_names) # pick out the coefficent for the column we enabled for enhanced checking. Can be None. # FIX! temporary hack to deal with disappearing/renaming columns in GLM if (not allowZeroCoeff) and (colX is not None): absXCoeff = abs(float(coefficients[str(colX)])) # add kwargs to help debug without looking at console log self.assertGreater( absXCoeff, 1e-26, ("abs. value of GLM coefficients['" + str(colX) + "'] is " + str(absXCoeff) + ", not >= 1e-26 for X=" + str(colX) + "\n" + "kwargs:" + dump_json(kwargs))) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ("abs. value of GLM coefficients['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "kwargs:" + dump_json(kwargs))) # this is good if we just want min or max # maxCoeff = max(coefficients, key=coefficients.get) # for more, just invert the dictionary and ... if (len(coefficients) > 0): maxKey = max([(abs(coefficients[x]), x) for x in coefficients])[1] print "H2O Largest abs. coefficient value:", maxKey, coefficients[ maxKey] minKey = min([(abs(coefficients[x]), x) for x in coefficients])[1] print "H2O Smallest abs. coefficient value:", minKey, coefficients[ minKey] else: print "Warning, no coefficients returned. Must be intercept only?" # many of the GLM tests aren't single column though. # quick and dirty check: if all the coefficients are zero, # something is broken # intercept is in there too, but this will get it okay # just sum the abs value up..look for greater than 0 # skip this test if there is just one coefficient. Maybe pointing to a non-important coeff? if (not allowZeroCoeff) and (len(coefficients) > 1): s = 0.0 for c in coefficients: v = coefficients[c] s += abs(float(v)) self.assertGreater( s, 1e-26, ("sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26\n" + "kwargs:" + dump_json(kwargs))) print "submodels1, run_time (milliseconds):", submodels1['run_time'] # shouldn't have any errors check_sandbox_for_errors() return (warnings, cList, intercept)
def simpleCheckGLM(self, model, parameters, labelList, labelListUsed, allowFailWarning=False, allowZeroCoeff=False, prettyPrint=False, noPrint=False, maxExpectedIterations=None, doNormalized=False, allowNaN=False): # FIX! the structure is all different return warnings = '' # binomial = model.binomial residual_deviance = model.training_metrics.residual_deviance threshold = model.training_metrics.threshold check_obj_has_good_numbers(threshold, 'threshold', allowNaN=allowNaN) auc = model.AUC # NaN if not logistic # check_obj_has_good_numbers(auc, 'model.AUC') best_lambda_idx = model.best_lambda_idx model_category = model.model_category name = model.name residual_degrees_of_freedom = model.residual_degrees_of_freedom # is this no longer used? coefficients_magnitude = model.coefficients_magnitude null_deviance = model.null_deviance check_obj_has_good_numbers(null_deviance, 'model.null_deviance', allowNaN=allowNaN) null_degrees_of_freedom = model.null_degrees_of_freedom check_obj_has_good_numbers(null_degrees_of_freedom, 'model.null_degrees_of_freedom', allowNaN=allowNaN) domains = model.domains # when is is this okay to be NaN? AIC = model.AIC check_obj_has_good_numbers(AIC, 'model.AIC', allowNaN=allowNaN) names = model.names coeffs_names = model.coefficients_table.data[0] # these are returned as quoted strings. Turn them into numbers temp = model.coefficients_table.data[1] assert len(coeffs_names) == len(temp), "%s %s" % (len(coeffs_names), len(temp)) # we need coefficients to be floats or empty check_obj_has_good_numbers(temp, 'model.coeffs', allowNaN=False) # print "temp", temp[0:10] # print "temp[5489:5500]", temp[5489:5500] # UPDATE: None (null json) is legal for coeffs coeffs = map(lambda x: float(x) if (x is not None and str(x) != "") else 0, temp) intercept = coeffs[-1] interceptName = coeffs_names[-1] assert interceptName == 'Intercept' assert len(coeffs) == len(coeffs_names), "%s %s" % (len(coeffs), len(coeffs_names)) # FIX! if a coeff is zeroed/ignored, it doesn't show up? # get rid of intercept in glm response # assert (len(coeffs)-1) == len(labelListUsed, \ # "%s %s %s %s" % (len(coeffs), len(labelListUsed), coeffs, labelListUsed) # labelList still has the response column? # ignored columns aren't in model.names, but output response is. # labelListUsed has the response col removed so add 1 # Hmm..dropped coefficients again? can't do this check? # assert len(model.names) == len(labelListUsed), \ # "%s %s %s %s" % (len(model.names), len(labelListUsed), model.names, labelList) # this is no longer true! # assert model.threshold!=0 print "len(coeffs)", len(coeffs) print "coeffs:", coeffs # last one is intercept if interceptName != "Intercept" or abs(intercept) < 1e-26: raise Exception("'Intercept' should be last in coeffs_names %s %s" % (interceptName, intercept)) y = parameters['response_column'] cString = "\n" for i, c in enumerate(coeffs_names): cString += "%s: %.5e " % (coeffs_names[i], coeffs[i]) print cString print "\nH2O intercept:\t\t%.5e" % intercept print "\nTotal # of coeffs:", len(coeffs_names) # intercept is buried in there too absIntercept = abs(float(intercept)) self.assertGreater(absIntercept, 1e-26, ("abs. value of GLM coeffs['Intercept'] is " + str(absIntercept) + ", not >= 1e-26 for Intercept" + "\n" + "parameters:" + dump_json(parameters))) if (not allowZeroCoeff) and (len(coeffs) > 1): s = 0.0 for c in coeffs: s += abs(float(c)) self.assertGreater( s, 1e-26, ("sum of abs. value of GLM coeffs/intercept is " + str(s) + ", not >= 1e-26\n" + "parameters:" + dump_json(parameters))) # shouldn't have any errors check_sandbox_for_errors() return (warnings, coeffs, intercept)
def test_xl_ast_assert_ZZ(self): #***************************************** a = DF('a1') # inits to -1 checkAst(astForInit(a)) # I suppose use of the h2o inspect request is deprecated # h2o_cmd.runInspect uses Frames? if 1 == 0: inspect = h2o.n0.inspect( key=a ) # str(a) becomes 'a1'. so this param should take type Key for key= print "a/a1:", dump_json(inspect) # let's use runSummary for fun..returns OutputObj for the col # will get from column 0, since column not specified summaryResult = h2o_cmd.runSummary(key=a) co = h2o_cmd.infoFromSummary(summaryResult) print "co.label:", co.label print "co.data:", co.data # how can we get a bunch of data? b = DF('b1') # inits to -1 checkAst(astForInit(b)) c = DF('c1') # inits to -1 checkAst(astForInit(c)) print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) h2p.yellow_print("Assign compare1") Assign(c[0], c[0] + 0) checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare2") Assign(c[0], c[0] - 0) checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare3") Assign(c[0], c[0] == 0) checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare4") Assign(c[0], c[0] != 0) checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))") # h2o_xl.debugPrintEnable = True #***************************************** c = DF('c1') h2p.yellow_print("<<= compare1") c[0] <<= (c[0] + 0) checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))") h2p.yellow_print("<<= compare2") c[0] <<= (c[0] - 0) checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))") h2p.yellow_print("<<= compare3") c[0] <<= (c[0] == 0) checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))") #***************************************** c = DF('c1') # inits to -1 h2p.yellow_print("compare1") # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ??? # .result can give us scalar, list, Key, None # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small? # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for () result = Expr(c[0] == -1).result checkAst("(n ([ %c1 #0 #0) #-1)") h2p.yellow_print( "Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result)) assert result == 1.0, "%s %s" % (type(result), result) # real result? if result: print "true for if of result", type(result), result else: print "else for if of result", type(result), result #***************************************** # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key result = Assign(None, c[0] == -1).result checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))") h2p.yellow_print( "Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result)) assert result == 1.0, "%s %s" % (type(result), result) # real result? if result: print "true if of result", result else: print "false if of result", result
def test_xl_ast_assert_X(self): # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') checkAst(astForInit(a)) b = DF('b1') checkAst(astForInit(b)) c = DF('c1') checkAst(astForInit(c)) # look at our secret stash in the base class. Should see the DFInit? print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) # DF does a kv store init. Key doesn't # DF inherits from Key. KeyIndexed inherits from Key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) assert isinstance(a, Key) assert isinstance(b, Key) assert isinstance(c, Key) Assign(a, 2) checkAst("(= !a1 #2)") Assign(b, 2) checkAst("(= !b1 #2)") Assign(c, 2) checkAst("(= !c1 #2)") # - doesn't exist? multiply by -1? Assign(c, ~c) checkAst("(= !c1 (^ %c1 #1))") # not right if more than 1 col? Assign(c, -c) checkAst("(= !c1 (_ %c1))") Assign(c, abs(c)) checkAst("(= !c1 (abs %c1))") # this needs to be an h2o int? because it expects int return # Assign(c, int(c)) # checkAst("(= !c1 (trunc c1 ))") Assign(a, [0]) checkAst("(= !a1 (c {#0}))") Assign(b, [0, 1]) checkAst("(= !b1 (c {#0;#1}))") Assign(c, [0, 1, 2]) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(a, (0, )) # make sure it's a tuple with comma checkAst("(= !a1 (c {#0}))") Assign(b, (0, 1)) checkAst("(= !b1 (c {#0;#1}))") Assign(c, (0, 1, 2)) checkAst("(= !c1 (c {#0;#1;#2}))") Assign(c, a[0] + b[1]) checkAst("(= !c1 (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") Assign(c[0], (a[0] + b[1])) checkAst("(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #1 #0)))") # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def simpleCheckRFView(node=None, rfv=None, checkScoringOnly=False, noPrint=False, **kwargs): if not node: node = h2o_nodes.nodes[0] if 'warnings' in rfv: warnings = rfv['warnings'] # catch the 'Failed to converge" for now for w in warnings: if not noPrint: print "\nwarning:", w if ('Failed' in w) or ('failed' in w): raise Exception(w) #**************************** # if we are checking after confusion_matrix for predict, the jsonschema is different if 'cm' in rfv: cm = rfv['cm'] # only one else: if 'drf_model' in rfv: rf_model = rfv['drf_model'] elif 'speedrf_model' in rfv: rf_model = rfv['speedrf_model'] elif 'rf_model' in rfv: rf_model = rfv['rf_model'] else: raise Exception("no rf_model in rfv? %s" % dump_json(rfv)) cms = rf_model['cms'] print "number of cms:", len(cms) print "FIX! need to add reporting of h2o's _perr per class error" # FIX! what if regression. is rf only classification? print "cms[-1]['_arr']:", cms[-1]['_arr'] print "cms[-1]['_predErr']:", cms[-1]['_predErr'] print "cms[-1]['_classErr']:", cms[-1]['_classErr'] ## print "cms[-1]:", dump_json(cms[-1]) ## for i,c in enumerate(cms): ## print "cm %s: %s" % (i, c['_arr']) cm = cms[-1]['_arr'] # take the last one scoresList = cm if not checkScoringOnly: used_trees = rf_model['N'] errs = rf_model['errs'] print "errs[0]:", errs[0] print "errs[-1]:", errs[-1] print "errs:", errs # if we got the ntree for comparison. Not always there in kwargs though! param_ntrees = kwargs.get('ntrees', None) if (param_ntrees is not None and used_trees != param_ntrees): raise Exception("used_trees should == param_ntree. used_trees: %s" % used_trees) if (used_trees+1)!=len(cms) or (used_trees+1)!=len(errs): raise Exception("len(cms): %s and len(errs): %s should be one more than N %s trees" % (len(cms), len(errs), used_trees)) #**************************** totalScores = 0 totalRight = 0 # individual scores can be all 0 if nothing for that output class # due to sampling classErrorPctList = [] predictedClassDict = {} # may be missing some? so need a dict? for classIndex,s in enumerate(scoresList): classSum = sum(s) if classSum == 0 : # why would the number of scores for a class be 0? does RF CM have entries for non-existent classes # in a range??..in any case, tolerate. (it shows up in test.py on poker100) if not noPrint: print "class:", classIndex, "classSum", classSum, "<- why 0?" else: # H2O should really give me this since it's in the browser, but it doesn't classRightPct = ((s[classIndex] + 0.0)/classSum) * 100 totalRight += s[classIndex] classErrorPct = round(100 - classRightPct, 2) classErrorPctList.append(classErrorPct) ### print "s:", s, "classIndex:", classIndex if not noPrint: print "class:", classIndex, "classSum", classSum, "classErrorPct:", "%4.2f" % classErrorPct # gather info for prediction summary for pIndex,p in enumerate(s): if pIndex not in predictedClassDict: predictedClassDict[pIndex] = p else: predictedClassDict[pIndex] += p totalScores += classSum #**************************** if not noPrint: print "Predicted summary:" # FIX! Not sure why we weren't working with a list..hack with dict for now for predictedClass,p in predictedClassDict.items(): print str(predictedClass)+":", p # this should equal the num rows in the dataset if full scoring? (minus any NAs) print "totalScores:", totalScores print "totalRight:", totalRight if totalScores != 0: pctRight = 100.0 * totalRight/totalScores else: pctRight = 0.0 pctWrong = 100 - pctRight print "pctRight:", "%5.2f" % pctRight print "pctWrong:", "%5.2f" % pctWrong if checkScoringOnly: check_sandbox_for_errors() classification_error = pctWrong return (round(classification_error,2), classErrorPctList, totalScores) # it's legal to get 0's for oobe error # if sample_rate = 1 sample_rate = kwargs.get('sample_rate', None) validation = kwargs.get('validation', None) print "kevin:", sample_rate, validation if (sample_rate==1 and not validation): pass elif (totalScores<=0 or totalScores>5e9): raise Exception("scores in RFView seems wrong. scores:", scoresList) varimp = rf_model['varimp'] if 'importance' in kwargs and kwargs['importance']: max_var = varimp['max_var'] variables = varimp['variables'] varimpSD = varimp['varimpSD'] varimp2 = varimp['varimp'] # what is max_var? it's 100 while the length of the others is 54 for covtype if not max_var: raise Exception("varimp.max_var is None? %s" % max_var) # if not variables: # raise Exception("varimp.variables is None? %s" % variables) if not varimpSD: raise Exception("varimp.varimpSD is None? %s" % varimpSD) if not varimp2: raise Exception("varimp.varimp is None? %s" % varimp2) # check that they all have the same length and that the importance is not all zero # if len(varimpSD)!=max_var or len(varimp2)!=max_var or len(variables)!=max_var: # raise Exception("varimp lists seem to be wrong length: %s %s %s" % \ # (max_var, len(varimpSD), len(varimp2), len(variables))) # not checking maxvar or variables. Don't know what they should be if len(varimpSD) != len(varimp2): raise Exception("varimp lists seem to be wrong length: %s %s" % \ (len(varimpSD), len(varimp2))) h2o_util.assertApproxEqual(sum(varimp2), 0.0, tol=1e-5, msg="Shouldn't have all 0's in varimp %s" % varimp2) treeStats = rf_model['treeStats'] if not treeStats: raise Exception("treeStats not right?: %s" % dump_json(treeStats)) # print "json:", dump_json(rfv) data_key = rf_model['_dataKey'] model_key = rf_model['_key'] classification_error = pctWrong if not noPrint: if 'minLeaves' not in treeStats or not treeStats['minLeaves']: raise Exception("treeStats seems to be missing minLeaves %s" % dump_json(treeStats)) print """ Leaves: {0} / {1} / {2} Depth: {3} / {4} / {5} Err: {6:0.2f} % """.format( treeStats['minLeaves'], treeStats['meanLeaves'], treeStats['maxLeaves'], treeStats['minDepth'], treeStats['meanDepth'], treeStats['maxDepth'], classification_error, ) ### modelInspect = node.inspect(model_key) dataInspect = h2o_cmd.runInspect(key=data_key) check_sandbox_for_errors() return (round(classification_error,2), classErrorPctList, totalScores)
def log_view(self, timeoutSecs=10, **kwargs): a = self.do_json_request('LogView.json', timeout=timeoutSecs) verboseprint("\nlog_view result:", dump_json(a)) return a
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_keys': None, 'destination_key': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings' : None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) # have to put double quotes around the individual list items (single not legal) source_keys = "[" + ",".join(map((lambda x: '"' + x + '"'), key)) + "]" else: # what if None here source_keys = '["' + key + '"]' # quotes required on key params_dict['source_keys'] = source_keys # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_keys=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_keys': source_keys} setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_keys']: # should these be quoted? source_keysStr = "[" + ",".join([('"%s"' % src['name']) for src in setup_result['source_keys'] ]) + "]" else: source_keysStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: # single quotes not legal..need double quotes columnNamesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: # single quotes not legal..need double quotes naStrings = "[" + ",".join(map((lambda x: '"' + x + '"' if x != None else '""'), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k,v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k>=0 and k<len(ct): ct[k] = v else: raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]" parse_params = { 'source_keys': source_keysStr, 'destination_key': setup_result['destination_key'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_keys is length:", len(parse_params['source_keys']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="3/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_key'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def test_simple2(self): # h2o-dev doesn't take ../.. type paths? make find_file return absolute path # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data") csvPathname = find_file("smalldata/logreg/prostate.csv") import_result = h2o.n0.import_files(path=csvPathname) # print dump_json(import_result) k = import_result['destination_frames'][0] frames_result = h2o.n0.frames(key=k) frame = frames_result['frames'][0] rows = frame['rows'] columns = frame['columns'] for c in columns: label = c['label'] missing = c['missing_count'] stype = c['type'] domain = c['domain'] # print dump_json(frame) # let's see what ray's util does frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'frame_id/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) for k, v in frames_dict.items(): print "frames_dict key:", k # interesting. we can do dictionary comprehensions # { k:v for k,v in my_dict.items() if 'Peter' in k } # how do you parse multiple files parse_result = h2o.n0.parse( key=k, intermediateResults=DO_INTERMEDIATE_RESULTS) frame = parse_result['frames'][0] hex_key = frame['frame_id']['name'] colCount = 9 rowCount = 380 # colCount = 11 # rowCount = 1000000 start = time.time() inspect = h2o_cmd.runInspect(None, hex_key) print "Inspect:", hex_key, "took", time.time() - start, "seconds" numCols = len(inspect['frames'][0]['columns']) numRows = inspect['frames'][0]['rows'] print "\n" + csvPathname, \ " rows:", "{:,}".format(numRows), \ " len(columns):", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount)) verboseprint(hex_key, ":", dump_json(parse_result))
def test_b_algo_parameters(self): for algo in ['kmeans', 'gbm', 'deeplearning', 'glm', 'word2vec', 'example', 'quantile', 'grep']: paramResult = h2o.n0.model_builders(algo=algo) self.print_params(paramResult) mmResult = h2o.n0.model_metrics(algo=algo) print "mmResult", dump_json(mmResult)
def test_xl_real(self): bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexDF = 'v' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexDF) # uses h2o_xl to do magic with Rapids # does this DFInit to rows=0 now? a = DF('a1') # knon_* key assert isinstance(a, DF) assert isinstance(a, Key) assert isinstance(a, Xbase) assert not isinstance(a, KeyIndexed) assert not isinstance(a, Fcn) assert not isinstance(a, Assign) # look at our secret stash in the base class. Should see the DFInit? print "Does the lastExecResult stash work?", dump_json( h2o_xl.Xbase.lastExecResult) # this should work if str(DF) returns DF.frame inspect = h2o_cmd.runInspect(key=a) # print "inspect a", dump_json(inspect) b = DF('b1') assert isinstance(b, DF) inspect = h2o_cmd.runInspect(key=b) # print "inspect b", dump_json(inspect) Assign(a, [0.0, 1.0, 2.0]) assert isinstance(a, Key) b <<= [3.1, 4.1, 5.1] assert isinstance(b, Key) # FIX! how come I have to create c here first for python # see here # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python # is it too much to require c to exist first? # c = DF() # c <<= a + b # this will trigger ok? c = DF('c1') c <<= [6.2, 7.2, 8.2] assert isinstance(c, Key) # c[0] <<= a + b # Assign(lhs=c[0], rhs=(a + b)) rhs = a + b Assign(c, rhs) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= !c1 (+ %a1 %b1))" assert ast == astExpected, "Actual: %s Expected: %s" % (ast, astExpected) rhs = a[0] + b[0] Assign(c[0], rhs) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #0 #0) (+ ([ %a1 #0 #0) ([ %b1 #0 #0)))" assert ast == astExpected, "Actual: %s Expected: %s" % (ast, astExpected) Assign(c[1], (a[2] + b[2])) ast = h2o_xl.Xbase.lastExecResult['ast'] astExpected = "(= ([ %c1 #1 #0) (+ ([ %a1 #2 #0) ([ %b1 #2 #0)))" assert ast == astExpected, "Actual: %s Expected: %s" % (ast, astExpected) # assert ast = "(= !b1 (is.na (c {#0})))" assert isinstance(c, Key), type(c) inspect = h2o_cmd.runInspect(key=c) # # print "inspect c", dump_json(inspect) # DF inits the frame # if you just want an existing Key, say existing=True a = DF('a2') # named data frame assert isinstance(a, DF) b = DF('b2') c = DF('c2') inspect = h2o_cmd.runInspect(key=c) # # print "inspect c", dump_json(inspect) a <<= 3 b <<= 3 c <<= 3 c[0] <<= a[0] + b[0] assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) a = DF('a3') # named data frame b = DF('b3') c = DF('c3') a <<= 4 b <<= 4 c <<= 4 c[0] <<= a[0] - b[0] assert isinstance(c, Key) c[0] <<= a[0] * b[0] assert isinstance(c, Key) a = DF('a4') # named data frame b = DF('b4') c = DF('c4') a <<= 5 b <<= 5 c <<= 5 c[0] <<= (a[0] - b[0]) assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) c[0] <<= (a[0] & b[0]) | a[0] assert isinstance(c, Key) inspect = h2o_cmd.runInspect(key=c) # print "inspect c", dump_json(inspect) # print "\nDoes the keyWriteHistoryList work?" for k in Xbase.keyWriteHistoryList: print k h2o.check_sandbox_for_errors()
def build_model(self, algo, training_frame, parameters, destination_key=None, timeoutSecs=60, noPoll=False, **kwargs): ''' Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. ''' assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders['model_builders'], "%s %s" % (algo, [k for k in model_builders['model_builders']]) builder = model_builders['model_builders'][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format(training_frame) key_name = frames['frames'][0]['key']['name'] assert key_name==training_frame, \ "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame) parameters['training_frame'] = training_frame if destination_key is not None: parameters['destination_key'] = destination_key print "build_model parameters", parameters start = time.time() result1 = self.do_json_request('/3/ModelBuilders.json/' + algo, cmd='post', timeout=timeoutSecs, postData=parameters) # make get overwritten after polling elapsed = time.time() - start verboseprint("build_model result", dump_json(result1)) if noPoll: result = result1 elif 'validation_error_count' in result1: h2p.yellow_print("parameter error in model_builders: %s") # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 # don't bother printing a time message elif 'exception_msg' in result1: h2p.yellow_print("exception msg in model_builders: %s" % result1['exception_msg']) result = result1 else: job_result = result1['jobs'][0] job_key = job_result['key']['name'] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print "ModelBuilders", algo, "end on", training_frame, 'took', time.time() - start, 'seconds' print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on build_model job status: %s %s %s %s" % \ (status, progress, msec, description)) result = job_result else: # ? we should always get a job_json result raise Exception("build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() result['python_elapsed'] = elapsed return result
def verify_cloud_size(nodeList=None, expectedCloudName=None, expectedLocked=None, verbose=False, timeoutSecs=10, ignoreHealth=True): if not nodeList: nodeList = h2o_nodes.nodes expectedSize = len(nodeList) # cloud size and consensus have to reflect a single grab of information from a node. cloudStatus = [n.get_cloud(timeoutSecs=timeoutSecs) for n in nodeList] cloudSizes = [(c['cloud_size']) for c in cloudStatus] cloudConsensus = [c['consensus'] for c in cloudStatus] cloudName = [c['cloud_name'] for c in cloudStatus] cloudLocked = [c['locked'] for c in cloudStatus] cloudVersion = [c['version'] for c in cloudStatus] # all match 0? # if "(unknown)" starts appearing in version..go to h2o1 h2o_bc.py/h2o_fc.py/h2o_methods.py and copy allowing. expectedVersion = cloudVersion[0] # check to see if it's a h2o-dev version? (common problem when mixing h2o1/h2o-dev testing with --usecloud # local builds have (unknown) in h2o if you build.sh (instead of make) # gradle builds should always be right with version? if not expectedVersion.startswith('0'): raise Exception("h2o version at node[0] doesn't look like h2o-dev version. (start with 0) %s" % expectedVersion) for i, v in enumerate(cloudVersion): if v != expectedVersion: versionStr = (",".join(map(str, cloudVersion))) raise Exception("node %s. Inconsistent cloud version. nodeList report version: %s" % (i, versionStr)) if not ignoreHealth: for c in cloudStatus: if 'cloud_healthy' not in c: raise Exception("cloud_healthy missing: %s" % dump_json(c)) cloudHealthy = [c['cloud_healthy'] for c in cloudStatus] if not all(cloudHealthy): msg = "Some node reported cloud_healthy not true: %s" % cloudHealthy raise Exception(msg) # gather up all the node_healthy status too for i, c in enumerate(cloudStatus): nodesHealthy = [n['healthy'] for n in c['nodes']] if not all(nodesHealthy): print "node %s cloud status: %s" % (i, dump_json(c)) msg = "node %s says some node is not reporting node_healthy: %s" % (c['cloud_name'], nodesHealthy) if not ignoreHealth: raise Exception(msg) if expectedSize == 0 or len(cloudSizes) == 0 or len(cloudConsensus) == 0: print "\nexpectedSize:", expectedSize print "cloudSizes:", cloudSizes print "cloudConsensus:", cloudConsensus raise Exception("Nothing in cloud. Can't verify size") consensusStr = (",".join(map(str, cloudConsensus))) sizeStr = (",".join(map(str, cloudSizes))) for s in cloudSizes: if s != expectedSize: raise Exception("Inconsistent cloud size. nodeList report size: %s consensus: %s instead of %d." % (sizeStr, consensusStr, expectedSize)) # check that all cloud_names are right if expectedCloudName: for i, cn in enumerate(cloudName): if cn != expectedCloudName: print "node %s has the wrong cloud name: %s expectedCloudName: %s." % (i, cn, expectedCloudName) # print "node %s cloud status: %s" % (i, dump_json(cloudStatus[i])) # tear everyone down, in case of zombies. so we don't have to kill -9 manually print "tearing cloud down" tear_down_cloud(nodeList=nodeList, sandboxIgnoreErrors=False) raise Exception("node %s has the wrong cloud name: %s expectedCloudName: %s" % \ (i, cn, expectedCloudName)) # check that all locked are right if expectedLocked: for i, cl in enumerate(cloudLocked): if cl != expectedLocked: print "node %s has the wrong locked: %s expectedLocked: %s." % (i, cl, expectedLocked) # print "node %s cloud status: %s" % (i, dump_json(cloudStatus[i])) # tear everyone down, in case of zombies. so we don't have to kill -9 manually print "tearing cloud down" tear_down_cloud(nodeList=nodeList, sandboxIgnoreErrors=False) raise Exception("node %s has the wrong locked: %s expectedLocked: %s" % (i, cn, expectedLocked)) return (sizeStr, consensusStr, expectedSize)