def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise create from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' extraComment2 = " " + str(postData)+";" if cmd=='post' else "" extraComment2 += extraComment if extraComment else "" if len(extraComment2) > 0: log('Start ' + url + paramsStr, comment=extraComment2) else: log('Start ' + url + paramsStr) # file get passed thru kwargs here if h2o_args.no_timeout: timeout = None # infinite try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors(python_test_name=h2o_args.python_test_name); raise exc_info[1], None, exc_info[2]
def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if cmd == 'post': r = requests.post(url, timeout=timeout, params=params, **kwargs) else: r = requests.get(url, timeout=timeout, params=params, **kwargs) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors(python_test_name=h2o_args.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2]
def build_cloud_with_json(h2o_nodes_json='h2o-nodes.json'): log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()") log("#*********************************************************************") print "This only makes sense if h2o is running as defined by", h2o_nodes_json print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here" print "No output means no h2o here! Some other info about stuff on the system is printed first though." import h2o_os_util if not os.path.exists(h2o_nodes_json): raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file") # h2o_os_util.show_h2o_processes() with open(h2o_nodes_json, 'rb') as f: cloneJson = json.load(f) # These are supposed to be in the file. # Just check the first one. if not there, the file must be wrong if not 'cloud_start' in cloneJson: raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json) else: cs = cloneJson['cloud_start'] print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff valList = ['time', 'cwd', 'python_test_name', 'python_cmd_line', 'config_json', 'username', 'ip'] for v in valList: if v not in cs: raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json)) print "cloud_start['%s']: %s" % (v, cs[v]) # this is the internal node state for python..nodes rebuild nodeStateList = cloneJson['h2o_nodes'] nodeList = [] if not nodeStateList: raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json) for nodeState in nodeStateList: print "Cloning state for node", nodeState['node_id'], 'from', h2o_nodes_json newNode = ExternalH2O(nodeState) nodeList.append(newNode) print "" h2p.red_print("Ingested from json:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") print "" # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() # save it to a global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours time.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) ### h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def do_json_request(addr=None, port=None, jsonRequest=None, params=None, timeout=7, **kwargs): if params is not None: paramsStr = '?' + '&'.join( ['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' url = create_url(addr, port, jsonRequest) print 'Start ' + url + paramsStr try: r = requests.get(url, timeout=timeout, params=params, **kwargs) # the requests json decoder might fail if we didn't get something good rjson = r.json() emsg = "ERROR: Probing claimed existing cloud with Cloud.json" if not isinstance(rjson, (list, dict)): # probably good raise Exception(emsg + "h2o json responses should always be lists or dicts. Got %s" %\ dump_json(rj)) elif r.status_code != requests.codes.ok: rjson = None raise Exception(emsg + "Couldn't decode. Status: %s" % r.status_code) except requests.ConnectionError, e: rjson = None emsg = "ERROR: json got ConnectionError or other exception" # Rethrow the exception after we've checked for stack trace from h2o. # Out of memory errors maybe don't show up right away? # so we should wait for h2o to get it out to h2o stdout. # Don't want to rely on cloud teardown to check because there's no delay, # and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # we don't expect to have connection errors, so any exception is a bad thing. h2p.red_print("%s\n %s\n %s\nGoing to check sandbox, then rethrow.." % (emsg, exc_info, url + paramsStr)) time.sleep(2) check_sandbox_for_errors() raise exc_info[1], None, exc_info[2]
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4 * 3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format( h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print( "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime < maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def do_json_request(addr=None, port=None, jsonRequest=None, params=None, timeout=7, **kwargs): if params is not None: paramsStr = '?' + '&'.join(['%s=%s' % (k,v) for (k,v) in params.items()]) else: paramsStr = '' url = create_url(addr, port, jsonRequest) print 'Start ' + url + paramsStr try: r = requests.get(url, timeout=timeout, params=params, **kwargs) # the requests json decoder might fail if we didn't get something good rjson = r.json() emsg = "ERROR: Probing claimed existing cloud with Cloud.json" if not isinstance(rjson, (list,dict)): # probably good raise Exception(emsg + "h2o json responses should always be lists or dicts. Got %s" %\ dump_json(rj)) elif r.status_code != requests.codes.ok: rjson = None raise Exception(emsg + "Couldn't decode. Status: %s" % r.status_code) except requests.ConnectionError, e: rjson = None emsg = "ERROR: json got ConnectionError or other exception" # Rethrow the exception after we've checked for stack trace from h2o. # Out of memory errors maybe don't show up right away? # so we should wait for h2o to get it out to h2o stdout. # Don't want to rely on cloud teardown to check because there's no delay, # and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # we don't expect to have connection errors, so any exception is a bad thing. h2p.red_print( "%s\n %s\n %s\nGoing to check sandbox, then rethrow.." % (emsg, exc_info, url + paramsStr)) time.sleep(2) check_sandbox_for_errors() raise exc_info[1], None, exc_info[2]
def do_h2o_glm(self, bucket, csvPathname, L, family="binomial"): h2p.red_print("\nNow doing h2o") h2o.beta_features = True parseResult = h2i.import_parse(bucket="smalldata", path=csvPathname, schema="local", timeoutSecs=180) # save the resolved pathname for use in the sklearn csv read below inspect = h2o_cmd.runInspect(None, parseResult["destination_key"]) print inspect print "\n" + csvPathname, " numRows:", "{:,}".format(inspect["numRows"]), " numCols:", "{:,}".format( inspect["numCols"] ) x = "ID" y = "CAPSULE" family = family alpha = "0" lambda_ = L nfolds = "0" f = "prostate" modelKey = "GLM_" + f kwargs = { "response": y, "ignored_cols": x, "family": family, "lambda": lambda_, "alpha": alpha, "n_folds": nfolds, # passes if 0, fails otherwise "destination_key": modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o alpha ", alpha) h2p.green_print("h2o lambda ", lambda_) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response glm_model = glmResult["glm_model"] _names = glm_model["_names"] coefficients_names = glm_model["coefficients_names"] # the first submodel is the right one, if onely one lambda is provided as a parameter above submodels = glm_model["submodels"][0] beta = submodels["beta"] h2p.red_print("beta:", beta) norm_beta = submodels["norm_beta"] iteration = submodels["iteration"] validation = submodels["validation"] avg_err = validation["avg_err"] auc = validation["auc"] aic = validation["aic"] null_deviance = validation["null_deviance"] residual_deviance = validation["residual_deviance"] print "_names", _names print "coefficients_names", coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print "beta", beta print "iteration", iteration print "avg_err", avg_err print "auc", auc
def build_cloud(node_count=1, base_port=None, hosts=None, timeoutSecs=30, retryDelaySecs=1, cleanup=True, rand_shuffle=True, conservative=False, create_json=False, clone_cloud=None, init_sandbox=True, usecloud=False, usecloud_size=None, **kwargs): # expectedSize is only used if usecloud # usecloud can be passed thru build_cloud param, or command line # not in config json though so no build_cloud_with_hosts path. # redirect to build_cloud_with_json if a command line arg # wants to force a test to ignore it's build_cloud/build_cloud_with_hosts # (both come thru here) # clone_cloud is just another way to get the effect (maybe ec2 config file thru # build_cloud_with_hosts? global stdout_wrapped if not h2o_args.disable_time_stamp and not stdout_wrapped: sys.stdout = OutWrapper(sys.stdout) stdout_wrapped = True if h2o_args.usecloud or usecloud: # for now, just have fixed name in local file. (think of this as a temp or debug file) # eventually we'll pass the json object instead for speed? nodesJsonPathname = "h2o_fc-nodes.json" elif h2o_args.clone_cloud_json: nodesJsonPathname = h2o_args.clone_cloud_json elif clone_cloud: nodesJsonPathname = clone_cloud else: # normal build_cloud() doesn't use nodesJsonPathname = None # usecloud dominates over all if (h2o_args.clone_cloud_json or clone_cloud) or (h2o_args.usecloud or usecloud): # then build_cloud_with_json with json object # we don't need to specify these defaults, but leave here to show that we can pass # I suppose kwargs will have it if h2o_args.usecloud: ip_port = h2o_args.usecloud elif usecloud: ip_port = usecloud else: ip_port = None # h2o_args dominates if h2o_args.usecloud_size: # only used for expected size useCloudExpectedSize = h2o_args.usecloud_size else: useCloudExpectedSize = usecloud_size if (h2o_args.usecloud or usecloud): nodesJsonObject = h2o_fc.find_cloud(ip_port=ip_port, expectedSize=useCloudExpectedSize, nodesJsonPathname=nodesJsonPathname, **kwargs) # potentially passed in kwargs # hdfs_version='cdh4', hdfs_config=None, hdfs_name_node='172.16.1.176', else: if h2o_args.clone_cloud_json: nodesJsonPathname = h2o_args.clone_cloud_json else: nodesJsonPathname = clone_cloud nodeList = build_cloud_with_json(h2o_nodes_json=nodesJsonPathname) return nodeList # else # moved to here from unit_main. so will run with nosetests too! # Normally do this. # Don't if build_cloud_with_hosts() did and put a flatfile in there already! if init_sandbox: clean_sandbox() log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud() ") log("#*********************************************************************") # start up h2o to report the java version (once). output to python stdout # only do this for regression testing # temporarily disable this, to go a little faster # if getpass.getuser() == 'jenkins': # check_h2o_version() ports_per_node = 2 nodeList = [] # shift the port used to run groups of tests on the same machine at the same time? base_port = get_base_port(base_port) try: # if no hosts list, use psutil method on local host. totalNodes = 0 # doing this list outside the loops so we can shuffle for better test variation # this jvm startup shuffle is independent from the flatfile shuffle portList = [base_port + ports_per_node * i for i in range(node_count)] if hosts is None: # if use_flatfile, we should create it # because tests will just call build_cloud with use_flatfile=True # best to just create it all the time..may or may not be used write_flatfile(node_count=node_count, base_port=base_port) hostCount = 1 if rand_shuffle: random.shuffle(portList) for p in portList: verboseprint("psutil starting node", i) newNode = LocalH2O(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 else: # if hosts, the flatfile was created and uploaded to hosts already # I guess don't recreate it, don't overwrite the one that was copied beforehand. # we don't always use the flatfile (use_flatfile=False) # Suppose we could dispatch from the flatfile to match it's contents # but sometimes we want to test with a bad/different flatfile then we invoke h2o? hostCount = len(hosts) hostPortList = [] for h in hosts: for port in portList: hostPortList.append((h, port)) if rand_shuffle: random.shuffle(hostPortList) for (h, p) in hostPortList: verboseprint('ssh starting node', totalNodes, 'via', h) newNode = h.remote_h2o(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 verboseprint("Attempting Cloud stabilize of", totalNodes, "nodes on", hostCount, "hosts") start = time.time() # UPDATE: best to stabilize on the last node! # FIX! for now, always check sandbox, because h2oddev has TIME_WAIT port problems stabilize_cloud(nodeList[0], nodeList, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, noExtraErrorCheck=False) stabilizeTime = time.time() - start verboseprint(len(nodeList), "Last added node stabilized in ", stabilizeTime, " secs") # assume all the heap sizes are the same as zero if nodeList[0].java_heap_GB: heapSize = str(nodeList[0].java_heap_GB) + " GB" elif nodeList[0].java_heap_GB: heapSize = str(nodeList[0].java_heap_MB) + " MB" else: heapSize = "(unknown)" h2p.red_print("Built cloud: %s java heap(s) with %d nodes on %d hosts, stabilizing in %d secs" % \ (heapSize, len(nodeList), hostCount, stabilizeTime)) # FIX! using "consensus" in node[-1] should mean this is unnecessary? # maybe there's a bug. For now do this. long term: don't want? # UPDATE: do it for all cases now 2/14/13 if conservative: # still needed? for n in nodeList: # FIX! for now, always check sandbox, because h2oddev has TIME_WAIT port problems stabilize_cloud(n, nodeList, timeoutSecs=timeoutSecs, noExtraErrorCheck=False) # this does some extra checking now # verifies cloud name too if param is not None verify_cloud_size(nodeList, expectedCloudName=nodeList[0].cloud_name, expectedLocked=0) # FIX! should probably check that the cloud's lock=0. It will go to 1 later. # but if it's an existing cloud, it may already be locked. # That will be in build_cloud_with_json, though # best to check for any errors due to cloud building right away? check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() except: # nodeList might be empty in some exception cases? # no shutdown issued first, though if cleanup and nodeList: for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise print len(nodeList), "total jvms in H2O cloud" if h2o_args.config_json: # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_args.config_json, "we used to", LOG_DIR shutil.copy(h2o_args.config_json, LOG_DIR + "/" + os.path.basename(h2o_args.config_json)) if create_json: # Figure out some stuff about how this test was run cs_time = str(datetime.datetime.now()) cs_cwd = os.getcwd() cs_python_cmd_line = "python %s %s" % (h2o_args.python_test_name, h2o_args.python_cmd_args) cs_python_test_name = h2o_args.python_test_name if h2o_args.config_json: cs_config_json = os.path.abspath(h2o_args.config_json) else: cs_config_json = None cs_username = h2o_args.python_username cs_ip = h2o_args.python_cmd_ip # dump the nodes state to a json file # include enough extra info to have someone # rebuild the cloud if a test fails that was using that cloud. q = { 'cloud_start': { 'time': cs_time, 'cwd': cs_cwd, 'python_test_name': cs_python_test_name, 'python_cmd_line': cs_python_cmd_line, 'config_json': cs_config_json, 'username': cs_username, 'ip': cs_ip, }, 'h2o_nodes': h2o_util.json_repr(nodeList), } with open('h2o-nodes.json', 'w+') as f: f.write(json.dumps(q, indent=4)) # save it to a local global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def do_scipy_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("Now doing sklearn") h2p.red_print("\nsee http://scikit-learn.org/0.11/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression") import numpy as np import scipy as sp from sklearn.linear_model import LogisticRegression from numpy import loadtxt csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) # make sure it does fp divide C = 1/(L+0.0) print "C regularization:", C dataset = np.loadtxt( open(csvPathnameFull,'r'), skiprows=1, # skip the header delimiter=',', dtype='float'); print "\ncsv read for training, done" n_features = len(dataset[0]) - 1; print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array ( [x[2:] for x in dataset] ) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target,3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family!='binomial': raise Exception("Only have binomial logistic for scipy") print "\nTrying l2" clf2 = LogisticRegression( C=C, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l2', tol=0.0001); # train the classifier start = time.time() clf2.fit(train, target) print "L2 fit took", time.time() - start, "seconds" # print "coefficients:", clf2.coef_ cstring = "".join([("%.5e " % c) for c in clf2.coef_[0]]) h2p.green_print("sklearn L2 C", C) h2p.green_print("sklearn coefficients:", cstring) h2p.green_print("sklearn intercept:", "%.5e" % clf2.intercept_[0]) h2p.green_print("sklearn score:", clf2.score(train,target)) print "\nTrying l1" clf1 = LogisticRegression( C=C, dual=False, fit_intercept=True, intercept_scaling=1, penalty='l1', tol=0.0001); # train the classifier start = time.time() clf1.fit(train, target) print "L1 fit took", time.time() - start, "seconds" # print "coefficients:", clf1.coef_ cstring = "".join([("%.5e " % c) for c in clf1.coef_[0]]) h2p.green_print("sklearn L1 C", C) h2p.green_print("sklearn coefficients:", cstring) h2p.green_print("sklearn intercept:", "%.5e" % clf1.intercept_[0]) h2p.green_print("sklearn score:", clf1.score(train,target)) # attributes are accessed in the normal python way dx = clf1.__dict__ dx.keys()
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oSummary2MaxErr=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, h2oExecQuantiles=None, interpolate='linear', quantile=0.50, use_genfromtxt=False): SCIPY_INSTALLED = False try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = False if use_genfromtxt and SCIPY_INSTALLED: print "Using numpy.genfromtxt. Better handling of null bytes" target = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', skip_header=1 if skipHeader else 0, dtype=None) # guess! # print "shape:", target.shape() else: print "Using python csv reader" target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype=='float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP = map(float, target) # targetFP= np.array(tFP, np.float) if datatype=='int': targetFP = map(int, target) if SCIPY_INSTALLED: # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 p = np.percentile(targetFP, quantile*100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile*100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1==0: # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5 # type 8 alphap=1/3.0 betap=1/3.0 if interpolate=='mean': # an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 if interpolate=='linear': # this is type 7 alphap=1 betap=1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) if SCIPY_INSTALLED: h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles) # they should be identical. keep a tight absolute tolerance # Note the comparisons have different tolerances, some are relative, some are absolute if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oQuantilesApprox: # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN if math.isnan(float(h2oQuantilesApprox)): raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) if h2oSummary2MaxErr: h2o_util.assertApproxEqual(h2oQuantilesApprox, b, tol=h2oSummary2MaxErr, msg='h2o quantile singlepass is not approx. same as sort algo') else: h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.1, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) if h2oSummary2MaxErr: # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2)) h2o_util.assertApproxEqual(h2oSummary2, b, tol=h2oSummary2MaxErr, msg='h2o summary2 is not approx. same as sort algo (calculated expected max error)') else: # bounds are way off, since it depends on the min/max of the col, not the expected value h2o_util.assertApproxEqual(h2oSummary2, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo (sloppy compare)') if h2oQuantilesApprox and h2oSummary2: # they should both get the same answer. Currently they have different code, but same algo # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases. # not sure why..maybe some subtle algo diff. h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04, msg='h2o summary2 is not approx. same as h2o singlepass.'+\ ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation') if h2oExecQuantiles: if math.isnan(float(h2oExecQuantiles)): raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles) # bounds are way off h2o_util.assertApproxEqual(h2oExecQuantiles, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesExact: h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile') # give us some slack compared to the scipy use of median (instead of desired mean) # since we don't have bounds here like above, just stop this test for now if h2oQuantilesApprox and 1==0: if interpolate=='mean': h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5, msg='h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles') else: h2o_util.assertApproxEqual(h2oQuantilesApprox, s2, rel=0.5, msg='h2o quantile singlepass is not same as scipy stats.mstats.mquantiles') # see if scipy changes. nope. it doesn't if 1==0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def generate_scipy_comparison(csvPathname, col=0, h2oMedian=None, h2oMedian2=None): # this is some hack code for reading the csv and doing some percentile stuff in scipy # from numpy import loadtxt, genfromtxt, savetxt import numpy as np import scipy as sp dataset = np.genfromtxt( open(csvPathname, 'r'), delimiter=',', # skip_header=1, dtype=None); # guess! print "csv read for training, done" # we're going to strip just the last column for percentile work # used below NUMCLASSES = 10 print "csv read for training, done" # data is last column # drop the output print dataset.shape if len(dataset.shape) > 1: target = [x[col] for x in dataset] else: target = dataset # we may have read it in as a string. coerce to number targetFP = np.array(target, np.float) if 1==0: n_features = len(dataset[0]) - 1; print "n_features:", n_features # get the end # target = [x[-1] for x in dataset] # get the 2nd col print "histogram of target" print target print sp.histogram(target, bins=NUMCLASSES) print target[0] print target[1] thresholds = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] print "scipy per:", thresholds from scipy import stats # a = stats.scoreatpercentile(target, per=per) a = stats.mstats.mquantiles(targetFP, prob=thresholds) a2 = ["%.2f" % v for v in a] h2p.red_print("scipy stats.mstats.mquantiles:", a2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') label = '50%' if DO_MEDIAN else '99.9%' h2p.blue_print(label, "from sort:", b) s = a[5 if DO_MEDIAN else 10] h2p.blue_print(label, "from scipy:", s) h2p.blue_print(label, "from h2o summary2:", h2oMedian) h2p.blue_print(label, "from h2o quantile multipass:"******"%.2f" % v for v in a] h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", a2)
def test_exec_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def build_cloud(node_count=1, base_port=None, hosts=None, timeoutSecs=30, retryDelaySecs=1, cleanup=True, rand_shuffle=True, conservative=False, create_json=False, clone_cloud=None, init_sandbox=True, usecloud=False, usecloud_size=None, **kwargs): # expectedSize is only used if usecloud # usecloud can be passed thru build_cloud param, or command line # not in config json though so no build_cloud_with_hosts path. # redirect to build_cloud_with_json if a command line arg # wants to force a test to ignore it's build_cloud/build_cloud_with_hosts # (both come thru here) # clone_cloud is just another way to get the effect (maybe ec2 config file thru # build_cloud_with_hosts? global stdout_wrapped if not h2o_args.disable_time_stamp and not stdout_wrapped: sys.stdout = OutWrapper(sys.stdout) stdout_wrapped = True if h2o_args.usecloud or usecloud: # for now, just have fixed name in local file. (think of this as a temp or debug file) # eventually we'll pass the json object instead for speed? nodesJsonPathname = "h2o_fc-nodes.json" elif h2o_args.clone_cloud_json: nodesJsonPathname = h2o_args.clone_cloud_json elif clone_cloud: nodesJsonPathname = clone_cloud else: # normal build_cloud() doesn't use nodesJsonPathname = None # usecloud dominates over all if (h2o_args.clone_cloud_json or clone_cloud) or (h2o_args.usecloud or usecloud): # then build_cloud_with_json with json object # we don't need to specify these defaults, but leave here to show that we can pass # I suppose kwargs will have it if h2o_args.usecloud: ip_port = h2o_args.usecloud elif usecloud: ip_port = usecloud else: ip_port = None # h2o_args dominates if h2o_args.usecloud_size: # only used for expected size useCloudExpectedSize = h2o_args.usecloud_size else: useCloudExpectedSize = usecloud_size nodesJsonObject = h2o_fc.find_cloud(ip_port=ip_port, expectedSize=useCloudExpectedSize, nodesJsonPathname=nodesJsonPathname, **kwargs) # potentially passed in kwargs # hdfs_version='cdh4', hdfs_config=None, hdfs_name_node='172.16.1.176', nodeList = build_cloud_with_json(h2o_nodes_json=nodesJsonPathname) return nodeList # else # moved to here from unit_main. so will run with nosetests too! # Normally do this. # Don't if build_cloud_with_hosts() did and put a flatfile in there already! if init_sandbox: clean_sandbox() log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud() ") log("#*********************************************************************") # start up h2o to report the java version (once). output to python stdout # only do this for regression testing # temporarily disable this, to go a little faster # if getpass.getuser() == 'jenkins': # check_h2o_version() ports_per_node = 2 nodeList = [] # shift the port used to run groups of tests on the same machine at the same time? base_port = get_base_port(base_port) try: # if no hosts list, use psutil method on local host. totalNodes = 0 # doing this list outside the loops so we can shuffle for better test variation # this jvm startup shuffle is independent from the flatfile shuffle portList = [base_port + ports_per_node * i for i in range(node_count)] if hosts is None: # if use_flatfile, we should create it # because tests will just call build_cloud with use_flatfile=True # best to just create it all the time..may or may not be used write_flatfile(node_count=node_count, base_port=base_port) hostCount = 1 if rand_shuffle: random.shuffle(portList) for p in portList: verboseprint("psutil starting node", i) newNode = LocalH2O(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 else: # if hosts, the flatfile was created and uploaded to hosts already # I guess don't recreate it, don't overwrite the one that was copied beforehand. # we don't always use the flatfile (use_flatfile=False) # Suppose we could dispatch from the flatfile to match it's contents # but sometimes we want to test with a bad/different flatfile then we invoke h2o? hostCount = len(hosts) hostPortList = [] for h in hosts: for port in portList: hostPortList.append((h, port)) if rand_shuffle: random.shuffle(hostPortList) for (h, p) in hostPortList: verboseprint('ssh starting node', totalNodes, 'via', h) newNode = h.remote_h2o(port=p, node_id=totalNodes, **kwargs) nodeList.append(newNode) totalNodes += 1 verboseprint("Attempting Cloud stabilize of", totalNodes, "nodes on", hostCount, "hosts") start = time.time() # UPDATE: best to stabilize on the last node! stabilize_cloud(nodeList[0], nodeList, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, noSandboxErrorCheck=True) verboseprint(len(nodeList), "Last added node stabilized in ", time.time() - start, " secs") verboseprint("Built cloud: %d nodes on %d hosts, in %d s" % \ (len(nodeList), hostCount, (time.time() - start))) h2p.red_print("Built cloud:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") # FIX! using "consensus" in node[-1] should mean this is unnecessary? # maybe there's a bug. For now do this. long term: don't want? # UPDATE: do it for all cases now 2/14/13 if conservative: # still needed? for n in nodeList: stabilize_cloud(n, nodeList, timeoutSecs=timeoutSecs, noSandboxErrorCheck=True) # this does some extra checking now # verifies cloud name too if param is not None verify_cloud_size(nodeList, expectedCloudName=nodeList[0].cloud_name) # best to check for any errors due to cloud building right away? check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) except: # nodeList might be empty in some exception cases? # no shutdown issued first, though if cleanup and nodeList: for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise print len(nodeList), "total jvms in H2O cloud" # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() if h2o_args.config_json: LOG_DIR = get_sandbox_name() # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_args.config_json, "we used to", LOG_DIR shutil.copy(h2o_args.config_json, LOG_DIR + "/" + os.path.basename(h2o_args.config_json)) # Figure out some stuff about how this test was run cs_time = str(datetime.datetime.now()) cs_cwd = os.getcwd() cs_python_cmd_line = "python %s %s" % (h2o_args.python_test_name, h2o_args.python_cmd_args) cs_python_test_name = h2o_args.python_test_name if h2o_args.config_json: cs_config_json = os.path.abspath(h2o_args.config_json) else: cs_config_json = None cs_username = h2o_args.python_username cs_ip = h2o_args.python_cmd_ip # dump the nodes state to a json file # include enough extra info to have someone # rebuild the cloud if a test fails that was using that cloud. if create_json: q = { 'cloud_start': { 'time': cs_time, 'cwd': cs_cwd, 'python_test_name': cs_python_test_name, 'python_cmd_line': cs_python_cmd_line, 'config_json': cs_config_json, 'username': cs_username, 'ip': cs_ip, }, 'h2o_nodes': h2o_util.json_repr(nodeList), } with open('h2o-nodes.json', 'w+') as f: f.write(json.dumps(q, indent=4)) # save it to a local global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1, iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:, i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1, 2, 3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] if 1 == 1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser()=='kevin': tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range (0,numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i); # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key); l = levels['levels'] for column in range(iColCount, iColCount+oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1==0: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def find_folder_and_filename(bucket, pathWithRegex, schema='put', returnFullPath=False): checkPath = True # strip the common mistake of leading "/" in path, if bucket is specified too giveUpAndSearchLocally = False if bucket is not None and re.match("/", pathWithRegex): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", pathWithRegex) pathWithRegex = pathWithRegex.lstrip('/') if bucket is None: # good for absolute path name bucketPath = "" elif bucket == ".": bucketPath = os.getcwd() # only use if the build_cloud was for remote H2O # Never use the var for remote, if you're doing a put! (which always sources local) elif h2o.nodes[0].remoteH2O and schema!='put' and \ (os.environ.get('H2O_REMOTE_BUCKETS_ROOT') or h2o.nodes[0].h2o_remote_buckets_root): if (bucket=='smalldata' or bucket=='datasets') and schema=='local': msg1 = "\nWARNING: you're using remote nodes, and 'smalldata' or 'datasets' git buckets, with schema!=put" msg2 = "\nThose aren't git pull'ed by the test. Since they are user-maintained, not globally-maintained-by-0xdata," msg3 = "\nthey may be out of date at those remote nodes?" msg4 = "\nGoing to assume we find a path to them locally, and remote path will be the same" h2p.red_print(msg1, msg2, msg3, msg4) giveUpAndSearchLocally = True else: if os.environ.get('H2O_REMOTE_BUCKETS_ROOT'): rootPath = os.environ.get('H2O_REMOTE_BUCKETS_ROOT') print "Found H2O_REMOTE_BUCKETS_ROOT:", rootPath else: rootPath = h2o.nodes[0].h2o_remote_buckets_root print "Found h2o_nodes[0].h2o_remote_buckets_root:", rootPath bucketPath = os.path.join(rootPath, bucket) checkPath = False # does it work to use bucket "." to get current directory # this covers reote with put too elif os.environ.get('H2O_BUCKETS_ROOT'): rootPath = os.environ.get('H2O_BUCKETS_ROOT') print "Using H2O_BUCKETS_ROOT environment variable:", rootPath if not (os.path.exists(rootPath)): raise Exception("H2O_BUCKETS_ROOT in env but %s doesn't exist." % rootPath) bucketPath = os.path.join(rootPath, bucket) if not (os.path.exists(bucketPath)): raise Exception("H2O_BUCKETS_ROOT and path used to form %s which doesn't exist." % bucketPath) else: giveUpAndSearchLocally = True #****************************************************************************************** if giveUpAndSearchLocally: # if we run remotely, we're assuming the import folder path on the remote machine # matches what we find on our local machine. But maybe the local user doesn't exist remotely # so using his path won't work. # Resolve by looking for special state in the config. If user = 0xdiag, just force the bucket location # This is a lot like knowing about fixed paths with s3 and hdfs # Otherwise the remote path needs to match the local discovered path. # want to check the username being used remotely first. should exist here too if going to use username = getpass.getuser() h2oUsername = h2o.nodes[0].username h2o.verboseprint("username:"******"h2oUsername:"******"datasets" is special. Don't want to find it in /home/0xdiag/datasets # needs to be the git clone 'datasets'. Find it by walking upwards below # disable it from this looking in home dir. Could change priority order? # resolved in order, looking for bucket (ln -s will work) in these home dirs. if bucket=='datasets': # special case possibleUsers = [] elif h2oUsername != username: possibleUsers = [username, h2oUsername, "0xdiag"] else: possibleUsers = [username, "0xdiag"] for u in possibleUsers: rootPath = os.path.expanduser("~" + u) bucketPath = os.path.join(rootPath, bucket) h2o.verboseprint("Checking bucketPath:", bucketPath, 'assuming home is', rootPath) if os.path.exists(bucketPath): h2o.verboseprint("search A did find", bucket, "at", rootPath) break else: # last chance to find it by snooping around rootPath = os.getcwd() h2o.verboseprint("find_bucket looking upwards from", rootPath, "for", bucket) # don't spin forever levels = 0 while not (os.path.exists(os.path.join(rootPath, bucket))): h2o.verboseprint("Didn't find", bucket, "at", rootPath) rootPath = os.path.split(rootPath)[0] levels += 1 if (levels==6): raise Exception("unable to find bucket: %s. Maybe missing link in /home/0xdiag or /home/0xcustomer or jenkins ~? or whatever user is running the python or the h2o?" % bucket) h2o.verboseprint("search B did find", bucket, "at", rootPath) bucketPath = os.path.join(rootPath, bucket) #****************************************************************************************** # if there's no path, just return the bucketPath # but what about cases with a header in the folder too? (not putfile) if pathWithRegex is None: if returnFullPath: return bucketPath else: return (bucketPath, None) # if there is a "/" in the path, that means it's not just a pattern # split it # otherwise it is a pattern. use it to search for files in python first? # FIX! do that later elif "/" in pathWithRegex: (head, tail) = os.path.split(pathWithRegex) folderPath = os.path.abspath(os.path.join(bucketPath, head)) # accept all 0xcustomer-datasets without checking..since the current python user # may not have permission, but h2o will # try a couple times with os.stat in between, in case it's not automounting if '/mnt/0xcustomer-datasets' in folderPath: pass else: retry = 0 while checkPath and (not os.path.exists(folderPath)) and retry<5: # we can't stat an actual file, because we could have a regex at the end of the pathname print "Retrying", folderPath, "in case there's a autofs mount problem" os.stat(folderPath) retry += 1 time.sleep(1) if checkPath and not os.path.exists(folderPath): raise Exception("%s doesn't exist. %s under %s may be wrong?" % (folderPath, head, bucketPath)) else: folderPath = bucketPath tail = pathWithRegex h2o.verboseprint("folderPath:", folderPath, "tail:", tail) if returnFullPath: return os.path.join(folderPath, tail) else: return (folderPath, tail)
def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, raiseIfNon200=True, suppressErrorMsg=False, **kwargs): H2O.verboseprint("__do_json_request, timeout: " + str(timeout)) # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.__url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params_serialized = params.copy() for k in params_serialized: if params_serialized[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' # The requests package takes array parameters and explodes them: ['f00', 'b4r'] becomes "f00,b4r". # NOTE: this handles 1D arrays only; if we need ND this needs to be recursive. # NOTE: we currently don't need to do this for GET, so that's not implemented. if postData is not None: munged_postData = {} for k, v in postData.iteritems(): if type(v) is list: if len(v) == 0: munged_postData[k] = '[]' else: first = True array_str = '[' for val in v: if not first: array_str += ', ' if val is None: array_str += 'null' elif isinstance(val, basestring): array_str += "\"" + str(val) + "\"" else: array_str += str(val) first = False array_str += ']' munged_postData[k] = array_str elif type(v) is dict: if len(v) == 0: munged_postData[k] = '{}' else: first = True map_str = '{' for key, val in v.iteritems(): if not first: map_str += ', ' if val is None: map_str += "\"" + key + "\"" + ': null' elif isinstance(val, basestring): map_str += "\"" + str(key) + "\"" + ":" + "\"" + str(val) + "\"" else: map_str += "\"" + key + "\"" + ':' + str(val) first = False map_str += '}' munged_postData[k] = map_str else: # not list: munged_postData[k] = v else: # None munged_postData = postData # print("munged_postData: " + repr(munged_postData)) if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(munged_postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=munged_postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception as e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) H2O.check_sandbox_for_errors(python_test_name=H2O.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise (exc_info[1], None, exc_info[2]) H2O.verboseprint("r: " + repr(r)) if 200 != r.status_code: pp = pprint.PrettyPrinter(indent=4) msg = "JSON call returned non-200 status: " + url json = r.json() if None != json and 'dev_msg' in json: msg += "\ndev_msg: " msg += str(json['dev_msg']) msg += "\nr.status_code: " + str(r.status_code) msg += "\nr.headers: " + repr(r.headers) if None == json: msg += '\nERROR: the error output from H2O is not JSON!' msg += "\nr.text: " + r.text else: msg += "\nr.json: " msg += pp.pformat(json) if raiseIfNon200: pass # we'll pass msg up with the exception elif not suppressErrorMsg: print(msg) log_rest(msg) log_rest("") try: if r is None: log_rest("r is None") else: log_rest("HTTP status code: " + str(r.status_code)) # The following accesses to r.text were taking most of the runtime: log_text = False if log_text: if hasattr(r, 'text'): if r.text is None: log_rest("r.text is None") else: log_rest(r.text) else: log_rest("r does not have attr text") except Exception as e: # Paranoid exception catch. # Ignore logging exceptions in the case that the above error checking isn't sufficient. print("Caught exception from result logging: ", e, "; result: ", repr(r)) # fatal if no response if raiseIfNon200 and not r: raise Exception("Maybe bad url? no r in __do_json_request in %s:" % inspect.stack()[1][3] + "\n\n" + msg) # this is used to open a browser on results, or to redo the operation in the browser # we don't' have that may urls flying around, so let's keep them all H2O.json_url_history.append(r.url) # if r.json(): # raise Exception("Maybe bad url? no r.json in __do_json_request in %s:" % inspect.stack()[1][3]) rjson = None if returnFast: return try: rjson = r.json() except: print(h2o_test_utils.dump_json(r.text)) if not isinstance(r, (list, dict)): raise Exception("h2o json responses should always be lists or dicts, see previous for text") raise Exception("Could not decode any json from the request.") # TODO # TODO # TODO # TODO: we should really only look in the response object. This check # prevents us from having a field called "error" (e.g., for a scoring result). for e in ['error', 'Error', 'errors', 'Errors']: # error can be null (python None). This happens in exec2 if e in rjson and rjson[e]: H2O.verboseprint("rjson:" + h2o_test_utils.dump_json(rjson)) emsg = 'rjson %s in %s: %s' % (e, inspect.stack()[1][3], rjson[e]) if ignoreH2oError: # well, we print it..so not totally ignore. test can look at rjson returned print(emsg) else: print(emsg) raise Exception(emsg) for w in ['warning', 'Warning', 'warnings', 'Warnings']: # warning can be null (python None). if w in rjson and rjson[w]: H2O.verboseprint(dump_json(rjson)) print('rjson %s in %s: %s' % (w, inspect.stack()[1][3], rjson[w])) # Allow the caller to check things like __http_request.status_code. # The response object is not JSON-serializable, so we capture the fields we want here: response = {} # response['headers'] = r.headers response['url'] = r.url response['status_code'] = r.status_code response['text'] = r.text rjson['__http_response'] = response return rjson
def do_statsmodels_glm(self, bucket, csvPathname, L, family='gaussian'): h2p.red_print("Now doing statsmodels") h2p.red_print( "http://statsmodels.sourceforge.net/devel/glm.html#module-reference") h2p.red_print( "http://statsmodels.sourceforge.net/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html" ) import numpy as np import scipy as sp from numpy import loadtxt import statsmodels as sm csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if 1 == 1: dataset = np.loadtxt( open(csvPathnameFull, 'r'), skiprows=1, # skip the header delimiter=',', dtype='float') # skipping cols from the begining... (ID is col 1) # In newer versions of Numpy, np.genfromtxt can take an iterable argument, # so you can wrap the file you're reading in a generator that generates lines, # skipping the first N columns. If your numbers are comma-separated, that's something like if 1 == 0: f = open(csvPathnameFull, 'r'), np.genfromtxt( (",".join(ln.split()[1:]) for ln in f), skiprows=1, # skip the header delimiter=',', dtype='float') print "\ncsv read for training, done" # data is last column # drop the output n_features = len(dataset[0]) - 1 print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array([x[2:] for x in dataset]) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target, 3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family != 'gaussian': raise Exception("Only have gaussian logistic for scipy") # train the classifier gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian( sm_api.families.links.log)) start = time.time() gauss_log_results = gauss_log.fit() print "sm_api.GLM took", time.time() - start, "seconds" print gauss_log_results.summary()
def test_quant_cols(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() if getpass.getuser() == 'kevin': tryList = [ (None, '/home/kevin/Downloads/t.csv', 15, 11, 'cE', 300), ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] else: tryList = [ ('home-0xdiag-datasets', 'airlines/year2013.csv', None, None, 'cE', 300), ] # h2b.browseTheCloud() trial = 0 for (bucket, csvPathname, iColCount, oColCount, hex_key, timeoutSecs) in tryList: xList = [] eList = [] fList = [] # PARSE******************************************************* parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=200, doSummary=False) csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) numRows = inspect['numRows'] numCols = inspect['numCols'] if not oColCount: iColCount = 0 if not oColCount: oColCount = numCols colCount = iColCount + oColCount for i in range(0, numCols): print "Column", i, "summary" h2o_cmd.runSummary(key=hex_key, max_qbins=1, cols=i) # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) # print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) print "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict # start after the last input col levels = h2o.nodes[0].levels(source=hex_key) l = levels['levels'] for column in range(iColCount, iColCount + oColCount): if l[column]: print "Skipping", column, "because it's enum (says levels)" continue # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? start = time.time() # file has headers. use col index q = h2o.nodes[0].quantiles(source_key=hex_key, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=1) qresult = q['result'] h2p.red_print("result:", q['result'], "quantile", quantile, "interpolated:", q['interpolated'], "iterations", q['iterations']) elapsed = time.time() - start print "quantile end on ", hex_key, 'took', elapsed, 'seconds.' quantileTime = elapsed # don't do for enums # also get the median with a sort (h2o_summ.percentileOnSortedlist() if 1 == 1: h2o_summ.quantile_comparisons( csvPathnameFull, skipHeader=True, col=column, # what col to extract from the csv datatype='float', quantile=0.5 if DO_MEDIAN else 0.999, # h2oSummary2=pctile[5 if DO_MEDIAN else 10], # h2oQuantilesApprox=qresult_single, h2oQuantilesExact=qresult, use_genfromtxt=True, ) trial += 1 execTime = 0 xList.append(column) eList.append(execTime) fList.append(quantileTime) # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on took", elapsed, 'seconds.' #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'column (0 is first)' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def build_cloud_with_json(h2o_nodes_json="h2o-nodes.json"): # local sandbox may not exist. Don't clean if it does, just append if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()") log("#*********************************************************************") print "This only makes sense if h2o is running as defined by", h2o_nodes_json print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here" print "No output means no h2o here! Some other info about stuff on the system is printed first though." import h2o_os_util if not os.path.exists(h2o_nodes_json): raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file") ## h2o_os_util.show_h2o_processes() with open(h2o_nodes_json, "rb") as f: cloneJson = json.load(f) # These are supposed to be in the file. # Just check the first one. if not there, the file must be wrong if not "cloud_start" in cloneJson: raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json) else: cs = cloneJson["cloud_start"] print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff valList = ["time", "cwd", "python_test_name", "python_cmd_line", "config_json", "username", "ip"] for v in valList: if v not in cs: raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json)) print "cloud_start['%s']: %s" % (v, cs[v]) # this is the internal node state for python..nodes rebuild nodeStateList = cloneJson["h2o_nodes"] nodeList = [] if not nodeStateList: raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json) try: for nodeState in nodeStateList: print "Cloning state for node", nodeState["node_id"], "from", h2o_nodes_json newNode = ExternalH2O(nodeState) nodeList.append(newNode) # If it's an existing cloud, it may already be locked. so never check. # we don't have the cloud name in the -ccj since it may change (and the file be static?) # so don't check expectedCloudName verify_cloud_size(nodeList, expectedCloudName=None, expectedLocked=None) # best to check for any errors right away? # (we won't report errors from prior tests due to marker stuff? ## check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() except: # nodeList might be empty in some exception cases? # no shutdown issued first, though ## if cleanup and nodeList: ## for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_nodes_json, "we used to", LOG_DIR shutil.copy(h2o_nodes_json, LOG_DIR + "/" + os.path.basename(h2o_nodes_json)) print "" h2p.red_print("Ingested from json:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") print "" # save it to a global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oSummary2MaxErr=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, h2oExecQuantiles=None, interpolate='linear', quantile=0.50, use_genfromtxt=False): SCIPY_INSTALLED = True try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = False if not SCIPY_INSTALLED: return if use_genfromtxt: print "Using numpy.genfromtxt. Better handling of null bytes" target = np.genfromtxt(open(csvPathname, 'r'), delimiter=',', skip_header=1 if skipHeader else 0, dtype=None) # guess! # print "shape:", target.shape() else: print "Using python csv reader" target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype == 'float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP = map(float, target) # targetFP= np.array(tFP, np.float) if datatype == 'int': targetFP = map(int, target) # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 p = np.percentile(targetFP, quantile * 100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile * 100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1 == 0: # type 6 alphap = 0 betap = 0 # type 5 okay but not perfect alphap = 0.5 betap = 0.5 # type 8 alphap = 1 / 3.0 betap = 1 / 3.0 if interpolate == 'mean': # an approx? (was good when comparing to h2o type 2) alphap = 0.4 betap = 0.4 if interpolate == 'linear': # this is type 7 alphap = 1 betap = 1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"from h2o exec:", h2oExecQuantiles) # they should be identical. keep a tight absolute tolerance # Note the comparisons have different tolerances, some are relative, some are absolute if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual( h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oQuantilesApprox: # this can be NaN if we didn't calculate it. turn the NaN string into a float NaN if math.isnan(float(h2oQuantilesApprox)): raise Exception("h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) if h2oSummary2MaxErr: h2o_util.assertApproxEqual( h2oQuantilesApprox, b, tol=h2oSummary2MaxErr, msg='h2o quantile singlepass is not approx. same as sort algo') else: h2o_util.assertApproxEqual( h2oQuantilesApprox, b, rel=0.1, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) if h2oSummary2MaxErr: # maxErr absolute was calculated in the test from 0.5*(max-min/(max_qbins-2)) h2o_util.assertApproxEqual( h2oSummary2, b, tol=h2oSummary2MaxErr, msg= 'h2o summary2 is not approx. same as sort algo (calculated expected max error)' ) else: # bounds are way off, since it depends on the min/max of the col, not the expected value h2o_util.assertApproxEqual( h2oSummary2, b, rel=1.0, msg= 'h2o summary2 is not approx. same as sort algo (sloppy compare)' ) if h2oQuantilesApprox and h2oSummary2: # they should both get the same answer. Currently they have different code, but same algo # FIX! ...changing to a relative tolerance, since we're getting a miscompare in some cases. # not sure why..maybe some subtle algo diff. h2o_util.assertApproxEqual(h2oSummary2, h2oQuantilesApprox, rel=0.04, msg='h2o summary2 is not approx. same as h2o singlepass.'+\ ' Check that max_qbins is 1000 (summary2 is fixed) and type 7 interpolation') if h2oExecQuantiles: if math.isnan(float(h2oExecQuantiles)): raise Exception("h2oExecQuantiles is unexpectedly NaN %s" % h2oExecQuantiles) # bounds are way off h2o_util.assertApproxEqual( h2oExecQuantiles, b, rel=1.0, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesExact: h2o_util.assertApproxEqual( h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual( h2oQuantilesExact, s1, tol=0.0000002, msg= 'h2o quantile multipass is not same as scipy stats.scoreatpercentile' ) # give us some slack compared to the scipy use of median (instead of desired mean) # since we don't have bounds here like above, just stop this test for now if h2oQuantilesApprox and 1 == 0: if interpolate == 'mean': h2o_util.assertApproxEqual( h2oQuantilesApprox, s2, rel=0.5, msg= 'h2o quantile singlepass is not approx. same as scipy stats.mstats.mquantiles' ) else: h2o_util.assertApproxEqual( h2oQuantilesApprox, s2, rel=0.5, msg= 'h2o quantile singlepass is not same as scipy stats.mstats.mquantiles' ) # see if scipy changes. nope. it doesn't if 1 == 0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
thresholds = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] #***************************************************************** # h2o #***************************************************************** d = target # target = dataset dmin = min(d) dmax = max(d) thresholdList = [OTHER_T] quantiles = findQuantileList(d, dmin, dmax, thresholdList) h2p.red_print('\nthis b result:', quantiles) #***************************************************************** # for comparison #***************************************************************** # perPrint = ["%.2f" % v for v in a] # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5
def findQuantile(d, dmin, dmax, threshold): # return the value at the threshold, or the mean of the two rows that bound it. # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer maxIterations = 30 # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere) totalRows = len(d) # Used to have desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # initialize newValStart = dmin newValEnd = dmax newValRange = newValEnd - newValStart desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine. newValBinSize = newValRange / (desiredBinCnt + 0.0) newLowCount = 0 # count of rows below the bins # yes there is no newHighCount. Created during the pass, though. # state shared by each pass assert maxBinCnt > 0 hcnt2 = [None for b in range(maxBinCnt)] hcnt2_min = [None for b in range(maxBinCnt)] hcnt2_max = [None for b in range(maxBinCnt)] hcnt2_low = 0 hcnt2_high = 0 assert newValBinSize != 0 # can be negative assert newValEnd > newValStart assert newValRange > 0 # break out on stopping condition # reuse the histogram array hcnt2[] iteration = 0 done = False # append to a list of best guesses per pass best_result = [] def htot2(): return sum(hcnt2) + hcnt2_low + hcnt2_high while iteration <= maxIterations and not done: h2p.green_print("newValStart", newValStart) h2p.green_print("newValEnd", newValEnd) h2p.green_print("newValRange", newValRange) h2p.green_print("newValBinSize", newValBinSize) h2p.green_print("newLowCount", newLowCount) h2p.green_print("threshold", threshold) valStart = newValStart valEnd = newValEnd valRange = newValRange valBinSize = newValBinSize lowCount = newLowCount desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # playing with creating relative NUDGE values to make sure bin range # is always inclusive of target. # ratio it down from valBinSize. # It doesn't need to be as big as valBinSize. # implicitly, it shouldn't need to be as large as valBinSize # can't seem to make it work yet. leave NUDGE=0 NUDGE = 0 # init to zero for each pass for b in range(maxBinCnt): hcnt2[b] = 0.0 # Init counts outside of the bins hcnt2_low = 0 hcnt2_high = 0 # minimum value for higher than the bin. Needed for interpolation hcnt2_high_min = None for val in d: # Need to count the stuff outside the bin-gathering, # since threshold compare is based on total row compare # on first pass, shouldn't see anything exceed the start/end bounds # since those are min/max for the column? (shouldn't be any fp precision issue? or ??) # oh wait, this valOffset math creates possible precision issue? # maybe we should address it with the NUDGE value below? but what about first pass? valOffset = val - valStart # where are we zeroing in? (start) binIdx2 = int(math.floor(valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide? # do some close looking for possible fp arith issues cA = valOffset < 0 cB = binIdx2 < 0 t = {True: 1, False: 0} # we get the 10 case if ((cA and not cB) or (not cA and cB)): h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \ "binIdx2", binIdx2) cC = val > valEnd cD = binIdx2 >= (maxBinCnt-1) # tighten the compare for printing if ((cC and not cD) or (not cC and cD)): h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \ "binIdx2", binIdx2, "maxBinCnt", maxBinCnt) # example hits this case..i.e. the max value # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3 if valOffset < 0 or binIdx2<0: # if valOffset < 0: # if binIdx2<0: hcnt2_low += 1 # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure! # have to use both compares, since can wrap the index (due to start/end shift) # elif val > valEnd or binIdx2>=(maxBinCnt-1): # should this really be a valOffset compare? elif val > valEnd or binIdx2 >= maxBinCnt: # elif val > valEnd: # elif binIdx2>=(maxBinCnt-1): if (hcnt2_high==0) or (val < hcnt2_high_min): hcnt2_high_min = val; print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd hcnt2_high += 1 else: # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \ (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize) if hcnt2[binIdx2]==0 or (val < hcnt2_min[binIdx2]): hcnt2_min[binIdx2] = val; if hcnt2[binIdx2]==0 or (val > hcnt2_max[binIdx2]): hcnt2_max[binIdx2] = val; hcnt2[binIdx2] += 1 # check if we went into the magic extra bin if binIdx2 == (maxBinCnt-1): print "\nFP! val went into the extra maxBinCnt bin:", \ binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n" # check the legal states for these two # we don't have None for checking hcnt2_high_min in java assert hcnt2_high==0 or (hcnt2_high_min is not None) assert (hcnt2_high_min is None) or hcnt2_high!=0 # everything should either be in low, the bins, or high totalBinnedRows = htot2() print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) # now walk thru and find out what bin to look inside currentCnt = hcnt2_low targetCntFull = threshold * (totalRows-1) # zero based indexing targetCntInt = int(math.floor(threshold * (totalRows-1))) targetCntFract = targetCntFull - targetCntInt assert targetCntFract>=0 and targetCntFract<=1 print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract k = 0 while ((currentCnt + hcnt2[k]) <= targetCntInt): # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] currentCnt += hcnt2[k] # ugly but have to break out if we'd cycle along with == adding h0's until we go too far # are we supposed to advance to a none zero bin? k += 1 # goes over in the equal case? # if currentCnt >= targetCntInt: # break if k==maxBinCnt: break assert k<maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % (k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k-1]) # format string to match java Log.info() in Quantiles.java print "Found k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] assert hcnt2[k]!=1 or hcnt2_min[k]==hcnt2_max[k] # some possibily interpolating guesses first, in guess we have to iterate (best guess) done = False guess = (hcnt2_max[k] - hcnt2_min[k]) / 2 if currentCnt==targetCntInt: if hcnt2[k]>2 and (hcnt2_min[k]==hcnt2_max[k]): guess = hcnt2_min[k] print "Guess A", guess, k, hcnt2[k] if hcnt2[k]==2: print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop" # no mattter what size the fraction it would be on this number guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 # no mattter what size the fraction it would be on this number if INTERPOLATION_TYPE==2: # type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 else: # default to type 7 (linear interpolation) # Unlike mean, which just depends on two adjacent values, this adjustment # adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[k] # two adjacent..as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_min[k] + (pctDiff * dDiff) done = False print "Guess B", guess if hcnt2[k]==1 and targetCntFract==0: assert hcnt2_min[k]==hcnt2_max[k] guess = hcnt2_min[k] done = True print "k", k print "Guess C", guess if hcnt2[k]==1 and targetCntFract!=0: assert hcnt2_min[k]==hcnt2_max[k] print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero" if k<maxBinCnt: nextK = k + 1 # could put it over maxBinCnt else: nextK = k while nextK<maxBinCnt and hcnt2[nextK]==0: nextK += 1 # have the "extra bin" for this if nextK >= maxBinCnt: assert hcnt2_high!=0 print "Using hcnt2_high_min for interpolate:", hcnt2_high_min nextVal = hcnt2_high_min else: print "Using nextK for interpolate:", nextK assert hcnt2[nextK]!=0 nextVal = hcnt2_min[nextK] guess = (hcnt2_max[k] + nextVal) / 2.0 # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK if INTERPOLATION_TYPE==2: # type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0 pctDiff = 0.5 else: # default to type 7 (linear interpolation) dDiff = nextVal - hcnt2_max[k] # two adjacent, as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_max[k] + (pctDiff * dDiff) done = True # has to be one above us when needed. (or we're at end) print 'k', 'hcnt2_max[k]', 'nextVal' print "hello3:", k, hcnt2_max[k], nextVal print "\nInterpolating result using nextK: %s nextVal: %s" % (nextK, nextVal) print "Guess D", guess if not done: print "Not done, setting new range",\ "k: ", k,\ "currentCnt: ", currentCnt,\ "hcnt2_min[k]: ", hcnt2_min[k],\ "hcnt2_max[k]: ", hcnt2_max[k] # possible bin leakage at start/end edges due to fp arith. # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you. # Just need to check the one bin below and above k, if they exist. if k > 0 and hcnt2[k-1]>0 and (hcnt2_max[k-1]<hcnt2_min[k]): newValStart = hcnt2_max[k-1] else: newValStart = hcnt2_min[k] # subtle. we do put stuff in the extra end bin (see the print above that happens) # k might be pointing to one less than that (like k=0 for 1 bin case) if k < maxBinCnt and hcnt2[k+1]>0 and (hcnt2_min[k+1]>hcnt2_max[k]): print "hello" newValEnd = hcnt2_min[k+1] else: newValEnd = hcnt2_max[k] newValRange = newValEnd - newValStart # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0) # the start/end should never change if we're just using one bin # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations # assumes NUDGE is 0 if NUDGE == 0.0: assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\ "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd) newLowCount = currentCnt if newValBinSize==0: # assert done or newValBinSize!=0 and live with current guess print "Assuming done because newValBinSize is 0." print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\ (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k]) guess = newValStart print "Guess E", guess done = True # if we have to interpolate # if it falls into this bin, interpolate to this bin means one answer? # cover the case above with multiple entris in a bin, all the same value # will be zero on the last pass? # assert newValBinSize != 0 or done # need the count up to but not including newValStart best_result.append(guess) iteration += 1 h2p.blue_print("Ending Pass", iteration) h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k]) print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high print "was", valStart, valEnd, valRange, valBinSize print "next", newValStart, newValEnd, newValRange, newValBinSize return best_result[-1]
def do_statsmodels_glm(self, bucket, csvPathname, L, family='gaussian'): h2p.red_print("Now doing statsmodels") h2p.red_print("http://statsmodels.sourceforge.net/devel/glm.html#module-reference") h2p.red_print("http://statsmodels.sourceforge.net/devel/generated/statsmodels.genmod.generalized_linear_model.GLM.html") import numpy as np import scipy as sp from numpy import loadtxt import statsmodels as sm csvPathnameFull = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) if 1==1: dataset = np.loadtxt( open(csvPathnameFull,'r'), skiprows=1, # skip the header delimiter=',', dtype='float'); # skipping cols from the begining... (ID is col 1) # In newer versions of Numpy, np.genfromtxt can take an iterable argument, # so you can wrap the file you're reading in a generator that generates lines, # skipping the first N columns. If your numbers are comma-separated, that's something like if 1==0: f = open(csvPathnameFull,'r'), np.genfromtxt( (",".join(ln.split()[1:]) for ln in f), skiprows=1, # skip the header delimiter=',', dtype='float'); print "\ncsv read for training, done" # data is last column # drop the output n_features = len(dataset[0]) - 1; print "n_features:", n_features # don't want ID (col 0) or CAPSULE (col 1) # get CAPSULE target = [x[1] for x in dataset] # slice off the first 2 train = np.array ( [x[2:] for x in dataset] ) n_samples, n_features = train.shape print "n_samples:", n_samples, "n_features:", n_features print "histogram of target" print sp.histogram(target,3) print "len(train):", len(train) print "len(target):", len(target) print "dataset shape:", dataset.shape if family!='gaussian': raise Exception("Only have gaussian logistic for scipy") # train the classifier gauss_log = sm_api.GLM(target, train, family=sm_api.families.Gaussian(sm_api.families.links.log)) start = time.time() gauss_log_results = gauss_log.fit() print "sm_api.GLM took", time.time() - start, "seconds" print gauss_log_results.summary()
def test_exec_enums_rand_cut2(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1,MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0,1)==0: cutExprList.append('p$C'+str(i+1)+'!='+c) else: cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount+1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I'+src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J'+src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*'+src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C'+str(iColCount+1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print(hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount+1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_exec2_enums_rand_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ (n, 10, 9, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like # celChoice = str(random.choice(range(len(cel)))) celChoice = random.choice(range(len(cel))) cutValue[c] = celChoice cutExprList = [] pKey = Key('p') for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # cutExprList.append('p$C'+str(i+1)+'=='+c) # all column indexing in h2o-dev is with number e = Fcn('==', c, pKey[:,i]) cutExprList.append(e) cutExpr = None for ce in cutExprList: if cutExpr: cutExpr = Fcn('&', cutExpr, ce) else: cutExpr = ce print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] # rowExpr = '%s[%s,];' % (hex_key, cutExpr) hKey = Key(hex_key) rowExpr = hKey[cutExpr, :] print "rowExpr:", rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspect = h2o_cmd.runInspect(key=parse_key) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) # print h2o.dump_json(inspect) # (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ # h2o_cmd.columnInfoFromInspect(parse_key, exceptionOnMissingValues=False) # error if any col has constant values # if len(constantValuesDict) != 0: # raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # build up the columns Assign('b', [1,2,3]) # could also append 1 col at a time, by assigning to the next col number? Assign('a', Cbind(['b' for i in range(colCount)])) for eKey in eKeys: Assign(eKey, 'a') ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==1: start = time.time() Assign(fKey, random.choice(rowExprList)).do() elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) missingList, valueList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # FIX! put quantile back in? quantileTime = 0 # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
thresholds = [0.001, 0.01, 0.1, 0.25, 0.33, 0.5, 0.66, 0.75, 0.9, 0.99, 0.999] #***************************************************************** # h2o #***************************************************************** d = target # target = dataset dmin = min(d) dmax = max(d) thresholdList = [OTHER_T] quantiles = findQuantileList(d, dmin, dmax, thresholdList) h2p.red_print('\nthis b result:', quantiles) #***************************************************************** # for comparison #***************************************************************** # perPrint = ["%.2f" % v for v in a] # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 # type 6 alphap = 0 betap = 0 # type 5 okay but not perfect alphap = 0.5 betap = 0.5
def find_folder_and_filename(bucket, pathWithRegex, schema='put', returnFullPath=False): checkPath = True # strip the common mistake of leading "/" in path, if bucket is specified too giveUpAndSearchLocally = False if bucket is not None and re.match("/", pathWithRegex): h2o.verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", pathWithRegex) pathWithRegex = pathWithRegex.lstrip('/') if bucket is None: # good for absolute path name bucketPath = "" elif bucket == ".": bucketPath = os.getcwd() # only use if the build_cloud was for remote H2O # Never use the var for remote, if you're doing a put! (which always sources local) elif h2o.nodes[0].remoteH2O and schema!='put' and \ (os.environ.get('H2O_REMOTE_BUCKETS_ROOT') or h2o.nodes[0].h2o_remote_buckets_root): if (bucket=='smalldata' or bucket=='datasets') and schema=='local': msg1 = "\nWARNING: you're using remote nodes, and 'smalldata' or 'datasets' git buckets, with schema!=put" msg2 = "\nThose aren't git pull'ed by the test. Since they are user-maintained, not globally-maintained-by-0xdata," msg3 = "\nthey may be out of date at those remote nodes?" msg4 = "\nGoing to assume we find a path to them locally, and remote path will be the same" h2p.red_print(msg1, msg2, msg3, msg4) giveUpAndSearchLocally = True else: if os.environ.get('H2O_REMOTE_BUCKETS_ROOT'): rootPath = os.environ.get('H2O_REMOTE_BUCKETS_ROOT') print "Found H2O_REMOTE_BUCKETS_ROOT:", rootPath else: rootPath = h2o.nodes[0].h2o_remote_buckets_root print "Found h2o_nodes[0].h2o_remote_buckets_root:", rootPath bucketPath = os.path.join(rootPath, bucket) checkPath = False # does it work to use bucket "." to get current directory # this covers reote with put too elif os.environ.get('H2O_BUCKETS_ROOT'): rootPath = os.environ.get('H2O_BUCKETS_ROOT') print "Using H2O_BUCKETS_ROOT environment variable:", rootPath if not (os.path.exists(rootPath)): raise Exception("H2O_BUCKETS_ROOT in env but %s doesn't exist." % rootPath) bucketPath = os.path.join(rootPath, bucket) if not (os.path.exists(bucketPath)): raise Exception("H2O_BUCKETS_ROOT and path used to form %s which doesn't exist." % bucketPath) else: giveUpAndSearchLocally = True #****************************************************************************************** if giveUpAndSearchLocally: # if we run remotely, we're assuming the import folder path on the remote machine # matches what we find on our local machine. But maybe the local user doesn't exist remotely # so using his path won't work. # Resolve by looking for special state in the config. If user = 0xdiag, just force the bucket location # This is a lot like knowing about fixed paths with s3 and hdfs # Otherwise the remote path needs to match the local discovered path. # want to check the username being used remotely first. should exist here too if going to use username = getpass.getuser() h2oUsername = h2o.nodes[0].username h2o.verboseprint("username:"******"h2oUsername:"******"datasets" is special. Don't want to find it in /home/0xdiag/datasets # needs to be the git clone 'datasets'. Find it by walking upwards below # disable it from this looking in home dir. Could change priority order? # resolved in order, looking for bucket (ln -s will work) in these home dirs. if bucket=='datasets': # special case possibleUsers = [] elif h2oUsername != username: possibleUsers = [username, h2oUsername, "0xdiag"] else: possibleUsers = [username, "0xdiag"] for u in possibleUsers: rootPath = os.path.expanduser("~" + u) bucketPath = os.path.join(rootPath, bucket) h2o.verboseprint("Checking bucketPath:", bucketPath, 'assuming home is', rootPath) if os.path.exists(bucketPath): h2o.verboseprint("search A did find", bucket, "at", rootPath) break else: # last chance to find it by snooping around rootPath = os.getcwd() h2o.verboseprint("find_bucket looking upwards from", rootPath, "for", bucket) # don't spin forever levels = 0 while not (os.path.exists(os.path.join(rootPath, bucket))): h2o.verboseprint("Didn't find", bucket, "at", rootPath) rootPath = os.path.split(rootPath)[0] levels += 1 if (levels==6): raise Exception("unable to find bucket: %s" % bucket) h2o.verboseprint("search B did find", bucket, "at", rootPath) bucketPath = os.path.join(rootPath, bucket) #****************************************************************************************** # if there's no path, just return the bucketPath # but what about cases with a header in the folder too? (not putfile) if pathWithRegex is None: if returnFullPath: return bucketPath else: return (bucketPath, None) # if there is a "/" in the path, that means it's not just a pattern # split it # otherwise it is a pattern. use it to search for files in python first? # FIX! do that later elif "/" in pathWithRegex: (head, tail) = os.path.split(pathWithRegex) folderPath = os.path.abspath(os.path.join(bucketPath, head)) # accept all 0xcustomer-datasets without checking..since the current python user # may not have permission, but h2o will # try a couple times with os.stat in between, in case it's not automounting if '/mnt/0xcustomer-datasets' in folderPath: pass else: retry = 0 while checkPath and (not os.path.exists(folderPath)) and retry<5: # we can't stat an actual file, because we could have a regex at the end of the pathname print "Retrying", folderPath, "in case there's a autofs mount problem" os.stat(folderPath) retry += 1 time.sleep(1) if checkPath and not os.path.exists(folderPath): raise Exception("%s doesn't exist. %s under %s may be wrong?" % (folderPath, head, bucketPath)) else: folderPath = bucketPath tail = pathWithRegex h2o.verboseprint("folderPath:", folderPath, "tail:", tail) if returnFullPath: return os.path.join(folderPath, tail) else: return (folderPath, tail)
class H2O(object): def __init__(self, use_this_ip_addr=None, port=54321, capture_output=True, force_ip=False, network=None, use_debugger=None, classpath=None, use_hdfs=False, use_maprfs=False, hdfs_version=None, hdfs_name_node=None, hdfs_config=None, aws_credentials=None, use_flatfile=False, java_heap_GB=None, java_heap_MB=None, java_extra_args=None, use_home_for_ice=False, node_id=None, username=None, random_udp_drop=False, force_tcp=False, redirect_import_folder_to_s3_path=None, redirect_import_folder_to_s3n_path=None, disable_h2o_log=False, enable_benchmark_log=False, h2o_remote_buckets_root=None, delete_keys_at_teardown=False, cloud_name=None, disable_assertions=None, sandbox_ignore_errors=False, ): if use_hdfs: # see if we can touch a 0xdata machine try: # long timeout in ec2...bad a = requests.get('http://172.16.2.176:80', timeout=1) hdfs_0xdata_visible = True except: hdfs_0xdata_visible = False # different defaults, depending on where we're running if hdfs_name_node is None: if hdfs_0xdata_visible: hdfs_name_node = "172.16.2.176" else: # ec2 hdfs_name_node = "10.78.14.235:9000" if hdfs_version is None: if hdfs_0xdata_visible: hdfs_version = "cdh4" else: # ec2 hdfs_version = "0.20.2" self.redirect_import_folder_to_s3_path = redirect_import_folder_to_s3_path self.redirect_import_folder_to_s3n_path = redirect_import_folder_to_s3n_path self.aws_credentials = aws_credentials self.port = port # None is legal for self.h2o_addr. # means we won't give an ip to the jar when we start. # Or we can say use use_this_ip_addr=127.0.0.1, or the known address # if use_this_addr is None, use 127.0.0.1 for urls and json # Command line arg 'ip_from_cmd_line' dominates: # ip_from_cmd_line and use_this_ip_addr shouldn't be used for mutli-node if h2o_args.ip_from_cmd_line: self.h2o_addr = h2o_args.ip_from_cmd_line else: self.h2o_addr = use_this_ip_addr self.force_ip = force_ip or (self.h2o_addr!=None) if self.h2o_addr: self.http_addr = self.h2o_addr else: self.http_addr = h2o_args.python_cmd_ip if h2o_args.network_from_cmd_line: self.network = h2o_args.network_from_cmd_line else: self.network = network # command line should always dominate for enabling if h2o_args.debugger: use_debugger = True self.use_debugger = use_debugger self.classpath = classpath self.capture_output = capture_output self.use_hdfs = use_hdfs self.use_maprfs = use_maprfs self.hdfs_name_node = hdfs_name_node self.hdfs_version = hdfs_version self.hdfs_config = hdfs_config self.use_flatfile = use_flatfile self.java_heap_GB = java_heap_GB self.java_heap_MB = java_heap_MB self.java_extra_args = java_extra_args self.use_home_for_ice = use_home_for_ice self.node_id = node_id if username: self.username = username else: self.username = getpass.getuser() # don't want multiple reports from tearDown and tearDownClass # have nodes[0] remember (0 always exists) self.sandbox_error_was_reported = False self.sandbox_ignore_errors = sandbox_ignore_errors self.random_udp_drop = random_udp_drop self.force_tcp = force_tcp self.disable_h2o_log = disable_h2o_log # this dumps stats from tests, and perf stats while polling to benchmark.log self.enable_benchmark_log = enable_benchmark_log self.h2o_remote_buckets_root = h2o_remote_buckets_root self.delete_keys_at_teardown = delete_keys_at_teardown self.disable_assertions = disable_assertions if cloud_name: self.cloud_name = cloud_name else: self.cloud_name = 'pytest-%s-%s' % (getpass.getuser(), os.getpid()) def __str__(self): return '%s - http://%s:%d/' % (type(self), self.http_addr, self.port) def url(self, loc, port=None): # always use the new api port if port is None: port = self.port if loc.startswith('/'): delim = '' else: delim = '/' u = 'http://%s:%d%s%s' % (self.http_addr, port, delim, loc) return u def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise create from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' extraComment2 = " " + str(postData)+";" if cmd=='post' else "" extraComment2 += extraComment if extraComment else "" if len(extraComment2) > 0: log('Start ' + url + paramsStr, comment=extraComment2) else: log('Start ' + url + paramsStr) # file get passed thru kwargs here if h2o_args.no_timeout: timeout = None # infinite try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors(python_test_name=h2o_args.python_test_name); raise exc_info[1], None, exc_info[2] if 200 != r.status_code: print "JSON call returned non-200 status with ", (url + paramsStr) print "r.status_code: " + str(r.status_code) print "r.headers: " + repr(r.headers) print "r.text: " + r.text # fatal if no response # FIX! why is this not working on bad response to GLM # if not r: # raise Exception("Maybe bad url? no r in do_json_request in %s:" % inspect.stack()[1][3]) # this is used to open a browser on results, or to redo the operation in the browser # we don't' have that may urls flying around, so let's keep them all # FIX! this doesn't work now with all the extra post data required? h2o_nodes.json_url_history.append(r.url) # if r.json(): # raise Exception("Maybe bad url? no r.json in do_json_request in %s:" % inspect.stack()[1][3]) rjson = None if returnFast: return try: # h2o-dev sometimes is returning ISO-8859-2, Latin-2? ## print "apparent_coding", r.apparent_encoding r.encoding = 'utf-8' rjson = r.json() except: h2p.red_print("r.text:", r.text.encode('utf8')) try: # try to decode the r.text? if not isinstance(json.loads(r.text), (list, dict)): raise Exception("h2o json responses should always be lists or dicts, see previous for text") except: raise Exception("Could not decode any json from the request %s." % r.text) # TODO: we should really only look in the response object. This check # prevents us from having a field called "error" (e.g., for a scoring result). for e in ['error', 'Error', 'errors', 'Errors']: # error can be null (python None). This happens in exec2 if e in rjson and rjson[e]: print "rjson:", dump_json(rjson) emsg = 'rjson %s in %s: %s' % (e, inspect.stack()[1][3], rjson[e]) if ignoreH2oError: # well, we print it..so not totally ignore. test can look at rjson returned print emsg else: print emsg raise Exception(emsg) for w in ['warning', 'Warning', 'warnings', 'Warnings']: # warning can be null (python None). if w in rjson and rjson[w]: verboseprint(dump_json(rjson)) print 'rjson %s in %s: %s' % (w, inspect.stack()[1][3], rjson[w]) return rjson
def do_h2o_glm(self, bucket, csvPathname, L, family='binomial'): h2p.red_print("\nNow doing h2o") parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', timeoutSecs=180) # save the resolved pathname for use in the sklearn csv read below inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print inspect print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) x = 'ID' y = 'CAPSULE' family = family alpha = '0' lambda_ = L nfolds = '0' f = 'prostate' modelKey = 'GLM_' + f kwargs = { 'response' : y, 'ignored_cols' : x, 'family' : family, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, # passes if 0, fails otherwise 'destination_key' : modelKey, } timeoutSecs = 60 start = time.time() glmResult = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) # this stuff was left over from when we got the result after polling the jobs list # okay to do it again # GLM2: when it redirects to the model view, we no longer have the job_key! (unlike the first response and polling) (warnings, clist, intercept) = h2o_glm.simpleCheckGLM(self, glmResult, None, **kwargs) cstring = "".join([("%.5e " % c) for c in clist]) h2p.green_print("h2o alpha ", alpha) h2p.green_print("h2o lambda ", lambda_) h2p.green_print("h2o coefficient list:", cstring) h2p.green_print("h2o intercept", "%.5e " % intercept) # other stuff in the json response glm_model = glmResult['glm_model'] _names = glm_model['_names'] coefficients_names = glm_model['coefficients_names'] # the first submodel is the right one, if onely one lambda is provided as a parameter above submodels = glm_model['submodels'][0] beta = submodels['beta'] h2p.red_print("beta:", beta) norm_beta = submodels['norm_beta'] iteration = submodels['iteration'] validation = submodels['validation'] auc = validation['auc'] aic = validation['aic'] null_deviance = validation['null_deviance'] residual_deviance = validation['residual_deviance'] print '_names', _names print 'coefficients_names', coefficients_names # did beta get shortened? the simple check confirms names/beta/norm_beta are same length print 'beta', beta print 'iteration', iteration print 'auc', auc
def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, postData=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, raiseIfNon200=True, **kwargs): H2O.verboseprint("__do_json_request, timeout: " + str(timeout)) # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.__url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params_serialized = params.copy() for k in params_serialized: if params_serialized[k] is None: del params[k] paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' # The requests package takes array parameters and explodes them: ['f00', 'b4r'] becomes "f00,b4r". # NOTE: this handles 1D arrays only; if we need ND this needs to be recursive. # NOTE: we currently don't need to do this for GET, so that's not implemented. if postData is not None: munged_postData = {} for k, v in postData.iteritems(): if type(v) is list: if len(v) == 0: munged_postData[k] = '[]' else: first = True array_str = '[' for val in v: if not first: array_str += ', ' if val is None: array_str += 'null' elif isinstance(val, basestring): array_str += "\"" + str(val) + "\"" else: array_str += str(val) first = False array_str += ']' munged_postData[k] = array_str else: # not list: munged_postData[k] = v else: # None munged_postData = postData if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest("----------------------------------------------------------------------\n") if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if 'post' == cmd: # NOTE == cmd: for now, since we don't have deserialization from JSON in h2o-dev, we use form-encoded POST. # This is temporary. # # This following does application/json (aka, posting JSON in the body): # r = requests.post(url, timeout=timeout, params=params, data=json.dumps(munged_postData), **kwargs) # # This does form-encoded, which doesn't allow POST of nested structures r = requests.post(url, timeout=timeout, params=params, data=munged_postData, **kwargs) elif 'delete' == cmd: r = requests.delete(url, timeout=timeout, params=params, **kwargs) elif 'get' == cmd: r = requests.get(url, timeout=timeout, params=params, **kwargs) else: raise ValueError("Unknown HTTP command (expected 'get', 'post' or 'delete'): " + cmd) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. # (this is new/experimental) exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) H2O.check_sandbox_for_errors(python_test_name=H2O.python_test_name); log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2] H2O.verboseprint("r: " + repr(r))
def do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=None, returnFast=False, cmd='get', extraComment=None, ignoreH2oError=False, noExtraErrorCheck=False, **kwargs): # if url param is used, use it as full url. otherwise crate from the jsonRequest if fullUrl: url = fullUrl else: url = self.url(jsonRequest) # remove any params that are 'None' # need to copy dictionary, since can't delete while iterating if params is not None: params2 = params.copy() for k in params2: if params2[k] is None: del params[k] paramsStr = '?' + '&'.join( ['%s=%s' % (k, v) for (k, v) in params.items()]) else: paramsStr = '' if extraComment: log('Start ' + url + paramsStr, comment=extraComment) else: log('Start ' + url + paramsStr) log_rest("") log_rest( "----------------------------------------------------------------------\n" ) if extraComment: log_rest("# Extra comment info about this request: " + extraComment) if cmd == 'get': log_rest("GET") else: log_rest("POST") log_rest(url + paramsStr) # file get passed thru kwargs here try: if cmd == 'post': r = requests.post(url, timeout=timeout, params=params, **kwargs) else: r = requests.get(url, timeout=timeout, params=params, **kwargs) except Exception, e: # rethrow the exception after we've checked for stack trace from h2o # out of memory errors maybe don't show up right away? so we should wait for h2o # to get it out to h2o stdout. We don't want to rely on cloud teardown to check # because there's no delay, and we don't want to delay all cloud teardowns by waiting. exc_info = sys.exc_info() # use this to ignore the initial connection errors during build cloud when h2o is coming up if not noExtraErrorCheck: h2p.red_print( "ERROR: got exception on %s to h2o. \nGoing to check sandbox, then rethrow.." % (url + paramsStr)) time.sleep(2) check_sandbox_for_errors( python_test_name=h2o_args.python_test_name) log_rest("") log_rest("EXCEPTION CAUGHT DOING REQUEST: " + str(e.message)) raise exc_info[1], None, exc_info[2]
def build_cloud_with_json(h2o_nodes_json='h2o-nodes.json'): # local sandbox may not exist. Don't clean if it does, just append if not os.path.exists(LOG_DIR): os.mkdir(LOG_DIR) log("#*********************************************************************") log("Starting new test: " + h2o_args.python_test_name + " at build_cloud_with_json()") log("#*********************************************************************") print "This only makes sense if h2o is running as defined by", h2o_nodes_json print "For now, assuming it's a cloud on this machine, and here's info on h2o processes running here" print "No output means no h2o here! Some other info about stuff on the system is printed first though." import h2o_os_util if not os.path.exists(h2o_nodes_json): raise Exception("build_cloud_with_json: Can't find " + h2o_nodes_json + " file") ## h2o_os_util.show_h2o_processes() with open(h2o_nodes_json, 'rb') as f: cloneJson = json.load(f) # These are supposed to be in the file. # Just check the first one. if not there, the file must be wrong if not 'cloud_start' in cloneJson: raise Exception("Can't find 'cloud_start' in %s, wrong file? h2o-nodes.json?" % h2o_nodes_json) else: cs = cloneJson['cloud_start'] print "Info on the how the cloud we're cloning was started (info from %s)" % h2o_nodes_json # required/legal values in 'cloud_start'. A robust check is good for easy debug when we add stuff valList = ['time', 'cwd', 'python_test_name', 'python_cmd_line', 'config_json', 'username', 'ip'] for v in valList: if v not in cs: raise Exception("Can't find %s in %s, wrong file or version change?" % (v, h2o_nodes_json)) print "cloud_start['%s']: %s" % (v, cs[v]) # this is the internal node state for python..nodes rebuild nodeStateList = cloneJson['h2o_nodes'] nodeList = [] if not nodeStateList: raise Exception("nodeStateList is empty. %s file must be empty/corrupt" % h2o_nodes_json) try: for nodeState in nodeStateList: print "Cloning state for node", nodeState['node_id'], 'from', h2o_nodes_json newNode = ExternalH2O(nodeState) nodeList.append(newNode) # If it's an existing cloud, it may already be locked. so never check. # we don't have the cloud name in the -ccj since it may change (and the file be static?) # so don't check expectedCloudName verify_cloud_size(nodeList, expectedCloudName=None, expectedLocked=None) # best to check for any errors right away? # (we won't report errors from prior tests due to marker stuff? ## check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) # put the test start message in the h2o log, to create a marker nodeList[0].h2o_log_msg() except: # nodeList might be empty in some exception cases? # no shutdown issued first, though ## if cleanup and nodeList: ## for n in nodeList: n.terminate() check_sandbox_for_errors(python_test_name=h2o_args.python_test_name) raise # like cp -p. Save the config file, to sandbox print "Saving the ", h2o_nodes_json, "we used to", LOG_DIR shutil.copy(h2o_nodes_json, LOG_DIR + "/" + os.path.basename(h2o_nodes_json)) print "" h2p.red_print("Ingested from json:", nodeList[0].java_heap_GB, "GB java heap(s) with", len(nodeList), "total nodes") print "" # save it to a global copy, in case it's needed for tearDown h2o_nodes.nodes[:] = nodeList return nodeList
def findQuantile(d, dmin, dmax, threshold): # return the value at the threshold, or the mean of the two rows that bound it. # fixed bin count per pass. Stops at maxIterations if not resolved to one true answer maxIterations = 30 # totalRows should be cleansed of NAs. assume d doesn't have NAs (cleaned elsewhere) totalRows = len(d) # Used to have desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # initialize newValStart = dmin newValEnd = dmax newValRange = newValEnd - newValStart desiredBinCnt = BIN_COUNT # Could do per-pass adjustment, but fixed works fine. newValBinSize = newValRange / (desiredBinCnt + 0.0) newLowCount = 0 # count of rows below the bins # yes there is no newHighCount. Created during the pass, though. # state shared by each pass assert maxBinCnt > 0 hcnt2 = [None for b in range(maxBinCnt)] hcnt2_min = [None for b in range(maxBinCnt)] hcnt2_max = [None for b in range(maxBinCnt)] hcnt2_low = 0 hcnt2_high = 0 assert newValBinSize != 0 # can be negative assert newValEnd > newValStart assert newValRange > 0 # break out on stopping condition # reuse the histogram array hcnt2[] iteration = 0 done = False # append to a list of best guesses per pass best_result = [] def htot2(): return sum(hcnt2) + hcnt2_low + hcnt2_high while iteration <= maxIterations and not done: h2p.green_print("newValStart", newValStart) h2p.green_print("newValEnd", newValEnd) h2p.green_print("newValRange", newValRange) h2p.green_print("newValBinSize", newValBinSize) h2p.green_print("newLowCount", newLowCount) h2p.green_print("threshold", threshold) valStart = newValStart valEnd = newValEnd valRange = newValRange valBinSize = newValBinSize lowCount = newLowCount desiredBinCnt = BIN_COUNT maxBinCnt = desiredBinCnt + 1 # might go one over due to FP issues # playing with creating relative NUDGE values to make sure bin range # is always inclusive of target. # ratio it down from valBinSize. # It doesn't need to be as big as valBinSize. # implicitly, it shouldn't need to be as large as valBinSize # can't seem to make it work yet. leave NUDGE=0 NUDGE = 0 # init to zero for each pass for b in range(maxBinCnt): hcnt2[b] = 0.0 # Init counts outside of the bins hcnt2_low = 0 hcnt2_high = 0 # minimum value for higher than the bin. Needed for interpolation hcnt2_high_min = None for val in d: # Need to count the stuff outside the bin-gathering, # since threshold compare is based on total row compare # on first pass, shouldn't see anything exceed the start/end bounds # since those are min/max for the column? (shouldn't be any fp precision issue? or ??) # oh wait, this valOffset math creates possible precision issue? # maybe we should address it with the NUDGE value below? but what about first pass? valOffset = val - valStart # where are we zeroing in? (start) binIdx2 = int(math.floor( valOffset / (valBinSize + 0.0))) # make sure it's always an fp divide? # do some close looking for possible fp arith issues cA = valOffset < 0 cB = binIdx2 < 0 t = {True: 1, False: 0} # we get the 10 case if ((cA and not cB) or (not cA and cB)): h2p.red_print("AB Interesting lower bin edge case %s%s" % (t[cA], t[cB]), "cA", cA, "cB", cB, "valOffSet", valOffSet, \ "binIdx2", binIdx2) cC = val > valEnd cD = binIdx2 >= (maxBinCnt - 1) # tighten the compare for printing if ((cC and not cD) or (not cC and cD)): h2p.red_print("CD Interesting upper bin edge case %s%s" % (t[cC], t[cD]), "cC", cC, "cB", cD, "val", val, "valEnd", valEnd, \ "binIdx2", binIdx2, "maxBinCnt", maxBinCnt) # example hits this case..i.e. the max value # CD Interesting upper bin edge case 01 cC False cB True val 100.995097486 valEnd 100.995097486 binIdx2 2 maxBinCnt 3 if valOffset < 0 or binIdx2 < 0: # if valOffset < 0: # if binIdx2<0: hcnt2_low += 1 # prevent the extra bin from being used..i.e. eliminate the fuzziness for sure! # have to use both compares, since can wrap the index (due to start/end shift) # elif val > valEnd or binIdx2>=(maxBinCnt-1): # should this really be a valOffset compare? elif val > valEnd or binIdx2 >= maxBinCnt: # elif val > valEnd: # elif binIdx2>=(maxBinCnt-1): if (hcnt2_high == 0) or (val < hcnt2_high_min): hcnt2_high_min = val print "hcnt2_high_min update:", hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd hcnt2_high += 1 else: # print "(multi) val: ",val," valOffset: ",valOffset," valBinSize: ",valBinSize assert binIdx2 >=0 and binIdx2<=(maxBinCnt-1), "val %s %s %s %s binIdx2: %s maxBinCnt: %s valBinSize: %s" % \ (val, valStart, valEnd, valOffset, binIdx2, maxBinCnt, valBinSize) if hcnt2[binIdx2] == 0 or (val < hcnt2_min[binIdx2]): hcnt2_min[binIdx2] = val if hcnt2[binIdx2] == 0 or (val > hcnt2_max[binIdx2]): hcnt2_max[binIdx2] = val hcnt2[binIdx2] += 1 # check if we went into the magic extra bin if binIdx2 == (maxBinCnt - 1): print "\nFP! val went into the extra maxBinCnt bin:", \ binIdx2, hcnt2_high_min, valOffset, val, valStart, hcnt2_high, val, valEnd,"\n" # check the legal states for these two # we don't have None for checking hcnt2_high_min in java assert hcnt2_high == 0 or (hcnt2_high_min is not None) assert (hcnt2_high_min is None) or hcnt2_high != 0 # everything should either be in low, the bins, or high totalBinnedRows = htot2() print "totalRows check: %s htot2(): %s should be equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) assert totalRows==totalBinnedRows, "totalRows: %s htot2() %s not equal. hcnt2_low: %s hcnt2_high: %s" % \ (totalRows, totalBinnedRows, hcnt2_low, hcnt2_high) # now walk thru and find out what bin to look inside currentCnt = hcnt2_low targetCntFull = threshold * (totalRows - 1) # zero based indexing targetCntInt = int(math.floor(threshold * (totalRows - 1))) targetCntFract = targetCntFull - targetCntInt assert targetCntFract >= 0 and targetCntFract <= 1 print "targetCntInt:", targetCntInt, "targetCntFract", targetCntFract k = 0 while ((currentCnt + hcnt2[k]) <= targetCntInt): # print "looping for k (multi): ",k," ",currentCnt," ",targetCntInt," ",totalRows," ",hcnt2[k]," ",hcnt2_min[k]," ",hcnt2_max[k] currentCnt += hcnt2[k] # ugly but have to break out if we'd cycle along with == adding h0's until we go too far # are we supposed to advance to a none zero bin? k += 1 # goes over in the equal case? # if currentCnt >= targetCntInt: # break if k == maxBinCnt: break assert k < maxBinCnt, "k too large, k: %s maxBinCnt %s %s %s %s" % ( k, maxBinCnt, currentCnt, targetCntInt, hcnt2[k - 1]) # format string to match java Log.info() in Quantiles.java print "Found k (multi): ", k, " ", currentCnt, " ", targetCntInt, " ", totalRows, " ", hcnt2[ k], " ", hcnt2_min[k], " ", hcnt2_max[k] assert hcnt2[k] != 1 or hcnt2_min[k] == hcnt2_max[k] # some possibily interpolating guesses first, in guess we have to iterate (best guess) done = False guess = (hcnt2_max[k] - hcnt2_min[k]) / 2 # we maight not have gottent all the way if currentCnt == targetCntInt: if hcnt2[k] > 2 and (hcnt2_min[k] == hcnt2_max[k]): guess = hcnt2_min[k] print "Guess A", guess, k, hcnt2[k] if hcnt2[k] == 2: print "hello" print "\nTwo values in this bin but we could be aligned to the 2nd. so can't stop" # no mattter what size the fraction it would be on this number guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 # no mattter what size the fraction it would be on this number if INTERPOLATION_TYPE == 2: # type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0 else: # default to type 7 (linear interpolation) # Unlike mean, which just depends on two adjacent values, this adjustment # adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[ k] # two adjacent..as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_min[k] + (pctDiff * dDiff) done = False print "Guess B", guess if hcnt2[k] == 1 and targetCntFract == 0: assert hcnt2_min[k] == hcnt2_max[k] guess = hcnt2_min[k] done = True print "k", k print "Guess C", guess if hcnt2[k] == 1 and targetCntFract != 0: assert hcnt2_min[k] == hcnt2_max[k] print "\nSingle value in this bin, but fractional means we need to interpolate to next non-zero" if k < maxBinCnt: nextK = k + 1 # could put it over maxBinCnt else: nextK = k while nextK < maxBinCnt and hcnt2[nextK] == 0: nextK += 1 # have the "extra bin" for this if nextK >= maxBinCnt: assert hcnt2_high != 0 print "Using hcnt2_high_min for interpolate:", hcnt2_high_min nextVal = hcnt2_high_min else: print "Using nextK for interpolate:", nextK assert hcnt2[nextK] != 0 nextVal = hcnt2_min[nextK] guess = (hcnt2_max[k] + nextVal) / 2.0 # OH! fixed bin as opposed to sort. Of course there are gaps between k and nextK if INTERPOLATION_TYPE == 2: # type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0 pctDiff = 0.5 else: # default to type 7 (linear interpolation) dDiff = nextVal - hcnt2_max[ k] # two adjacent, as if sorted! pctDiff = targetCntFract # This is the fraction of total rows guess = hcnt2_max[k] + (pctDiff * dDiff) done = True # has to be one above us when needed. (or we're at end) print 'k', 'hcnt2_max[k]', 'nextVal' print "hello3:", k, hcnt2_max[k], nextVal print "\nInterpolating result using nextK: %s nextVal: %s" % ( nextK, nextVal) print "Guess D", guess if not done: print "%s %s %s %s Not done, setting new range" % (hcnt2[k], currentCnt, targetCntInt, targetCntFract),\ "k: ", k,\ "currentCnt: ", currentCnt,\ "hcnt2_min[k]: ", hcnt2_min[k],\ "hcnt2_max[k]: ", hcnt2_max[k] # possible bin leakage at start/end edges due to fp arith. # the bin index arith may resolve OVER the boundary created by the compare for hcnt2_high compare # rather than using NUDGE, see if there's a non-zero bin below (min) or above (max) you. # Just need to check the one bin below and above k, if they exist. if k > 0 and hcnt2[k - 1] > 0 and (hcnt2_max[k - 1] < hcnt2_min[k]): print "1" newValStart = hcnt2_max[k - 1] else: print "2" newValStart = hcnt2_min[k] # subtle. we do put stuff in the extra end bin (see the print above that happens) # k might be pointing to one less than that (like k=0 for 1 bin case) if k < maxBinCnt and hcnt2[k + 1] > 0 and (hcnt2_min[k + 1] > hcnt2_max[k]): print "3" newValEnd = hcnt2_min[k + 1] else: print "4" newValEnd = hcnt2_max[k] newValRange = newValEnd - newValStart # maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0) # the start/end should never change if we're just using one bin # this is a bin leakage test, if you use one bin. (we should never resolve exactly stop at max iterations # assumes NUDGE is 0 if NUDGE == 0.0: assert desiredBinCnt>1 or (valStart==newValStart and valEnd==newValEnd),\ "if 1 bin, should be no per-pass edge leakage %s %s %s %s %s %s" % (k, hcnt2_high, valStart, newValStart, valEnd, newValEnd) newLowCount = currentCnt if newValBinSize == 0: # assert done or newValBinSize!=0 and live with current guess print "Assuming done because newValBinSize is 0." print "newValRange: %s, hcnt2[k]: %s hcnt2_min[k]: %s hcnt2_max[k]: %s" %\ (newValRange, hcnt2[k], hcnt2_min[k], hcnt2_max[k]) guess = newValStart print "Guess E", guess # was done = True 3/20/14 done = True # if we have to interpolate # if it falls into this bin, interpolate to this bin means one answer? # cover the case above with multiple entries in a bin, all the same value # will be zero on the last pass? # assert newValBinSize != 0 or done # need the count up to but not including newValStart best_result.append(guess) iteration += 1 h2p.blue_print("Ending Pass", iteration) h2p.blue_print("best_result:", best_result, "done:", done, "hcnt2[k]", hcnt2[k]) print "currentCnt", currentCnt, "targetCntInt", targetCntInt, "hcnt2_low", hcnt2_low, "hcnt2_high", hcnt2_high print "was", valStart, valEnd, valRange, valBinSize print "next", newValStart, newValEnd, newValRange, newValBinSize return best_result[-1]
def quantile_comparisons(csvPathname, skipHeader=False, col=0, datatype='float', h2oSummary2=None, h2oQuantilesApprox=None, h2oQuantilesExact=None, interpolate='linear', quantile=0.50): SCIPY_INSTALLED = True try: import scipy as sp import numpy as np print "Both numpy and scipy are installed. Will do extra checks" except ImportError: print "numpy or scipy is not installed. Will only do sort-based checking" SCIPY_INSTALLED = false target = h2o_util.file_read_csv_col(csvPathname, col=col, datatype=datatype, skipHeader=skipHeader, preview=5) if datatype=='float': # to make irene's R runif files first col work (quoted row numbers, integers #shouldn't hurt anyone else? # strip " from left (ignore leading whitespace # strip " from right (ignore leading whitespace targetFP= map(float, target) # targetFP= np.array(tFP, np.float) if datatype=='int': targetFP= map(int, target) # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.percentile.html # numpy.percentile has simple linear interpolate and midpoint # need numpy 1.9 for interpolation. numpy 1.8 doesn't have # p = np.percentile(targetFP, 50 if DO_MEDIAN else 99.9, interpolation='midpoint') # 1.8 if SCIPY_INSTALLED: p = np.percentile(targetFP, quantile*100) h2p.red_print("numpy.percentile", p) # per = [100 * t for t in thresholds] from scipy import stats s1 = stats.scoreatpercentile(targetFP, quantile*100) h2p.red_print("scipy stats.scoreatpercentile", s1) # scipy apparently doesn't have the use of means (type 2) # http://en.wikipedia.org/wiki/Quantile # it has median (R-8) with 1/3, 1/3 if 1==0: # type 6 alphap=0 betap=0 # type 5 okay but not perfect alphap=0.5 betap=0.5 # type 8 alphap=1/3.0 betap=1/3.0 if interpolate=='mean': # an approx? (was good when comparing to h2o type 2) alphap=0.4 betap=0.4 if interpolate=='linear': # this is type 7 alphap=1 betap=1 s2List = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) s2 = s2List[0] # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html # type 7 # alphap=0.4, betap=0.4, # type 2 not available? (mean) # alphap=1/3.0, betap=1/3.0 is approx median? h2p.red_print("scipy stats.mstats.mquantiles:", s2) # also get the median with a painful sort (h2o_summ.percentileOnSortedlist() # inplace sort targetFP.sort() # this matches scipy type 7 (linear) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='linear') # this matches h2o type 2 (mean) # b = h2o_summ.percentileOnSortedList(targetFP, 0.50 if DO_MEDIAN else 0.999, interpolate='mean') b = percentileOnSortedList(targetFP, quantile, interpolate='linear') label = str(quantile * 100) + '%' h2p.blue_print(label, "from sort:", b) if SCIPY_INSTALLED: h2p.blue_print(label, "from numpy:", p) h2p.blue_print(label, "from scipy 1:", s1) h2p.blue_print(label, "from scipy 2:", s2) h2p.blue_print(label, "from h2o summary:", h2oSummary2) h2p.blue_print(label, "from h2o multipass:"******"from h2o singlepass:"******"h2oQuantilesApprox is unexpectedly NaN %s" % h2oQuantilesApprox) h2o_util.assertApproxEqual(h2oQuantilesApprox, b, rel=0.5, msg='h2o quantile singlepass is not approx. same as sort algo') if h2oQuantilesExact: if math.isnan(float(h2oQuantilesExact)): raise Exception("h2oQuantilesExact is unexpectedly NaN %s" % h2oQuantilesExact) h2o_util.assertApproxEqual(h2oQuantilesExact, b, tol=0.0000002, msg='h2o quantile multipass is not approx. same as sort algo') if h2oSummary2: if math.isnan(float(h2oSummary2)): raise Exception("h2oSummary2 is unexpectedly NaN %s" % h2oSummary2) h2o_util.assertApproxEqual(h2oSummary2, b, rel=0.5, msg='h2o summary2 is not approx. same as sort algo') if SCIPY_INSTALLED: if h2oQuantilesApprox: h2o_util.assertApproxEqual(h2oQuantilesExact, p, tol=0.0000002, msg='h2o quantile multipass is not same as numpy.percentile') h2o_util.assertApproxEqual(h2oQuantilesExact, s1, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.scoreatpercentile') # give us some slack compared to the scipy use of median (instead of desired mean) if h2oQuantilesExact: if interpolate=='mean': h2o_util.assertApproxEqual(h2oQuantilesExact, s2, rel=0.01, msg='h2o quantile multipass is not approx. same as scipy stats.mstats.mquantiles') else: h2o_util.assertApproxEqual(h2oQuantilesExact, s2, tol=0.0000002, msg='h2o quantile multipass is not same as scipy stats.mstats.mquantiles') # see if scipy changes. nope. it doesn't if 1==0: a = stats.mstats.mquantiles(targetFP, prob=quantile, alphap=alphap, betap=betap) h2p.red_print("after sort") h2p.red_print("scipy stats.mstats.mquantiles:", s3)
def test_exec_enums_rand_cut2(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() n = ROWS tryList = [ # (n, 10, 9, 'cE', 300), (n, 1, 1, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] print "Creating", CUT_EXPR_CNT, 'cut expressions' for j in range(CUT_EXPR_CNT): # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression MAX_COLS_IN_EXPR = iColCount cols = random.sample(range(MAX_COLS_IN_EXPR), random.randint(1, MAX_COLS_IN_EXPR)) for c in cols: # possible choices within the column cel = colEnumList[c] # for now the cutValues are numbers for the enum mappings if 1 == 1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i, c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] # randomly pick == or != if random.randint(0, 1) == 0: cutExprList.append('p$C' + str(i + 1) + '!=' + c) else: cutExprList.append('p$C' + str(i + 1) + '==' + c) cutExpr = ' & '.join(cutExprList) # print "cutExpr:", cutExpr # just extract one output col (the first one) rowExpr = '%s[%s,%s];' % (hex_key, cutExpr, iColCount + 1) # print "rowExpr:", rowExpr print rowExpr rowExprList.append(rowExpr) # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str( colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* src_key = csvFilename parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='A' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='B' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='C' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='D' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='E' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='F' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='G' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='H' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='I' + src_key, timeoutSecs=200) parseResult = h2i.import_only(path=csvPathname, schema='put', src_key='J' + src_key, timeoutSecs=200) parseResult = h2i.parse_only(pattern='*' + src_key, hex_key=hex_key, timeoutSecs=800) print "Parse result['destination_key']:", parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) pNumRows = inspect['numRows'] pNumCols = inspect['numCols'] # print h2o.dump_json(inspect) levels = h2o.nodes[0].levels(source=hex_key) print "levels result:", h2o.dump_json(levels) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception( "Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1 == 1: a = 'a=c(1,2,3);' + ';'.join( ['a[,%s]=a[,%s-1]' % (i, i) for i in range(2, colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(CUT_LOOP_CNT): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0, iColCount - 1) randOCol = random.randint(iColCount, iColCount + oColCount - 1) # should be two different keys in the sample e = random.sample(eKeys, 2) fKey = e[0] eKey = e[1] start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows == 0 or numCols != colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount column = 0 start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1 == 0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) #**************************************************************** # QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET print "QUANTILE APPROX. BASELINE FOR SINGLE COL WALK FULL DATASET. Although it's a real col, not an enum col" quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=hex_key, column='C' + str(iColCount + 1), quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=0) elapsed = time.time() - start h2p.red_print( hex_key, pNumRows, "rows Baseline: quantile single col (C" + str(iColCount + 1) + ")", "one iteration", elapsed, "secs. threshold:", quantile, q['result']) print "quantile single col 1 iteration end on", hex_key, "took", elapsed, 'seconds.' quantileTime = elapsed #**************************************************************** # PLOTS. look for eplot.jpg and fplot.jpg in local dir? if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel, server=True)
def test_exec_enums_rand_cut(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (ROWS, 3, 2, 'cE', 300), ] # create key names to use for exec eKeys = ['e%s' % i for i in range(10)] # h2b.browseTheCloud() trial = 0 for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList: colCount = iColCount + oColCount hex_key = 'p' colEnumList = create_col_enum_list(iColCount) # create 100 possible cut expressions here, so we don't waste time below rowExprList = [] for j in range(CUT_EXPR_CNT): print "Creating", CUT_EXPR_CNT, 'cut expressions' # init cutValue. None means no compare cutValue = [None for i in range(iColCount)] # build up a random cut expression cols = random.sample(range(iColCount), random.randint(1,iColCount)) for c in cols: # possible choices within the column # cel = colEnumList[c] cel = colEnumList # for now the cutValues are numbers for the enum mappings if 1==1: # FIX! hack. don't use encoding 0, maps to NA here? h2o doesn't like celChoice = str(random.choice(range(len(cel)))) else: celChoice = random.choice(cel) cutValue[c] = celChoice cutExprList = [] for i,c in enumerate(cutValue): if c is None: continue else: # new ...ability to reference cols # src[ src$age<17 && src$zip=95120 && ... , ] cutExprList.append('p$C'+str(i+1)+'=='+c) cutExpr = ' && '.join(cutExprList) print "cutExpr:", cutExpr # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] rowExpr = '%s[%s,];' % (hex_key, cutExpr) print "rowExpr:", rowExpr rowExprList.append(rowExpr) print "j:", j # CREATE DATASET******************************************* SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE, colEnumList=colEnumList) # PARSE******************************************************* parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30, doSummary=False, header=0) print "Parse result['destination_key']:", parseResult['destination_key'] inspect = h2o_cmd.runInspect(key=parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) # print h2o.dump_json(inspect) rSummary = h2o_cmd.runSummary(key=parseResult['destination_key']) h2o_cmd.infoFromSummary(rSummary) (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \ h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # error if any col has constant values if len(constantValuesDict) != 0: raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict) # INIT all possible key names used*************************** # remember. 1 indexing! # is this needed? if 1==1: a = 'a=c(1,2,3);' + ';'.join(['a[,%s]=a[,%s-1]'% (i,i) for i in range(2,colCount)]) print a for eKey in eKeys: # build up the columns e = h2o.nodes[0].exec_query(str='%s;%s=a' % (a, eKey), print_params=False) ## print h2o.dump_json(e) xList = [] eList = [] fList = [] for repeat in range(200): # EXEC******************************************************* # don't use exec_expr to avoid issues with Inspect following etc. randICol = random.randint(0,iColCount-1) randOCol = random.randint(iColCount, iColCount+oColCount-1) # should be two different keys in the sample e = random.sample(eKeys,2) fKey = e[0] eKey = e[1] if 1==0: start = time.time() e = h2o.nodes[0].exec_query(str='%s=%s[,%s]' % (fKey, hex_key, randOCol+1)) elapsed = time.time() - start print "exec 1 took", elapsed, "seconds." execTime = elapsed if 1==1: start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (fKey, random.choice(rowExprList))) elapsed = time.time() - start execTime = elapsed print "exec 2 took", elapsed, "seconds." if 1==0: gKey = random.choice(eKeys) # do a 2nd random to see if things blow up start = time.time() h2o.nodes[0].exec_query(str="%s=%s" % (gKey, fKey)) elapsed = time.time() - start print "exec 3 took", elapsed, "seconds." if 1==1: inspect = h2o_cmd.runInspect(key=fKey) h2o_cmd.infoFromInspect(inspect, fKey) numRows = inspect['numRows'] numCols = inspect['numCols'] if numRows==0 or numCols!=colCount: h2p.red_print("Warning: Cut resulted in", numRows, "rows and", numCols, "cols. Quantile will abort") # QUANTILE******************************************************* quantile = 0.5 if DO_MEDIAN else .999 # first output col. always fed by an exec cut, so 0? column = iColCount start = time.time() q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS) h2p.red_print("quantile", quantile, q['result']) elapsed = time.time() - start print "quantile end on ", fKey, 'took', elapsed, 'seconds.' quantileTime = elapsed # remove all keys******************************************************* # what about hex_key? if 1==0: start = time.time() h2o.nodes[0].remove_all_keys() elapsed = time.time() - start print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.' trial += 1 xList.append(trial) eList.append(execTime) fList.append(quantileTime) # just get a plot of the last one (biggest) if DO_PLOT: xLabel = 'trial' eLabel = 'exec cut time' fLabel = 'quantile time' eListTitle = "" fListTitle = "" h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)