def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def parse_only(node=None, pattern=None, hex_key=None, importKeyList=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, **kwargs): if not node: node = h2o_nodes.nodes[0] # Get the list of all keys and use those that match the pattern # FIX! this can be slow. Can we use h2o to filter the list for us? # HACK. to avoid the costly frames, pass the imported key list during import_parse # won't work for cases where we do multiple import_only, then parse (for multi-dir import) matchingList = [] if importKeyList: # the pattern is a full path/key name, so no false matches for key_name in importKeyList: if fnmatch.fnmatch(key_name, pattern): matchingList.append(key_name) else: h2p.yellow_print("WARNING: using frames to look up key names for possible parse regex") framesResult = node.frames(timeoutSecs=timeoutSecs) for frame in framesResult['frames']: key_name = frame['key']['name'] if fnmatch.fnmatch(key_name, pattern): matchingList.append(key_name) parseResult = node.parse(key=matchingList, hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, initialDelaySecs=initialDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noise=noise, benchmarkLogging=benchmarkLogging, noPoll=noPoll, **kwargs) parseResult['python_source'] = pattern return parseResult
def test_rapids_basic(self): bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'p' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for execExpr in exprList: r = re.match ('\(= \!([a-zA-Z0-9_]+) ', execExpr) resultKey = r.group(1) execResult, result = h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) if DO_ROLLUP: h2o_cmd.runInspect(key=resultKey) # rows might be zero! if execResult['num_rows'] or execResult['num_cols']: keys.append(execExpr) else: h2p.yellow_print("\nNo key created?\n", dump_json(execResult)) print "\nExpressions that created keys. Shouldn't all of these expressions create keys" for k in keys: print k h2o.check_sandbox_for_errors()
def parse_only(node=None, pattern=None, hex_key=None, importKeyList=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, **kwargs): if not node: node = h2o_nodes.nodes[0] # Get the list of all keys and use those that match the pattern # FIX! this can be slow. Can we use h2o to filter the list for us? # HACK. to avoid the costly frames, pass the imported key list during import_parse # won't work for cases where we do multiple import_only, then parse (for multi-dir import) matchingList = [] if importKeyList: # the pattern is a full path/key name, so no false matches for key_name in importKeyList: if fnmatch.fnmatch(str(key_name), pattern): matchingList.append(key_name) else: h2p.yellow_print( "WARNING: using frames to look up key names for possible parse regex" ) framesResult = node.frames(timeoutSecs=timeoutSecs) for frame in framesResult['frames']: key_name = frame['key']['name'] if fnmatch.fnmatch(str(key_name), pattern): matchingList.append(key_name) if len(matchingList) == 0: raise Exception("Didn't find %s in key list %s or Frames result" % (pattern, importKeyList)) start = time.time() # put quotes on all keys parseResult = node.parse(key=matchingList, hex_key=hex_key, timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, initialDelaySecs=initialDelaySecs, pollTimeoutSecs=pollTimeoutSecs, noise=noise, benchmarkLogging=benchmarkLogging, noPoll=noPoll, **kwargs) # FIX! extract and print the result key name (from parseResult) print "\nparse took", time.time() - start, "seconds" parseResult['python_source'] = pattern return parseResult
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours time.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) ### h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o_args.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will Check cloud status every %s secs and kill cloud if wrong or no answer" % incrTime) if CHECK_WHILE_SLEEPING: h2p.blue_print("Will also look at redirected stdout/stderr logs in sandbox every %s secs" % incrTime) h2p.red_print("No checking of logs while sleeping, or check of cloud status") h2p.yellow_print("So if H2O stack traces, it's up to you to kill me if 4 hours is too long") h2p.yellow_print("ctrl-c will cause all jvms to die(thru psutil terminate, paramiko channel death or h2o shutdown...") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) if CHECK_WHILE_SLEEPING: print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) else: print str(datetime.datetime.now()), h2o.python_cmd_line, "still here", totalTime, maxTime, incrTime # don't do this, as the cloud may be hung? if 1==0: print "Shutting down cloud, but first delete all keys" start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def build_model(self, algo, training_frame, parameters, destination_key=None, timeoutSecs=60, asynchronous=False, **kwargs): ''' Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. ''' assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders['model_builders'] builder = model_builders['model_builders'][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format(training_frame) key_name = frames['frames'][0]['key']['name'] assert key_name==training_frame, \ "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame) parameters['training_frame'] = training_frame if destination_key is not None: parameters['destination_key'] = destination_key result1 = self.do_json_request('/2/ModelBuilders.json/' + algo, cmd='post', timeout=timeoutSecs, postData=parameters) if asynchronous: result = result1 elif 'validation_error_count' in result1: h2p.yellow_print("parameter error in model_builders") # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 else: job = result1['jobs'][0] job_key = job['key']['name'] verboseprint("model building job_key: " + repr(job_key)) result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint("result:", result) return result
def import_parse(node=None, schema='local', bucket=None, path=None, src_key=None, hex_key=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, noPrint=True, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. if schema=='put': h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + "\nMeans multi-machine with 'put' will fail") schema = 'local' if not node: node = h2o_nodes.nodes[0] (importResult, importPattern) = import_only(node, schema, bucket, path, timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, doSummary, src_key, noPrint, importParentDir, **kwargs) verboseprint("importPattern:", importPattern) verboseprint("importResult", dump_json(importResult)) assert len(importResult['keys']) >= 1, "No keys imported, maybe bad bucket %s or path %s" % (bucket, path) parseResult = parse_only(node, importPattern, hex_key, importResult['keys'], timeoutSecs, retryDelaySecs, initialDelaySecs, pollTimeoutSecs, noise, benchmarkLogging, noPoll, **kwargs) verboseprint("parseResult:", dump_json(parseResult)) # do SummaryPage here too, just to get some coverage # only if not noPoll. otherwise parse isn't done if doSummary and not noPoll: # if parse blows up, we want error isolation ..i.e. find stack traces here, rather than the next guy blowing up check_sandbox_for_errors() print "WARNING: not doing inspect/summary for now after parse" ## inspect = node.inspect(parseResult['destination_key'], timeoutSecs=timeoutSecs) ## numRows = inspect['numRows'] ## numCols = inspect['numCols'] # we pass numCols, for detecting whether the na cnt means a col is all NAs, (for ignoring min/max/mean/sigma) ## node.summary_page(parseResult['destination_key'], timeoutSecs=timeoutSecs, noPrint=noPrint, numRows=numRows, numCols=numCols) # for now, don't worry about error isolating summary else: # isolate a parse from the next thing check_sandbox_for_errors() return parseResult
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4 * 3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime + 0.0) / 3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format( h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print( "Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime < maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def test_build_for_clone(self): # python gets confused about which 'start' if I used start here elapsed = time.time() - beginning print "\n%0.2f seconds to get here from start" % elapsed # might as well open a browser on it? (because the ip/port will vary # maybe just print the ip/port for now ## h2b.browseTheCloud() maxTime = 4*3600 totalTime = 0 incrTime = 60 h2p.purple_print("\nSleeping for total of", (maxTime+0.0)/3600, "hours.") print "Will check h2o logs every", incrTime, "seconds" print "Should be able to run another test using h2o-nodes.json to clone cloud" print "i.e. h2o.build_cloud_with_json()" print "Bad test if a running test shuts down the cloud. I'm supposed to!\n" h2p.green_print("To watch cloud in browser follow address:") h2p.green_print(" http://{0}:{1}/Cloud.html".format(h2o.nodes[0].http_addr, h2o.nodes[0].port)) h2p.blue_print("You can start a test (or tests) now!") h2p.blue_print("Will spin looking at redirected stdout/stderr logs in sandbox for h2o errors every %s secs" % incrTime) h2p.red_print("This is just for fun") h2p.yellow_print("So is this") while (totalTime<maxTime): # die after 4 hours h2o.sleep(incrTime) totalTime += incrTime # good to touch all the nodes to see if they're still responsive # give them up to 120 secs to respond (each individually) h2o.verify_cloud_size(timeoutSecs=120) print "Checking sandbox log files" h2o.check_sandbox_for_errors(cloudShutdownIsError=True) start = time.time() h2i.delete_keys_at_all_nodes() elapsed = time.time() - start print "delete_keys_at_all_nodes(): took", elapsed, "secs"
def build_model(self, algo, training_frame, parameters, destination_frame=None, model_id=None, timeoutSecs=60, noPoll=False, **kwargs): if 'destination_key' in kwargs: raise Exception('Change destination_key in build_model() to model_id') ''' Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. ''' assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders['model_builders'], "%s %s" % ( algo, [k for k in model_builders['model_builders']]) builder = model_builders['model_builders'][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format( training_frame) key_name = frames['frames'][0]['frame_id']['name'] assert key_name==training_frame, \ "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame) parameters['training_frame'] = training_frame if destination_frame is not None: print "destination_frame should be replaced by model_id now" parameters['model_id'] = destination_frame if model_id is not None: parameters['model_id'] = model_id print "build_model parameters", parameters start = time.time() result1 = self.do_json_request('/3/ModelBuilders.json/' + algo, cmd='post', timeout=timeoutSecs, postData=parameters) # make get overwritten after polling elapsed = time.time() - start verboseprint("build_model result", dump_json(result1)) if noPoll: result = result1 elif ('validation_error_count' in result1) and (result1['validation_error_count'] > 0): h2p.yellow_print("parameter error in model_builders: %s" % result1) # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 # don't bother printing a time message elif 'exception_msg' in result1: h2p.yellow_print("exception msg in model_builders: %s" % result1['exception_msg']) result = result1 else: job_result = result1['job'] job_key = job_result['key']['name'] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print "ModelBuilders", algo, "end on", training_frame, 'took', time.time( ) - start, 'seconds' print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on build_model job status: %s %s %s %s" % \ (status, progress, msec, description)) result = job_result else: # ? we should always get a job_json result raise Exception( "build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() result['python_elapsed'] = elapsed return result
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_frames': None, 'destination_frame': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings': None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception( "key seems to be bad in parse. Should be list or string. %s" % key) # have to put double quotes around the individual list items (single not legal) source_frames = "[" + ",".join(map( (lambda x: '"' + x + '"'), key)) + "]" else: # what if None here source_frames = '["' + key + '"]' # quotes required on key params_dict['source_frames'] = source_frames # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_frames=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_frames': source_frames} setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_frames']: # should these be quoted? source_framesStr = "[" + ",".join([ ('"%s"' % src['name']) for src in setup_result['source_frames'] ]) + "]" else: source_framesStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: # single quotes not legal..need double quotes columnNamesStr = "[" + ",".join( map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: # single quotes not legal..need double quotes naStrings = "[" + ",".join( map((lambda x: '"' + x + '"' if x != None else '""'), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k, v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k >= 0 and k < len(ct): ct[k] = v else: raise Exception( "bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception( "bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]" parse_params = { 'source_frames': source_framesStr, 'destination_frame': setup_result['destination_frame'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len( setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print( "Not printing the parameters to Parse because the columnNames are too lengthy." ) h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs( parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_frames is length:", len(parse_params['source_frames']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request(jsonRequest="3/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_frame'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def parse(self, key, hex_key=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'srcs': None, 'hex': hex_key, 'pType': None, # This is a list? 'sep': None, 'ncols': None, 'checkHeader': None, # how is this used 'singleQuotes': None, 'columnNames': None, # list? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) srcs = "[" + ",".join(key) + "]" else: # what if None here srcs = "[" + key + "]" params_dict['srcs'] = srcs # merge kwargs into params_dict # =None overwrites params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?srcs=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'srcs': srcs} setup_result = self.do_json_request(jsonRequest="ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # and then Parse?srcs=<keys list> and params from the ParseSetup result # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON] if setup_result['srcs']: setupSrcs = "[" + ",".join([src['name'] for src in setup_result['srcs'] ]) + "]" else: setupSrcs = None # I suppose we need a way for parameters to parse() to override these if setup_result['columnNames']: ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]" else: ascii_column_names = None parse_params = { 'srcs': setupSrcs, 'hex': setup_result['hexName'], 'pType': setup_result['pType'], 'sep': setup_result['sep'], 'ncols': setup_result['ncols'], 'checkHeader': setup_result['checkHeader'], 'singleQuotes': setup_result['singleQuotes'], 'columnNames': ascii_column_names, # how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['columnNames'] and len(setup_result['columnNames']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse srcs is length:", len(parse_params['srcs']) print "parse columnNames is length:", len(parse_params['columnNames']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['name'] hex_key = parse_params['hex'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() return this.jobs(job_key) # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def build_model(self, algo, training_frame, parameters, destination_key=None, timeoutSecs=60, asynchronous=False, **kwargs): ''' Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. ''' assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders['model_builders'], "%s %s" % (algo, [k for k in model_builders['model_builders']]) builder = model_builders['model_builders'][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format(training_frame) key_name = frames['frames'][0]['key']['name'] assert key_name==training_frame, \ "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame) parameters['training_frame'] = training_frame if destination_key is not None: parameters['destination_key'] = destination_key print "build_model parameters", parameters result1 = self.do_json_request('/2/ModelBuilders.json/' + algo, cmd='post', timeout=timeoutSecs, postData=parameters) verboseprint("build_model result", dump_json(result1)) if asynchronous: result = result1 elif 'validation_error_count' in result1: h2p.yellow_print("parameter error in model_builders") # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 else: job_result = result1['jobs'][0] job_key = job_result['key']['name'] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on build_model job status: %s %s %s %s" % \ (status, progress, msec, description)) result = job_result else: # ? we should always get a job_json result raise Exception("build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() return result
def parse(self, key, hex_key=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'srcs': None, 'hex': hex_key, 'pType': None, # This is a list? 'sep': None, 'ncols': None, 'checkHeader': None, # how is this used 'singleQuotes': None, 'columnNames': None, # list? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception( "key seems to be bad in parse. Should be list or string. %s" % key) srcs = "[" + ",".join(key) + "]" else: # what if None here srcs = "[" + key + "]" params_dict['srcs'] = srcs # merge kwargs into params_dict # =None overwrites params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?srcs=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'srcs': srcs} setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # and then Parse?srcs=<keys list> and params from the ParseSetup result # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON] if setup_result['srcs']: setupSrcs = "[" + ",".join( [src['name'] for src in setup_result['srcs']]) + "]" else: setupSrcs = None # I suppose we need a way for parameters to parse() to override these if setup_result['columnNames']: ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]" else: ascii_column_names = None parse_params = { 'srcs': setupSrcs, 'hex': setup_result['hexName'], 'pType': setup_result['pType'], 'sep': setup_result['sep'], 'ncols': setup_result['ncols'], 'checkHeader': setup_result['checkHeader'], 'singleQuotes': setup_result['singleQuotes'], 'columnNames': ascii_column_names, # how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['columnNames'] and len( setup_result['columnNames']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print( "Not printing the parameters to Parse because the columnNames are too lengthy." ) h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs( parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse srcs is length:", len(parse_params['srcs']) print "parse columnNames is length:", len(parse_params['columnNames']) # none of the kwargs passed to here! parse_result = self.do_json_request(jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['hex'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def import_only(node=None, schema='local', bucket=None, path=None, timeoutSecs=30, retryDelaySecs=0.1, initialDelaySecs=0, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, doSummary=True, src_key=None, noPrint=False, importParentDir=True, **kwargs): # FIX! hack all put to local, since h2o-dev doesn't have put yet? # multi-machine put will fail as a result. if schema=='put': h2p.yellow_print("WARNING: hacking schema='put' to 'local'..h2o-dev doesn't support upload." + "\nMeans multi-machine with 'put' will fail") schema = 'local' if src_key and schema!='put': raise Exception("can only specify a 'src_key' param for schema='put'. You have %s %s" % (schema, src_key)) # no bucket is sometimes legal (fixed path) if not node: node = h2o_nodes.nodes[0] if path is None: raise Exception("import_only: path parameter needs to be specified") if "/" in path: (head, pattern) = os.path.split(path) else: (head, pattern) = ("", path) verboseprint("head:", head) verboseprint("pattern:", pattern) # to train users / okay here # normally we import the folder above, but if we import exactly, the path can't have regex # the folder can't have regex in any case if importParentDir: if re.search(r"[\*<>{}[\]~`]", head): raise Exception("h2o folder path %s can't be regex. path= was %s" % (head, path)) else: if re.search(r"[\*<>{}[\]~`]", path): raise Exception("h2o path %s can't be regex. path= was %s" % (head, path)) if schema=='put': # to train users if re.search(r"[/\*<>{}[\]~`]", pattern): raise Exception("h2o putfile basename %s can't be regex. path= was %s" % (pattern, path)) if not path: raise Exception("path= didn't say what file to put") (folderPath, filename) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, filename) verboseprint("put filename:", filename, "folderPath:", folderPath, "filePath:", filePath) if not noPrint: h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses put:/%s" % filePath) h2p.green_print("Local path to file that will be uploaded: %s" % filePath) h2p.blue_print("That path resolves as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") key = node.put_file(filePath, key=src_key, timeoutSecs=timeoutSecs) # hmm.. what should importResult be in the put case # set it to None. No import is done, and shouldn't be used if you're doing schema='put' importResult = None return (None, key) if schema=='local' and not \ (node.redirect_import_folder_to_s3_path or node.redirect_import_folder_to_s3n_path): (folderPath, pattern) = find_folder_and_filename(bucket, path, schema) filePath = os.path.join(folderPath, pattern) h2p.green_print("\nimport_only:", h2o_args.python_test_name, "uses local:/%s" % filePath) h2p.green_print("Path h2o will be told to use: %s" % filePath) h2p.blue_print("If local jvms, path resolves locally as:", os.path.realpath(filePath)) if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") # FIX! why are we returning importPattern here..it's different than finalImportString if we import a folder? # is it used for key matching by others? # FIX! hack ..h2o-dev is creating key names with the absolute path, not the sym link path # messes up for import folders that go thru /home/<user>/home-0xdiag-datasets # importPattern = folderURI + "/" + pattern # could include this on the entire importPattern if we no longer have regex basename in h2o-dev? # folderURI = 'nfs:/' + folderPath folderURI = 'nfs:/' + os.path.realpath(folderPath) if importParentDir: finalImportString = folderPath else: finalImportString = folderPath + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: if bucket is not None and re.match("/", head): verboseprint("You said bucket:", bucket, "so stripping incorrect leading '/' from", head) head = head.lstrip('/') # strip leading / in head if present if bucket and head!="": folderOffset = bucket + "/" + head elif bucket: folderOffset = bucket else: folderOffset = head if h2o_args.abort_after_import: raise Exception("Aborting due to abort_after_import (-aai) argument's effect in import_only()") n = h2o_nodes.nodes[0] if schema=='s3' or node.redirect_import_folder_to_s3_path: # this is just like s3n now? i.e. we can point down inside the s3 bucket like s3n? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) folderURI = "s3://" + folderOffset if not n.aws_credentials: print "aws_credentials: %s" % n.aws_credentials # raise Exception("Something was missing for s3 on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3 on the java -jar cmd line when the cloud was built" if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='s3n' or node.redirect_import_folder_to_s3n_path: # FIX! hack for now...when we change import folder to import s3, point to unique bucket name for h2o # should probably deal with this up in the bucket resolution # this may change other cases, but smalldata should only exist as a "bucket" for us? folderOffset = re.sub("smalldata", "h2o-smalldata", folderOffset) if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for s3n on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for s3n on the java -jar cmd line when the cloud was built" folderURI = "s3n://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='maprfs': if not n.use_maprfs: print "use_maprfs: %s" % n.use_maprfs # raise Exception("Something was missing for maprfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for maprfs on the java -jar cmd line when the cloud was built" # if I use the /// and default, the key names that get created by h2o only have 1 slash # so the parse doesn't find the key name if n.hdfs_name_node: folderURI = "maprfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though # folderURI = "maprfs:///" + folderOffset folderURI = "maprfs:/" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) elif schema=='hdfs': # check that some state from the cloud building time was right # the requirements for this may change and require updating if not (n.use_hdfs and ((n.hdfs_version and n.hdfs_name_node) or n.hdfs_config)): print "use_hdfs: %s hdfs_version: %s hdfs_name_node: %s" % (n.use_hdfs, n.hdfs_version, n.hdfs_name_node) if n.hdfs_config: print "hdfs_config: %s" % n.hdfs_config # raise Exception("Something was missing for hdfs on the java -jar cmd line when the cloud was built") print "ERROR: Something was missing for hdfs on the java -jar cmd line when the cloud was built" if n.hdfs_name_node: folderURI = "hdfs://" + n.hdfs_name_node + "/" + folderOffset else: # this is different than maprfs? normally we specify the name though folderURI = "hdfs://" + folderOffset if importParentDir: finalImportString = folderURI else: finalImportString = folderURI + "/" + pattern importResult = node.import_files(finalImportString, timeoutSecs=timeoutSecs) else: raise Exception("schema not understood: %s" % schema) print "\nimport_only:", h2o_args.python_test_name, schema, "uses", finalImportString importPattern = folderURI + "/" + pattern return (importResult, importPattern)
def test_xl_ast_assert_ZZ(self): #***************************************** a = DF('a1') # inits to -1 checkAst(astForInit(a)) # I suppose use of the h2o inspect request is deprecated # h2o_cmd.runInspect uses Frames? if 1==0: inspect = h2o.n0.inspect(key=a) # str(a) becomes 'a1'. so this param should take type Key for key= print "a/a1:", dump_json(inspect) # let's use runSummary for fun..returns OutputObj for the col # will get from column 0, since column not specified summaryResult = h2o_cmd.runSummary(key=a) co = h2o_cmd.infoFromSummary(summaryResult) print "co.label:", co.label print "co.data:", co.data # how can we get a bunch of data? b = DF('b1') # inits to -1 checkAst(astForInit(b)) c = DF('c1') # inits to -1 checkAst(astForInit(c)) print "lastExecResult:", dump_json(h2o_xl.Xbase.lastExecResult) h2p.yellow_print("Assign compare1") Assign(c[0], c[0] + 0) checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare2") Assign(c[0], c[0] - 0) checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare3") Assign(c[0], c[0] == 0) checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))") h2p.yellow_print("Assign compare4") Assign(c[0], c[0] != 0) checkAst("(= ([ %c1 #0 #0) (N ([ %c1 #0 #0) #0))") # h2o_xl.debugPrintEnable = True #***************************************** c = DF('c1') h2p.yellow_print("<<= compare1") c[0] <<= (c[0] + 0) checkAst("(= ([ %c1 #0 #0) (+ ([ %c1 #0 #0) #0))") h2p.yellow_print("<<= compare2") c[0] <<= (c[0] - 0) checkAst("(= ([ %c1 #0 #0) (- ([ %c1 #0 #0) #0))") h2p.yellow_print("<<= compare3") c[0] <<= (c[0] == 0) checkAst("(= ([ %c1 #0 #0) (n ([ %c1 #0 #0) #0))") #***************************************** c = DF('c1') # inits to -1 h2p.yellow_print("compare1") # doesn't assign result to a key?, gets result if scalar, otherwise gets a list or ??? # .result can give us scalar, list, Key, None # .result could be a property that triggers a csv download, if we didn't cache the scalar/list result because it was small? # i.e. check if .result_cached was None, when .result property is used (property to avoid the need for () result = Expr(c[0] == -1).result checkAst("(n ([ %c1 #0 #0) #-1)") h2p.yellow_print("Expr result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result)) assert result == 1.0, "%s %s" % (type(result), result) # real result? if result: print "true for if of result", type(result), result else: print "else for if of result", type(result), result #***************************************** # difference is this goes to a temp key, so if not scalar, you can still get the results by looking at the key result = Assign(None, c[0]==-1).result checkAst("(= !knon_0x1a34250 (n ([ %c1 #0 #0) #-1))") h2p.yellow_print("Assign result..Desire: python datatype/value if scalar or list,.else Key: %s %s" % (type(result), result)) assert result == 1.0, "%s %s" % (type(result), result) # real result? if result: print "true if of result", result else: print "false if of result", result
def build_model( self, algo, training_frame, parameters, destination_frame=None, model_id=None, timeoutSecs=60, noPoll=False, **kwargs ): if "destination_key" in kwargs: raise Exception("Change destination_key in build_model() to model_id") """ Build a model on the h2o cluster using the given algorithm, training Frame and model parameters. """ assert algo is not None, '"algo" parameter is null' assert training_frame is not None, '"training_frame" parameter is null' assert parameters is not None, '"parameters" parameter is null' # why always check that the algo is in here? model_builders = self.model_builders(timeoutSecs=timeoutSecs) assert model_builders is not None, "/ModelBuilders REST call failed" assert algo in model_builders["model_builders"], "%s %s" % (algo, [k for k in model_builders["model_builders"]]) builder = model_builders["model_builders"][algo] # TODO: test this assert, I don't think this is working. . . frames = self.frames(key=training_frame) assert frames is not None, "/Frames/{0} REST call failed".format(training_frame) key_name = frames["frames"][0]["frame_id"]["name"] assert key_name == training_frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format( training_frame, key_name, training_frame ) parameters["training_frame"] = training_frame if destination_frame is not None: print "destination_frame should be replaced by model_id now" parameters["model_id"] = destination_frame if model_id is not None: parameters["model_id"] = model_id print "build_model parameters", parameters start = time.time() result1 = self.do_json_request( "/3/ModelBuilders.json/" + algo, cmd="post", timeout=timeoutSecs, postData=parameters ) # make get overwritten after polling elapsed = time.time() - start verboseprint("build_model result", dump_json(result1)) if noPoll: result = result1 elif ("validation_error_count" in result1) and (result1["validation_error_count"] > 0): h2p.yellow_print("parameter error in model_builders: %s" % result1) # parameters validation failure # TODO: add schema_type and schema_version into all the schemas to make this clean to check result = result1 # don't bother printing a time message elif "exception_msg" in result1: h2p.yellow_print("exception msg in model_builders: %s" % result1["exception_msg"]) result = result1 else: job_result = result1["job"] job_key = job_result["key"]["name"] verboseprint("build_model job_key: " + repr(job_key)) job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs) verboseprint(job_result) elapsed = time.time() - start print "ModelBuilders", algo, "end on", training_frame, "took", time.time() - start, "seconds" print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) if job_result: jobs = job_result["jobs"][0] description = jobs["description"] dest = jobs["dest"] msec = jobs["msec"] status = jobs["status"] progress = jobs["progress"] # can condition this with a parameter if some FAILED are expected by tests. if status == "FAILED": print dump_json(job_result) raise Exception( "Taking exception on build_model job status: %s %s %s %s" % (status, progress, msec, description) ) result = job_result else: # ? we should always get a job_json result raise Exception("build_model didn't get a job_result when it expected one") # return None verboseprint("result:", result) h2o_sandbox.check_sandbox_for_errors() result["python_elapsed"] = elapsed return result
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_keys': None, 'destination_key': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings' : None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) # have to put quotes around the individual list items source_keys = "[" + ",".join(map((lambda x: "'" + x + "'"), key)) + "]" else: # what if None here source_keys = "['" + key + "']" # quotes required on key params_dict['source_keys'] = source_keys # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_keys=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_keys': source_keys} setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_keys']: # should these be quoted? source_keysStr = "[" + ",".join([("'%s'" % src['name']) for src in setup_result['source_keys'] ]) + "]" else: source_keysStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: columnNamesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: naStrings = "[" + ",".join(map((lambda x: "'" + x + "'" if x != None else "''"), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k,v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k>=0 and k<len(ct): ct[k] = v else: raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), ct)) + "]" parse_params = { 'source_keys': source_keysStr, 'destination_key': setup_result['destination_key'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_keys is length:", len(parse_params['source_keys']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_key'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")