def frames(self, key=None, timeoutSecs=10, **kwargs): if not (key is None or isinstance(key, (basestring, Key))): raise Exception("frames: key should be string or Key type %s %s" % (type(key), key)) params_dict = { 'find_compatible_models': 0, 'offset': 0, # is offset working yet? 'len': 5, } ''' Return a single Frame or all of the Frames in the h2o cluster. The frames are contained in a list called "frames" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_models is implemented then the top level dict will also contain a "models" list. ''' h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'frames', False) # key can be type Key? (from h2o_xl) str(key) should return if key: if isinstance(key, Key): keyStr = key.frame else: keyStr = key result = self.do_json_request('3/Frames.json/' + keyStr, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Frames.json', timeout=timeoutSecs, params=params_dict) return result
def model_builders(self, algo=None, timeoutSecs=10, **kwargs): ''' Return a model builder or all of the model builders known to the h2o cluster. The model builders are contained in a dictionary called "model_builders" at the top level of the result. The dictionary maps algorithm names to parameters lists. Each of the parameters contains all the metdata required by a client to present a model building interface to the user. if parameters = True, return the parameters? ''' params_dict = {} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'model_builders', False) request = '3/ModelBuilders.json' if algo: request += "/" + algo result = self.do_json_request(request, timeout=timeoutSecs, params=params_dict) # verboseprint(request, "result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def frames(self, key=None, timeoutSecs=60, **kwargs): if not (key is None or isinstance(key, (basestring, Key))): raise Exception("frames: key should be string or Key type %s %s" % (type(key), key)) params_dict = { 'find_compatible_models': 0, 'row_offset': 0, # is offset working yet? 'row_count': 5, } ''' Return a single Frame or all of the Frames in the h2o cluster. The frames are contained in a list called "frames" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_models is implemented then the top level dict will also contain a "models" list. ''' h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'frames', False) # key can be type Key? (from h2o_xl) str(key) should return if key: if isinstance(key, Key): keyStr = key.frame else: keyStr = key result = self.do_json_request('3/Frames.json/' + keyStr, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Frames.json', timeout=timeoutSecs, params=params_dict) return result
def models(self, key=None, timeoutSecs=10, **kwargs): ''' Return all of the models in the h2o cluster, or a single model given its key. The models are contained in a list called "models" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_frames is implemented then the top level dict will also contain a "frames" list. ''' params_dict = {'find_compatible_frames': False} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True) if key: # result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) # print "for ray:", dump_json(result) result = self.do_json_request('3/Models.json/' + key, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) verboseprint("models result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs): ''' Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out. ''' params_dict = {} # merge kwargs into params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False) start_time = time.time() pollCount = 0 while True: result = self.do_json_request('2/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict) # print 'Job: ', dump_json(result) if key: frames_result = self.frames(key=key) print 'frames_result for key:', key, dump_json(result) jobs = result['jobs'][0] description = jobs['description'] dest = jobs['dest'] dest_name = dest['name'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] print description, \ "dest_name:", dest_name, \ "\tprogress:", "%-10s" % progress, \ "\tstatus:", "%-12s" % status, \ "\tmsec:", msec if status == 'DONE' or status == 'CANCELLED' or status == 'FAILED': h2o_sandbox.check_sandbox_for_errors() return result # FIX! what are the other legal polling statuses that we should check for? if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs): h2o_sandbox.check_sandbox_for_errors() emsg = "Job:", job_key, "timed out in:", timeoutSecs raise Exception(emsg) print emsg return None # check every other poll, for now if (pollCount % 2) == 0: h2o_sandbox.check_sandbox_for_errors() time.sleep(retryDelaySecs) pollCount += 1
def column(self, key, column, timeoutSecs=10, **kwargs): params_dict = {"offset": 0, "len": 100} h2o_methods.check_params_update_kwargs(params_dict, kwargs, "column", True) result = self.do_json_request( "3/Frames.json/" + key + "/columns/" + column, timeout=timeoutSecs, params=params_dict ) return result
def column(self, key, column, timeoutSecs=10, **kwargs): params_dict = { 'offset': 0, 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'column', True) result = self.do_json_request('3/Frames.json/' + key + '/columns/' + column, timeout=timeoutSecs, params=params_dict) return result
def column(self, key, column, timeoutSecs=10, **kwargs): params_dict = {'offset': 0, 'len': 100} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'column', True) result = self.do_json_request('3/Frames.json/' + key + '/columns/' + column, timeout=timeoutSecs, params=params_dict) return result
def columns(self, key, timeoutSecs=10, **kwargs): """ Return the columns for a single Frame in the h2o cluster. """ params_dict = {"offset": 0, "len": 100} h2o_methods.check_params_update_kwargs(params_dict, kwargs, "columns", True) result = self.do_json_request("3/Frames.json/" + key + "/columns", timeout=timeoutSecs, params=params_dict) return result
def jobs(self, job_key=None, timeoutSecs=10, **kwargs): """ Fetch all the jobs or a single job from the /Jobs endpoint. """ params_dict = { # 'job_key': job_key } h2o_methods.check_params_update_kwargs(params_dict, kwargs, "jobs", True) result = self.do_json_request("3/Jobs.json", timeout=timeoutSecs, params=params_dict) return result
def jobs(self, job_key=None, timeoutSecs=10, **kwargs): ''' Fetch all the jobs or a single job from the /Jobs endpoint. ''' params_dict = { 'job_key': job_key } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'jobs', True) result = self.do_json_request('2/Jobs.json', timeout=timeoutSecs, params=params_dict) return result
def jobs(self, job_key=None, timeoutSecs=10, **kwargs): ''' Fetch all the jobs or a single job from the /Jobs endpoint. ''' params_dict = { # 'job_key': job_key } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'jobs', True) result = self.do_json_request('3/Jobs.json', timeout=timeoutSecs, params=params_dict) return result
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs): ''' Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out. ''' params_dict = {} # merge kwargs into params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False) start_time = time.time() pollCount = 0 while True: result = self.do_json_request('3/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict) # print 'Job: ', dump_json(result) if key: frames_result = self.frames(key=key) print 'frames_result for key:', key, dump_json(result) jobs = result['jobs'][0] description = jobs['description'] dest = jobs['dest'] dest_name = dest['name'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] print description, \ "dest_name:", dest_name, \ "\tprogress:", "%-10s" % progress, \ "\tstatus:", "%-12s" % status, \ "\tmsec:", msec if status=='DONE' or status=='CANCELLED' or status=='FAILED': h2o_sandbox.check_sandbox_for_errors() return result # what about 'CREATED' # FIX! what are the other legal polling statuses that we should check for? if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs): h2o_sandbox.check_sandbox_for_errors() emsg = "Job:", job_key, "timed out in:", timeoutSecs # for debug a = h2o.nodes[0].get_cloud() print "cloud.json:", dump_json(a) raise Exception(emsg) print emsg return None # check every other poll, for now if (pollCount % 2) == 0: h2o_sandbox.check_sandbox_for_errors() time.sleep(retryDelaySecs) pollCount += 1
def summary(self, key, column="C1", timeoutSecs=10, **kwargs): ''' Return the summary for a single column for a single Frame in the h2o cluster. ''' params_dict = { 'offset': 0, 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True) result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict) return result
def columns(self, key, timeoutSecs=10, **kwargs): ''' Return the columns for a single Frame in the h2o cluster. ''' params_dict = {'offset': 0, 'len': 100} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'columns', True) result = self.do_json_request('3/Frames.json/' + key + '/columns', timeout=timeoutSecs, params=params_dict) return result
def columns(self, key, timeoutSecs=10, **kwargs): ''' Return the columns for a single Frame in the h2o cluster. ''' params_dict = { 'offset': 0, 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'columns', True) result = self.do_json_request('3/Frames.json/' + key + '/columns', timeout=timeoutSecs, params=params_dict) return result
def summary(self, key, column="C1", timeoutSecs=10, **kwargs): ''' Return the summary for a single column for a single Frame in the h2o cluster. ''' params_dict = { # 'offset': 0, # 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True) result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict) h2o_sandbox.check_sandbox_for_errors() return result
def summary(self, key, column="C1", timeoutSecs=10, **kwargs): """ Return the summary for a single column for a single Frame in the h2o cluster. """ params_dict = { # 'offset': 0, # 'len': 100 } h2o_methods.check_params_update_kwargs(params_dict, kwargs, "summary", True) result = self.do_json_request( "3/Frames.json/%s/columns/%s/summary" % (key, column), timeout=timeoutSecs, params=params_dict ) h2o_sandbox.check_sandbox_for_errors() return result
def models(self, key=None, timeoutSecs=10, **kwargs): ''' Return all of the models in the h2o cluster, or a single model given its key. The models are contained in a list called "models" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_frames is implemented then the top level dict will also contain a "frames" list. ''' params_dict = { 'find_compatible_frames': False } h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True) if key: result = self.do_json_request('3/Models.json/' + key, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) return result
def frames(self, key=None, timeoutSecs=10, **kwargs): params_dict = { 'find_compatible_models': 0, 'offset': 0, # is offset working yet? 'len': 5, } ''' Return a single Frame or all of the Frames in the h2o cluster. The frames are contained in a list called "frames" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_models is implemented then the top level dict will also contain a "models" list. ''' h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'frames', False) if key: result = self.do_json_request('3/Frames.json/' + key, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request('3/Frames.json', timeout=timeoutSecs, params=params_dict) return result
def model_builders(self, algo=None, timeoutSecs=10, **kwargs): ''' Return a model builder or all of the model builders known to the h2o cluster. The model builders are contained in a dictionary called "model_builders" at the top level of the result. The dictionary maps algorithm names to parameters lists. Each of the parameters contains all the metdata required by a client to present a model building interface to the user. if parameters = True, return the parameters? ''' params_dict = {} h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'model_builders', False) request = '2/ModelBuilders.json' if algo: request += "/" + algo result = self.do_json_request(request, timeout=timeoutSecs, params=params_dict) # verboseprint(request, "result:", dump_json(result)) return result
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs): ''' Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out. ''' params_dict = {} # merge kwargs into params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False) start_time = time.time() while True: result = self.do_json_request('2/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict) # print 'Job: ', dump_json(result) if key: frames_result = self.frames(key=key) print 'frames_result for key:', key, dump_json(result) jobs = result['jobs'][0] description = jobs['description'] dest = jobs['dest'] dest_name = dest['name'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] print description, \ "dest_name:", dest_name, \ "\tprogress:", "%-10s" % progress, \ "\tstatus:", "%-12s" % status, \ "\tmsec:", msec if status=='DONE' or status=='CANCELLED' or status=='FAILED': return result # FIX! what are the other legal polling statuses that we should check for? if time.time() - start_time > timeoutSecs: print "Job:", job_key, "timed out in:", timeoutSecs return None time.sleep(retryDelaySecs)
def models(self, key=None, timeoutSecs=10, **kwargs): """ Return all of the models in the h2o cluster, or a single model given its key. The models are contained in a list called "models" at the top level of the result. Currently the list is unordered. TODO: When find_compatible_frames is implemented then the top level dict will also contain a "frames" list. """ params_dict = {"find_compatible_frames": False} h2o_methods.check_params_update_kwargs(params_dict, kwargs, "models", True) if key: # result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict) # print "for ray:", dump_json(result) result = self.do_json_request("3/Models.json/" + key, timeout=timeoutSecs, params=params_dict) else: result = self.do_json_request("3/Models.json", timeout=timeoutSecs, params=params_dict) verboseprint("models result:", dump_json(result)) h2o_sandbox.check_sandbox_for_errors() return result
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_frames': None, 'destination_frame': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings': None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception( "key seems to be bad in parse. Should be list or string. %s" % key) # have to put double quotes around the individual list items (single not legal) source_frames = "[" + ",".join(map( (lambda x: '"' + x + '"'), key)) + "]" else: # what if None here source_frames = '["' + key + '"]' # quotes required on key params_dict['source_frames'] = source_frames # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_frames=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_frames': source_frames} setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_frames']: # should these be quoted? source_framesStr = "[" + ",".join([ ('"%s"' % src['name']) for src in setup_result['source_frames'] ]) + "]" else: source_framesStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: # single quotes not legal..need double quotes columnNamesStr = "[" + ",".join( map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: # single quotes not legal..need double quotes naStrings = "[" + ",".join( map((lambda x: '"' + x + '"' if x != None else '""'), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k, v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k >= 0 and k < len(ct): ct[k] = v else: raise Exception( "bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception( "bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]" parse_params = { 'source_frames': source_framesStr, 'destination_frame': setup_result['destination_frame'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len( setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print( "Not printing the parameters to Parse because the columnNames are too lengthy." ) h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs( parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_frames is length:", len(parse_params['source_frames']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request(jsonRequest="3/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_frame'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def parse(self, key, hex_key=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'srcs': None, 'hex': hex_key, 'pType': None, # This is a list? 'sep': None, 'ncols': None, 'checkHeader': None, # how is this used 'singleQuotes': None, 'columnNames': None, # list? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) srcs = "[" + ",".join(key) + "]" else: # what if None here srcs = "[" + key + "]" params_dict['srcs'] = srcs # merge kwargs into params_dict # =None overwrites params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?srcs=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'srcs': srcs} setup_result = self.do_json_request(jsonRequest="ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # and then Parse?srcs=<keys list> and params from the ParseSetup result # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON] if setup_result['srcs']: setupSrcs = "[" + ",".join([src['name'] for src in setup_result['srcs'] ]) + "]" else: setupSrcs = None # I suppose we need a way for parameters to parse() to override these if setup_result['columnNames']: ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]" else: ascii_column_names = None parse_params = { 'srcs': setupSrcs, 'hex': setup_result['hexName'], 'pType': setup_result['pType'], 'sep': setup_result['sep'], 'ncols': setup_result['ncols'], 'checkHeader': setup_result['checkHeader'], 'singleQuotes': setup_result['singleQuotes'], 'columnNames': ascii_column_names, # how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['columnNames'] and len(setup_result['columnNames']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse srcs is length:", len(parse_params['srcs']) print "parse columnNames is length:", len(parse_params['columnNames']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['name'] hex_key = parse_params['hex'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() return this.jobs(job_key) # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def parse(self, key, hex_key=None, columnTypeDict=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'source_keys': None, 'destination_key': hex_key, 'parse_type': None, # file type 'separator': None, 'single_quotes': None, 'check_header': None, # forces first line to be seen as column names 'number_columns': None, 'column_names': None, # a list 'column_types': None, # a list. or can use columnTypeDict param (see below) 'na_strings' : None, # a list 'chunk_size': None, # are these two no longer supported? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception("key seems to be bad in parse. Should be list or string. %s" % key) # have to put quotes around the individual list items source_keys = "[" + ",".join(map((lambda x: "'" + x + "'"), key)) + "]" else: # what if None here source_keys = "['" + key + "']" # quotes required on key params_dict['source_keys'] = source_keys # merge kwargs into params_dict # =None overwrites params_dict # columnTypeDict not used here h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?source_keys=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'source_keys': source_keys} setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # this should match what we gave as input? if setup_result['source_keys']: # should these be quoted? source_keysStr = "[" + ",".join([("'%s'" % src['name']) for src in setup_result['source_keys'] ]) + "]" else: source_keysStr = None # I suppose we need a way for parameters to parse() to override these # should it be an array or a dict? if setup_result['column_names']: columnNamesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), setup_result['column_names'])) + "]" else: columnNamesStr = None columnTypes = setup_result['column_types'] assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes) if setup_result['na_strings']: naStrings = "[" + ",".join(map((lambda x: "'" + x + "'" if x != None else "''"), setup_result['na_strings'])) + "]" else: naStrings = None # dict parameter to update columnTypeDict? # but we don't pass columnNames like this? ct = setup_result['column_types'] if columnTypeDict: for k,v in columnTypeDict.iteritems(): if isinstance(k, int): # if a column index if k>=0 and k<len(ct): ct[k] = v else: raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict)) # if a column name elif isinstance(k, basestring): # find the index if k not in columnNames: raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames)) ci = columnNames.index(k) ct[ci] = v else: raise Exception("%s %s should be int or string" % (k, type(k))) columnTypesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), ct)) + "]" parse_params = { 'source_keys': source_keysStr, 'destination_key': setup_result['destination_key'], 'parse_type': setup_result['parse_type'], 'separator': setup_result['separator'], 'single_quotes': setup_result['single_quotes'], 'check_header': setup_result['check_header'], 'number_columns': setup_result['number_columns'], 'column_names': columnNamesStr, 'column_types': columnTypesStr, 'na_strings': naStrings, 'chunk_size': setup_result['chunk_size'], # No longer supported? how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse source_keys is length:", len(parse_params['source_keys']) # This can be null now? parseSetup doesn't return default colnames? # print "parse column_names is length:", len(parse_params['column_names']) # none of the kwargs passed to here! parse_result = self.do_json_request( jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['destination_key'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status=='FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")
def parse(self, key, hex_key=None, timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180, noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs): ''' Parse an imported raw file or files into a Frame. ''' # these should override what parse setup gets below params_dict = { 'srcs': None, 'hex': hex_key, 'pType': None, # This is a list? 'sep': None, 'ncols': None, 'checkHeader': None, # how is this used 'singleQuotes': None, 'columnNames': None, # list? 'delete_on_done': None, 'blocking': None, } # if key is a list, create a comma separated string # list or tuple but not string if not isinstance(key, basestring): # it's a list of some kind (tuple ok?) # if len(key) > 1: # print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key # len 1 is ok here. 0 not. what if None or [None] here if not key: raise Exception( "key seems to be bad in parse. Should be list or string. %s" % key) srcs = "[" + ",".join(key) + "]" else: # what if None here srcs = "[" + key + "]" params_dict['srcs'] = srcs # merge kwargs into params_dict # =None overwrites params_dict h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False) # Call ParseSetup?srcs=[keys] . . . # if benchmarkLogging: # cloudPerfH2O.get_log_save(initOnly=True) # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True) params_setup = {'srcs': srcs} setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup) h2o_sandbox.check_sandbox_for_errors() verboseprint("ParseSetup result:", dump_json(setup_result)) # and then Parse?srcs=<keys list> and params from the ParseSetup result # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON] if setup_result['srcs']: setupSrcs = "[" + ",".join( [src['name'] for src in setup_result['srcs']]) + "]" else: setupSrcs = None # I suppose we need a way for parameters to parse() to override these if setup_result['columnNames']: ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]" else: ascii_column_names = None parse_params = { 'srcs': setupSrcs, 'hex': setup_result['hexName'], 'pType': setup_result['pType'], 'sep': setup_result['sep'], 'ncols': setup_result['ncols'], 'checkHeader': setup_result['checkHeader'], 'singleQuotes': setup_result['singleQuotes'], 'columnNames': ascii_column_names, # how come these aren't in setup_result? 'delete_on_done': params_dict['delete_on_done'], 'blocking': params_dict['blocking'], } # HACK: if there are too many column names..don't print! it is crazy output # just check the output of parse setup. Don't worry about columnNames passed as params here. tooManyColNamesToPrint = setup_result['columnNames'] and len( setup_result['columnNames']) > 2000 if tooManyColNamesToPrint: h2p.yellow_print( "Not printing the parameters to Parse because the columnNames are too lengthy." ) h2p.yellow_print("See sandbox/commands.log") # merge params_dict into parse_params # don't want =None to overwrite parse_params h2o_methods.check_params_update_kwargs( parse_params, params_dict, 'parse after merge into parse setup', print_params=not tooManyColNamesToPrint, ignoreNone=True) print "parse srcs is length:", len(parse_params['srcs']) print "parse columnNames is length:", len(parse_params['columnNames']) # none of the kwargs passed to here! parse_result = self.do_json_request(jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs) verboseprint("Parse result:", dump_json(parse_result)) job_key = parse_result['job']['key']['name'] hex_key = parse_params['hex'] # TODO: dislike having different shapes for noPoll and poll if noPoll: # ?? h2o_sandbox.check_sandbox_for_errors() # return self.jobs(job_key) return parse_result # does Frame also, while polling if intermediateResults: key = hex_key else: key = None job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key) if job_result: jobs = job_result['jobs'][0] description = jobs['description'] dest = jobs['dest'] msec = jobs['msec'] status = jobs['status'] progress = jobs['progress'] dest_key = dest['name'] # can condition this with a parameter if some FAILED are expected by tests. if status == 'FAILED': print dump_json(job_result) raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \ (status, progress, msec, dest_key, description)) return self.frames(dest_key) else: # ? we should always get a job_json result raise Exception("parse didn't get a job_result when it expected one")