コード例 #1
0
ファイル: h2o_ray.py プロジェクト: JMR-b/h2o-dev
def frames(self, key=None, timeoutSecs=10, **kwargs):
    if not (key is None or isinstance(key, (basestring, Key))):
        raise Exception("frames: key should be string or Key type %s %s" % (type(key), key))

    params_dict = {
        'find_compatible_models': 0,
        'offset': 0, # is offset working yet?
        'len': 5,
    }
    '''
    Return a single Frame or all of the Frames in the h2o cluster.  The
    frames are contained in a list called "frames" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_models is implemented then the top level 
    dict will also contain a "models" list.
    '''
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'frames', False)
    
    # key can be type Key? (from h2o_xl) str(key) should return
    if key:
        if isinstance(key, Key):
            keyStr = key.frame
        else:
            keyStr = key
        result = self.do_json_request('3/Frames.json/' + keyStr, timeout=timeoutSecs, params=params_dict)
    else:
        result = self.do_json_request('3/Frames.json', timeout=timeoutSecs, params=params_dict)
    return result
コード例 #2
0
ファイル: h2o_ray.py プロジェクト: patchlog/h2o-3
def model_builders(self, algo=None, timeoutSecs=10, **kwargs):
    '''
    Return a model builder or all of the model builders known to the
    h2o cluster.  The model builders are contained in a dictionary
    called "model_builders" at the top level of the result.  The
    dictionary maps algorithm names to parameters lists.  Each of the
    parameters contains all the metdata required by a client to
    present a model building interface to the user.

    if parameters = True, return the parameters?
    '''
    params_dict = {}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs,
                                           'model_builders', False)

    request = '3/ModelBuilders.json'
    if algo:
        request += "/" + algo

    result = self.do_json_request(request,
                                  timeout=timeoutSecs,
                                  params=params_dict)
    # verboseprint(request, "result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
コード例 #3
0
ファイル: h2o_ray.py プロジェクト: patchlog/h2o-3
def frames(self, key=None, timeoutSecs=60, **kwargs):
    if not (key is None or isinstance(key, (basestring, Key))):
        raise Exception("frames: key should be string or Key type %s %s" %
                        (type(key), key))

    params_dict = {
        'find_compatible_models': 0,
        'row_offset': 0,  # is offset working yet?
        'row_count': 5,
    }
    '''
    Return a single Frame or all of the Frames in the h2o cluster.  The
    frames are contained in a list called "frames" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_models is implemented then the top level 
    dict will also contain a "models" list.
    '''
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'frames',
                                           False)

    # key can be type Key? (from h2o_xl) str(key) should return
    if key:
        if isinstance(key, Key):
            keyStr = key.frame
        else:
            keyStr = key
        result = self.do_json_request('3/Frames.json/' + keyStr,
                                      timeout=timeoutSecs,
                                      params=params_dict)
    else:
        result = self.do_json_request('3/Frames.json',
                                      timeout=timeoutSecs,
                                      params=params_dict)
    return result
コード例 #4
0
ファイル: h2o_ray.py プロジェクト: patchlog/h2o-3
def models(self, key=None, timeoutSecs=10, **kwargs):
    '''
    Return all of the models in the h2o cluster, or a single model given its key.  
    The models are contained in a list called "models" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_frames is implemented then the top level 
    dict will also contain a "frames" list.
    '''
    params_dict = {'find_compatible_frames': False}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True)

    if key:
        # result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict)
        # print "for ray:", dump_json(result)
        result = self.do_json_request('3/Models.json/' + key,
                                      timeout=timeoutSecs,
                                      params=params_dict)
    else:
        result = self.do_json_request('3/Models.json',
                                      timeout=timeoutSecs,
                                      params=params_dict)

    verboseprint("models result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
コード例 #5
0
def poll_job(self,
             job_key,
             timeoutSecs=10,
             retryDelaySecs=0.5,
             key=None,
             **kwargs):
    '''
    Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out.
    '''
    params_dict = {}
    # merge kwargs into params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job',
                                           False)

    start_time = time.time()
    pollCount = 0
    while True:
        result = self.do_json_request('2/Jobs.json/' + job_key,
                                      timeout=timeoutSecs,
                                      params=params_dict)
        # print 'Job: ', dump_json(result)

        if key:
            frames_result = self.frames(key=key)
            print 'frames_result for key:', key, dump_json(result)

        jobs = result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        dest_name = dest['name']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        print description, \
            "dest_name:", dest_name, \
            "\tprogress:", "%-10s" % progress, \
            "\tstatus:", "%-12s" % status, \
            "\tmsec:", msec

        if status == 'DONE' or status == 'CANCELLED' or status == 'FAILED':
            h2o_sandbox.check_sandbox_for_errors()
            return result

        # FIX! what are the other legal polling statuses that we should check for?

        if not h2o_args.no_timeout and (time.time() - start_time >
                                        timeoutSecs):
            h2o_sandbox.check_sandbox_for_errors()
            emsg = "Job:", job_key, "timed out in:", timeoutSecs
            raise Exception(emsg)
            print emsg
            return None

        # check every other poll, for now
        if (pollCount % 2) == 0:
            h2o_sandbox.check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        pollCount += 1
コード例 #6
0
ファイル: h2o_ray.py プロジェクト: zxsted/h2o-3
def column(self, key, column, timeoutSecs=10, **kwargs):
    params_dict = {"offset": 0, "len": 100}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, "column", True)

    result = self.do_json_request(
        "3/Frames.json/" + key + "/columns/" + column, timeout=timeoutSecs, params=params_dict
    )
    return result
コード例 #7
0
ファイル: h2o_ray.py プロジェクト: krishnatray/h2o-dev
def column(self, key, column, timeoutSecs=10, **kwargs):
    params_dict = { 
        'offset': 0,
        'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'column', True)
    
    result = self.do_json_request('3/Frames.json/' + key + '/columns/' + column, timeout=timeoutSecs, params=params_dict)
    return result
コード例 #8
0
ファイル: h2o_ray.py プロジェクト: patchlog/h2o-3
def column(self, key, column, timeoutSecs=10, **kwargs):
    params_dict = {'offset': 0, 'len': 100}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'column', True)

    result = self.do_json_request('3/Frames.json/' + key + '/columns/' +
                                  column,
                                  timeout=timeoutSecs,
                                  params=params_dict)
    return result
コード例 #9
0
ファイル: h2o_ray.py プロジェクト: zxsted/h2o-3
def columns(self, key, timeoutSecs=10, **kwargs):
    """
    Return the columns for a single Frame in the h2o cluster.  
    """
    params_dict = {"offset": 0, "len": 100}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, "columns", True)

    result = self.do_json_request("3/Frames.json/" + key + "/columns", timeout=timeoutSecs, params=params_dict)
    return result
コード例 #10
0
ファイル: h2o_ray.py プロジェクト: zxsted/h2o-3
def jobs(self, job_key=None, timeoutSecs=10, **kwargs):
    """
    Fetch all the jobs or a single job from the /Jobs endpoint.
    """
    params_dict = {
        # 'job_key': job_key
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, "jobs", True)
    result = self.do_json_request("3/Jobs.json", timeout=timeoutSecs, params=params_dict)
    return result
コード例 #11
0
ファイル: h2o_ray.py プロジェクト: krishnatray/h2o-dev
def jobs(self, job_key=None, timeoutSecs=10, **kwargs):
    '''
    Fetch all the jobs or a single job from the /Jobs endpoint.
    '''
    params_dict = {
        'job_key': job_key
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'jobs', True)
    result = self.do_json_request('2/Jobs.json', timeout=timeoutSecs, params=params_dict)
    return result
コード例 #12
0
def jobs(self, job_key=None, timeoutSecs=10, **kwargs):
    '''
    Fetch all the jobs or a single job from the /Jobs endpoint.
    '''
    params_dict = {
        # 'job_key': job_key
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'jobs', True)
    result = self.do_json_request('3/Jobs.json', timeout=timeoutSecs, params=params_dict)
    return result
コード例 #13
0
ファイル: h2o_ray.py プロジェクト: AllCodeNoGyaan/h2o-3
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs):
    '''
    Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out.
    '''
    params_dict = {}
    # merge kwargs into params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False)

    start_time = time.time()
    pollCount = 0
    while True:
        result = self.do_json_request('3/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict)
        # print 'Job: ', dump_json(result)

        if key:
            frames_result = self.frames(key=key)
            print 'frames_result for key:', key, dump_json(result)

        jobs = result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        dest_name = dest['name']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        print description, \
            "dest_name:", dest_name, \
            "\tprogress:", "%-10s" % progress, \
            "\tstatus:", "%-12s" % status, \
            "\tmsec:", msec
        
        if status=='DONE' or status=='CANCELLED' or status=='FAILED':
            h2o_sandbox.check_sandbox_for_errors()
            return result

        # what about 'CREATED'
        # FIX! what are the other legal polling statuses that we should check for?

        if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs):
            h2o_sandbox.check_sandbox_for_errors()
            emsg = "Job:", job_key, "timed out in:", timeoutSecs

            # for debug
            a = h2o.nodes[0].get_cloud()
            print "cloud.json:", dump_json(a)
            raise Exception(emsg)
            print emsg
            return None

        # check every other poll, for now
        if (pollCount % 2) == 0:
            h2o_sandbox.check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        pollCount += 1
コード例 #14
0
ファイル: h2o_ray.py プロジェクト: narayana1208/h2o-dev
def summary(self, key, column="C1", timeoutSecs=10, **kwargs):
    '''
    Return the summary for a single column for a single Frame in the h2o cluster.  
    '''
    params_dict = { 
        'offset': 0,
        'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True)
    
    result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict)
    return result
コード例 #15
0
ファイル: h2o_ray.py プロジェクト: patchlog/h2o-3
def columns(self, key, timeoutSecs=10, **kwargs):
    '''
    Return the columns for a single Frame in the h2o cluster.  
    '''
    params_dict = {'offset': 0, 'len': 100}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'columns',
                                           True)

    result = self.do_json_request('3/Frames.json/' + key + '/columns',
                                  timeout=timeoutSecs,
                                  params=params_dict)
    return result
コード例 #16
0
ファイル: h2o_ray.py プロジェクト: krishnatray/h2o-dev
def columns(self, key, timeoutSecs=10, **kwargs):
    '''
    Return the columns for a single Frame in the h2o cluster.  
    '''
    params_dict = { 
        'offset': 0,
        'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'columns', True)
    
    result = self.do_json_request('3/Frames.json/' + key + '/columns', timeout=timeoutSecs, params=params_dict)
    return result
コード例 #17
0
def summary(self, key, column="C1", timeoutSecs=10, **kwargs):
    '''
    Return the summary for a single column for a single Frame in the h2o cluster.  
    '''
    params_dict = { 
        # 'offset': 0,
        # 'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True)
    
    result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict)
    h2o_sandbox.check_sandbox_for_errors()
    return result
コード例 #18
0
ファイル: h2o_ray.py プロジェクト: zxsted/h2o-3
def summary(self, key, column="C1", timeoutSecs=10, **kwargs):
    """
    Return the summary for a single column for a single Frame in the h2o cluster.  
    """
    params_dict = {
        # 'offset': 0,
        # 'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, "summary", True)

    result = self.do_json_request(
        "3/Frames.json/%s/columns/%s/summary" % (key, column), timeout=timeoutSecs, params=params_dict
    )
    h2o_sandbox.check_sandbox_for_errors()
    return result
コード例 #19
0
ファイル: h2o_ray.py プロジェクト: ansonism/h2o-dev
def models(self, key=None, timeoutSecs=10, **kwargs):
    '''
    Return all of the models in the h2o cluster, or a single model given its key.  
    The models are contained in a list called "models" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_frames is implemented then the top level 
    dict will also contain a "frames" list.
    '''
    params_dict = {
        'find_compatible_frames': False
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True)

    if key:
        result = self.do_json_request('3/Models.json/' + key, timeout=timeoutSecs, params=params_dict)
    else:
        result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict)
    return result
コード例 #20
0
ファイル: h2o_ray.py プロジェクト: krishnatray/h2o-dev
def frames(self, key=None, timeoutSecs=10, **kwargs):
    params_dict = {
        'find_compatible_models': 0,
        'offset': 0, # is offset working yet?
        'len': 5,
    }
    '''
    Return a single Frame or all of the Frames in the h2o cluster.  The
    frames are contained in a list called "frames" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_models is implemented then the top level 
    dict will also contain a "models" list.
    '''
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'frames', False)
    
    if key:
        result = self.do_json_request('3/Frames.json/' + key, timeout=timeoutSecs, params=params_dict)
    else:
        result = self.do_json_request('3/Frames.json', timeout=timeoutSecs, params=params_dict)
    return result
コード例 #21
0
ファイル: h2o_ray.py プロジェクト: narayana1208/h2o-dev
def model_builders(self, algo=None, timeoutSecs=10, **kwargs):
    '''
    Return a model builder or all of the model builders known to the
    h2o cluster.  The model builders are contained in a dictionary
    called "model_builders" at the top level of the result.  The
    dictionary maps algorithm names to parameters lists.  Each of the
    parameters contains all the metdata required by a client to
    present a model building interface to the user.

    if parameters = True, return the parameters?
    '''
    params_dict = {}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'model_builders', False)

    request = '2/ModelBuilders.json' 
    if algo:
        request += "/" + algo

    result = self.do_json_request(request, timeout=timeoutSecs, params=params_dict)
    # verboseprint(request, "result:", dump_json(result))
    return result
コード例 #22
0
ファイル: h2o_ray.py プロジェクト: narayana1208/h2o-dev
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs):
    '''
    Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out.
    '''
    params_dict = {}
    # merge kwargs into params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False)

    start_time = time.time()
    while True:
        result = self.do_json_request('2/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict)
        # print 'Job: ', dump_json(result)

        if key:
            frames_result = self.frames(key=key)
            print 'frames_result for key:', key, dump_json(result)

        jobs = result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        dest_name = dest['name']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        print description, \
            "dest_name:", dest_name, \
            "\tprogress:", "%-10s" % progress, \
            "\tstatus:", "%-12s" % status, \
            "\tmsec:", msec
        
        if status=='DONE' or status=='CANCELLED' or status=='FAILED':
            return result

        # FIX! what are the other legal polling statuses that we should check for?

        if time.time() - start_time > timeoutSecs:
            print "Job:", job_key, "timed out in:", timeoutSecs
            return None

        time.sleep(retryDelaySecs)
コード例 #23
0
ファイル: h2o_ray.py プロジェクト: zxsted/h2o-3
def models(self, key=None, timeoutSecs=10, **kwargs):
    """
    Return all of the models in the h2o cluster, or a single model given its key.  
    The models are contained in a list called "models" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_frames is implemented then the top level 
    dict will also contain a "frames" list.
    """
    params_dict = {"find_compatible_frames": False}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, "models", True)

    if key:
        # result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict)
        # print "for ray:", dump_json(result)
        result = self.do_json_request("3/Models.json/" + key, timeout=timeoutSecs, params=params_dict)
    else:
        result = self.do_json_request("3/Models.json", timeout=timeoutSecs, params=params_dict)

    verboseprint("models result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
コード例 #24
0
ファイル: h2o_ray.py プロジェクト: patchlog/h2o-3
def parse(self,
          key,
          hex_key=None,
          columnTypeDict=None,
          timeoutSecs=300,
          retryDelaySecs=0.2,
          initialDelaySecs=None,
          pollTimeoutSecs=180,
          noise=None,
          benchmarkLogging=None,
          noPoll=False,
          intermediateResults=False,
          **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_frames': None,
        'destination_frame': hex_key,
        'parse_type': None,  # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None,  # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None,  # a list
        'column_types':
        None,  # a list. or can use columnTypeDict param (see below)
        'na_strings': None,  # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }

    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception(
                "key seems to be bad in parse. Should be list or string. %s" %
                key)
        # have to put double quotes around the individual list items (single not legal)
        source_frames = "[" + ",".join(map(
            (lambda x: '"' + x + '"'), key)) + "]"

    else:
        # what if None here
        source_frames = '["' + key + '"]'  # quotes required on key

    params_dict['source_frames'] = source_frames

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict,
                                           kwargs,
                                           'parse before setup merge',
                                           print_params=False)
    # Call ParseSetup?source_frames=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_frames': source_frames}
    setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json",
                                        cmd='post',
                                        timeout=timeoutSecs,
                                        postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_frames']:
        # should these be quoted?
        source_framesStr = "[" + ",".join([
            ('"%s"' % src['name']) for src in setup_result['source_frames']
        ]) + "]"
    else:
        source_framesStr = None

    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        # single quotes not legal..need double quotes
        columnNamesStr = "[" + ",".join(
            map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
        # single quotes not legal..need double quotes
        naStrings = "[" + ",".join(
            map((lambda x: '"' + x + '"' if x != None else '""'),
                setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict:
        for k, v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k >= 0 and k < len(ct):
                    ct[k] = v
                else:
                    raise Exception(
                        "bad col index %s in columnTypeDict param %s" %
                        (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception(
                        "bad col name %s in columnTypeDict param %s. columnNames: %s"
                        % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]"

    parse_params = {
        'source_frames': source_framesStr,
        'destination_frame': setup_result['destination_frame'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings,
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here.
    tooManyColNamesToPrint = setup_result['column_names'] and len(
        setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print(
            "Not printing the parameters to Parse because the columnNames are too lengthy."
        )
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(
        parse_params,
        params_dict,
        'parse after merge into parse setup',
        print_params=not tooManyColNamesToPrint,
        ignoreNone=True)

    print "parse source_frames is length:", len(parse_params['source_frames'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request(jsonRequest="3/Parse.json",
                                        cmd='post',
                                        postData=parse_params,
                                        timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_frame']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status == 'FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
コード例 #25
0
ファイル: h2o_ray.py プロジェクト: krishnatray/h2o-dev
def parse(self, key, hex_key=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'srcs': None,
        'hex': hex_key, 
        'pType': None, # This is a list?
        'sep': None,
        'ncols': None,
        'checkHeader': None, # how is this used
        'singleQuotes': None,
        'columnNames': None, # list?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        srcs = "[" + ",".join(key) + "]"
    else:
        # what if None here
        srcs = "[" + key + "]"

    params_dict['srcs'] = srcs

    # merge kwargs into params_dict
    # =None overwrites params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)

    # Call ParseSetup?srcs=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'srcs': srcs}
    setup_result = self.do_json_request(jsonRequest="ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # and then Parse?srcs=<keys list> and params from the ParseSetup result
    # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON]

    if setup_result['srcs']:
        setupSrcs = "[" + ",".join([src['name'] for src in setup_result['srcs'] ]) + "]"
    else:
        setupSrcs = None
    
    # I suppose we need a way for parameters to parse() to override these
    if setup_result['columnNames']:
        ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]"
    else:
        ascii_column_names = None


    parse_params = {
        'srcs': setupSrcs,
        'hex': setup_result['hexName'],
        'pType': setup_result['pType'],
        'sep': setup_result['sep'],
        'ncols': setup_result['ncols'],
        'checkHeader': setup_result['checkHeader'],
        'singleQuotes': setup_result['singleQuotes'],
        'columnNames': ascii_column_names,
        # how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['columnNames'] and len(setup_result['columnNames']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse srcs is length:", len(parse_params['srcs'])
    print "parse columnNames is length:", len(parse_params['columnNames'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['name']
    hex_key = parse_params['hex']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        return this.jobs(job_key)

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
コード例 #26
0
ファイル: h2o_ray.py プロジェクト: bikash/h2o-dev
def parse(self, key, hex_key=None, columnTypeDict=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_keys': None,
        'destination_key': hex_key, 
        'parse_type': None, # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None, # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None, # a list
        'column_types': None, # a list. or can use columnTypeDict param (see below)
	'na_strings' : None, # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        # have to put quotes around the individual list items
        source_keys = "[" + ",".join(map((lambda x: "'" + x + "'"), key)) + "]"

    else:
        # what if None here
        source_keys = "['" + key + "']" # quotes required on key

    params_dict['source_keys'] = source_keys

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)
    # Call ParseSetup?source_keys=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_keys': source_keys}
    setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_keys']:
        # should these be quoted?
        source_keysStr = "[" + ",".join([("'%s'" % src['name']) for src in setup_result['source_keys'] ]) + "]"
    else:
        source_keysStr = None
    
    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        columnNamesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
	naStrings = "[" + ",".join(map((lambda x: "'" + x + "'" if x != None else "''"), setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict: 
        for k,v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k>=0 and k<len(ct):
                    ct[k] = v
                else:
                    raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), ct)) + "]"


    parse_params = {
        'source_keys': source_keysStr,
        'destination_key': setup_result['destination_key'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings, 
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse source_keys is length:", len(parse_params['source_keys'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_key']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
コード例 #27
0
def parse(self,
          key,
          hex_key=None,
          timeoutSecs=300,
          retryDelaySecs=0.2,
          initialDelaySecs=None,
          pollTimeoutSecs=180,
          noise=None,
          benchmarkLogging=None,
          noPoll=False,
          intermediateResults=False,
          **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'srcs': None,
        'hex': hex_key,
        'pType': None,  # This is a list?
        'sep': None,
        'ncols': None,
        'checkHeader': None,  # how is this used
        'singleQuotes': None,
        'columnNames': None,  # list?
        'delete_on_done': None,
        'blocking': None,
    }

    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception(
                "key seems to be bad in parse. Should be list or string. %s" %
                key)
        srcs = "[" + ",".join(key) + "]"
    else:
        # what if None here
        srcs = "[" + key + "]"

    params_dict['srcs'] = srcs

    # merge kwargs into params_dict
    # =None overwrites params_dict
    h2o_methods.check_params_update_kwargs(params_dict,
                                           kwargs,
                                           'parse before setup merge',
                                           print_params=False)

    # Call ParseSetup?srcs=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'srcs': srcs}
    setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json",
                                        cmd='post',
                                        timeout=timeoutSecs,
                                        postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # and then Parse?srcs=<keys list> and params from the ParseSetup result
    # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON]

    if setup_result['srcs']:
        setupSrcs = "[" + ",".join(
            [src['name'] for src in setup_result['srcs']]) + "]"
    else:
        setupSrcs = None

    # I suppose we need a way for parameters to parse() to override these
    if setup_result['columnNames']:
        ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]"
    else:
        ascii_column_names = None

    parse_params = {
        'srcs': setupSrcs,
        'hex': setup_result['hexName'],
        'pType': setup_result['pType'],
        'sep': setup_result['sep'],
        'ncols': setup_result['ncols'],
        'checkHeader': setup_result['checkHeader'],
        'singleQuotes': setup_result['singleQuotes'],
        'columnNames': ascii_column_names,
        # how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here.
    tooManyColNamesToPrint = setup_result['columnNames'] and len(
        setup_result['columnNames']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print(
            "Not printing the parameters to Parse because the columnNames are too lengthy."
        )
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(
        parse_params,
        params_dict,
        'parse after merge into parse setup',
        print_params=not tooManyColNamesToPrint,
        ignoreNone=True)

    print "parse srcs is length:", len(parse_params['srcs'])
    print "parse columnNames is length:", len(parse_params['columnNames'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request(jsonRequest="2/Parse.json",
                                        cmd='post',
                                        postData=parse_params,
                                        timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['hex']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status == 'FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")