Beispiel #1
0
def rapids(self, timeoutSecs=120, ignoreH2oError=False, **kwargs):
    # FIX! assume both of these are strings for now, not lists
    if 'ast' in kwargs and kwargs['ast'] is not None:
        assert isinstance(kwargs['ast'], basestring), "only string assumed? %s" % kwargs['ast']
    if 'funs' in kwargs and kwargs['funs'] is not None:
        assert isinstance(kwargs['funs'], basestring), "only string assumed? %s" % kwargs['funs']

    # currently runExec only does one or the other
    params_dict = {
        'ast': None,
        'funs': None,
    }

    check_params_update_kwargs(params_dict, kwargs, 'rapids', True)
    if 1==1:
        result = self.do_json_request('Rapids.json', cmd='post', timeout=timeoutSecs, postData=params_dict)
    else:
        result = self.do_json_request('Rapids.json', timeout=timeoutSecs, params=params_dict)

    verboseprint("rapids result:", dump_json(result))

    # FIX! maybe add something for ignoring conditionally?
    if 'exception' in result and result['exception'] and not ignoreH2oError:
        exception = result['exception']
        raise Exception('rapids with kwargs:\n%s\ngot exception:\n"%s"\n' % (dump_json(kwargs), exception))

    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #2
0
def model_metrics(self, timeoutSecs=60, **kwargs):
    '''
    ModelMetrics list. 
    '''
    result = self.do_json_request('/3/ModelMetrics.json', cmd='get', timeout=timeoutSecs)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #3
0
def model_builders(self, algo=None, timeoutSecs=10, **kwargs):
    '''
    Return a model builder or all of the model builders known to the
    h2o cluster.  The model builders are contained in a dictionary
    called "model_builders" at the top level of the result.  The
    dictionary maps algorithm names to parameters lists.  Each of the
    parameters contains all the metdata required by a client to
    present a model building interface to the user.

    if parameters = True, return the parameters?
    '''
    params_dict = {}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs,
                                           'model_builders', False)

    request = '3/ModelBuilders.json'
    if algo:
        request += "/" + algo

    result = self.do_json_request(request,
                                  timeout=timeoutSecs,
                                  params=params_dict)
    # verboseprint(request, "result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #4
0
def split_frame(self, timeoutSecs=120, noPoll=False, **kwargs):
    params_dict = {
        'dataset': None,
        'ratios': None,
        'destKeys': None,  # ['bigger', 'smaller']
    }
    check_params_update_kwargs(params_dict,
                               kwargs,
                               'split_frame',
                               print_params=True)
    firstResult = self.do_json_request('3/SplitFrame.json',
                                       cmd='post',
                                       timeout=timeoutSecs,
                                       params=params_dict)
    print "firstResult:", dump_json(firstResult)
    # FIX! what is ['dest']['name'] ..It's not there at the beginning?
    job_key = firstResult['key']['name']

    if noPoll:
        h2o_sandbox.check_sandbox_for_errors()
        return firstResult

    # is it polllable while it's in the CREATED state? msec looks wrong. start_time is 0
    time.sleep(2)
    result = self.poll_job(job_key)
    verboseprint("split_frame result:", dump_json(result))
    return result
Beispiel #5
0
def rapids_iseval(self, timeoutSecs=120, ignoreH2oError=False, **kwargs):
    # FIX! assume both of these are strings for now, not lists
    if 'ast_key' in kwargs and kwargs['ast_key'] is not None:
        assert isinstance(
            kwargs['ast_key'],
            basestring), "only string assumed? %s" % kwargs['ast_key']

    # currently runExec only does one or the other
    params_dict = {
        'ast_key': None,
    }

    check_params_update_kwargs(params_dict, kwargs, 'rapids_iseval', True)
    # doesn't like 'put' here?
    # doesn't like empty key
    result = self.do_json_request('3/Rapids.json/isEval',
                                  cmd='get',
                                  timeout=timeoutSecs,
                                  params=params_dict)
    verboseprint("rapids_iseval result:", dump_json(result))

    # FIX! maybe add something for ignoring conditionally?
    if 'exception' in result and result['exception'] and not ignoreH2oError:
        exception = result['exception']
        raise Exception('rapids with kwargs:\n%s\ngot exception:\n"%s"\n' %
                        (dump_json(kwargs), exception))

    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #6
0
def model_metrics(self, timeoutSecs=60, **kwargs):
    """
    ModelMetrics list. 
    """
    result = self.do_json_request("/3/ModelMetrics.json", cmd="get", timeout=timeoutSecs)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #7
0
def compute_model_metrics(self, model, frame, timeoutSecs=60, **kwargs):
    """
    Score a model on the h2o cluster on the given Frame and return only the model metrics. 
    """
    assert model is not None, '"model" parameter is null'
    assert frame is not None, '"frame" parameter is null'

    models = self.models(key=model, timeoutSecs=timeoutSecs)
    assert models is not None, "/Models REST call failed"
    assert (
        models["models"][0]["model_id"]["name"] == model
    ), "/Models/{0} returned Model {1} rather than Model {2}".format(model, models["models"][0]["key"]["name"], model)

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(frame)

    print "frames:", dump_json(frames)
    # is the name not there?
    # assert frames['frames'][0]['model_id']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, models['models'][0]['key']['name'], frame)

    result = self.do_json_request(
        "/3/ModelMetrics.json/models/" + model + "/frames/" + frame, cmd="post", timeout=timeoutSecs
    )

    mm = result["model_metrics"][0]
    verboseprint("model metrics: " + repr(mm))
    h2o_sandbox.check_sandbox_for_errors()
    return mm
Beispiel #8
0
def compute_model_metrics(self, model, frame, timeoutSecs=60, **kwargs):
    '''
    Score a model on the h2o cluster on the given Frame and return only the model metrics. 
    '''
    assert model is not None, '"model" parameter is null'
    assert frame is not None, '"frame" parameter is null'

    models = self.models(key=model, timeoutSecs=timeoutSecs)
    assert models is not None, "/Models REST call failed"
    assert models['models'][0]['model_id'][
        'name'] == model, "/Models/{0} returned Model {1} rather than Model {2}".format(
            model, models['models'][0]['key']['name'], model)

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(frame)

    print "frames:", dump_json(frames)
    # is the name not there?
    # assert frames['frames'][0]['model_id']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, models['models'][0]['key']['name'], frame)

    result = self.do_json_request('/3/ModelMetrics.json/models/' + model +
                                  '/frames/' + frame,
                                  cmd='post',
                                  timeout=timeoutSecs)

    mm = result['model_metrics'][0]
    verboseprint("model metrics: " + repr(mm))
    h2o_sandbox.check_sandbox_for_errors()
    return mm
Beispiel #9
0
def models(self, key=None, timeoutSecs=10, **kwargs):
    '''
    Return all of the models in the h2o cluster, or a single model given its key.  
    The models are contained in a list called "models" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_frames is implemented then the top level 
    dict will also contain a "frames" list.
    '''
    params_dict = {'find_compatible_frames': False}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True)

    if key:
        # result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict)
        # print "for ray:", dump_json(result)
        result = self.do_json_request('3/Models.json/' + key,
                                      timeout=timeoutSecs,
                                      params=params_dict)
    else:
        result = self.do_json_request('3/Models.json',
                                      timeout=timeoutSecs,
                                      params=params_dict)

    verboseprint("models result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #10
0
def model_metrics(self, timeoutSecs=60, **kwargs):
    '''
    ModelMetrics list. 
    '''
    result = self.do_json_request('/3/ModelMetrics.json', cmd='get', timeout=timeoutSecs)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #11
0
def import_files(self, path, timeoutSecs=180):
    """ 
    Import a file or files into h2o.  The 'file' parameter accepts a directory or a single file.
    192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets
    """
    a = self.do_json_request("3/ImportFiles.json", timeout=timeoutSecs, params={"path": path})
    verboseprint("\nimport_files result:", dump_json(a))
    h2o_sandbox.check_sandbox_for_errors()
    return a
Beispiel #12
0
def import_files(self, path, timeoutSecs=180):
    ''' 
    Import a file or files into h2o.  The 'file' parameter accepts a directory or a single file.
    192.168.0.37:54323/ImportFiles.html?file=%2Fhome%2F0xdiag%2Fdatasets
    '''
    a = self.do_json_request('3/ImportFiles.json',
                             timeout=timeoutSecs,
                             params={"path": path})
    verboseprint("\nimport_files result:", dump_json(a))
    h2o_sandbox.check_sandbox_for_errors()
    return a
Beispiel #13
0
def summary(self, key, column="C1", timeoutSecs=10, **kwargs):
    '''
    Return the summary for a single column for a single Frame in the h2o cluster.  
    '''
    params_dict = { 
        # 'offset': 0,
        # 'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True)
    
    result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #14
0
def summary(self, key, column="C1", timeoutSecs=10, **kwargs):
    '''
    Return the summary for a single column for a single Frame in the h2o cluster.  
    '''
    params_dict = { 
        'offset': 0,
        'len': 100
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'summary', True)
    
    result = self.do_json_request('3/Frames.json/%s/columns/%s/summary' % (key, column), timeout=timeoutSecs, params=params_dict)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #15
0
def quantiles(self, timeoutSecs=300, print_params=True, **kwargs):
    params_dict = {
        'destination_key': None,
        'training_frame': None,
        'validation_frame': None,
        'ignored_columns': None,
        'score_each_iteration': None,
        'probs': None,
    }
    check_params_update_kwargs(params_dict, kwargs, 'quantiles', print_params)
    a = self.do_json_request('3/Quantiles.json', timeout=timeoutSecs, params=params_dict)
    verboseprint("\nquantiles result:", dump_json(a))
    h2o_sandbox.check_sandbox_for_errors()
    return a
Beispiel #16
0
def quantiles(self, timeoutSecs=300, print_params=True, **kwargs):
    params_dict = {
        'source_key': None,
        'column': None,
        'quantile': None,
        'max_qbins': None,
        'interpolation_type': None,
        'multiple_pass': None,
    }
    check_params_update_kwargs(params_dict, kwargs, 'quantiles', print_params)
    a = self.do_json_request('Quantiles.json', timeout=timeoutSecs, params=params_dict)
    verboseprint("\nquantiles result:", dump_json(a))
    h2o_sandbox.check_sandbox_for_errors()
    return a
Beispiel #17
0
def quantiles(self, timeoutSecs=300, print_params=True, **kwargs):
    params_dict = {
        'destination_key': None,
        'training_frame': None,
        'validation_frame': None,
        'ignored_columns': None,
        'score_each_iteration': None,
        'probs': None,
    }
    check_params_update_kwargs(params_dict, kwargs, 'quantiles', print_params)
    a = self.do_json_request('Quantiles.json', timeout=timeoutSecs, params=params_dict)
    verboseprint("\nquantiles result:", dump_json(a))
    h2o_sandbox.check_sandbox_for_errors()
    return a
Beispiel #18
0
def frame_split(self, timeoutSecs=120, noPoll=False, **kwargs):
    params_dict = {
        'training_frame': None,
        'ratios': None,
    }
    check_params_update_kwargs(params_dict, kwargs, 'frame_split', print_params=True)
    firstResult = self.do_json_request('SplitFrame.json', timeout=timeoutSecs, params=params_dict)
    job_key = firstResult['job']['key']['name']

    if noPoll:
        h2o_sandbox.check_sandbox_for_errors()
        return firstResult

    result = self.poll_job(job_key)
    verboseprint("frame_split result:", dump_json(result))
    return result
Beispiel #19
0
def interaction(self, timeoutSecs=120, noPoll=False, **kwargs):
    # FIX! have to add legal params
    params_dict = {

    }
    check_params_update_kwargs(params_dict, kwargs, 'interaction', print_params=True)
    firstResult = self.do_json_request('3/Interaction.json', cmd='post', timeout=timeoutSecs, params=params_dict)
    job_key = firstResult['dest']['name']

    if noPoll:
        h2o_sandbox.check_sandbox_for_errors()
        return firstResult

    result = self.poll_job(job_key)
    verboseprint("interaction result:", dump_json(result))
    return result
Beispiel #20
0
def create_frame(self, timeoutSecs=120, noPoll=False, **kwargs):
    # FIX! have to add legal params
    params_dict = {

    }
    check_params_update_kwargs(params_dict, kwargs, 'create_frame', print_params=True)
    firstResult = self.do_json_request('3/CreateFrame.json', cmd='post', timeout=timeoutSecs, params=params_dict)
    job_key = firstResult['dest']['name']

    if noPoll:
        h2o_sandbox.check_sandbox_for_errors()
        return firstResult

    result = self.poll_job(job_key)
    verboseprint("create_frame result:", dump_json(result))
    return result
Beispiel #21
0
def poll_job2(self,
              firstResult,
              algo=None,
              timeoutSecs=60,
              noPoll=False,
              **kwargs):
    if noPoll:
        result = firstResult
    elif ('validation_error_count'
          in firstResult) and (firstResult['validation_error_count'] > 0):
        h2p.yellow_print("parameter error in %s" % algo)
        result = firstResult
    else:
        job_result = result1['jobs'][0]
        job_key = job_result['key']['name']
        verboseprint("%s job_key: %s" % (algo, job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print algo, " end on ", training_frame, 'took', time.time(
        ) - start, 'seconds'
        print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            if status == 'FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on %s job status: %s %s %s %s" % \
                    (algo, status, progress, msec, description))
            result = job_result

        else:
            raise Exception(
                "build_model didn't get a job_result when it expected one")

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #22
0
def predict(self, model, frame, timeoutSecs=60, **kwargs):
    assert model is not None, '"model" parameter is null'
    assert frame is not None, '"frame" parameter is null'

    models = self.models(key=model, timeoutSecs=timeoutSecs)
    assert models is not None, "/Models REST call failed"
    assert models['models'][0]['key'] == model, "/Models/{0} returned Model {1} rather than Model {2}".format(model, models['models'][0]['key']['name'], model)

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(frame)
    assert frames['frames'][0]['key']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, frames['frames'][0]['key']['name'], frame)

    result = self.do_json_request('/3/Predictions.json/models/' + model + '/frames/' + frame, cmd='post', timeout=timeoutSecs)

    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #23
0
def predict(self, model, frame, timeoutSecs=60, **kwargs):
    assert model is not None, '"model" parameter is null'
    assert frame is not None, '"frame" parameter is null'

    models = self.models(key=model, timeoutSecs=timeoutSecs)
    assert models is not None, "/Models REST call failed"
    assert models['models'][0]['key']['name'] == model, "/Models/{0} returned Model {1} rather than Model {2}".format(model, models['models'][0]['key']['name'], model)

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(frame)
    assert frames['frames'][0]['key']['name'] == frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(frame, frames['frames'][0]['key']['name'], frame)

    result = self.do_json_request('/3/Predictions.json/models/' + model + '/frames/' + frame, cmd='post', timeout=timeoutSecs)

    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #24
0
def split_frame(self, timeoutSecs=120, noPoll=False, **kwargs):
    params_dict = {
        'dataset': None,
        'ratios': None,
        'destKeys': None, # ['bigger', 'smaller']
    }
    check_params_update_kwargs(params_dict, kwargs, 'split_frame', print_params=True)
    firstResult = self.do_json_request('3/SplitFrame.json', cmd='post', timeout=timeoutSecs, params=params_dict)
    print "firstResult:", dump_json(firstResult)
    # FIX! what is ['dest']['name'] ..It's not there at the beginning?
    job_key = firstResult['key']['name']

    if noPoll:
        h2o_sandbox.check_sandbox_for_errors()
        return firstResult

    # is it polllable while it's in the CREATED state? msec looks wrong. start_time is 0
    time.sleep(2)
    result = self.poll_job(job_key)
    verboseprint("split_frame result:", dump_json(result))
    return result
Beispiel #25
0
def frame_split(self, timeoutSecs=120, noPoll=False, **kwargs):
    params_dict = {
        'training_frame': None,
        'ratios': None,
    }
    check_params_update_kwargs(params_dict,
                               kwargs,
                               'frame_split',
                               print_params=True)
    firstResult = self.do_json_request('SplitFrame.json',
                                       timeout=timeoutSecs,
                                       params=params_dict)
    job_key = firstResult['job']['key']['name']

    if noPoll:
        h2o_sandbox.check_sandbox_for_errors()
        return firstResult

    result = self.poll_job(job_key)
    verboseprint("frame_split result:", dump_json(result))
    return result
Beispiel #26
0
def model_builders(self, algo=None, timeoutSecs=10, **kwargs):
    '''
    Return a model builder or all of the model builders known to the
    h2o cluster.  The model builders are contained in a dictionary
    called "model_builders" at the top level of the result.  The
    dictionary maps algorithm names to parameters lists.  Each of the
    parameters contains all the metdata required by a client to
    present a model building interface to the user.

    if parameters = True, return the parameters?
    '''
    params_dict = {}
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'model_builders', False)

    request = '2/ModelBuilders.json' 
    if algo:
        request += "/" + algo

    result = self.do_json_request(request, timeout=timeoutSecs, params=params_dict)
    # verboseprint(request, "result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #27
0
def models(self, key=None, timeoutSecs=10, **kwargs):
    '''
    Return all of the models in the h2o cluster, or a single model given its key.  
    The models are contained in a list called "models" at the top level of the
    result.  Currently the list is unordered.
    TODO:
    When find_compatible_frames is implemented then the top level 
    dict will also contain a "frames" list.
    '''
    params_dict = {
        'find_compatible_frames': False
    }
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'models', True)

    if key:
        result = self.do_json_request('3/Models.json/' + key, timeout=timeoutSecs, params=params_dict)
    else:
        result = self.do_json_request('3/Models.json', timeout=timeoutSecs, params=params_dict)
    
    verboseprint("models result:", dump_json(result))
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #28
0
def poll_job(self,
             job_key,
             timeoutSecs=10,
             retryDelaySecs=0.5,
             key=None,
             **kwargs):
    '''
    Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out.
    '''
    params_dict = {}
    # merge kwargs into params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job',
                                           False)

    start_time = time.time()
    pollCount = 0
    while True:
        result = self.do_json_request('2/Jobs.json/' + job_key,
                                      timeout=timeoutSecs,
                                      params=params_dict)
        # print 'Job: ', dump_json(result)

        if key:
            frames_result = self.frames(key=key)
            print 'frames_result for key:', key, dump_json(result)

        jobs = result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        dest_name = dest['name']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        print description, \
            "dest_name:", dest_name, \
            "\tprogress:", "%-10s" % progress, \
            "\tstatus:", "%-12s" % status, \
            "\tmsec:", msec

        if status == 'DONE' or status == 'CANCELLED' or status == 'FAILED':
            h2o_sandbox.check_sandbox_for_errors()
            return result

        # FIX! what are the other legal polling statuses that we should check for?

        if not h2o_args.no_timeout and (time.time() - start_time >
                                        timeoutSecs):
            h2o_sandbox.check_sandbox_for_errors()
            emsg = "Job:", job_key, "timed out in:", timeoutSecs
            raise Exception(emsg)
            print emsg
            return None

        # check every other poll, for now
        if (pollCount % 2) == 0:
            h2o_sandbox.check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        pollCount += 1
Beispiel #29
0
def poll_job2(self, firstResult, algo=None, timeoutSecs=60, noPoll=False, **kwargs):
    if noPoll:
        result = firstResult
    elif 'validation_error_count' in firstResult:
        h2p.yellow_print("parameter error in %s" % algo)
        result = firstResult
    else:
        job_result = result1['jobs'][0]
        job_key = job_result['key']['name']
        verboseprint("%s job_key: %s" % (algo, job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print algo, " end on ", training_frame, 'took', time.time() - start, 'seconds'
        print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            if status=='FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on %s job status: %s %s %s %s" % \
                    (algo, status, progress, msec, description))
            result = job_result

        else:
            raise Exception("build_model didn't get a job_result when it expected one")

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #30
0
def csv_download(self, key, csvPathname, timeoutSecs=60, **kwargs):
    params = {
        'key': key
    }

    paramsStr = '?' + '&'.join(['%s=%s' % (k, v) for (k, v) in params.items()])
    url = self.url('DownloadDataset.json')
    log('Start ' + url + paramsStr, comment=csvPathname)

    # do it (absorb in 1024 byte chunks)
    r = requests.get(url, params=params, timeout=timeoutSecs)
    print "csv_download r.headers:", r.headers
    if r.status_code == 200:
        f = open(csvPathname, 'wb')
        for chunk in r.iter_content(1024):
            f.write(chunk)
    else:
        raise Exception("unexpected status for DownloadDataset: %s" % r.status_code)

    print csvPathname, "size:", h2o_util.file_size_formatted(csvPathname)
    h2o_sandbox.check_sandbox_for_errors()

    # FIX! we're skipping all the checks in do_json_request. And no json return?
    return 
Beispiel #31
0
def poll_job(self, job_key, timeoutSecs=10, retryDelaySecs=0.5, key=None, **kwargs):
    '''
    Poll a single job from the /Jobs endpoint until it is "status": "DONE" or "CANCELLED" or "FAILED" or we time out.
    '''
    params_dict = {}
    # merge kwargs into params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'poll_job', False)

    start_time = time.time()
    pollCount = 0
    while True:
        result = self.do_json_request('3/Jobs.json/' + job_key, timeout=timeoutSecs, params=params_dict)
        # print 'Job: ', dump_json(result)

        if key:
            frames_result = self.frames(key=key)
            print 'frames_result for key:', key, dump_json(result)

        jobs = result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        dest_name = dest['name']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        print description, \
            "dest_name:", dest_name, \
            "\tprogress:", "%-10s" % progress, \
            "\tstatus:", "%-12s" % status, \
            "\tmsec:", msec
        
        if status=='DONE' or status=='CANCELLED' or status=='FAILED':
            h2o_sandbox.check_sandbox_for_errors()
            return result

        # what about 'CREATED'
        # FIX! what are the other legal polling statuses that we should check for?

        if not h2o_args.no_timeout and (time.time() - start_time > timeoutSecs):
            h2o_sandbox.check_sandbox_for_errors()
            emsg = "Job:", job_key, "timed out in:", timeoutSecs

            # for debug
            a = h2o.nodes[0].get_cloud()
            print "cloud.json:", dump_json(a)
            raise Exception(emsg)
            print emsg
            return None

        # check every other poll, for now
        if (pollCount % 2) == 0:
            h2o_sandbox.check_sandbox_for_errors()

        time.sleep(retryDelaySecs)
        pollCount += 1
Beispiel #32
0
def check_sandbox_for_errors(cloudShutdownIsError=False, sandboxIgnoreErrors=False, python_test_name=''):
    # dont' have both tearDown and tearDownClass report the same found error
    # only need the first
    if nodes and nodes[0].sandbox_error_report(): # gets current state
        return

    # Can build a cloud that ignores all sandbox things that normally fatal the test
    # Kludge, test will set this directly if it wants, rather than thru build_cloud parameter.
    # we need the sandbox_ignore_errors, for the test teardown_cloud..the state disappears!
    ignore = sandboxIgnoreErrors or (nodes and nodes[0].sandbox_ignore_errors)
    errorFound = h2o_sandbox.check_sandbox_for_errors(
        LOG_DIR=LOG_DIR,
        sandboxIgnoreErrors=ignore,
        cloudShutdownIsError=cloudShutdownIsError,
        python_test_name=python_test_name)

    if errorFound and nodes:
        nodes[0].sandbox_error_report(True) # sets
Beispiel #33
0
def check_sandbox_for_errors(cloudShutdownIsError=False, sandboxIgnoreErrors=False, python_test_name=''):
    # dont' have both tearDown and tearDownClass report the same found error
    # only need the first
    global sandbox_error_was_reported
    if sandbox_error_was_reported: # gets current state
        return

    # Can build a cloud that ignores all sandbox things that normally fatal the test
    # Kludge, test will set this directly if it wants, rather than thru build_cloud parameter.
    # we need the sandbox_ignore_errors, for the test teardown_cloud..the state disappears!
    ignore = sandboxIgnoreErrors or (h2o_nodes.nodes and h2o_nodes.nodes[0].sandbox_ignore_errors)
    errorFound = h2o_sandbox.check_sandbox_for_errors(
        LOG_DIR=LOG_DIR,
        sandboxIgnoreErrors=ignore,
        cloudShutdownIsError=cloudShutdownIsError,
        python_test_name=python_test_name)

    if errorFound:
        sandbox_error_was_reported = True
Beispiel #34
0
def build_model(self, algo, training_frame, parameters, destination_key=None, 
    timeoutSecs=60, asynchronous=False, **kwargs):
    '''
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    '''
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders['model_builders'], "%s %s" % (algo, [k for k in model_builders['model_builders']])
    builder = model_builders['model_builders'][algo]
    
    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(training_frame)

    key_name = frames['frames'][0]['key']['name'] 
    assert key_name==training_frame, \
        "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame)
    parameters['training_frame'] = training_frame

    if destination_key is not None:
        parameters['destination_key'] = destination_key

    print "build_model parameters", parameters
    result1 = self.do_json_request('/2/ModelBuilders.json/' + algo, cmd='post', 
        timeout=timeoutSecs, postData=parameters)
    verboseprint("build_model result", dump_json(result1))

    if asynchronous:
        result = result1
    elif 'validation_error_count' in result1:
        h2p.yellow_print("parameter error in model_builders")
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
    else:
        job_result = result1['jobs'][0]
        job_key = job_result['key']['name']
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            # can condition this with a parameter if some FAILED are expected by tests.
            if status=='FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on build_model job status: %s %s %s %s" % \
                    (status, progress, msec, description))

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception("build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    return result
Beispiel #35
0
def build_model(
    self,
    algo,
    training_frame,
    parameters,
    destination_frame=None,
    model_id=None,
    timeoutSecs=60,
    noPoll=False,
    **kwargs
):

    if "destination_key" in kwargs:
        raise Exception("Change destination_key in build_model() to model_id")

    """
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    """
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders["model_builders"], "%s %s" % (algo, [k for k in model_builders["model_builders"]])
    builder = model_builders["model_builders"][algo]

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(training_frame)

    key_name = frames["frames"][0]["frame_id"]["name"]
    assert key_name == training_frame, "/Frames/{0} returned Frame {1} rather than Frame {2}".format(
        training_frame, key_name, training_frame
    )
    parameters["training_frame"] = training_frame

    if destination_frame is not None:
        print "destination_frame should be replaced by model_id now"
        parameters["model_id"] = destination_frame

    if model_id is not None:
        parameters["model_id"] = model_id

    print "build_model parameters", parameters
    start = time.time()
    result1 = self.do_json_request(
        "/3/ModelBuilders.json/" + algo, cmd="post", timeout=timeoutSecs, postData=parameters
    )
    # make get overwritten after polling
    elapsed = time.time() - start
    verboseprint("build_model result", dump_json(result1))

    if noPoll:
        result = result1
    elif ("validation_error_count" in result1) and (result1["validation_error_count"] > 0):
        h2p.yellow_print("parameter error in model_builders: %s" % result1)
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
        # don't bother printing a time message
    elif "exception_msg" in result1:
        h2p.yellow_print("exception msg in model_builders: %s" % result1["exception_msg"])
        result = result1
    else:
        job_result = result1["job"]
        job_key = job_result["key"]["name"]
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print "ModelBuilders", algo, "end on", training_frame, "took", time.time() - start, "seconds"
        print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

        if job_result:
            jobs = job_result["jobs"][0]
            description = jobs["description"]
            dest = jobs["dest"]
            msec = jobs["msec"]
            status = jobs["status"]
            progress = jobs["progress"]

            # can condition this with a parameter if some FAILED are expected by tests.
            if status == "FAILED":
                print dump_json(job_result)
                raise Exception(
                    "Taking exception on build_model job status: %s %s %s %s" % (status, progress, msec, description)
                )

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception("build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    result["python_elapsed"] = elapsed
    return result
Beispiel #36
0
def build_model(self,
                algo,
                training_frame,
                parameters,
                destination_frame=None,
                model_id=None,
                timeoutSecs=60,
                noPoll=False,
                **kwargs):

    if 'destination_key' in kwargs:
        raise Exception('Change destination_key in build_model() to model_id')
    '''
    Build a model on the h2o cluster using the given algorithm, training 
    Frame and model parameters.
    '''
    assert algo is not None, '"algo" parameter is null'
    assert training_frame is not None, '"training_frame" parameter is null'
    assert parameters is not None, '"parameters" parameter is null'

    # why always check that the algo is in here?
    model_builders = self.model_builders(timeoutSecs=timeoutSecs)
    assert model_builders is not None, "/ModelBuilders REST call failed"
    assert algo in model_builders['model_builders'], "%s %s" % (
        algo, [k for k in model_builders['model_builders']])
    builder = model_builders['model_builders'][algo]

    # TODO: test this assert, I don't think this is working. . .
    frames = self.frames(key=training_frame)
    assert frames is not None, "/Frames/{0} REST call failed".format(
        training_frame)

    key_name = frames['frames'][0]['frame_id']['name']
    assert key_name==training_frame, \
        "/Frames/{0} returned Frame {1} rather than Frame {2}".format(training_frame, key_name, training_frame)
    parameters['training_frame'] = training_frame

    if destination_frame is not None:
        print "destination_frame should be replaced by model_id now"
        parameters['model_id'] = destination_frame

    if model_id is not None:
        parameters['model_id'] = model_id

    print "build_model parameters", parameters
    start = time.time()
    result1 = self.do_json_request('/3/ModelBuilders.json/' + algo,
                                   cmd='post',
                                   timeout=timeoutSecs,
                                   postData=parameters)
    # make get overwritten after polling
    elapsed = time.time() - start
    verboseprint("build_model result", dump_json(result1))

    if noPoll:
        result = result1
    elif ('validation_error_count'
          in result1) and (result1['validation_error_count'] > 0):
        h2p.yellow_print("parameter error in model_builders: %s" % result1)
        # parameters validation failure
        # TODO: add schema_type and schema_version into all the schemas to make this clean to check
        result = result1
        # don't bother printing a time message
    elif 'exception_msg' in result1:
        h2p.yellow_print("exception msg in model_builders: %s" %
                         result1['exception_msg'])
        result = result1
    else:
        job_result = result1['job']
        job_key = job_result['key']['name']
        verboseprint("build_model job_key: " + repr(job_key))

        job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs)
        verboseprint(job_result)

        elapsed = time.time() - start
        print "ModelBuilders", algo, "end on", training_frame, 'took', time.time(
        ) - start, 'seconds'
        print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100)

        if job_result:
            jobs = job_result['jobs'][0]
            description = jobs['description']
            dest = jobs['dest']
            msec = jobs['msec']
            status = jobs['status']
            progress = jobs['progress']

            # can condition this with a parameter if some FAILED are expected by tests.
            if status == 'FAILED':
                print dump_json(job_result)
                raise Exception("Taking exception on build_model job status: %s %s %s %s" % \
                    (status, progress, msec, description))

            result = job_result
        else:
            # ? we should always get a job_json result
            raise Exception(
                "build_model didn't get a job_result when it expected one")
            # return None

    verboseprint("result:", result)
    h2o_sandbox.check_sandbox_for_errors()
    result['python_elapsed'] = elapsed
    return result
Beispiel #37
0
#!/usr/bin/python
import sys
sys.path.extend(['.', '..', 'py'])
import h2o_sandbox
print "Will look at all the files in ./sandbox assuming they are stdout/stderr log files"
h2o_sandbox.check_sandbox_for_errors(pattern='*')
Beispiel #38
0
def parse(self,
          key,
          hex_key=None,
          columnTypeDict=None,
          timeoutSecs=300,
          retryDelaySecs=0.2,
          initialDelaySecs=None,
          pollTimeoutSecs=180,
          noise=None,
          benchmarkLogging=None,
          noPoll=False,
          intermediateResults=False,
          **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_frames': None,
        'destination_frame': hex_key,
        'parse_type': None,  # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None,  # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None,  # a list
        'column_types':
        None,  # a list. or can use columnTypeDict param (see below)
        'na_strings': None,  # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }

    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception(
                "key seems to be bad in parse. Should be list or string. %s" %
                key)
        # have to put double quotes around the individual list items (single not legal)
        source_frames = "[" + ",".join(map(
            (lambda x: '"' + x + '"'), key)) + "]"

    else:
        # what if None here
        source_frames = '["' + key + '"]'  # quotes required on key

    params_dict['source_frames'] = source_frames

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict,
                                           kwargs,
                                           'parse before setup merge',
                                           print_params=False)
    # Call ParseSetup?source_frames=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_frames': source_frames}
    setup_result = self.do_json_request(jsonRequest="3/ParseSetup.json",
                                        cmd='post',
                                        timeout=timeoutSecs,
                                        postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_frames']:
        # should these be quoted?
        source_framesStr = "[" + ",".join([
            ('"%s"' % src['name']) for src in setup_result['source_frames']
        ]) + "]"
    else:
        source_framesStr = None

    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        # single quotes not legal..need double quotes
        columnNamesStr = "[" + ",".join(
            map((lambda x: '"' + x + '"'), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
        # single quotes not legal..need double quotes
        naStrings = "[" + ",".join(
            map((lambda x: '"' + x + '"' if x != None else '""'),
                setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict:
        for k, v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k >= 0 and k < len(ct):
                    ct[k] = v
                else:
                    raise Exception(
                        "bad col index %s in columnTypeDict param %s" %
                        (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception(
                        "bad col name %s in columnTypeDict param %s. columnNames: %s"
                        % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: '"' + x + '"'), ct)) + "]"

    parse_params = {
        'source_frames': source_framesStr,
        'destination_frame': setup_result['destination_frame'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings,
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here.
    tooManyColNamesToPrint = setup_result['column_names'] and len(
        setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print(
            "Not printing the parameters to Parse because the columnNames are too lengthy."
        )
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(
        parse_params,
        params_dict,
        'parse after merge into parse setup',
        print_params=not tooManyColNamesToPrint,
        ignoreNone=True)

    print "parse source_frames is length:", len(parse_params['source_frames'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request(jsonRequest="3/Parse.json",
                                        cmd='post',
                                        postData=parse_params,
                                        timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_frame']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status == 'FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Beispiel #39
0
#!/usr/bin/python
import sys
sys.path.extend(['.','..','py'])
import h2o_sandbox
print "Will look at all the files in ./sandbox assuming they are stdout/stderr log files"
h2o_sandbox.check_sandbox_for_errors(pattern='*')

Beispiel #40
0
def sh2junit(name='NoName', cmd_string='/bin/ls', timeout=300, shdir=None, **kwargs):
    # split by arbitrary strings of whitespace characters (space, tab, newline, return, formfeed)
    print "cmd_string:", cmd_string
    cmdList = cmd_string.split()
    # these are absolute paths
    outfd, outpath = sandbox_tmp_file(prefix=name + '.stdout.', suffix='.log')
    errfd, errpath = sandbox_tmp_file(prefix=name + '.stderr.', suffix='.log')

    # make outpath and errpath full paths, so we can redirect
    print "outpath:", outpath
    print "errpath:", errpath

    start = time.time()
    print "psutil.Popen:", cmdList, outpath, errpath
    import subprocess
    # start the process in the target dir, if desired
    if shdir:
        currentDir = os.getcwd()
        os.chdir(shdir)
    ps = psutil.Popen(cmdList, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)
    if shdir:
        os.chdir(currentDir)

    comment = 'PID %d, stdout %s, stderr %s' % (
        ps.pid, os.path.basename(outpath), os.path.basename(errpath))
    print "spawn_cmd", cmd_string, comment

    # Reads the subprocess stdout until it is closed and 
    # ...echo it our python stdout and also the R stdout file in sandbox
    # Then wait for the program to exit. 
    # Read before wait so that you don't risk the pipe filling up and hanging the program. 
    # You wait after read for the final program exit and return code. 
    # If you don't wait, you'll get a zombie process (at least on linux)

    # this might not do what we want..see:
    # http://stackoverflow.com/questions/2804543/read-subprocess-stdout-line-by-line
    # I suppose we'll stop early?

    # shouldn't need a delay before checking this?
    if not ps.is_running():
        raise Exception("sh2junit: not immediate ps.is_running after start")

    # Until we get the rc, it can be a zombie process.
    # A zombie process is not a real process. 
    # it's just a remaining entry in the process table until the parent process requests the child's return code. 
    # The actual process has ended and requires no other resources but said process table entry.
    linesMayExist = True
    errors = 0 
    timeoutError = False
    while linesMayExist:
        # get whatever accumulated, up to nothing returned 
        # only do up to 20 lines before we check timeout again
        linesMayExist = ps.is_running() and not ps.status() == psutil.STATUS_ZOMBIE
        lineBurstCnt = 0
        # stdout from subprocess
        line = ps.stdout.readline()

        # R apparently uses stderr a lot, so want to mix that in. We don't grab it until we hit a stall in R stdout though.
        while line:
            lineBurstCnt += 1
            # maybe I should use p.communicate() instead. have to keep it to stdout? or do stdout+stderr here
            sys.stdout.write("R->" + line) # to our python stdout, with a prefix so it's obviously from R
            os.write(outfd, line) # to sandbox R stdout
            elapsed = time.time() - start
            if elapsed > timeout:
                timeoutError = True
                errors += 1
                print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout)
                #kill R subprocess but don't kill me
                terminate_process_tree(ps.pid, including_parent=False)
                break
            line = ps.stdout.readline()
        if timeoutError:
            print "\n\n\nERROR: timeout"
            break
        # stderr from subprocess
        line = ps.stderr.readline()
        while line:
            lineBurstCnt += 1
            sys.stdout.write("Re->" + line) # to our python stdout, with a prefix so it's obviously from R stderr
            os.write(errfd, line) # to sandbox R stderr
            line = ps.stderr.readline()
        print "lineBurstCnt:", lineBurstCnt

        # Check. may have flipped to not running, and we just got the last bit.
        # shouldn't be a race on a transition here, if ps.wait(0) completion syncs the transition
        if linesMayExist:
            print "ps.is_running():", ps.is_running(), ps.pid, ps.name, ps.status, ps.create_time
            # unload the return code without waiting..so we don't have a zombie!

        (lastrc, error) = rc_if_exists_and_done(ps)
        errors += error

        elapsed = time.time() - start
        # forever if timeout is None
        #if timeout and elapsed > timeout:
        if elapsed > timeout:
            timeoutError = True
            errors += 1
            # we don't want to exception here, because we're going to print the xml that says there's an error
            # I guess we'll end up terminating the R process down below
            # could we have lines in stdout we didn't catch up on? maybe, but do we care?
            print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout)
            #kill R subprocess but don't kill me
            #terminate_process_tree(ps.pid, including_parent=False)
            break
        # wait for some more output to accumulate
        time.sleep(0.25)
        
    # It shouldn't be running now?

    # timeout=None waits forever. timeout=0 returns immediately.
    # default above is 5 minutes
    # Wait for process termination. Since child:  return the exit code. 
    # If the process is already terminated does not raise NoSuchProcess exception 
    # but just return None immediately. 
    # If timeout is specified and process is still alive raises psutil.TimeoutExpired() exception. 
    # old
    # rc = ps.wait(timeout)
    (lastrc, error) = rc_if_exists_and_done(ps)
    errors += error
    elapsed = time.time() - start

    # Prune h2o logs to interesting lines and detect errors.
    # Error lines are returned. warning/info are printed to our (python stdout)
    # so that's always printed/saved?
    # None if no error
    sandboxErrorMessage = h2o_sandbox.check_sandbox_for_errors(
        LOG_DIR='./sandbox', 
        python_test_name=name, 
        cloudShutdownIsError=True, 
        sandboxIgnoreErrors=True) # don't take exception on error

    if sandboxErrorMessage:
        errors += 1

    out = file(outpath).read()
    err = file(errpath).read()
    create_junit_xml(name, out, err, sandboxErrorMessage, errors=errors, elapsed=elapsed)

    if not errors:
        return (errors, outpath, errpath)
    else:
        # dump all the info as part of the exception? maybe too much
        # is this bad to do in all cases? do we need it? 
        hline = "\n===========================================BEGIN DUMP=============================================================\n"
        hhline = "\n===========================================END DUMP=============================================================\n"
        out = '[stdout->err]: '.join(out.splitlines(True))
        err = '[sterr->err]: '.join(err.splitlines(True))
        if ps.is_running():
            print "Before terminate:", ps.pid, ps.is_running()
            terminate_process_tree(ps.pid, including_parent=True)
        if sandboxErrorMessage:
            print "\n\n\nError in Sandbox. Ending test. Dumping sub-process output.\n"
            print hline
            raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tErrors found in ./sandbox log files?.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % 
                (name, cmd_string, lastrc, errors, out, err, hhline))
        # could have already terminated?
        elif timeoutError:
            print "\n\n\nTimeout Error. Ending test. Dumping sub-process output.\n"
            print hline
            raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\ttimed out after %d secs. \nR stdout:\n%s\n\nR stderr:\n%s\n%s" %
                (name, cmd_string, lastrc, errors, timeout or 0, out, err, hhline))
        else:
            print "\n\n\nCaught exception. Ending test. Dumping sub-process output.\n"
            print hline
            raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tLikely non-zero exit code from R.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % 
                (name, cmd_string, lastrc, errors, out, err, hhline))
Beispiel #41
0
def sh2junit(name='NoName', cmd_string='/bin/ls', timeout=300, shdir=None, **kwargs):
    # split by arbitrary strings of whitespace characters (space, tab, newline, return, formfeed)
    print "cmd_string:", cmd_string
    cmdList = cmd_string.split()
    # these are absolute paths
    outfd, outpath = sandbox_tmp_file(prefix=name + '.stdout.', suffix='.log')
    errfd, errpath = sandbox_tmp_file(prefix=name + '.stderr.', suffix='.log')

    # make outpath and errpath full paths, so we can redirect
    print "outpath:", outpath
    print "errpath:", errpath

    start = time.time()
    print "psutil.Popen:", cmdList, outpath, errpath
    import subprocess
    # start the process in the target dir, if desired
    if shdir:
        currentDir = os.getcwd()
        os.chdir(shdir)
    ps = psutil.Popen(cmdList, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)
    if shdir:
        os.chdir(currentDir)

    comment = 'PID %d, stdout %s, stderr %s' % (
        ps.pid, os.path.basename(outpath), os.path.basename(errpath))
    print "spawn_cmd", cmd_string, comment

    # Reads the subprocess stdout until it is closed and 
    # ...echo it our python stdout and also the R stdout file in sandbox
    # Then wait for the program to exit. 
    # Read before wait so that you don't risk the pipe filling up and hanging the program. 
    # You wait after read for the final program exit and return code. 
    # If you don't wait, you'll get a zombie process (at least on linux)

    # this might not do what we want..see:
    # http://stackoverflow.com/questions/2804543/read-subprocess-stdout-line-by-line
    # I suppose we'll stop early?

    # shouldn't need a delay before checking this?
    if not ps.is_running():
        raise Exception("sh2junit: not immediate ps.is_running after start")

    # Until we get the rc, it can be a zombie process.
    # A zombie process is not a real process. 
    # it's just a remaining entry in the process table until the parent process requests the child's return code. 
    # The actual process has ended and requires no other resources but said process table entry.
    linesMayExist = True
    errors = 0 
    timeoutError = False
    while linesMayExist:
        # get whatever accumulated, up to nothing returned 
        # only do up to 20 lines before we check timeout again
        # why was R processes not completing on centos?
        # linesMayExist = ps.is_running() and not ps.status() == psutil.STATUS_ZOMBIE
        linesMayExist = ps.is_running()
        lineBurstCnt = 0
        # stdout from subprocess
        line = ps.stdout.readline()

        # R apparently uses stderr a lot, so want to mix that in. We don't grab it until we hit a stall in R stdout though.
        while line:
            lineBurstCnt += 1
            # maybe I should use p.communicate() instead. have to keep it to stdout? or do stdout+stderr here
            sys.stdout.write("R->" + line) # to our python stdout, with a prefix so it's obviously from R
            sys.stdout.flush()
            os.write(outfd, line) # to sandbox R stdout
            elapsed = time.time() - start
            if elapsed > timeout:
                timeoutError = True
                errors += 1
                print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout)
                #kill R subprocess but don't kill me
                terminate_process_tree(ps.pid, including_parent=False)
                break
            line = ps.stdout.readline()
        if timeoutError:
            print "\n\n\nERROR: timeout"
            break
        # stderr from subprocess
        line = ps.stderr.readline()
        while line:
            lineBurstCnt += 1
            sys.stdout.write("Re->" + line) # to our python stdout, with a prefix so it's obviously from R stderr
            sys.stdout.flush()
            os.write(errfd, line) # to sandbox R stderr
            line = ps.stderr.readline()
        print "lineBurstCnt:", lineBurstCnt

        # Check. may have flipped to not running, and we just got the last bit.
        # shouldn't be a race on a transition here, if ps.wait(0) completion syncs the transition
        if linesMayExist:
            print "ps.is_running():", ps.is_running(), ps.pid, ps.name, ps.status, ps.create_time
            # unload the return code without waiting..so we don't have a zombie!

        (lastrc, error) = rc_if_exists_and_done(ps)
        errors += error

        elapsed = time.time() - start
        # forever if timeout is None
        #if timeout and elapsed > timeout:
        if elapsed > timeout:
            timeoutError = True
            errors += 1
            # we don't want to exception here, because we're going to print the xml that says there's an error
            # I guess we'll end up terminating the R process down below
            # could we have lines in stdout we didn't catch up on? maybe, but do we care?
            print "ERROR: sh2junit: elapsed: %0.2f timeout: %s (secs) while echoing subprocess stdout" % (elapsed, timeout)
            #kill R subprocess but don't kill me
            #terminate_process_tree(ps.pid, including_parent=False)
            break
        # wait for some more output to accumulate
        time.sleep(0.25)
        
    # It shouldn't be running now?

    # timeout=None waits forever. timeout=0 returns immediately.
    # default above is 5 minutes
    # Wait for process termination. Since child:  return the exit code. 
    # If the process is already terminated does not raise NoSuchProcess exception 
    # but just return None immediately. 
    # If timeout is specified and process is still alive raises psutil.TimeoutExpired() exception. 
    # old
    # rc = ps.wait(timeout)
    (lastrc, error) = rc_if_exists_and_done(ps)
    errors += error
    elapsed = time.time() - start

    # Prune h2o logs to interesting lines and detect errors.
    # Error lines are returned. warning/info are printed to our (python stdout)
    # so that's always printed/saved?
    # None if no error
    sandboxErrorMessage = h2o_sandbox.check_sandbox_for_errors(
        LOG_DIR='./sandbox', 
        python_test_name=name, 
        cloudShutdownIsError=True, 
        sandboxIgnoreErrors=True) # don't take exception on error

    if sandboxErrorMessage:
        errors += 1

    out = file(outpath).read()
    err = file(errpath).read()
    create_junit_xml(name, out, err, sandboxErrorMessage, errors=errors, elapsed=elapsed)

    if not errors:
        return (errors, outpath, errpath)
    else:
        # dump all the info as part of the exception? maybe too much
        # is this bad to do in all cases? do we need it? 
        hline = "\n===========================================BEGIN DUMP=============================================================\n"
        hhline = "\n===========================================END DUMP=============================================================\n"
        out = '[stdout->err]: '.join(out.splitlines(True))
        err = '[sterr->err]: '.join(err.splitlines(True))
        if ps.is_running():
            print "Before terminate:", ps.pid, ps.is_running()
            terminate_process_tree(ps.pid, including_parent=True)
        if sandboxErrorMessage:
            print "\n\n\nError in Sandbox. Ending test. Dumping sub-process output.\n"
            print hline
            raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tErrors found in ./sandbox log files?.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % 
                (name, cmd_string, lastrc, errors, out, err, hhline))
        # could have already terminated?
        elif timeoutError:
            print "\n\n\nTimeout Error. Ending test. Dumping sub-process output.\n"
            print hline
            raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\ttimed out after %d secs. \nR stdout:\n%s\n\nR stderr:\n%s\n%s" %
                (name, cmd_string, lastrc, errors, timeout or 0, out, err, hhline))
        else:
            print "\n\n\nCaught exception. Ending test. Dumping sub-process output.\n"
            print hline
            raise Exception("%s %s \n\tlastrc:%s \n\terrors:%s \n\tLikely non-zero exit code from R.\nR stdout:\n%s\n\nR stderr:\n%s\n%s" % 
                (name, cmd_string, lastrc, errors, out, err, hhline))
Beispiel #42
0
def parse(self, key, hex_key=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'srcs': None,
        'hex': hex_key, 
        'pType': None, # This is a list?
        'sep': None,
        'ncols': None,
        'checkHeader': None, # how is this used
        'singleQuotes': None,
        'columnNames': None, # list?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        srcs = "[" + ",".join(key) + "]"
    else:
        # what if None here
        srcs = "[" + key + "]"

    params_dict['srcs'] = srcs

    # merge kwargs into params_dict
    # =None overwrites params_dict
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)

    # Call ParseSetup?srcs=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'srcs': srcs}
    setup_result = self.do_json_request(jsonRequest="ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # and then Parse?srcs=<keys list> and params from the ParseSetup result
    # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON]

    if setup_result['srcs']:
        setupSrcs = "[" + ",".join([src['name'] for src in setup_result['srcs'] ]) + "]"
    else:
        setupSrcs = None
    
    # I suppose we need a way for parameters to parse() to override these
    if setup_result['columnNames']:
        ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]"
    else:
        ascii_column_names = None


    parse_params = {
        'srcs': setupSrcs,
        'hex': setup_result['hexName'],
        'pType': setup_result['pType'],
        'sep': setup_result['sep'],
        'ncols': setup_result['ncols'],
        'checkHeader': setup_result['checkHeader'],
        'singleQuotes': setup_result['singleQuotes'],
        'columnNames': ascii_column_names,
        # how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['columnNames'] and len(setup_result['columnNames']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse srcs is length:", len(parse_params['srcs'])
    print "parse columnNames is length:", len(parse_params['columnNames'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['name']
    hex_key = parse_params['hex']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        return this.jobs(job_key)

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Beispiel #43
0
def parse(self,
          key,
          hex_key=None,
          timeoutSecs=300,
          retryDelaySecs=0.2,
          initialDelaySecs=None,
          pollTimeoutSecs=180,
          noise=None,
          benchmarkLogging=None,
          noPoll=False,
          intermediateResults=False,
          **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'srcs': None,
        'hex': hex_key,
        'pType': None,  # This is a list?
        'sep': None,
        'ncols': None,
        'checkHeader': None,  # how is this used
        'singleQuotes': None,
        'columnNames': None,  # list?
        'delete_on_done': None,
        'blocking': None,
    }

    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception(
                "key seems to be bad in parse. Should be list or string. %s" %
                key)
        srcs = "[" + ",".join(key) + "]"
    else:
        # what if None here
        srcs = "[" + key + "]"

    params_dict['srcs'] = srcs

    # merge kwargs into params_dict
    # =None overwrites params_dict
    h2o_methods.check_params_update_kwargs(params_dict,
                                           kwargs,
                                           'parse before setup merge',
                                           print_params=False)

    # Call ParseSetup?srcs=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'srcs': srcs}
    setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json",
                                        cmd='post',
                                        timeout=timeoutSecs,
                                        postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # and then Parse?srcs=<keys list> and params from the ParseSetup result
    # Parse?srcs=[nfs://Users/rpeck/Source/h2o2/smalldata/logreg/prostate.csv]&hex=prostate.hex&pType=CSV&sep=44&ncols=9&checkHeader=0&singleQuotes=false&columnNames=[ID,%20CAPSULE,%20AGE,%20RACE,%20DPROS,%20DCAPS,%20PSA,%20VOL,%20GLEASON]

    if setup_result['srcs']:
        setupSrcs = "[" + ",".join(
            [src['name'] for src in setup_result['srcs']]) + "]"
    else:
        setupSrcs = None

    # I suppose we need a way for parameters to parse() to override these
    if setup_result['columnNames']:
        ascii_column_names = "[" + ",".join(setup_result['columnNames']) + "]"
    else:
        ascii_column_names = None

    parse_params = {
        'srcs': setupSrcs,
        'hex': setup_result['hexName'],
        'pType': setup_result['pType'],
        'sep': setup_result['sep'],
        'ncols': setup_result['ncols'],
        'checkHeader': setup_result['checkHeader'],
        'singleQuotes': setup_result['singleQuotes'],
        'columnNames': ascii_column_names,
        # how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here.
    tooManyColNamesToPrint = setup_result['columnNames'] and len(
        setup_result['columnNames']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print(
            "Not printing the parameters to Parse because the columnNames are too lengthy."
        )
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(
        parse_params,
        params_dict,
        'parse after merge into parse setup',
        print_params=not tooManyColNamesToPrint,
        ignoreNone=True)

    print "parse srcs is length:", len(parse_params['srcs'])
    print "parse columnNames is length:", len(parse_params['columnNames'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request(jsonRequest="2/Parse.json",
                                        cmd='post',
                                        postData=parse_params,
                                        timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['hex']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status == 'FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")
Beispiel #44
0
def parse(self, key, hex_key=None, columnTypeDict=None,
          timeoutSecs=300, retryDelaySecs=0.2, initialDelaySecs=None, pollTimeoutSecs=180,
          noise=None, benchmarkLogging=None, noPoll=False, intermediateResults=False, **kwargs):
    '''
    Parse an imported raw file or files into a Frame.
    '''
    # these should override what parse setup gets below
    params_dict = {
        'source_keys': None,
        'destination_key': hex_key, 
        'parse_type': None, # file type 
        'separator': None,
        'single_quotes': None,
        'check_header': None, # forces first line to be seen as column names 
        'number_columns': None,
        'column_names': None, # a list
        'column_types': None, # a list. or can use columnTypeDict param (see below)
	'na_strings' : None, # a list
        'chunk_size': None,
        # are these two no longer supported?
        'delete_on_done': None,
        'blocking': None,
    }
        
    # if key is a list, create a comma separated string
    # list or tuple but not string
    if not isinstance(key, basestring):
        # it's a list of some kind (tuple ok?)
        # if len(key) > 1:
        #     print "I noticed you're giving me a list of > 1 keys %s to parse:" % len(key), key

        # len 1 is ok here. 0 not. what if None or [None] here
        if not key:
            raise Exception("key seems to be bad in parse. Should be list or string. %s" % key)
        # have to put quotes around the individual list items
        source_keys = "[" + ",".join(map((lambda x: "'" + x + "'"), key)) + "]"

    else:
        # what if None here
        source_keys = "['" + key + "']" # quotes required on key

    params_dict['source_keys'] = source_keys

    # merge kwargs into params_dict
    # =None overwrites params_dict

    # columnTypeDict not used here
    h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse before setup merge', print_params=False)
    # Call ParseSetup?source_keys=[keys] . . .

    # if benchmarkLogging:
    #     cloudPerfH2O.get_log_save(initOnly=True)

    # h2o_methods.check_params_update_kwargs(params_dict, kwargs, 'parse_setup', print_params=True)
    params_setup = {'source_keys': source_keys}
    setup_result = self.do_json_request(jsonRequest="2/ParseSetup.json", cmd='post', timeout=timeoutSecs, postData=params_setup)
    h2o_sandbox.check_sandbox_for_errors()
    verboseprint("ParseSetup result:", dump_json(setup_result))

    # this should match what we gave as input?
    if setup_result['source_keys']:
        # should these be quoted?
        source_keysStr = "[" + ",".join([("'%s'" % src['name']) for src in setup_result['source_keys'] ]) + "]"
    else:
        source_keysStr = None
    
    # I suppose we need a way for parameters to parse() to override these
    # should it be an array or a dict?
    if setup_result['column_names']:
        columnNamesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), setup_result['column_names'])) + "]"
    else:
        columnNamesStr = None

    columnTypes = setup_result['column_types']
    assert columnTypes is not None, "%s %s" % ("column_types:", columnTypes)

    if setup_result['na_strings']:
	naStrings = "[" + ",".join(map((lambda x: "'" + x + "'" if x != None else "''"), setup_result['na_strings'])) + "]"
    else:
        naStrings = None

    # dict parameter to update columnTypeDict?
    # but we don't pass columnNames like this?
    ct = setup_result['column_types']
    if columnTypeDict: 
        for k,v in columnTypeDict.iteritems():
            if isinstance(k, int):
                # if a column index
                if k>=0 and k<len(ct):
                    ct[k] = v
                else:
                    raise Exception("bad col index %s in columnTypeDict param %s" % (k, columnTypeDict))
            # if a column name
            elif isinstance(k, basestring):
                # find the index
                if k not in columnNames:
                    raise Exception("bad col name %s in columnTypeDict param %s. columnNames: %s" % (k, columnTypeDict, columnNames))
                ci = columnNames.index(k)
                ct[ci] = v
            else:
                raise Exception("%s %s should be int or string" % (k, type(k)))

    columnTypesStr = "[" + ",".join(map((lambda x: "'" + x + "'"), ct)) + "]"


    parse_params = {
        'source_keys': source_keysStr,
        'destination_key': setup_result['destination_key'],
        'parse_type': setup_result['parse_type'],
        'separator': setup_result['separator'],
        'single_quotes': setup_result['single_quotes'],
        'check_header': setup_result['check_header'],
        'number_columns': setup_result['number_columns'],
        'column_names': columnNamesStr,
        'column_types': columnTypesStr,
        'na_strings': naStrings, 
        'chunk_size': setup_result['chunk_size'],
        # No longer supported? how come these aren't in setup_result?
        'delete_on_done': params_dict['delete_on_done'],
        'blocking': params_dict['blocking'],
    }
    # HACK: if there are too many column names..don't print! it is crazy output
    # just check the output of parse setup. Don't worry about columnNames passed as params here. 
    tooManyColNamesToPrint = setup_result['column_names'] and len(setup_result['column_names']) > 2000
    if tooManyColNamesToPrint:
        h2p.yellow_print("Not printing the parameters to Parse because the columnNames are too lengthy.") 
        h2p.yellow_print("See sandbox/commands.log")

    # merge params_dict into parse_params
    # don't want =None to overwrite parse_params
    h2o_methods.check_params_update_kwargs(parse_params, params_dict, 'parse after merge into parse setup', 
        print_params=not tooManyColNamesToPrint, ignoreNone=True)

    print "parse source_keys is length:", len(parse_params['source_keys'])
    # This can be null now? parseSetup doesn't return default colnames?
    # print "parse column_names is length:", len(parse_params['column_names'])

    # none of the kwargs passed to here!
    parse_result = self.do_json_request( jsonRequest="2/Parse.json", cmd='post', postData=parse_params, timeout=timeoutSecs)
    verboseprint("Parse result:", dump_json(parse_result))

    job_key = parse_result['job']['key']['name']
    hex_key = parse_params['destination_key']

    # TODO: dislike having different shapes for noPoll and poll
    if noPoll:
        # ??
        h2o_sandbox.check_sandbox_for_errors()
        # return self.jobs(job_key)
        return parse_result

    # does Frame also, while polling
    if intermediateResults:
        key = hex_key
    else:
        key = None

    job_result = self.poll_job(job_key, timeoutSecs=timeoutSecs, key=key)

    if job_result:
        jobs = job_result['jobs'][0]
        description = jobs['description']
        dest = jobs['dest']
        msec = jobs['msec']
        status = jobs['status']
        progress = jobs['progress']
        dest_key = dest['name']

        # can condition this with a parameter if some FAILED are expected by tests.
        if status=='FAILED':
            print dump_json(job_result)
            raise Exception("Taking exception on parse job status: %s %s %s %s %s" % \
                (status, progress, msec, dest_key, description))

        return self.frames(dest_key)
    else:
        # ? we should always get a job_json result
        raise Exception("parse didn't get a job_result when it expected one")