def testCheckWithModelAPI(self):
        ######################################################################
        # Now look for kmeans_model_name using the one-model API and find_compatible_frames, and check it
        model = self.a_node.models(key=self.kmeans_model_name,
                                   find_compatible_frames=True)
        found_kmeans = False
        h2o_util.assertKeysExist(model['models'][0], '', ['compatible_frames'])
        assert self.prostate_key in model['models'][0]['compatible_frames'], \
            "Failed to find " + self.prostate_key + " in compatible_frames list."
        ######################################################################
        # Now look for prostate_key using the one-frame API and find_compatible_models, and check it
        result = self.a_node.frames(key='prostate.hex',
                                    find_compatible_models=True)
        frames = result['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        assert_true('prostate.hex' in frames_dict,
                    "Failed to find prostate.hex in Frames list.")

        compatible_models = result['compatible_models']
        models_dict = h2o_util.list_to_dict(compatible_models, 'key')
        assert_true ( self.dl_prostate_model_name in models_dict, "Failed to find " + \
                    self.dl_prostate_model_name + " in compatible models list.")

        assert_true(
            self.dl_prostate_model_name in frames[0]['compatible_models'])
        assert_true(self.kmeans_model_name in frames[0]['compatible_models'])
def validate_predictions(result, model_name, frame_key, expected_rows):
    '''
    Validate a /Predictions result.
    '''
    assert p is not None, "FAIL: Got a null result for scoring: " + model_name + " on: " + frame_key
    assert 'model_metrics' in p, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a model_metrics object."
    mm = p['model_metrics'][0]
    h2o.H2O.verboseprint('mm: ', repr(mm))
    assert 'auc' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an AUC."
    assert 'cm' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a CM."
    assert 'predictions' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an predictions section."
    assert 'key' in mm['predictions'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key."
    assert 'name' in mm['predictions']['key'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key name."
    
    predictions_key = mm['predictions']['key']['name']
    result = a_node.frames(key=predictions_key, find_compatible_models=True, len=5)
    frames = result['frames']
    frames_dict = h2o_util.list_to_dict(frames, 'key/name')
    assert predictions_key in frames_dict, "FAIL: Failed to find predictions key" + predictions_key + " in Frames list."
    
    predictions = mm['predictions']
    h2o.H2O.verboseprint('p: ', repr(p))
    assert 'columns' in predictions, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an columns section."
    assert len(predictions['columns']) > 0, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain any columns."
    assert 'label' in predictions['columns'][0], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 has no label element."
    assert 'predict' == predictions['columns'][0]['label'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 is not 'predict'."
    assert expected_rows == predictions['rows'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " has an unexpected number of rows."
def validate_predictions(result, model_name, frame_key, expected_rows, destination_key=None):
    '''
    Validate a /Predictions result.
    '''
    assert p is not None, "FAIL: Got a null result for scoring: " + model_name + " on: " + frame_key
    assert 'model_metrics' in p, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a model_metrics object."
    mm = p['model_metrics'][0]
    h2o.H2O.verboseprint('mm: ', repr(mm))
    #assert 'auc' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an AUC."
    #assert 'cm' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a CM."
    assert 'predictions' in mm, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an predictions section."
    assert 'key' in mm['predictions'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key."
    assert 'name' in mm['predictions']['key'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain a key name."

    predictions_key = mm['predictions']['key']['name']
    f = a_node.frames(key=predictions_key, find_compatible_models=True, row_count=5)
    frames = f['frames']
    frames_dict = h2o_util.list_to_dict(frames, 'key/name')
    assert predictions_key in frames_dict, "FAIL: Failed to find predictions key" + predictions_key + " in Frames list."

    predictions = mm['predictions']
    h2o.H2O.verboseprint('p: ', repr(p))
    assert 'columns' in predictions, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain an columns section."
    assert len(predictions['columns']) > 0, "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " does not contain any columns."
    assert 'label' in predictions['columns'][0], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 has no label element."
    assert 'predict' == predictions['columns'][0]['label'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " column 0 is not 'predict'."
    assert expected_rows == predictions['rows'], "FAIL: Predictions for scoring: " + model_name + " on: " + frame_key + " has an unexpected number of rows."

    assert 'destination_key' in result, "FAIL: failed to find 'destination_key' in predict result:" + h2o_util.dump_json(result)
    assert 'name' in result['destination_key'], "FAIL: failed to find name in 'destination_key' in predict result:" + h2o_util.dump_json(result)

    if destination_key is not None:
        assert destination_key == result['destination_key']['name'], "FAIL: bad value for 'destination_key' in predict result; expected: " + destination_key + ", got: " + result['destination_key']['name']
Beispiel #4
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['keys'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k,v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(numCols, colCount,
            "parse created result with the wrong number of cols %s %s" % (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))
    def testCheckWithModelAPI(self):
        ######################################################################
        # Now look for kmeans_model_name using the one-model API and find_compatible_frames, and check it
        model = self.a_node.models(key=self.kmeans_model_name, find_compatible_frames=True)
        found_kmeans = False;
        h2o_util.assertKeysExist(model['models'][0], '', ['compatible_frames'])
        assert self.prostate_key in model['models'][0]['compatible_frames'], \
            "Failed to find " + self.prostate_key + " in compatible_frames list."
        ######################################################################
        # Now look for prostate_key using the one-frame API and find_compatible_models, and check it
        result = self.a_node.frames(key='prostate.hex', find_compatible_models=True)
        frames = result['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list."

        compatible_models = result['compatible_models']
        models_dict = h2o_util.list_to_dict(compatible_models, 'key')
        assert self.dl_prostate_model_name in models_dict, "Failed to find " + \
                    self.dl_prostate_model_name + " in compatible models list."

        assert self.dl_prostate_model_name in frames[0]['compatible_models']
        assert self.kmeans_model_name in frames[0]['compatible_models']
 def testImportProstate(self):
     cleanup(self.a_node)
     import_result = self.a_node.import_files(path="/Users/radu/h2o-dev/smalldata/logreg/prostate.csv")
     parse_result = self.a_node.parse(key=import_result['keys'][0]) # TODO: handle multiple files
     self.prostate_key = parse_result['frames'][0]['key']['name']
     # Test /Frames for prostate.csv
     frames = self.a_node.frames()['frames']
     frames_dict = h2o_util.list_to_dict(frames, 'key/name')
     assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list."
     # Test /Frames/{key} for prostate.csv
     frames = self.a_node.frames(key='prostate.hex')['frames']
     frames_dict = h2o_util.list_to_dict(frames, 'key/name')
     assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list."
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'CAPSULE' in columns_dict, "Failed to find CAPSULE in Frames/prostate.hex."
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns."
     assert 'bins' in columns_dict['AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE."
     assert None is columns_dict['AGE']['bins'], "Failed to clear bins field." # should be cleared except for /summary
     frames = self.a_node.columns(key='prostate.hex')['frames']
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'ID' in columns_dict, "Failed to find ID in Frames/prostate.hex/columns."
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns."
     assert 'bins' in columns_dict['AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE."
     assert None is columns_dict['AGE']['bins'], "Failed to clear bins field." # should be cleared except for /summary
     frames = self.a_node.column(key='prostate.hex', column='AGE')['frames']
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns."
     assert 'bins' in columns_dict['AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE."
     assert None is columns_dict['AGE']['bins'], "Failed to clear bins field." # should be cleared except for /summary
     frames = self.a_node.summary(key='prostate.hex', column='AGE')['frames']
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns/AGE/summary."
     col = columns_dict['AGE']
     h2o_util.assertKeysExistAndNonNull(col, '', ['label', 'missing', 'zeros', 'pinfs', 'ninfs', 'mins',
             'maxs', 'mean', 'sigma', 'type', 'data', 'precision', 'bins', 'base', 'stride', 'pctiles'])
     h2o_util.assertKeysExist(col, '', ['domain', 'str_data'])
     assert col['mins'][0] == 43, 'Failed to find 43 as the first min for AGE.'
     assert col['maxs'][0] == 79, 'Failed to find 79 as the first max for AGE.'
     assert col['mean'] == 66.03947368421052, 'Failed to find 66.03947368421052 as the mean for AGE.'
     assert col['sigma'] == 6.527071269173308, 'Failed to find 6.527071269173308 as the sigma for AGE.'
     assert col['type'] == 'int', 'Failed to find int as the type for AGE.'
     assert col['data'][0] == 65, 'Failed to find 65 as the first data for AGE.'
     assert col['precision'] == -1, 'Failed to find -1 as the precision for AGE.'
     assert col['bins'][0] == 1, 'Failed to find 1 as the first bin for AGE.'
     assert col['base'] == 43, 'Failed to find 43 as the base for AGE.'
     assert col['stride'] == 1, 'Failed to find 1 as the stride for AGE.'
     assert col['pctiles'][0] == 50.5, 'Failed to find 50.5 as the first pctile for AGE.'
Beispiel #7
0
def find_key(pattern=None):
    try:
        patternObj = re.compile(pattern)
    except:
        raise Exception("Need legal pattern in find_key, not %s", pattern)

    frames = h2o_nodes.nodes[0].frames()['frames']
    frames_dict = h2o_util.list_to_dict(frames, 'key/name')

    result = []
    for key in frames_dict:
        if patternObj.search(key):
            result.append(key)

    if len(result) == 0:
        verboseprint("Warning: No match for %s" % pattern)
        return None

    if len(result) > 1:
        verboseprint("Warning: multiple imported keys match the key pattern %s, Using: %s" % (pattern, result[0]))

    return result[0]
    DatasetSpec('airlines_binomial', '../../smalldata/airlines/allyears2k_headers.zip', 43978, 'Binomial', 'IsDepDelayed', ['IsArrDelayed', 'ArrDelay', 'DepDelay']), # TODO: more ignored?

    DatasetSpec('iris_multinomial', '../../smalldata/iris/iris_wheader.csv', 150, 'Multinomial', 'class', []),
]

datasets = {} # the dataset spec
for dataset_spec in datasets_to_import:
    dataset = dataset_spec.import_and_validate_dataset(a_node) # it's also stored in dataset_spec['dataset']
    datasets[dataset_spec['dest_key']] = dataset_spec


################################################
# Test /Frames for prostate.csv
frames = a_node.frames(len=5)['frames']
frames_dict = h2o_util.list_to_dict(frames, 'key/name')

# TODO: remove:
if h2o.H2O.verbose:
    print "frames: "
    pp.pprint(frames)

if h2o.H2O.verbose:
    print "frames_dict: "
    pp.pprint(frames_dict)

# TODO: test len and offset (they aren't working yet)
assert 'prostate_binomial' in frames_dict, "FAIL: Failed to find " + 'prostate_binomial' + " in Frames list."
assert not frames_dict['prostate_binomial']['isText'], "FAIL: Parsed Frame is isText"

 def testImportProstate(self):
     cleanup(self.a_node)
     prostate_tuple = self.cfg.data['prostate']
     if (prostate_tuple[0] == "file"):
         import_result = self.a_node.import_files(
             path=os.path.abspath(prostate_tuple[1]))
     else:
         raise RuntimeError("Unsupported file type specified")
     parse_result = self.a_node.parse(
         key=import_result['keys'][0])  # TODO: handle multiple files
     self.prostate_key = parse_result['frames'][0]['key']['name']
     # Test /Frames for prostate.csv
     frames = self.a_node.frames()['frames']
     frames_dict = h2o_util.list_to_dict(frames, 'key/name')
     assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list."
     # Test /Frames/{key} for prostate.csv
     frames = self.a_node.frames(key='prostate.hex')['frames']
     frames_dict = h2o_util.list_to_dict(frames, 'key/name')
     assert 'prostate.hex' in frames_dict, "Failed to find prostate.hex in Frames list."
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'CAPSULE' in columns_dict, "Failed to find CAPSULE in Frames/prostate.hex."
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns."
     assert 'bins' in columns_dict[
         'AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE."
     assert None is columns_dict['AGE'][
         'bins'], "Failed to clear bins field."  # should be cleared except for /summary
     frames = self.a_node.columns(key='prostate.hex')['frames']
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'ID' in columns_dict, "Failed to find ID in Frames/prostate.hex/columns."
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns."
     assert 'bins' in columns_dict[
         'AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE."
     assert None is columns_dict['AGE'][
         'bins'], "Failed to clear bins field."  # should be cleared except for /summary
     frames = self.a_node.column(key='prostate.hex', column='AGE')['frames']
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns."
     assert 'bins' in columns_dict[
         'AGE'], "Failed to find bins in Frames/prostate.hex/columns/AGE."
     assert None is columns_dict['AGE'][
         'bins'], "Failed to clear bins field."  # should be cleared except for /summary
     frames = self.a_node.summary(key='prostate.hex',
                                  column='AGE')['frames']
     columns_dict = h2o_util.list_to_dict(frames[0]['columns'], 'label')
     assert 'AGE' in columns_dict, "Failed to find AGE in Frames/prostate.hex/columns/AGE/summary."
     col = columns_dict['AGE']
     h2o_util.assertKeysExistAndNonNull(col, '', [
         'label', 'missing', 'zeros', 'pinfs', 'ninfs', 'mins', 'maxs',
         'mean', 'sigma', 'type', 'data', 'precision', 'bins', 'base',
         'stride', 'pctiles'
     ])
     h2o_util.assertKeysExist(col, '', ['domain', 'str_data'])
     assert col['mins'][
         0] == 43, 'Failed to find 43 as the first min for AGE.'
     assert col['maxs'][
         0] == 79, 'Failed to find 79 as the first max for AGE.'
     assert col[
         'mean'] == 66.03947368421052, 'Failed to find 66.03947368421052 as the mean for AGE.'
     assert col[
         'sigma'] == 6.527071269173308, 'Failed to find 6.527071269173308 as the sigma for AGE.'
     assert col['type'] == 'int', 'Failed to find int as the type for AGE.'
     assert col['data'][
         0] == 65, 'Failed to find 65 as the first data for AGE.'
     assert col[
         'precision'] == -1, 'Failed to find -1 as the precision for AGE.'
     assert col['bins'][
         0] == 1, 'Failed to find 1 as the first bin for AGE.'
     assert col['base'] == 43, 'Failed to find 43 as the base for AGE.'
     assert col['stride'] == 1, 'Failed to find 1 as the stride for AGE.'
     assert col['pctiles'][
         0] == 50.5, 'Failed to find 50.5 as the first pctile for AGE.'
Beispiel #10
0
    def test_parse_multi_header_single(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
        ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs,
             dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1  # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 rowCount,
                                                 headerData=None,
                                                 rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(
                    SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname,
                                                 dataRowsWithHeader,
                                                 headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f,
                                schema='put',
                                noPrint=True)
                print f

            # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node?
            # frames = h2o.nodes[0].frames()['frames']
            frames = h2o.n0.frames()['frames']
            frames_dict = h2o_util.list_to_dict(frames, 'key/name')

            # print "frames:", dump_json(frames)
            # print "frames_dict:", dump_json(frames_dict)

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception(
                        "Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()

            # does h2o-dev take a regex? or do we need to glob
            parseResult = h2i.parse_only(
                pattern='*' + rowxcol + '*',
                hex_key=hex_key,
                timeoutSecs=timeoutSecs,
                checkHeader="1")  # header_from_file=header

            pA = h2o_cmd.ParseObj(parseResult,
                                  expectedNumRows=totalDataRows,
                                  expectedNumCols=totalCols)
            print pA.numRows
            print pA.numCols
            print pA.parse_key

            expectedLabelList = headerData.split(",")
            iA = h2o_cmd.InspectObj(pA.parse_key,
                                    expectedNumRows=totalDataRows,
                                    expectedNumCols=totalCols,
                                    expectedMissinglist=[],
                                    expectedLabelList=expectedLabelList)

            if DO_RF:
                # put in an ignore param, that will fail unless headers were parsed correctly
                if HEADER:
                    kwargs = {
                        'sample_rate': 0.75,
                        'max_depth': 25,
                        'ntrees': 1,
                        'ignored_cols_by_name': 'ID,CAPSULE'
                    }
                else:
                    kwargs = {
                        'sample_rate': 0.75,
                        'max_depth': 25,
                        'ntrees': 1
                    }

                rfv = h2o_cmd.runRF(parseResult=parseResult,
                                    timeoutSecs=timeoutSecs,
                                    **kwargs)

            h2o.check_sandbox_for_errors()
Beispiel #11
0
    DatasetSpec('airlines_binomial', '../../smalldata/airlines/allyears2k_headers.zip', 43978, 'Binomial', 'IsDepDelayed', ['IsArrDelayed', 'ArrDelay', 'DepDelay']), # TODO: more ignored?

    DatasetSpec('iris_multinomial', '../../smalldata/iris/iris_wheader.csv', 150, 'Multinomial', 'class', []),
]

datasets = {} # the dataset spec
for dataset_spec in datasets_to_import:
    dataset = dataset_spec.import_and_validate_dataset(a_node) # it's also stored in dataset_spec['dataset']
    datasets[dataset_spec['dest_key']] = dataset_spec


################################################
# Test /Frames for prostate.csv
frames = a_node.frames(row_count=5)['frames']
frames_dict = h2o_util.list_to_dict(frames, 'key/name')

# TODO: remove:
if h2o.H2O.verbose:
    print "frames: "
    pp.pprint(frames)

if h2o.H2O.verbose:
    print "frames_dict: "
    pp.pprint(frames_dict)

assert 'prostate_binomial' in frames_dict, "FAIL: Failed to find " + 'prostate_binomial' + " in Frames list."
assert not frames_dict['prostate_binomial']['is_text'], "FAIL: Parsed Frame is is_text"


# Test /Frames/{key} for prostate.csv
    def test_parse_multi_header_single(self):
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_ints.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename
        headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output"

        # cols must be 9 to match the header above, otherwise a different bug is hit
        # extra output is added, so it's 10 total
        tryList = [
            (57, 300, 9, 'cA', 60, 0),
            # try with 1-3 data lines in the header file too
            (57, 300, 9, 'cB', 60, 1),
            (57, 300, 9, 'cC', 60, 2),
            (57, 300, 9, 'cD', 60, 3),
            ]

        trial = 0
        for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList:
            trial += 1
            # FIX! should we add a header to them randomly???
            print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR
            rowxcol = str(rowCount) + 'x' + str(colCount)
            totalCols = colCount + 1 # 1 extra for output
            totalDataRows = 0
            for fileN in range(fileNum):
                csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                rList = rand_rowData(colCount)
                dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList)
                totalDataRows += dataRowsDone

            # create the header file
            # can make it pass by not doing this
            if HEADER:
                csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv'
                csvPathname = SYNDATASETS_DIR + '/' + csvFilename
                dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList)
                totalDataRows += dataRowsDone

            # make sure all key names are unique, when we re-put and re-parse (h2o caching issues)
            src_key = "syn_" + str(trial)
            hex_key = "syn_" + str(trial) + ".hex"

            # DON"T get redirected to S3! (EC2 hack in config, remember!)
            # use it at the node level directly (because we gen'ed the files.
            # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?
            # put them, rather than using import files, so this works if remote h2o is used
            # and python creates the files locally
            fileList = os.listdir(SYNDATASETS_DIR)
            for f in fileList:
                h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True)
                print f

            # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node?
            # frames = h2o.nodes[0].frames()['frames']
            frames = h2o.n0.frames()['frames']
            frames_dict = h2o_util.list_to_dict(frames, 'key/name')

            # print "frames:", dump_json(frames)
            # print "frames_dict:", dump_json(frames_dict)

            if HEADER:
                header = h2i.find_key('syn_header')
                if not header:
                    raise Exception("Didn't find syn_header* key in the import")

            # use regex. the only files in the dir will be the ones we just created with  *fileN* match
            print "Header Key = " + header
            start = time.time()

            # does h2o-dev take a regex? or do we need to glob
            parseResult = h2i.parse_only(pattern='*'+rowxcol+'*',
                hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header

            pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols)
            print pA.numRows
            print pA.numCols
            print pA.parse_key

            expectedLabelList = headerData.split(",")
            iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols,
                expectedMissinglist=[], expectedLabelList=expectedLabelList)

            if DO_RF:
                # put in an ignore param, that will fail unless headers were parsed correctly
                if HEADER:
                    kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'}
                else:
                    kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1}

                rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)

            h2o.check_sandbox_for_errors()
Beispiel #13
0
    def test_simple2(self):
        # h2o-dev doesn't take ../.. type paths? make find_file return absolute path
        # csvPathname = find_file("bigdata/laptop/poker-hand-testing.data")
        csvPathname = find_file("smalldata/logreg/prostate.csv")
        import_result = h2o.n0.import_files(path=csvPathname)
        # print dump_json(import_result)

        k = import_result['keys'][0]
        frames_result = h2o.n0.frames(key=k)

        frame = frames_result['frames'][0]
        rows = frame['rows']
        columns = frame['columns']
        for c in columns:
            label = c['label']
            missing = c['missing_count']
            stype = c['type']
            domain = c['domain']

        # print dump_json(frame)

        # let's see what ray's util does
        frames = h2o.n0.frames()['frames']
        frames_dict = h2o_util.list_to_dict(frames, 'key/name')
        # print "frames:", dump_json(frames)
        # print "frames_dict:", dump_json(frames_dict)
        for k, v in frames_dict.items():
            print "frames_dict key:", k

        # interesting. we can do dictionary comprehensions
        # { k:v for k,v in my_dict.items() if 'Peter' in k }

        # how do you parse multiple files
        parse_result = h2o.n0.parse(
            key=k, intermediateResults=DO_INTERMEDIATE_RESULTS)

        frame = parse_result['frames'][0]
        hex_key = frame['key']['name']

        colCount = 9
        rowCount = 380
        # colCount = 11
        # rowCount = 1000000
        start = time.time()
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "Inspect:", hex_key, "took", time.time() - start, "seconds"
        numCols = len(inspect['frames'][0]['columns'])
        numRows = inspect['frames'][0]['rows']
        print "\n" + csvPathname, \
            "    rows:", "{:,}".format(numRows), \
            "    len(columns):", "{:,}".format(numCols)

        # should match # of cols in header or ??
        self.assertEqual(
            numCols, colCount,
            "parse created result with the wrong number of cols %s %s" %
            (numCols, colCount))
        self.assertEqual(numRows, rowCount,
            "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \
            (numRows, rowCount))

        verboseprint(hex_key, ":", dump_json(parse_result))