Beispiel #1
0
    def test_NN_mnist_multi(self):
        # h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = "mnist/train.csv.gz"
        csvPathname_test = "mnist/test.csv.gz"
        hex_key = "mnist_train.hex"
        validation_key = "mnist_test.hex"
        timeoutSecs = 60
        parseResult = h2i.import_parse(
            bucket="smalldata", path=csvPathname_train, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs
        )
        parseResultV = h2i.import_parse(
            bucket="smalldata", path=csvPathname_test, schema="put", hex_key=validation_key, timeoutSecs=timeoutSecs
        )
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, "    numRows:", "{:,}".format(
            inspect["numRows"]
        ), "    numCols:", "{:,}".format(inspect["numCols"])
        response = inspect["numCols"] - 1

        modes = [
            ###'SingleThread', ### too slow (and slightly less accurate)
            "SingleNode",  ### wastes N-1 nodes, since their weight matrices are updated but never looked at...
            ###'MapReduce' ### TODO: enable, once implemented
        ]

        for mode in modes:

            # Making random id
            identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = "nn_" + identifier + ".hex"

            kwargs = {
                "ignored_cols": None,
                "response": response,
                "classification": 1,
                "mode": mode,
                "activation": "RectifierWithDropout",
                "input_dropout_ratio": 0.2,
                "hidden": "117,131,129",
                "rate": 0.005,
                "rate_annealing": 1e-6,
                "momentum_start": 0.5,
                "momentum_ramp": 100000,
                "momentum_stable": 0.9,
                "l1": 0.00001,
                "l2": 0.0000001,
                "seed": 98037452452,
                "loss": "CrossEntropy",
                "max_w2": 15,
                "warmup_samples": 0,
                "initial_weight_distribution": "UniformAdaptive",
                #'initial_weight_scale'         : 0.01,
                "epochs": 20.0,
                "destination_key": model_key,
                "validation": validation_key,
            }
            ###expectedErr = 0.0362 ## from single-threaded mode
            expectedErr = 0.03  ## observed actual value with Hogwild

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds"

            relTol = 0.02 if mode == "SingleThread" else 0.10  ### 10% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(
                self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs
            )

            ### Now score using the model, and check the validation error
            kwargs = {
                "source": validation_key,
                "max_rows": 0,
                "response": response,
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "destination_key": "score_" + identifier + ".hex",
                "model": model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

            if mode != "MapReduce":
                print "WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results."

        h2o.beta_features = False
Beispiel #2
0
    def test_NN_covtype(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'covtype/covtype.20k.data'
        csvPathname_test  = 'covtype/covtype.20k.data'
        hex_key = 'covtype.hex'
        validation_key = hex_key
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs)
        ###No need - use training as validation
        ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread',
            'SingleNode',
            ]

        for mode in modes:

            #Making random id
            identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'mode'                         : mode,
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.1,
                'hidden'                       : '200,200',
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-5,
                'momentum_start'               : 0.1,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.3,
                'l1'                           : 0.0000,
                'l2'                           : 0.0000,
                'seed'                         : 28372348842,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 10,
                'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'Normal',
                'initial_weight_scale'         : 1,
                'epochs'                       : 2.0,
                'destination_key'              : model_key,
                'validation'                   : validation_key,
            }
            expectedErr = 0.3413 if mode == 'SingleThread' else 0.3 ## expected validation error for the above model

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

            relTol = 0.03 if mode == 'SingleThread' else 0.20 ### 20% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source' : validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
                }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

        h2o.beta_features = False
Beispiel #3
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread', 
            'SingleNode',
            ###'MapReduce' ### TODO: enable, once implemented
            ]

        for mode in modes:

            #Making random id
            identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'mode'                         : mode,
                'activation'                   : 'RectifierWithDropout',
                'input_dropout_ratio'          : 0.2,
                'hidden'                       : '117,131,129',
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0.5,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.9,
                'l1'                           : 0.00001,
                'l2'                           : 0.0000001,
                'seed'                         : 98037452452,
                'loss'                         : 'CrossEntropy',
                'max_w2'                       : 15,
                'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 2.0,
                'destination_key'              : model_key,
                'validation'                   : validation_key,
            }
            expectedErr = 0.0565 ## expected validation error for the above model on 1 thread

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

            #### Look at model progress, and check the last reported validation error
            relTol = 0.3 if mode == 'SingleThread' else 0.15
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

            #### Now score using the model, and check the validation error
            kwargs = {
                'source' : validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

        h2o.beta_features = False
Beispiel #4
0
    def test_NN_covtype(self):
        # h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = "covtype/covtype.20k.data"
        csvPathname_test = "covtype/covtype.20k.data"
        hex_key = "covtype.hex"
        validation_key = hex_key
        timeoutSecs = 30
        parseResult = h2i.import_parse(
            bucket="smalldata", path=csvPathname_train, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs
        )
        ###No need - use training as validation
        ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, "    numRows:", "{:,}".format(
            inspect["numRows"]
        ), "    numCols:", "{:,}".format(inspect["numCols"])
        response = inspect["numCols"] - 1

        modes = ["SingleThread", "SingleNode"]

        for mode in modes:

            # Making random id
            identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = "nn_" + identifier + ".hex"

            kwargs = {
                "ignored_cols": None,
                "response": response,
                "classification": 1,
                "mode": mode,
                "activation": "Tanh",
                #'input_dropout_ratio'          : 0.1,
                "hidden": "200,200",
                "rate": 0.005,
                "rate_annealing": 1e-5,
                "momentum_start": 0.1,
                "momentum_ramp": 100000,
                "momentum_stable": 0.3,
                "l1": 0.0000,
                "l2": 0.0000,
                "seed": 28372348842,
                "loss": "CrossEntropy",
                #'max_w2'                       : 10,
                "warmup_samples": 0,
                "initial_weight_distribution": "Normal",
                "initial_weight_scale": 1,
                "epochs": 2.0,
                "destination_key": model_key,
                "validation": validation_key,
            }
            expectedErr = 0.3413 if mode == "SingleThread" else 0.3  ## expected validation error for the above model

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds"

            relTol = 0.03 if mode == "SingleThread" else 0.15  ### 15% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(
                self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs
            )

            ### Now score using the model, and check the validation error
            kwargs = {
                "source": validation_key,
                "max_rows": 0,
                "response": response,
                "ignored_cols": None,  # this is not consistent with ignored_cols_by_name
                "classification": 1,
                "destination_key": "score_" + identifier + ".hex",
                "model": model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

        h2o.beta_features = False
Beispiel #5
0
    def test_NN_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            ###'SingleThread', ### too slow (and slightly less accurate)
            'SingleNode',  ### wastes N-1 nodes, since their weight matrices are updated but never looked at...
            ###'MapReduce' ### TODO: enable, once implemented
        ]

        for mode in modes:

            #Making random id
            identifier = ''.join(
                random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols': None,
                'response': response,
                'classification': 1,
                'mode': mode,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117,131,129',
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                'loss': 'CrossEntropy',
                'max_w2': 15,
                'warmup_samples': 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 20.0,
                'destination_key': model_key,
                'validation': validation_key,
            }
            ###expectedErr = 0.0362 ## from single-threaded mode
            expectedErr = 0.03  ## observed actual value with Hogwild

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
            ) - start, 'seconds'

            relTol = 0.02 if mode == 'SingleThread' else 0.10  ### 10% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                            inspect['numRows'], expectedErr,
                                            relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source': validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

            if mode != 'MapReduce':
                print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.'

        h2o.beta_features = False
Beispiel #6
0
    def test_NN_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue    = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse   = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" 

        twoValueList = [
            ('A','B',0, 14),
            ('A','B',1, 14),
            (0,1,0, 12),
            (0,1,1, 12),
            (0,1,'NaN', 12),
            (1,0,'NaN', 12),
            (-1,1,0, 12),
            (-1,1,1, 12),
            (-1e1,1e1,1e1, 12),
            (-1e1,1e1,-1e1, 12),
            ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, 
                rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'

            parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : 'C' + str(response),
                'classification'               : 1,
                'mode'                         : 'SingleThread',
                'activation'                   : 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden'                       : '500',
                'rate'                         : 0.01,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0,
                'momentum_ramp'                : 0,
                'momentum_stable'              : 0,
                'l1'                           : 0.0,
                'l2'                           : 1e-4,
                'seed'                         : 80023842348,
                'loss'                         : 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 1.0,
                'destination_key'              : model_key,
                'validation'                   : hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o.beta_features = True
            h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "trial #", trial, "NN end on ", csvFilename, ' took', time.time() - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.0
            relTol = 0.01
            kwargs = {
                'source' : hex_key,
                'max_rows': 0,
                'response': 'C' + str(response),
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score' + str(trial) + '.hex',
                'model': model_key
            }

            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

            h2o.check_sandbox_for_errors()

            trial += 1
Beispiel #7
0
    def test_NN_mnist(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata',
                                        path=csvPathname_test,
                                        schema='put',
                                        hex_key=validation_key,
                                        timeoutSecs=timeoutSecs)

        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread',
            'SingleNode',
            ###'MapReduce' ### TODO: enable, once implemented
        ]

        for mode in modes:

            #Making random id
            identifier = ''.join(
                random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols': None,
                'response': response,
                'classification': 1,
                'mode': mode,
                'activation': 'RectifierWithDropout',
                'input_dropout_ratio': 0.2,
                'hidden': '117,131,129',
                'rate': 0.005,
                'rate_annealing': 1e-6,
                'momentum_start': 0.5,
                'momentum_ramp': 100000,
                'momentum_stable': 0.9,
                'l1': 0.00001,
                'l2': 0.0000001,
                'seed': 98037452452,
                'loss': 'CrossEntropy',
                'max_w2': 15,
                'warmup_samples': 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 2.0,
                'destination_key': model_key,
                'validation': validation_key,
            }
            expectedErr = 0.0565  ## expected validation error for the above model on 1 thread

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
            ) - start, 'seconds'

            #### Look at model progress, and check the last reported validation error
            relTol = 0.3 if mode == 'SingleThread' else 0.15
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                            inspect['numRows'], expectedErr,
                                            relTol, **kwargs)

            #### Now score using the model, and check the validation error
            kwargs = {
                'source': validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

        h2o.beta_features = False
Beispiel #8
0
    def test_NN_twovalues(self):
        h2o.beta_features = True
        SYNDATASETS_DIR = h2o.make_syn_dir()
        csvFilename = "syn_twovalues.csv"
        csvPathname = SYNDATASETS_DIR + '/' + csvFilename

        rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4"
        rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4"

        twoValueList = [
            ('A', 'B', 0, 14),
            ('A', 'B', 1, 14),
            (0, 1, 0, 12),
            (0, 1, 1, 12),
            (0, 1, 'NaN', 12),
            (1, 0, 'NaN', 12),
            (-1, 1, 0, 12),
            (-1, 1, 1, 12),
            (-1e1, 1e1, 1e1, 12),
            (-1e1, 1e1, -1e1, 12),
        ]

        trial = 0
        for (outputTrue, outputFalse, case, coeffNum) in twoValueList:
            write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse,
                              str(outputTrue), str(outputFalse))

            start = time.time()
            hex_key = csvFilename + "_" + str(trial)
            model_key = 'trial_' + str(trial) + '.hex'

            parseResult = h2i.import_parse(path=csvPathname,
                                           schema='put',
                                           hex_key=hex_key)
            print "using outputTrue: %s outputFalse: %s" % (outputTrue,
                                                            outputFalse)

            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
            print "\n" + csvPathname, \
                "    numRows:", "{:,}".format(inspect['numRows']), \
                "    numCols:", "{:,}".format(inspect['numCols'])
            response = inspect['numCols'] - 1

            kwargs = {
                'ignored_cols': None,
                'response': 'C' + str(response),
                'classification': 1,
                'mode': 'SingleThread',
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.2,
                'hidden': '500',
                'rate': 0.01,
                'rate_annealing': 1e-6,
                'momentum_start': 0,
                'momentum_ramp': 0,
                'momentum_stable': 0,
                'l1': 0.0,
                'l2': 1e-4,
                'seed': 80023842348,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 15,
                #'warmup_samples'               : 0,
                'initial_weight_distribution': 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs': 1.0,
                'destination_key': model_key,
                'validation': hex_key,
            }

            timeoutSecs = 60
            start = time.time()
            h2o.beta_features = True
            h2o_cmd.runNNet(parseResult=parseResult,
                            timeoutSecs=timeoutSecs,
                            **kwargs)
            print "trial #", trial, "NN end on ", csvFilename, ' took', time.time(
            ) - start, 'seconds'

            #### Now score using the model, and check the validation error
            expectedErr = 0.0
            relTol = 0.01
            kwargs = {
                'source': hex_key,
                'max_rows': 0,
                'response': 'C' + str(response),
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score' + str(trial) + '.hex',
                'model': model_key
            }

            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

            h2o.check_sandbox_for_errors()

            trial += 1
Beispiel #9
0
    def test_NN_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 30
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            ###'SingleThread', ### too slow (and slightly less accurate)
            'SingleNode',  ### wastes N-1 nodes, since their weight matrices are updated but never looked at...
            ###'MapReduce' ### TODO: enable, once implemented
            ]

        for mode in modes:

            #Making random id
            identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols'                 : None,
                'response'                     : response,
                'classification'               : 1,
                'mode'                         : mode,
                'activation'                   : 'RectifierWithDropout',
                'input_dropout_ratio'          : 0.2,
                'hidden'                       : '117,131,129',
                'rate'                         : 0.005,
                'rate_annealing'               : 1e-6,
                'momentum_start'               : 0.5,
                'momentum_ramp'                : 100000,
                'momentum_stable'              : 0.9,
                'l1'                           : 0.00001,
                'l2'                           : 0.0000001,
                'seed'                         : 98037452452,
                'loss'                         : 'CrossEntropy',
                'max_w2'                       : 15,
                'warmup_samples'               : 0,
                'initial_weight_distribution'  : 'UniformAdaptive',
                #'initial_weight_scale'         : 0.01,
                'epochs'                       : 20.0,
                'destination_key'              : model_key,
                'validation'                   : validation_key,
            }
            ###expectedErr = 0.0362 ## from single-threaded mode
            expectedErr = 0.0331 ## observed actual value with Hogwild

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

            relTol = 0.02 if mode == 'SingleThread' else 0.10 ### 10% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source' : validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols': None, # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)

            if mode != 'MapReduce':
                print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.'

        h2o.beta_features = False
    def test_NN2_mnist_multi(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'mnist/train.csv.gz'
        csvPathname_test  = 'mnist/test.csv.gz'
        hex_key = 'mnist_train.hex'
        validation_key = 'mnist_test.hex'
        timeoutSecs = 60
        parseResult  = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs)
        parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1


        #Making random id
        identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10))
        model_key = 'nn_' + identifier + '.hex'

        kwargs = {
            'ignored_cols'                 : None,
            'response'                     : response,
            'classification'               : 1,
            'activation'                   : 'RectifierWithDropout',
            'input_dropout_ratio'          : 0.2,
            'hidden'                       : '117,131,129',
            'rate'                         : 0.005,
            'rate_annealing'               : 1e-6,
            'momentum_start'               : 0.5,
            'momentum_ramp'                : 100000,
            'momentum_stable'              : 0.9,
            'l1'                           : 0.00001,
            'l2'                           : 0.0000001,
            'seed'                         : 98037452452,
            'loss'                         : 'CrossEntropy',
            'max_w2'                       : 15,
            'initial_weight_distribution'  : 'UniformAdaptive',
            #'initial_weight_scale'         : 0.01,
            'epochs'                       : 20.0,
            'destination_key'              : model_key,
            'validation'                   : validation_key,
        }
        ###expectedErr = 0.0362 ## from single-threaded mode
        expectedErr = 0.03 ## observed actual value with Hogwild

        timeoutSecs = 600
        start = time.time()
        nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs)
        print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds'

        ### Now score using the model, and check the validation error
        kwargs = {
            'source' : validation_key,
            'max_rows': 0,
            'response': response,
            'ignored_cols': None, # this is not consistent with ignored_cols_by_name
            'classification': 1,
            'destination_key': 'score_' + identifier + '.hex',
            'model': model_key,
        }
        nnScoreResult = h2o_cmd.runDeepLearningScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs)
        h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)
Beispiel #11
0
    def test_NN_covtype(self):
        #h2b.browseTheCloud()
        h2o.beta_features = True
        csvPathname_train = 'covtype/covtype.20k.data'
        csvPathname_test = 'covtype/covtype.20k.data'
        hex_key = 'covtype.hex'
        validation_key = hex_key
        timeoutSecs = 30
        parseResult = h2i.import_parse(bucket='smalldata',
                                       path=csvPathname_train,
                                       schema='put',
                                       hex_key=hex_key,
                                       timeoutSecs=timeoutSecs)
        ###No need - use training as validation
        ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs)
        inspect = h2o_cmd.runInspect(None, hex_key)
        print "\n" + csvPathname_train, \
            "    numRows:", "{:,}".format(inspect['numRows']), \
            "    numCols:", "{:,}".format(inspect['numCols'])
        response = inspect['numCols'] - 1

        modes = [
            'SingleThread',
            'SingleNode',
        ]

        for mode in modes:

            #Making random id
            identifier = ''.join(
                random.sample(string.ascii_lowercase + string.digits, 10))
            model_key = 'nn_' + identifier + '.hex'

            kwargs = {
                'ignored_cols': None,
                'response': response,
                'classification': 1,
                'mode': mode,
                'activation': 'Tanh',
                #'input_dropout_ratio'          : 0.1,
                'hidden': '200,200',
                'rate': 0.005,
                'rate_annealing': 1e-5,
                'momentum_start': 0.1,
                'momentum_ramp': 100000,
                'momentum_stable': 0.3,
                'l1': 0.0000,
                'l2': 0.0000,
                'seed': 28372348842,
                'loss': 'CrossEntropy',
                #'max_w2'                       : 10,
                'warmup_samples': 0,
                'initial_weight_distribution': 'Normal',
                'initial_weight_scale': 1,
                'epochs': 2.0,
                'destination_key': model_key,
                'validation': validation_key,
            }
            expectedErr = 0.35195 if mode == 'SingleThread' else 0.3  ## expected validation error for the above model

            timeoutSecs = 600
            start = time.time()
            nn = h2o_cmd.runNNet(parseResult=parseResult,
                                 timeoutSecs=timeoutSecs,
                                 **kwargs)
            print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time(
            ) - start, 'seconds'

            relTol = 0.03 if mode == 'SingleThread' else 0.20  ### 20% relative error is acceptable for Hogwild
            h2o_nn.checkLastValidationError(self, nn['neuralnet_model'],
                                            inspect['numRows'], expectedErr,
                                            relTol, **kwargs)

            ### Now score using the model, and check the validation error
            kwargs = {
                'source': validation_key,
                'max_rows': 0,
                'response': response,
                'ignored_cols':
                None,  # this is not consistent with ignored_cols_by_name
                'classification': 1,
                'destination_key': 'score_' + identifier + '.hex',
                'model': model_key,
            }
            nnScoreResult = h2o_cmd.runNNetScore(
                key=parseResult['destination_key'],
                timeoutSecs=timeoutSecs,
                **kwargs)
            h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol,
                                    **kwargs)

        h2o.beta_features = False