def test_NN_mnist_multi(self): # h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = "mnist/train.csv.gz" csvPathname_test = "mnist/test.csv.gz" hex_key = "mnist_train.hex" validation_key = "mnist_test.hex" timeoutSecs = 60 parseResult = h2i.import_parse( bucket="smalldata", path=csvPathname_train, schema="put", hex_key=hex_key, timeoutSecs=timeoutSecs ) parseResultV = h2i.import_parse( bucket="smalldata", path=csvPathname_test, schema="put", hex_key=validation_key, timeoutSecs=timeoutSecs ) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"]) response = inspect["numCols"] - 1 modes = [ ###'SingleThread', ### too slow (and slightly less accurate) "SingleNode", ### wastes N-1 nodes, since their weight matrices are updated but never looked at... ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: # Making random id identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = "nn_" + identifier + ".hex" kwargs = { "ignored_cols": None, "response": response, "classification": 1, "mode": mode, "activation": "RectifierWithDropout", "input_dropout_ratio": 0.2, "hidden": "117,131,129", "rate": 0.005, "rate_annealing": 1e-6, "momentum_start": 0.5, "momentum_ramp": 100000, "momentum_stable": 0.9, "l1": 0.00001, "l2": 0.0000001, "seed": 98037452452, "loss": "CrossEntropy", "max_w2": 15, "warmup_samples": 0, "initial_weight_distribution": "UniformAdaptive", #'initial_weight_scale' : 0.01, "epochs": 20.0, "destination_key": model_key, "validation": validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds" relTol = 0.02 if mode == "SingleThread" else 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError( self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs ) ### Now score using the model, and check the validation error kwargs = { "source": validation_key, "max_rows": 0, "response": response, "ignored_cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "destination_key": "score_" + identifier + ".hex", "model": model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) if mode != "MapReduce": print "WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results." h2o.beta_features = False
def test_NN_covtype(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'covtype/covtype.20k.data' csvPathname_test = 'covtype/covtype.20k.data' hex_key = 'covtype.hex' validation_key = hex_key timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='local', hex_key=hex_key, timeoutSecs=timeoutSecs) ###No need - use training as validation ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ] for mode in modes: #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'mode' : mode, 'activation' : 'Tanh', #'input_dropout_ratio' : 0.1, 'hidden' : '200,200', 'rate' : 0.005, 'rate_annealing' : 1e-5, 'momentum_start' : 0.1, 'momentum_ramp' : 100000, 'momentum_stable' : 0.3, 'l1' : 0.0000, 'l2' : 0.0000, 'seed' : 28372348842, 'loss' : 'CrossEntropy', #'max_w2' : 10, 'warmup_samples' : 0, 'initial_weight_distribution' : 'Normal', 'initial_weight_scale' : 1, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, } expectedErr = 0.3413 if mode == 'SingleThread' else 0.3 ## expected validation error for the above model timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' relTol = 0.03 if mode == 'SingleThread' else 0.20 ### 20% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'mode' : mode, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'warmup_samples' : 0, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 2.0, 'destination_key' : model_key, 'validation' : validation_key, } expectedErr = 0.0565 ## expected validation error for the above model on 1 thread timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' #### Look at model progress, and check the last reported validation error relTol = 0.3 if mode == 'SingleThread' else 0.15 h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) #### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_covtype(self): # h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = "covtype/covtype.20k.data" csvPathname_test = "covtype/covtype.20k.data" hex_key = "covtype.hex" validation_key = hex_key timeoutSecs = 30 parseResult = h2i.import_parse( bucket="smalldata", path=csvPathname_train, schema="local", hex_key=hex_key, timeoutSecs=timeoutSecs ) ###No need - use training as validation ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, " numRows:", "{:,}".format( inspect["numRows"] ), " numCols:", "{:,}".format(inspect["numCols"]) response = inspect["numCols"] - 1 modes = ["SingleThread", "SingleNode"] for mode in modes: # Making random id identifier = "".join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = "nn_" + identifier + ".hex" kwargs = { "ignored_cols": None, "response": response, "classification": 1, "mode": mode, "activation": "Tanh", #'input_dropout_ratio' : 0.1, "hidden": "200,200", "rate": 0.005, "rate_annealing": 1e-5, "momentum_start": 0.1, "momentum_ramp": 100000, "momentum_stable": 0.3, "l1": 0.0000, "l2": 0.0000, "seed": 28372348842, "loss": "CrossEntropy", #'max_w2' : 10, "warmup_samples": 0, "initial_weight_distribution": "Normal", "initial_weight_scale": 1, "epochs": 2.0, "destination_key": model_key, "validation": validation_key, } expectedErr = 0.3413 if mode == "SingleThread" else 0.3 ## expected validation error for the above model timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, "took", time.time() - start, "seconds" relTol = 0.03 if mode == "SingleThread" else 0.15 ### 15% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError( self, nn["neuralnet_model"], inspect["numRows"], expectedErr, relTol, **kwargs ) ### Now score using the model, and check the validation error kwargs = { "source": validation_key, "max_rows": 0, "response": response, "ignored_cols": None, # this is not consistent with ignored_cols_by_name "classification": 1, "destination_key": "score_" + identifier + ".hex", "model": model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult["destination_key"], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ ###'SingleThread', ### too slow (and slightly less accurate) 'SingleNode', ### wastes N-1 nodes, since their weight matrices are updated but never looked at... ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'mode': mode, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'warmup_samples': 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 20.0, 'destination_key': model_key, 'validation': validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' relTol = 0.02 if mode == 'SingleThread' else 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) if mode != 'MapReduce': print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.' h2o.beta_features = False
def test_NN_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A','B',0, 14), ('A','B',1, 14), (0,1,0, 12), (0,1,1, 12), (0,1,'NaN', 12), (1,0,'NaN', 12), (-1,1,0, 12), (-1,1,1, 12), (-1e1,1e1,1e1, 12), (-1e1,1e1,-1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols' : None, 'response' : 'C' + str(response), 'classification' : 1, 'mode' : 'SingleThread', 'activation' : 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden' : '500', 'rate' : 0.01, 'rate_annealing' : 1e-6, 'momentum_start' : 0, 'momentum_ramp' : 0, 'momentum_stable' : 0, 'l1' : 0.0, 'l2' : 1e-4, 'seed' : 80023842348, 'loss' : 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 1.0, 'destination_key' : model_key, 'validation' : hex_key, } timeoutSecs = 60 start = time.time() h2o.beta_features = True h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "NN end on ", csvFilename, ' took', time.time() - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.0 relTol = 0.01 kwargs = { 'source' : hex_key, 'max_rows': 0, 'response': 'C' + str(response), 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score' + str(trial) + '.hex', 'model': model_key } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.check_sandbox_for_errors() trial += 1
def test_NN_mnist(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'mode': mode, 'activation': 'RectifierWithDropout', 'input_dropout_ratio': 0.2, 'hidden': '117,131,129', 'rate': 0.005, 'rate_annealing': 1e-6, 'momentum_start': 0.5, 'momentum_ramp': 100000, 'momentum_stable': 0.9, 'l1': 0.00001, 'l2': 0.0000001, 'seed': 98037452452, 'loss': 'CrossEntropy', 'max_w2': 15, 'warmup_samples': 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 2.0, 'destination_key': model_key, 'validation': validation_key, } expectedErr = 0.0565 ## expected validation error for the above model on 1 thread timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' #### Look at model progress, and check the last reported validation error relTol = 0.3 if mode == 'SingleThread' else 0.15 h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) #### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False
def test_NN_twovalues(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_twovalues.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename rowDataTrue = "1, 0, 65, 1, 2, 1, 1, 4, 1, 4, 1, 4" rowDataFalse = "0, 1, 0, -1, -2, -1, -1, -4, -1, -4, -1, -4" twoValueList = [ ('A', 'B', 0, 14), ('A', 'B', 1, 14), (0, 1, 0, 12), (0, 1, 1, 12), (0, 1, 'NaN', 12), (1, 0, 'NaN', 12), (-1, 1, 0, 12), (-1, 1, 1, 12), (-1e1, 1e1, 1e1, 12), (-1e1, 1e1, -1e1, 12), ] trial = 0 for (outputTrue, outputFalse, case, coeffNum) in twoValueList: write_syn_dataset(csvPathname, 20, rowDataTrue, rowDataFalse, str(outputTrue), str(outputFalse)) start = time.time() hex_key = csvFilename + "_" + str(trial) model_key = 'trial_' + str(trial) + '.hex' parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) print "using outputTrue: %s outputFalse: %s" % (outputTrue, outputFalse) inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 kwargs = { 'ignored_cols': None, 'response': 'C' + str(response), 'classification': 1, 'mode': 'SingleThread', 'activation': 'Tanh', #'input_dropout_ratio' : 0.2, 'hidden': '500', 'rate': 0.01, 'rate_annealing': 1e-6, 'momentum_start': 0, 'momentum_ramp': 0, 'momentum_stable': 0, 'l1': 0.0, 'l2': 1e-4, 'seed': 80023842348, 'loss': 'CrossEntropy', #'max_w2' : 15, #'warmup_samples' : 0, 'initial_weight_distribution': 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs': 1.0, 'destination_key': model_key, 'validation': hex_key, } timeoutSecs = 60 start = time.time() h2o.beta_features = True h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "trial #", trial, "NN end on ", csvFilename, ' took', time.time( ) - start, 'seconds' #### Now score using the model, and check the validation error expectedErr = 0.0 relTol = 0.01 kwargs = { 'source': hex_key, 'max_rows': 0, 'response': 'C' + str(response), 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score' + str(trial) + '.hex', 'model': model_key } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.check_sandbox_for_errors() trial += 1
def test_NN_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ ###'SingleThread', ### too slow (and slightly less accurate) 'SingleNode', ### wastes N-1 nodes, since their weight matrices are updated but never looked at... ###'MapReduce' ### TODO: enable, once implemented ] for mode in modes: #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'mode' : mode, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'warmup_samples' : 0, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 20.0, 'destination_key' : model_key, 'validation' : validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.0331 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' relTol = 0.02 if mode == 'SingleThread' else 0.10 ### 10% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) if mode != 'MapReduce': print 'WARNING: Running in non-MapReduce mode on multiple nodes! Only one node contributes to results.' h2o.beta_features = False
def test_NN2_mnist_multi(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'mnist/train.csv.gz' csvPathname_test = 'mnist/test.csv.gz' hex_key = 'mnist_train.hex' validation_key = 'mnist_test.hex' timeoutSecs = 60 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='put', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 #Making random id identifier = ''.join(random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols' : None, 'response' : response, 'classification' : 1, 'activation' : 'RectifierWithDropout', 'input_dropout_ratio' : 0.2, 'hidden' : '117,131,129', 'rate' : 0.005, 'rate_annealing' : 1e-6, 'momentum_start' : 0.5, 'momentum_ramp' : 100000, 'momentum_stable' : 0.9, 'l1' : 0.00001, 'l2' : 0.0000001, 'seed' : 98037452452, 'loss' : 'CrossEntropy', 'max_w2' : 15, 'initial_weight_distribution' : 'UniformAdaptive', #'initial_weight_scale' : 0.01, 'epochs' : 20.0, 'destination_key' : model_key, 'validation' : validation_key, } ###expectedErr = 0.0362 ## from single-threaded mode expectedErr = 0.03 ## observed actual value with Hogwild timeoutSecs = 600 start = time.time() nn = h2o_cmd.runDeepLearning(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time() - start, 'seconds' ### Now score using the model, and check the validation error kwargs = { 'source' : validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runDeepLearningScore(key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs)
def test_NN_covtype(self): #h2b.browseTheCloud() h2o.beta_features = True csvPathname_train = 'covtype/covtype.20k.data' csvPathname_test = 'covtype/covtype.20k.data' hex_key = 'covtype.hex' validation_key = hex_key timeoutSecs = 30 parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname_train, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs) ###No need - use training as validation ###parseResultV = h2i.import_parse(bucket='smalldata', path=csvPathname_test, schema='local', hex_key=validation_key, timeoutSecs=timeoutSecs) inspect = h2o_cmd.runInspect(None, hex_key) print "\n" + csvPathname_train, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) response = inspect['numCols'] - 1 modes = [ 'SingleThread', 'SingleNode', ] for mode in modes: #Making random id identifier = ''.join( random.sample(string.ascii_lowercase + string.digits, 10)) model_key = 'nn_' + identifier + '.hex' kwargs = { 'ignored_cols': None, 'response': response, 'classification': 1, 'mode': mode, 'activation': 'Tanh', #'input_dropout_ratio' : 0.1, 'hidden': '200,200', 'rate': 0.005, 'rate_annealing': 1e-5, 'momentum_start': 0.1, 'momentum_ramp': 100000, 'momentum_stable': 0.3, 'l1': 0.0000, 'l2': 0.0000, 'seed': 28372348842, 'loss': 'CrossEntropy', #'max_w2' : 10, 'warmup_samples': 0, 'initial_weight_distribution': 'Normal', 'initial_weight_scale': 1, 'epochs': 2.0, 'destination_key': model_key, 'validation': validation_key, } expectedErr = 0.35195 if mode == 'SingleThread' else 0.3 ## expected validation error for the above model timeoutSecs = 600 start = time.time() nn = h2o_cmd.runNNet(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "neural net end on ", csvPathname_train, " and ", csvPathname_test, 'took', time.time( ) - start, 'seconds' relTol = 0.03 if mode == 'SingleThread' else 0.20 ### 20% relative error is acceptable for Hogwild h2o_nn.checkLastValidationError(self, nn['neuralnet_model'], inspect['numRows'], expectedErr, relTol, **kwargs) ### Now score using the model, and check the validation error kwargs = { 'source': validation_key, 'max_rows': 0, 'response': response, 'ignored_cols': None, # this is not consistent with ignored_cols_by_name 'classification': 1, 'destination_key': 'score_' + identifier + '.hex', 'model': model_key, } nnScoreResult = h2o_cmd.runNNetScore( key=parseResult['destination_key'], timeoutSecs=timeoutSecs, **kwargs) h2o_nn.checkScoreResult(self, nnScoreResult, expectedErr, relTol, **kwargs) h2o.beta_features = False