def test_exec2_runif(self): print "in h2o-dev, params are column, min, max, seed" bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' hexKey = 'r.hex' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) # work up to the failing case incrementally execExprList = [ # hack to make them keys? (not really needed but interesting) # params for h2o-dev runif are: column, min, max, seed AssignObj('r0.hex', KeyIndexed('r.hex', col=0)), AssignObj('s0.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=0), 1)), AssignObj('s1.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=1), -1)), AssignObj('s2.hex', Fcn("h2o.runif", KeyIndexed('r.hex', col=54), -1)), ] results = [] for execExpr in execExprList: start = time.time() result = execExpr.do(timeoutSecs=30) results.append(result) execResult = execExpr.execResult print "exec took", time.time() - start, "seconds" print "exec result:", result print "exec result (full):", h2o.dump_json(execResult) h2o.check_sandbox_for_errors() rSummary = h2o_cmd.runSummary(key='r0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) rSummary = h2o_cmd.runSummary(key='s0.hex', cols='0') # h2o_cmd.infoFromSummary(rSummary) sSummary = h2o_cmd.runSummary(key='s1.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) sSummary = h2o_cmd.runSummary(key='s2.hex', cols='0') # h2o_cmd.infoFromSummary(sSummary) # since there are no NAs in covtype, r.hex and s.hex should be identical? if 1 == 0: print "Comparing summary of r.hex to summary of s.hex" df = h2o_util.JsonDiff(rSummary, sSummary, with_values=True) # time can be different print "df.difference:", h2o.dump_json(df.difference) self.assertLess(len(df.difference), 2) print "results from the individual exec expresssions (ignore last which was an apply)" print "results:", results self.assertEqual(results, [ 0.0, 0.0, 0.0, 1859.0, 581012.0, 581012.0, 2959.365300544567, 1859.0, 1859.0 ])
def test_rapids_mean(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) data_key = hex_key data_key2 = hex_key + "_2" for trial in range(4): result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) Assign(result_key, Fcn('mean', KeyIndexed(data_key2, col=0), 0, False)) trial += 1
def test_rapids_funs_basic3(self): DO_FAIL = False if DO_FAIL: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' else: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] # works for 1 pass..why is execExpr set for 2nd pass? should be new instance? # if we reuse the same object in the list, it has state? # do we need to copy the object...hmm for trial in range(1): for execObj in funsList: freshObj = copy(execObj) result = freshObj.do() # rapids doesn't like complicated params right now? if DO_FAIL: a = Assign('junk', Fcn('anon', KeyIndexed('r1', col=0)), do=False) else: a = Assign('junk', Fcn('anon', 'r1'), do=False) result = a.do(timeoutSecs=60) # rows might be zero! if a.execResult['num_rows'] or a.execResult['num_cols']: keys.append(a.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
def test_rapids_funs_1000_stmnt(self): DO_FAIL = False if DO_FAIL: bucket = 'home-0xdiag-datasets' csvPathname = 'standard/covtype.data' else: bucket = 'smalldata' csvPathname = 'iris/iris_wheader.csv' hexKey = 'r1' parseResult = h2i.import_parse(bucket=bucket, path=csvPathname, schema='put', hex_key=hexKey) keys = [] for trial in range(3): for execObj in funsList: freshObj = copy(execObj) print "ast length:", len(str(freshObj)) result = freshObj.do() # rapids doesn't like complicated params right now? if DO_FAIL: a = Assign('junk', Fcn('anon', KeyIndexed('r1',col=0))) else: a = Assign('junk', Fcn('anon', 'r1')) result = a.do(timeoutSecs=60) # rows might be zero! if a.execResult['num_rows'] or a.execResult['num_cols']: keys.append(a.execExpr) print "\nExpressions that created keys" for k in keys: print k # for execExpr in exprList: # h2e.exec_expr(execExpr=execExpr, resultKey=None, timeoutSecs=10) h2o.check_sandbox_for_errors()
Assign('q', Fcn('var', 'x', None, False, None), do=False), Assign('r', Fcn('var', 'x', None, False, None), do=False), Assign('s', Fcn('var', 'x', None, False, None), do=False), Assign('t', Fcn('var', 'x', None, False, None), do=False), Assign('u', Fcn('var', 'x', None, False, None), do=False), Assign('v', Fcn('var', 'x', None, False, None), do=False), Assign('w', Fcn('var', 'x', None, False, None), do=False), Assign('x', Fcn('var', 'x', None, False, None), do=False), Assign('y', Fcn('var', 'x', None, False, None), do=False), Assign('z', Fcn('var', 'x', None, False, None), do=False), Fcn('var', 'x', None, False, None), ), Def('anon', 'x', [Assign(key, Fcn('var', 'x', None, False, None), do=False) for key in 'abdefghijklmnopqrstuvz'], [Assign(key, Fcn('sum', KeyIndexed('x',col=0), False), do=False) for key in 'abdefghijklmnopqrstuvz'], Fcn('var', 'x', None, False, None), ), ] class Basic(unittest.TestCase): def tearDown(self): h2o.check_sandbox_for_errors() @classmethod def setUpClass(cls): global SEED SEED = h2o.setup_random_seed() h2o.init(1, base_port=54333) @classmethod
from h2o_xl import Def, Fcn, Assign, KeyIndexed from copy import copy print "Trying a different way, listing Rapids objects, rather than .ast() strings" # 'c' allowed # should be able to take a list of statements keyString = 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz' keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz' keyString += 'abdefghijklmnopqrstuvzabdefghijklmnopqrstuvz' funsList = [ Def('anon', 'x', [Assign(key, Fcn('var', 'x', None, False, None), do=False) for key in keyString], [Assign(key, Fcn('sum', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('max', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('min', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('xorsum', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('sd', KeyIndexed('x',col=0), False), do=False) for key in keyString], [Assign(key, Fcn('ncol', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('is.factor', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('any.factor', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('length', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('sin', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('asin', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('sinh', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('cos', KeyIndexed('x',col=0)), do=False) for key in keyString], [Assign(key, Fcn('acos', KeyIndexed('x',col=0)), do=False) for key in keyString],
def test_rapids_row_range(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) # Assign('s1', Seq(range(5)) ).do Assign('s1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created # first try as object, then method Assign('s2', Fcn('c', Seq(range(5)))) print dump_json(Xbase.lastExecResult) print dump_json(Xbase.lastResult) # just combine Assign('s3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s3') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 Assign('s2', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 5 assert numCols == 1 # can't have sequence of sequences? # make sure key is created with c() f = Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s1', f) f = Col( Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52))) Assign('s2', f) inspect = h2o_cmd.runInspect(key='s2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) assert numRows == 313 assert numCols == 1 print "z1" Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5)))) print "z2" Assign( 's1', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1, 5)))) print "z3" Assign(result_key, KeyIndexed(data_key, row='#1')).do print "z4" Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) print "z5" Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) print "z6" Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) print "z7" Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # execResult, Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion print "z8" Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) print "z9" Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # no assign print "z10" result = KeyIndexed(data_key, row=Colon('#1', rowCount - 10)).do() print "z11" # result = KeyIndexed(data_key, col=Colon('#1', colCount-1,)).do() # do some function translation print "z12" # result = Fcn('==', 1, KeyIndexed(data_key, col=Colon('#1', colCount-1,))).do() print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_rapids_cut(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual( numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual( numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) REPEAT = 1 data_key = hex_key for i in range(REPEAT): result_key = data_key + "_" + str(i) Assign('seq1', Seq(range(5))) # take advantage of default params for row/col (None) # need the 'c' function, to make sure the key is created Assign('seq2', Fcn('c', Seq(range(5)))) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) Assign('seq3', Col(Seq(range(5)))) inspect = h2o_cmd.runInspect(key='seq2') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) # can't have sequence of sequences? # make sure key is created with c() Assign( 'seq4', Fcn( 'c', Seq(Colon(99, 400), "#2", 1, range(1, 5), range(7, 10), range(50, 52)))) inspect = h2o_cmd.runInspect(key='seq1') missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect( inspect) Assign(result_key, KeyIndexed(data_key, row=Seq(range(1, 5)))) Assign( 'seq5', KeyIndexed(data_key, row=Seq(Colon(99, 400), "#2", 1, range(1, 5)))) # they need to be same size # Assign('seq6', Key('seq5') + Key('seq4') + Key('seq3')) # doesn't like my cut? complains on FALSE # Assign(result_key, Cut(KeyIndexed(data_key, col=0))) # Assign(result_key, Cut(KeyIndexed(data_key, col=1), breaks=3)) Assign(result_key, Fcn('min', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('max', KeyIndexed(data_key, col=1), True)) Assign(result_key, Fcn('mean', KeyIndexed(data_key, col=1), 0, False)) Assign(result_key, KeyIndexed(data_key, row='#1')) Assign(result_key, KeyIndexed(data_key, row=Colon('#1', '#100'))) Assign(result_key, KeyIndexed(data_key, row=Colon(1, 100))) # this should fail rapids because of reverse msb/lsb # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#100', '#1'))) Assign(result_key, KeyIndexed(data_key, row=Colon('#-2', '#-1'))) Assign(result_key, KeyIndexed(data_key, row=Colon(-2, -1))) # illegal, detected # resultExpr, result = Assign(result_key, KeyIndexed(data_key, row=Colon('#-1', '#-2'))) # take advantage of number to string conversion Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # no assign. Expr() complains when result has no key? Assign(result_key, KeyIndexed(data_key, row=Colon('#1', rowCount - 10))) Assign(result_key, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, ))) # do some function translation Assign( result_key, Fcn('==', 1, KeyIndexed(data_key, col=Colon( '#1', colCount - 1, )))) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols)
def test_rapids_funs_1op(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # (1000000, 5, 'cA', 200), (1000, 5, 'cA', 200), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "\nCreating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=False) inspect = h2o_cmd.runInspect(key=hex_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows %s %s" % (numRows, rowCount)) # Xbase.debugOnly = True REPEAT = 1 data_key = hex_key data_key2 = hex_key + "_2" trial = 0 good = [] bad = [] both = h2o_xl.xFcnOp1Set.union(h2o_xl.xFcnOp3Set) both = h2o_xl.xFcnOp1Set for fun in both: a = None try: result_key = data_key + "_" + str(trial) # copy the key Assign(data_key2, data_key) # a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('sum', KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('xorsum', KeyIndexed(data_key2, col=0), True)) # a = Assign(result_key, Fcn('sqrt', KeyIndexed(data_key2, col=0))) # a = Assign(result_key, Fcn('ncol', KeyIndexed(data_key2, col=0))) # what's wrong with mean? if fun in ['ncol', 'asin', 'any.factor', 'sin', 'atan', 'tan', 'sign', 'log', 'exp', 'sqrt', 'abs', 'floor', 'ceiling', 'trunc','is.factor', 'is.na', 'any.na', 'nrow', 'tanh', 'length', 'acos', 'cos', 'sinh', 'cosh']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0))) good.append(fun) elif fun in ['sum', 'max', 'min', 'xorsum', 'sd']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), True)) good.append(fun) elif fun in ['scale']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False)) good.append(fun) elif fun in ['round', 'signif']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1)) good.append(fun) elif fun in ['seq_len', 'rep_len']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 4)) good.append(fun) elif fun in ['seq']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 1, 5, 1)) good.append(fun) elif fun in ['mean']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), 0, False)) good.append(fun) elif fun in ['var']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, False, False)) good.append(fun) elif fun in ['match']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), KeyIndexed(data_key2, col=0), 1, None)) good.append(fun) elif fun in ['unique']: a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), False, 10, 1)) good.append(fun) else: # bad functions kill h2o? a = Assign(result_key, Fcn(fun, KeyIndexed(data_key2, col=0), None)) bad.append(fun) # a = Fcn(fun, KeyIndexed(data_key, col=0), '%FALSE ') # a = Fcn(fun, data_key, '%FALSE') # a = Fcn(fun, data_key) # scalars? if 1==0: inspect = h2o_cmd.runInspect(key=result_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) assert numRows==1000, numRows assert numCols==1, numCols print "\n" + csvPathname, \ " numRows:", "{:,}".format(numRows), \ " numCols:", "{:,}".format(numCols) except: if not a: # print dump_json(a.execResult) bad.append(fun) trial += 1 print "good:", good print "bad:", bad
Assign('v', Fcn('var', 'x', None, False, None), do=False), Assign('w', Fcn('var', 'x', None, False, None), do=False), Assign('x', Fcn('var', 'x', None, False, None), do=False), Assign('y', Fcn('var', 'x', None, False, None), do=False), Assign('z', Fcn('var', 'x', None, False, None), do=False), Fcn('var', 'x', None, False, None), ), Def( 'anon', 'x', [ Assign(key, Fcn('var', 'x', None, False, None), do=False) for key in 'abdefghijklmnopqrstuvz' ], [ Assign(key, Fcn('sum', KeyIndexed('x', col=0), False), do=False) for key in 'abdefghijklmnopqrstuvz' ], Fcn('var', 'x', None, False, None), ), ] class Basic(unittest.TestCase): def tearDown(self): h2o.check_sandbox_for_errors() @classmethod def setUpClass(cls): global SEED SEED = h2o.setup_random_seed()