def test_GLM_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self, csvFilename, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) glm_doit(self, filename4x, pathname4x, timeoutSecs=120)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) kmeans_doit(self, csvFilename, csvPathname, num_rows=1000000, timeoutSecs=60) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, pathname2xShuf, num_rows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, pathname4x, num_rows=4000000, timeoutSecs=120)
def test_1mx10_hastie_10_2_cat_and_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'datasets' csvPathname = 'logreg' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=45) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self,filename4x, pathname4x, timeoutSecs=60)
def test_GLM_hastie_shuffle(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" bucket = 'home-0xdiag-datasets' csvPathname = 'standard' + '/' + csvFilename fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=30) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) glm_doit(self, filename2xShuf, None, pathname2xShuf, timeoutSecs=45) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf,pathname2xShuf,pathname4x) glm_doit(self,filename4x, None, pathname4x, timeoutSecs=120)
def test_GLM_hastie(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=75) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, None, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self, filename4x, None, pathname4x, timeoutSecs=150)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=75) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=75) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one for perf compare" for i in range(3): print "\nTrial #", i, "of", filename4x glm_doit(self,filename4x, pathname4x, timeoutSecs=150)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} (modelKey, validations1) = glm_doit(self, csvFilename, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, pathname2x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x,pathname2x,pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, pathname4x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60)
def test_GLM2_score_same(self): h2o.beta_features = True # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename y = "10" kwargs = {'response': y, 'alpha': 0, 'family': 'gaussian'} (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x modelPathname = SYNDATASETS_DIR + '/model_' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x modelPathname = SYNDATASETS_DIR + '/model_' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def test_KMeans_hastie_shuffle_fvec(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) # This test also adds file shuffling, to see that row order doesn't matter csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard/' + csvFilename bucket = 'home-0xdiag-datasets' kmeans_doit(self, csvFilename, bucket, csvPathname, numRows=1000000, timeoutSecs=60) fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename1xShuf = "hastie_1x.data_shuf" pathname1xShuf = SYNDATASETS_DIR + '/' + filename1xShuf h2o_util.file_shuffle(pathname1x, pathname1xShuf) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1xShuf, pathname1xShuf, pathname2x) filename2xShuf = "hastie_2x.data_shuf" pathname2xShuf = SYNDATASETS_DIR + '/' + filename2xShuf h2o_util.file_shuffle(pathname2x, pathname2xShuf) kmeans_doit(self, filename2xShuf, None, pathname2xShuf, numRows=2000000, timeoutSecs=90) # too big to shuffle? filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2xShuf, pathname2xShuf, pathname4x) kmeans_doit(self, filename4x, None, pathname4x, numRows=4000000, timeoutSecs=120)
def test_GLM2_score_same(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename y = "10" kwargs = {'response': y, 'alpha': 0, 'family': 'gaussian'} (modelKey, validation1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x modelPathname = SYNDATASETS_DIR + '/model_' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x modelPathname = SYNDATASETS_DIR + '/model_' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, modelPathname, timeoutSecs=60, pollTimeoutSecs=60)
def write_syn_dataset(csvPathname, rowCount, inCount=1, outCount=1, SEED='12345678', colSepChar=",", rowSepChar="\n", quoteChars="", colEnumList=None): r1 = random.Random(SEED) if CAT_ITERATE == 0: dsf = open(csvPathname, "w+") else: tmpFd, tmpPathname = h2o.tmp_file("cat", ".csv", tmp_dir="/tmp") dsf = open(tmpPathname, "w+") global WRITE_REPEAT if not (WRITE_REPEAT >= 1 and WRITE_REPEAT <= 100): print "Forcing WRITE_REPEAT to 1" WRITE_REPEAT = 1 for row in range(rowCount): if ((WRITE_REPEAT * row) % 100000) == 0: print "Wrote", WRITE_REPEAT * row, "lines" # doesn't guarantee that 10000 rows have 10000 unique enums in a column # essentially sampling with replacement rowData = [] for iCol in range(inCount): # FIX! we should add some random NA? ri = random.choice(colEnumList[iCol]) rowData.append(ri) # output columns. always 0-10e6 with 2 digits of fp precision for oCol in range(outCount): ri = "%.2f" % random.uniform(0, 10e6) rowData.append(ri) # use the new Hive separator rowDataCsv = colSepChar.join(map(str, rowData)) + rowSepChar ### sys.stdout.write(rowDataCsv) # faster for creating big files? doesn't need to be fully random for i in range(WRITE_REPEAT): dsf.write(rowDataCsv) dsf.close() if CAT_ITERATE > 0: for c in range(CAT_ITERATE + 1): if c == CAT_ITERATE: print "Doubling", tmpPathname, "into", csvPathname h2o_util.file_cat(tmpPathname, tmpPathname, csvPathname) else: tmp2Fd, tmp2Pathname = h2o.tmp_file() print "Doubling", tmpPathname, "into", tmp2Pathname h2o_util.file_cat(tmpPathname, tmpPathname, tmp2Pathname) tmpPathname = tmp2Pathname return colEnumList
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseKey['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def write_syn_dataset(csvPathname, rowCount, inCount=1, outCount=1, SEED='12345678', colSepChar=",", rowSepChar="\n", quoteChars="", colEnumList=None): r1 = random.Random(SEED) if CAT_ITERATE==0: dsf = open(csvPathname, "w+") else: tmpFd, tmpPathname = h2o.tmp_file("cat",".csv", tmp_dir="/tmp") dsf = open(tmpPathname, "w+") global WRITE_REPEAT if not (WRITE_REPEAT>=1 and WRITE_REPEAT <=100): print "Forcing WRITE_REPEAT to 1" WRITE_REPEAT = 1 for row in range(rowCount): if ((WRITE_REPEAT*row) % 100000)== 0: print "Wrote", WRITE_REPEAT*row, "lines" # doesn't guarantee that 10000 rows have 10000 unique enums in a column # essentially sampling with replacement rowData = [] for iCol in range(inCount): # FIX! we should add some random NA? ri = random.choice(colEnumList[iCol]) rowData.append(ri) # output columns. always 0-10e6 with 2 digits of fp precision for oCol in range(outCount): ri = "%.2f" % random.uniform(0, 10e6) rowData.append(ri) # use the new Hive separator rowDataCsv = colSepChar.join(map(str,rowData)) + rowSepChar ### sys.stdout.write(rowDataCsv) # faster for creating big files? doesn't need to be fully random for i in range(WRITE_REPEAT): dsf.write(rowDataCsv) dsf.close() if CAT_ITERATE > 0: for c in range(CAT_ITERATE+1): if c==CAT_ITERATE: print "Doubling", tmpPathname, "into", csvPathname h2o_util.file_cat(tmpPathname, tmpPathname, csvPathname) else: tmp2Fd, tmp2Pathname = h2o.tmp_file() print "Doubling", tmpPathname, "into", tmp2Pathname h2o_util.file_cat(tmpPathname, tmpPathname, tmp2Pathname) tmpPathname = tmp2Pathname return colEnumList
def test_exec2_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key) k = Key(hex_key) colResultList = [] for i in range(pA.numCols): result = Expr(Fcn('sum', k[:, i], True)).result colResultList.append(result) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self,csvFilename, csvPathname, timeoutSecs=300) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self,filename2x, pathname2x, timeoutSecs=300)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = h2o.find_dataset('logreg' + '/' + csvFilename) glm_doit(self, csvFilename, csvPathname, timeoutSecs=300) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(csvPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) glm_doit(self, filename2x, pathname2x, timeoutSecs=300)
def test_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2o.find_dataset('UCI/UCI-large/covtype' + '/' + filename1x) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, key2, timeoutSecs, resultMult) in csvAll: parseKey = h2o_cmd.parseFile(csvPathname=csvPathname, key2=key2, timeoutSecs=2000) print "Parse result['Key']:", parseKey['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseKey['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols( lenNodes, exprList, key2, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x) / resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual( good, compare, 'compare is not equal to good (first try * resultMult)')
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'home-0xdiag-datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'standard' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300) fullPathname = h2i.find_folder_and_filename('home-0xdiag-datasets', csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
def test_A_1mx10_hastie_10_2(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR # FIX! eventually we'll compare the 1x, 2x and 4x results like we do # in other tests. (catdata?) bucket = 'datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'logreg' + '/' + csvFilename glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=300) fullPathname = h2i.find_folder_and_filename('datasets', csvPathname, returnFullPath=True) filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_doit(self, filename2x, None, pathname2x, timeoutSecs=300)
def test_exec2_sum(self): print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('home-0xdiag-datasets', 'standard/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] # h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResultA = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key) pA = h2o_cmd.ParseObj(parseResultA) print pA.numRows print pA.numCols print pA.parse_key iA = h2o_cmd.InspectObj(pA.parse_key) k = Key(hex_key) colResultList = [] for i in range(pA.numCols): result = Expr(Fcn('sum', k[:,i], True)).result colResultList.append(result) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_exec2_sum(self): h2o.beta_features = True print "Replicating covtype.data by 2x for results comparison to 1x" filename1x = 'covtype.data' pathname1x = h2i.find_folder_and_filename('datasets', 'UCI/UCI-large/covtype/covtype.data', returnFullPath=True) filename2x = "covtype_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x h2o_util.file_cat(pathname1x, pathname1x, pathname2x) csvAll = [ (pathname1x, "cA", 5, 1), (pathname2x, "cB", 5, 2), (pathname2x, "cC", 5, 2), ] h2b.browseTheCloud() lenNodes = len(h2o.nodes) firstDone = False for (csvPathname, hex_key, timeoutSecs, resultMult) in csvAll: parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=2000) print "Parse result['Key']:", parseResult['destination_key'] # We should be able to see the parse result? inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) print "\n" + csvPathname h2o_exec.exec_zero_list(zeroList) colResultList = h2o_exec.exec_expr_list_across_cols(lenNodes, exprList, hex_key, maxCol=54, timeoutSecs=timeoutSecs) print "\ncolResultList", colResultList if not firstDone: colResultList0 = list(colResultList) good = [float(x) for x in colResultList0] firstDone = True else: print "\n", colResultList0, "\n", colResultList # create the expected answer...i.e. N * first compare = [float(x)/resultMult for x in colResultList] print "\n", good, "\n", compare self.assertEqual(good, compare, 'compare is not equal to good (first try * resultMult)')
def test_GLM_score_same(self): # gunzip it and cat it to create 2x and 4x replications in SYNDATASETS_DIR bucket = 'datasets' csvFilename = "1mx10_hastie_10_2.data.gz" csvPathname = 'logreg' + '/' + csvFilename y = "10" x = "" kwargs = {'x': x, 'y': y, 'case': -1, 'thresholds': 0.5} (modelKey, validations1, parseResult) = glm_doit(self, csvFilename, bucket, csvPathname, timeoutSecs=60, pollTimeoutSecs=60, **kwargs) print "Use", modelKey, "model on 2x and 4x replications and compare results to 1x" filename1x = "hastie_1x.data" pathname1x = SYNDATASETS_DIR + '/' + filename1x fullPathname = h2i.find_folder_and_filename(bucket, csvPathname, returnFullPath=True) h2o_util.file_gunzip(fullPathname, pathname1x) filename2x = "hastie_2x.data" pathname2x = SYNDATASETS_DIR + '/' + filename2x bucket = None h2o_util.file_cat(pathname1x,pathname1x,pathname2x) glm_score(self,filename2x, bucket, pathname2x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60) filename4x = "hastie_4x.data" pathname4x = SYNDATASETS_DIR + '/' + filename4x h2o_util.file_cat(pathname2x, pathname2x, pathname4x) print "Iterating 3 times on this last one" for i in range(3): print "\nTrial #", i, "of", filename4x glm_score(self,filename4x, bucket, pathname4x, modelKey, thresholds="0.5", timeoutSecs=60, pollTimeoutSecs=60)
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols (10, 5000, 'cE', 600), (10, 10000, 'cF', 600), (10, 50000, 'cF', 600), ] FILEREPL = 200 DOSUMMARY = True # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz start = time.time() print "Replicating", csvFilenamegz, "into", csvFilenameReplgz h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) print csvFilenameReplgz, 'parse time:', parseKey['response'][ 'time'] if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseKey['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual( inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz # experiment to see if the gz is causing it to fail if NO_GZ: csvPathnameReplgz = csvPathname totalRows = rowCount # hack experiment if NO_REPL: h2o_util.file_gzip(csvPathname, csvPathnameReplgz) totalRows = rowCount parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING) if DO_SUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." time.sleep(5) start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # there is an extra response variable if inspect['numCols'] != (colCount + 1): raise Exception( "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) if inspect['numRows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalRows)) # hack it in! for test purposees only parseResult['numRows'] = inspect['numRows'] parseResult['numCols'] = inspect['numCols'] parseResult['byteSize'] = inspect['byteSize'] return parseResult
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz # experiment to see if the gz is causing it to fail if NO_GZ: csvPathnameReplgz = csvPathname totalRows = rowCount # hack experiment if NO_REPL: h2o_util.file_gzip(csvPathname, csvPathnameReplgz) totalRows = rowCount parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING) if DO_SUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseResult['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." time.sleep(5) start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # there is an extra response variable if inspect['numCols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) if inspect['numRows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalRows)) # hack it in! for test purposees only parseResult['numRows'] = inspect['numRows'] parseResult['numCols'] = inspect['numCols'] parseResult['byteSize'] = inspect['byteSize'] return parseResult
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 40000, 'cF', 600), (100, 20000, 'cF', 600), (100, 10000, 'cF', 600), (100, 5000, 'cF', 600), ] FILEREPL = 200 DOSUMMARY = True # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz start = time.time() print "Replicating", csvFilenamegz, "into", csvFilenameReplgz h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) print csvFilenameReplgz, 'parse time:', parseResult['response']['time'] if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseResult['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] value_size_bytes = inspect['value_size_bytes'] h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ "\n num_rows:", "{:,}".format(num_rows), \ "\n num_cols:", "{:,}".format(num_cols), \ "\n value_size_bytes:", "{:,}".format(value_size_bytes) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseKey['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception( "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey