def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseKey['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz # experiment to see if the gz is causing it to fail if NO_GZ: csvPathnameReplgz = csvPathname totalRows = rowCount # hack experiment if NO_REPL: h2o_util.file_gzip(csvPathname, csvPathnameReplgz) totalRows = rowCount parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING) if DO_SUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseResult['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." time.sleep(5) start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # there is an extra response variable if inspect['numCols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) if inspect['numRows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalRows)) # hack it in! for test purposees only parseResult['numRows'] = inspect['numRows'] parseResult['numCols'] = inspect['numCols'] parseResult['byteSize'] = inspect['byteSize'] return parseResult
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols (10, 5000, 'cE', 600), (10, 10000, 'cF', 600), (10, 50000, 'cF', 600), ] FILEREPL = 200 DOSUMMARY = True # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz start = time.time() print "Replicating", csvFilenamegz, "into", csvFilenameReplgz h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) print csvFilenameReplgz, 'parse time:', parseKey['response'][ 'time'] if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseKey['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual( inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 40000, 'cF', 600), (100, 20000, 'cF', 600), (100, 10000, 'cF', 600), (100, 5000, 'cF', 600), ] FILEREPL = 200 DOSUMMARY = True # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz start = time.time() print "Replicating", csvFilenamegz, "into", csvFilenameReplgz h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) print csvFilenameReplgz, 'parse time:', parseResult['response']['time'] if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseResult['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] value_size_bytes = inspect['value_size_bytes'] h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ "\n num_rows:", "{:,}".format(num_rows), \ "\n num_cols:", "{:,}".format(num_cols), \ "\n value_size_bytes:", "{:,}".format(value_size_bytes) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz # experiment to see if the gz is causing it to fail if NO_GZ: csvPathnameReplgz = csvPathname totalRows = rowCount # hack experiment if NO_REPL: h2o_util.file_gzip(csvPathname, csvPathnameReplgz) totalRows = rowCount parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING) if DO_SUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." time.sleep(5) start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # there is an extra response variable if inspect['numCols'] != (colCount + 1): raise Exception( "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) if inspect['numRows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalRows)) # hack it in! for test purposees only parseResult['numRows'] = inspect['numRows'] parseResult['numCols'] = inspect['numCols'] parseResult['byteSize'] = inspect['byteSize'] return parseResult
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseKey['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception( "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey