def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) start = time.time() print "Parse start:", csvPathnamegz parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parse_key, "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(key=parse_key, timeoutSecs=timeoutSecs) print "Inspect:", parse_key, "took", time.time() - start, "seconds" missingValuesList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspect) print "\n" + csvPathnamegz, \ "\n numRows:", "{:,}".format(numRows), \ "\n numCols:", "{:,}".format(numCols) self.assertEqual(len(missingValuesList), 0, "Don't expect any missing values. These cols had some: %s" % missingValuesList) # should match # of cols in header or ?? self.assertEqual(numCols, colCount, "parse created result with the wrong number of cols %s %s" % (numCols, colCount)) self.assertEqual(numRows, rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (numRows, rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseKey['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=rowCount, expectedNumCols=colCount) print pA.numRows print pA.numCols print pA.parse_key # this guy can take json object as first thing, or re-read with key iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=rowCount, expectedNumCols=colCount, expectedMissinglist=[])
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz # experiment to see if the gz is causing it to fail if NO_GZ: csvPathnameReplgz = csvPathname totalRows = rowCount # hack experiment if NO_REPL: h2o_util.file_gzip(csvPathname, csvPathnameReplgz) totalRows = rowCount parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING) if DO_SUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseResult['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." time.sleep(5) start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # there is an extra response variable if inspect['numCols'] != (colCount + 1): raise Exception("parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) if inspect['numRows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalRows)) # hack it in! for test purposees only parseResult['numRows'] = inspect['numRows'] parseResult['numCols'] = inspect['numCols'] parseResult['byteSize'] = inspect['byteSize'] return parseResult
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0,1) GZIP_HEADER = random.randint(0,1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k,v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0,1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0,1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1==0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf} start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols (10, 5000, 'cE', 600), (10, 10000, 'cF', 600), (10, 50000, 'cF', 600), ] FILEREPL = 200 DOSUMMARY = True # h2b.browseTheCloud() for (rowCount, colCount, key2, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz start = time.time() print "Replicating", csvFilenamegz, "into", csvFilenameReplgz h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) print csvFilenameReplgz, 'parse time:', parseKey['response'][ 'time'] if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseKey['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # should match # of cols in header or ?? self.assertEqual( inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0, 1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0, 1) GZIP_HEADER = random.randint(0, 1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k, v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0, 1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0, 1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename( csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename( hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1 == 0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = { 'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf } start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 40000, 'cF', 600), (100, 20000, 'cF', 600), (100, 10000, 'cF', 600), (100, 5000, 'cF', 600), ] FILEREPL = 200 DOSUMMARY = True # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str(rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz start = time.time() print "Replicating", csvFilenamegz, "into", csvFilenameReplgz h2o_util.file_cat(csvPathnamegz, csvPathnamegz , csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL-2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) print csvFilenameReplgz, 'parse time:', parseResult['response']['time'] if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo , parseResult['destination_key'], "took", time.time() - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" num_rows = inspect['num_rows'] num_cols = inspect['num_cols'] value_size_bytes = inspect['value_size_bytes'] h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ "\n num_rows:", "{:,}".format(num_rows), \ "\n num_cols:", "{:,}".format(num_cols), \ "\n value_size_bytes:", "{:,}".format(value_size_bytes) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) self.assertEqual(inspect['num_rows'], totalRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount))
def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols # overwrite the key each time to save space? (100, 100, 'cF', 600), (100, 5000, 'cF', 600), (100, 10000, 'cF', 600), # (100, 12000, 'cF', 600), # (100, 15000, 'cF', 600), # (100, 17000, 'cF', 600), (100, 20000, 'cF', 600), (100, 40000, 'cF', 600), ] # h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: SEEDPERFILE = random.randint(0, sys.maxint) csvFilename = 'syn_' + str(SEEDPERFILE) + "_" + str( rowCount) + 'x' + str(colCount) + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) start = time.time() print "Parse start:", csvPathnamegz parseResult = h2i.import_parse(path=csvPathnamegz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, doSummary=DOSUMMARY) if DOSUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult[ 'destination_key'], "took", time.time() - start, "seconds" numRows = inspect['numRows'] numCols = inspect['numCols'] missingValuesList = h2o_cmd.infoFromInspect(inspect, csvPathnamegz) print "\n" + csvPathnamegz, \ "\n numRows:", "{:,}".format(numRows), \ "\n numCols:", "{:,}".format(numCols) self.assertEqual( len(missingValuesList), 0, "Don't expect any missing values. These cols had some: %s" % missingValuesList) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], colCount, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) self.assertEqual(inspect['numRows'], rowCount, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], rowCount))
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, hex_key, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz # experiment to see if the gz is causing it to fail if NO_GZ: csvPathnameReplgz = csvPathname totalRows = rowCount # hack experiment if NO_REPL: h2o_util.file_gzip(csvPathname, csvPathnameReplgz) totalRows = rowCount parseResult = h2i.import_parse(path=csvPathnameReplgz, schema='put', hex_key=hex_key, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=DO_SUMMARY, blocking=DO_BLOCKING) if DO_SUMMARY: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseResult['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." time.sleep(5) start = time.time() inspect = h2o_cmd.runInspect(key=parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathnameReplgz) print "\n" + csvPathnameReplgz, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # there is an extra response variable if inspect['numCols'] != (colCount + 1): raise Exception( "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], colCount)) if inspect['numRows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalRows)) # hack it in! for test purposees only parseResult['numRows'] = inspect['numRows'] parseResult['numCols'] = inspect['numCols'] parseResult['byteSize'] = inspect['byteSize'] return parseResult
def make_datasetgz_and_parse(SYNDATASETS_DIR, csvFilename, key2, rowCount, colCount, FILEREPL, SEEDPERFILE, timeoutSecs): csvPathname = SYNDATASETS_DIR + '/' + csvFilename print "Creating random", csvPathname write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE) csvFilenamegz = csvFilename + ".gz" csvPathnamegz = SYNDATASETS_DIR + '/' + csvFilenamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) csvFilenameReplgz = csvFilename + "_" + str(FILEREPL) + "x.gz" csvPathnameReplgz = SYNDATASETS_DIR + '/' + csvFilenameReplgz print "Replicating", csvFilenamegz, "into", csvFilenameReplgz start = time.time() h2o_util.file_cat(csvPathnamegz, csvPathnamegz, csvPathnameReplgz) # no header? should we add a header? would have to be a separate gz? totalRows = 2 * rowCount for i in range(FILEREPL - 2): h2o_util.file_append(csvPathnamegz, csvPathnameReplgz) totalRows += rowCount print "Replication took:", time.time() - start, "seconds" start = time.time() print "Parse start:", csvPathnameReplgz doSummary = False parseKey = h2o_cmd.parseFile(None, csvPathnameReplgz, key2=key2, timeoutSecs=timeoutSecs, pollTimeoutSecs=120, doSummary=doSummary) print csvFilenameReplgz, 'parse time:', parseKey['response']['time'] if doSummary: algo = "Parse and Summary:" else: algo = "Parse:" print algo, parseKey['destination_key'], "took", time.time( ) - start, "seconds" print "Inspecting.." start = time.time() inspect = h2o_cmd.runInspect(None, parseKey['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseKey['destination_key'], "took", time.time( ) - start, "seconds" h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " num_rows:", "{:,}".format(inspect['num_rows']), \ " num_cols:", "{:,}".format(inspect['num_cols']) # there is an extra response variable if inspect['num_cols'] != (colCount + 1): raise Exception( "parse created result with the wrong number of cols %s %s" % (inspect['num_cols'], colCount)) if inspect['num_rows'] != totalRows: raise Exception("parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['num_rows'], rowCount)) # hack it in! for test purposees only parseKey['python_source_key'] = csvFilenameReplgz parseKey['num_rows'] = inspect['num_rows'] parseKey['num_cols'] = inspect['num_cols'] parseKey['value_size_bytes'] = inspect['value_size_bytes'] return parseKey