def test_H_Basic(self): # maybe best to extra the key from an import? first? # this isn't used much, maybe we don't care about this h2i.import_only(path="testdir_multi_jvm/syn_test/syn_header.csv") headerKey = h2i.find_key('syn_header.csv') # comma 44 is separator h2i.import_parse(path="testdir_multi_jvm/syn_test/syn[1-2].csv", header=1, header_from_file=headerKey, separator=44) # symbolic links work # ln -s /home/0xdiag/datasets home-0xdiag-datasets # lrwxrwxrwx 1 kevin kevin 21 Aug 26 22:05 home-0xdiag-datasets -> /home/0xdiag/datasets h2i.import_parse(path="standard/covtype.data", bucket="home-0xdiag-datasets")
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, check_header=1, delete_on_done=0, timeoutSecs=180, doSummary=False) pA = h2o_cmd.ParseObj(parseResult) iA = h2o_cmd.InspectObj(pA.parse_key) parse_key = pA.parse_key numRows = iA.numRows numCols = iA.numCols labelList = iA.labelList src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # KeyIndexed False [] 'ignored_columns': None, # string[] None [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1, # int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model(algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1, 64) # random length headers headerName = ''.join( [random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0, 1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0, 1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0, 1) GZIP_HEADER = random.randint(0, 1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k, v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN == " " or SEP_CHAR_GEN == ",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0, 1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0, 1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset( csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename( csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str( trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset( hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename( hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*' + str(trial) + "_" + rowxcol + '*' else: pattern = 'syn_data_*' + str(trial) + "_" + rowxcol + '*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone == 0) and (kwargs['header'] == 1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1 == 0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = { 'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf } start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def doGBM(f, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): debug = False bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/'+build+'/'+pre+'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(gbmbenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() h2o.beta_features = False #ensure this is false! if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 16000, retryDelaySecs = 5, pollTimeoutSecs = 16000, noPoll = True, doSummary = False ) h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=16000, retryDelaySecs=5) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." h2o.beta_features = False #make sure false for the inspect as well! inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=16000) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=16000) h2o.beta_features = True #ok, can be true again nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update( {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nTrainRows' : inspect_train['num_rows'], 'nTestRows' : inspect_test['num_rows'], 'nCols' : inspect_train['num_cols'], 'trainParseWallTime' : parseWallTime, 'nTrees' : ntrees, 'minRows' : minrows, 'maxDepth' : depth, 'learnRate' : learnRate, 'classification' : classification, }) params = {'destination_key' : 'GBM('+f+')', 'response' : response, 'ignored_cols_by_name' : ignored_cols, 'classification' : classification, 'validation' : testFilehex, 'ntrees' : ntrees, 'max_depth' : depth, 'min_rows' : minrows, 'nbins' : nbins, 'learn_rate' : learnRate, } parseResult = {'destination_key' : hex_key} kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion gbm = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=120, retryDelaySecs=5) gbmTime = time.time() - gbmStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) row.update( {'gbmBuildTime' : gbmTime, }) gbmTrainView = h2o_cmd.runGBMView(model_key='GBM('+f+')') if classification: cm = gbmTrainView['gbm_model']['cm'] err = 1.0*(cm[0][1] + cm[1][0]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]) else: err = gbmTrainView['gbm_model']['errs'][-1] row.update({'Error' : err}) csvWrt.writerow(row) finally: output.close()
def doGLM2(f, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, row, case_mode, case_val): debug = False bench = "bench" if debug: print "DOING GLM2 DEBUG" bench = "bench/debug" date = '-'.join([str(z) for z in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" glm2benchcsv = 'benchmarks/' + build + '/' + pre + 'glm2bench.csv' if not os.path.exists(glm2benchcsv): output = open(glm2benchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(glm2benchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: #print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..." #continue csvPathname = importFolderPath + "/" + f + "/*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200, noPoll=True, doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5) parseResult = {'destination_key': hex_key} parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." h2o.beta_features = True inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=7200) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'Xmx/JVM': java_heap_GB, 'nJVMs': len(h2o.nodes), 'dataset': f, 'nTrainRows': inspect_train['numRows'], 'nTestRows': inspect_test['numRows'], 'nCols': inspect_train['numCols'], 'trainParseWallTime': parseWallTime, 'nfolds': nfolds, 'family': family, }) params = { 'vresponse': y, 'ignored_cols': x, 'family': family, 'lambda': lambda_, 'alpha': alpha, 'n_folds': nfolds, #'case_mode' : case_mode, #'case_val' : case_val, 'destination_key': "GLM(" + f + ")", } h2o.beta_features = True kwargs = params.copy() glmStart = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=1800, noPoll=True, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5) glmTime = time.time() - glmStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) #glm = h2o.nodes[0].inspect("GLM("+f+")") row.update({ 'glm2BuildTime': glmTime, #'AverageErrorOver10Folds' : glm['glm_model']['validations'][0]['err'], }) #if "Bedrooms" in f: #print "Sleeping 30" #time.sleep(30) glmView = h2o_cmd.runGLMView(modelKey="GLM(" + f + ")", timeoutSecs=380) #glmScoreStart = time.time() #glmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - glmScoreStart row.update({ 'AIC': glmView['glm_model']['validation']['aic'], 'nIterations': glmView['glm_model']['iteration'], 'nPredictors': len(glmView['glm_model']['beta']), #'AverageError' : glmView['glm_model']['validation']['avg_err'], }) if family == "binomial": #Scrape html of 2/glmmodelview to get best threshold, #then, multiply by 100 and cast to int... #then ask for the coresponding CM from _cms inside glmView url = 'http://%s:%d/2/GLMModelView.html?_modelKey=%s' % ( h2o.nodes[0].http_addr, 55555, 'GLM(' + f + ')') r = requests.get(url).text p1 = re.compile('threshold[:<>/a-z]*[0-9]\.[0-9]*') p2 = re.compile('[0-9]\.[0-9]*') best = int(float(p2.search(p1.search(r).group()).group()) * 100) best_cm = glmView['glm_model']['validation']['_cms'][best]['_arr'] avg_err = 1.0 * (best_cm[0][1] + best_cm[1][0] + 0.0) / (sum( [i for sublist in best_cm for i in sublist])) row.update( {#'scoreTime' : scoreTime, 'AUC' : glmView['glm_model']['validation']['auc'], 'AverageError' : avg_err, }) else: row.update( {#'scoreTime' : scoreTime, 'AUC' : 'NA', 'AverageError' : glmView['glm_model']['validation']['avg_err'], }) csvWrt.writerow(row) finally: output.close()
finally: output.close() if __name__ == '__main__': debug = sys.argv.pop(-1) build = sys.argv.pop(-1) h2o.parse_our_args() h2o_hosts.build_cloud_with_hosts(enable_benchmark_log=False) #AIRLINES airlinesTestParseStart = time.time() hK = "AirlinesHeader.csv" headerPathname = "bench/Airlines" + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path='bench/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, timeoutSecs=4800, retryDelaySecs=5, pollTimeoutSecs=4800) elapsedAirlinesTestParse = time.time() - airlinesTestParseStart row = {'testParseWallTime': elapsedAirlinesTestParse} response = 'IsDepDelayed' ignored = None doGBM(files['Airlines'],
def test_parse_multi_header_rand_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename allowedLetters = 'abcdeABCDE01234[]' headerChoices = [] for n in range(500): # max # of cols below is 500 done = False while not done: l = random.randint(1,64) # random length headers headerName = ''.join([random.choice(allowedLetters) for _ in range(l)]) # we keep trying if we already have that header name. Has to be unique. done = headerName not in headerChoices headerChoices.append(headerName) tryList = [ (3, 5, 9, 'cA', 60, 0), # (3, 5, 25, 'cA', 60, 0), # (10, 100, 500, 'cA', 60, 0), ] for trial in range(20): (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) = random.choice(tryList) print fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 totalHeaderRows = 0 # random selection of parse param choices # HEADER_HAS_HDR_ROW = random.randint(0,1) HEADER_HAS_HDR_ROW = 1 DATA_HAS_HDR_ROW = random.randint(0,1) PARSE_PATTERN_INCLUDES_HEADER = random.randint(0,1) # DATA_FIRST_IS_COMMENT = random.randint(0,1) # HEADER_FIRST_IS_COMMENT = random.randint(0,1) # FIX! doesn't seem to like just comment in the header file DATA_FIRST_IS_COMMENT = 0 HEADER_FIRST_IS_COMMENT = 0 GZIP_DATA = random.randint(0,1) GZIP_HEADER = random.randint(0,1) SEP_CHAR_GEN = random.choice(paramsDict['separator']) HEADER_SEP_CHAR_GEN = random.choice(paramsDict['hdr_separator']) if HEADER_SEP_CHAR_GEN == 'same': HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # don't put a header in a data file with a different separator? if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # Hack: if both data and header files have a header, then, just in case # the header and data files should have the same separator # if they don't, make header match data if DATA_HAS_HDR_ROW and HEADER_HAS_HDR_ROW: HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN # New for fvec? if separators are not the same, then the header separator needs to be comma if HEADER_SEP_CHAR_GEN != SEP_CHAR_GEN: HEADER_SEP_CHAR_GEN = ',' # screw it. make them always match HEADER_SEP_CHAR_GEN = SEP_CHAR_GEN if HEADER_SEP_CHAR_GEN in (',', ' '): pass # extra spaces? Don't add any # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = " " + HEADER_SEP_CHAR_GEN # if random.randint(0,1): # HEADER_SEP_CHAR_GEN = HEADER_SEP_CHAR_GEN + " " kwargs = {} for k,v in paramsDict.items(): kwargs[k] = random.choice(v) kwargs['separator'] = SEP_CHAR_GEN # parse doesn't auto-detect tab. will autodetect space and comma if SEP_CHAR_GEN==" " or SEP_CHAR_GEN==",": del kwargs['separator'] else: kwargs['separator'] = ord(SEP_CHAR_GEN) # randomly add leading and trailing white space # we have to do this after we save the single char HEADER_SEP_CHAR_GEN if SEP_CHAR_GEN in (',', ' '): if random.randint(0,1): SEP_CHAR_GEN = " " + SEP_CHAR_GEN if random.randint(0,1): SEP_CHAR_GEN = SEP_CHAR_GEN + " " print '\nHEADER_HAS_HDR_ROW:', HEADER_HAS_HDR_ROW print 'DATA_HAS_HDR_ROW:', DATA_HAS_HDR_ROW print 'PARSE_PATTERN_INCLUDES_HEADER', PARSE_PATTERN_INCLUDES_HEADER print 'DATA_FIRST_IS_COMMENT:', DATA_FIRST_IS_COMMENT print 'HEADER_FIRST_IS_COMMENT:', HEADER_FIRST_IS_COMMENT print 'SEP_CHAR_GEN:', "->" + SEP_CHAR_GEN + "<-" print 'HEADER_SEP_CHAR_GEN:', "->" + HEADER_SEP_CHAR_GEN + "<-" print 'GZIP_DATA:', GZIP_DATA print 'GZIP_HEADER:', GZIP_HEADER # they need to both use the same separator (h2o rule) # can't have duplicates hfhList = random.sample(headerChoices, colCount) + ["output"] # UPDATE: always use comma or space for header separator?? it should work no matter what # separator the data uses? headerForHeader = HEADER_SEP_CHAR_GEN.join(hfhList) print "headerForHeader:", headerForHeader # make these different # hfdList = [random.choice(headerChoices) for h in range(colCount)] + ["output"] # FIX! keep them the same for now to avoid some odd cases on what header gets used to RF hfdList = hfhList headerForData = SEP_CHAR_GEN.join(hfdList) # create data files for fileN in range(fileNum): csvFilenameSuffix = str(fileN) + "_" + str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' csvFilename = 'syn_data_' + csvFilenameSuffix csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount, sepChar=SEP_CHAR_GEN) (headerRowsDone, dataRowsDone) = write_syn_dataset(csvPathname, rowCount, headerString=(headerForData if DATA_HAS_HDR_ROW else None), rList=rList, commentFirst=DATA_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_DATA: csvPathnamegz = csvPathname + ".gz" print "gzipping to", csvPathnamegz h2o_util.file_gzip(csvPathname, csvPathnamegz) os.rename(csvPathname, SYNDATASETS_DIR + "/not_used_data_" + csvFilenameSuffix) # pattern match should find the right key with csvPathname # create the header file hdrFilenameSuffix = str(SEED) + "_" + str(trial) + "_" + rowxcol + '_csv' hdrFilename = 'syn_header_' + hdrFilenameSuffix hdrPathname = SYNDATASETS_DIR + '/' + hdrFilename # dataRowsWithHeader = 0 # temp hack (headerRowsDone, dataRowsDone) = write_syn_dataset(hdrPathname, dataRowsWithHeader, headerString=(headerForHeader if HEADER_HAS_HDR_ROW else None), rList=rList, commentFirst=HEADER_FIRST_IS_COMMENT, sepChar=SEP_CHAR_GEN) # only include header file data rows if the parse pattern includes it if PARSE_PATTERN_INCLUDES_HEADER: totalDataRows += dataRowsDone totalHeaderRows += headerRowsDone if GZIP_HEADER: hdrPathnamegz = hdrPathname + ".gz" print "gzipping to", hdrPathnamegz h2o_util.file_gzip(hdrPathname, hdrPathnamegz) os.rename(hdrPathname, SYNDATASETS_DIR + "/not_used_header_" + hdrFilenameSuffix) # pattern match should find the right key with hdrPathnameh # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) hex_key = "syn_dst" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) h2o_cmd.runStoreView() headerKey = h2i.find_key(hdrFilename) dataKey = h2i.find_key(csvFilename) # use regex. the only files in the dir will be the ones we just created # with *fileN* match print "Header Key =", headerKey # put the right name in if kwargs['header_from_file'] == 'header': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = headerKey # use one of the data files? elif kwargs['header_from_file'] == 'data': # do we need to add the .hex suffix we know h2o will append kwargs['header_from_file'] = dataKey # if there's no header in the header file, turn off the header_from_file if not HEADER_HAS_HDR_ROW: kwargs['header_from_file'] = None if HEADER_HAS_HDR_ROW and (kwargs['header_from_file'] == headerKey): ignoreForRf = hfhList[0] elif DATA_HAS_HDR_ROW: ignoreForRf = hfdList[0] else: ignoreForRf = None print "If header_from_file= , required to force header=1 for h2o" if kwargs['header_from_file']: kwargs['header'] = 1 # if we have a header in a data file, tell h2o (for now) elif DATA_HAS_HDR_ROW: kwargs['header'] = 1 else: kwargs['header'] = 0 # may have error if h2o doesn't get anything! start = time.time() if PARSE_PATTERN_INCLUDES_HEADER and HEADER_HAS_HDR_ROW: pattern = 'syn_*'+str(trial)+"_"+rowxcol+'*' else: pattern = 'syn_data_*'+str(trial)+"_"+rowxcol+'*' # don't pass to parse kwargs.pop('hdr_separator', None) parseResult = h2i.parse_only(pattern=pattern, hex_key=hex_key, timeoutSecs=timeoutSecs, **kwargs) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # more reporting: (we can error here if extra col in header, # causes all NA for missing col of data) h2o_cmd.columnInfoFromInspect(parseResult['destination_key'], exceptionOnMissingValues=False) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, \ "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) # do we end up parsing one data rows as a header because of mismatch in gen/param h2oLosesOneData = (headerRowsDone==0) and (kwargs['header']==1) and not DATA_HAS_HDR_ROW # header in data file gets treated as data h2oGainsOneData = (headerRowsDone!=0) and (kwargs['header']==1) and \ DATA_HAS_HDR_ROW and (kwargs['header_from_file'] is not None) h2oGainsOneData = False print "h2oLosesOneData:", h2oLosesOneData print "h2oGainsOneData:", h2oGainsOneData if h2oLosesOneData: totalDataRows -= 1 if h2oGainsOneData: totalDataRows += 1 if 1==0: # FIX! don't check for now self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows h2o %s gen'ed: %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly # doesn't matter if the header got a comment, should see it kwargs = {'sample': 100, 'depth': 25, 'ntree': 2, 'ignore': ignoreForRf} start = time.time() # h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=10, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors() h2i.delete_keys_at_all_nodes(pattern='syn_datasets')
def test_w2v_basic_2(self): global SYNDATASETS_DIR SYNDATASETS_DIR = h2o.make_syn_dir() n = 100 tryList = [ # (n, 1, 'cD', 300), (n, 2, 'cE', 300), (n, 3, 'cF', 300), (n, 4, 'cG', 300), (n, 5, 'cH', 300), (n, 6, 'cI', 300), (n, 7, 'cJ', 300), (n, 9, 'cK', 300), ] ### h2b.browseTheCloud() for (rowCount, colCount, hex_key, timeoutSecs) in tryList: csvPathname = create_file_with_seps(rowCount, colCount) hex_key = "not_used.hex" # just parse to make sure it's good parseResult = h2i.import_parse(path=csvPathname, checkHeader=1, delete_on_done = 0, timeoutSecs=180, doSummary=False) numRows, numCols, parse_key = h2o_cmd.infoFromParse(parseResult) inspectResult = h2o_cmd.runInspect(key=parse_key) missingList, labelList, numRows, numCols = h2o_cmd.infoFromInspect(inspectResult) src_key = h2i.find_key('syn_.*csv') # no cols ignored labelListUsed = list(labelList) numColsUsed = numCols for trial in range(1): parameters = { 'validation_frame': parse_key, # Frame False [] 'ignored_columns': None, # string[] None [] 'score_each_iteration': None, # boolean false [] 'minWordFreq': 1, # int 5 [] 'wordModel': 'CBOW', # enum [u'CBOW', u'SkipGram'] 'normModel': 'NegSampling', # enum # [u'HSM', u'NegSampling'] 'negSampleCnt': 1,# int 5 [] 'vecSize': 10, # int 100 'windowSize': 2, # int 5 'sentSampleRate': 0.001, # float 0.001 'initLearningRate': 0.05, # float 0.05 'epochs': 1, # int 5 } model_key = 'benign_w2v.hex' bmResult = h2o.n0.build_model( algo='word2vec', destination_key=model_key, training_frame=parse_key, parameters=parameters, timeoutSecs=10) bm = OutputObj(bmResult, 'bm') modelResult = h2o.n0.models(key=model_key) model = OutputObj(modelResult['models'][0]['output'], 'model') cmmResult = h2o.n0.compute_model_metrics( model=model_key, frame=parse_key, timeoutSecs=60) cmm = OutputObj(cmmResult, 'cmm') mmResult = h2o.n0.model_metrics(model=model_key, frame=parse_key, timeoutSecs=60) mm = OutputObj(mmResult['model_metrics'][0], 'mm') prResult = h2o.n0.predict(model=model_key, frame=parse_key, timeoutSecs=60) pr = OutputObj(prResult['model_metrics'][0]['predictions'], 'pr') h2o_cmd.runStoreView()
def doKMeans(f, folderPath): debug = False bench = "bench" if debug: print "Debugging KMEANS" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" kmeansbenchcsv = 'benchmarks/'+build+'/'+pre+'kmeansbench.csv' if not os.path.exists(kmeansbenchcsv): output = open(kmeansbenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(kmeansbenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB #Train File Parsing# importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = 5, pollTimeoutSecs = 7200, doSummary = False ) parseWallTime = time.time() - trainParseWallStart #End Train File Parse# print "Parsing training file took ", parseWallTime ," seconds." inspect = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row = {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nRows' : inspect['num_rows'], 'nCols' : inspect['num_cols'], 'parseWallTime' : parseWallTime, 'k' : 6, 'max_iter' : 100, 'init' : 'Furthest', } params = {'source_key' : hex_key, 'k' : 6, 'initialization' : 'Furthest', 'max_iter' : 100, 'seed' : 1234567, 'normalize' : 0, #'cols' : , 'destination_key' : "KMeans("+f+")", } kwargs = params.copy() kmeansStart = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=7200, **kwargs) kmeansTime = time.time() - kmeansStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) row.update({'kmeansBuildTime' : kmeansTime}) csvWrt.writerow(row) finally: output.close()
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" date = '-'.join([str(x) for x in list(time.localtime())][0:3]) for f in fs['train']: overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/'+build+'/'+date+'/'+pre+'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(gbmbenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x','CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = 5, pollTimeoutSecs = 7200 ) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." inspect_train = h2o.nodes[0].inspect(parseResult['destination_key']) inspect_test = h2o.nodes[0].inspect(testFilehex) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update( {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nTrainRows' : inspect_train['numRows'], 'nTestRows' : inspect_test['numRows'], 'nCols' : inspect_train['numCols'], 'trainParseWallTime' : parseWallTime, 'classification' : classification, }) params = {'destination_key' : 'GBM('+f+')', 'response' : response, 'ignored_cols_by_name' : ignored_cols, 'classification' : classification, 'validation' : testFilehex, 'ntrees' : ntrees, 'max_depth' : depth, 'min_rows' : minrows, 'nbins' : nbins, 'learn_rate' : learnRate, } kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion h2o.beta_features = True gbm = h2o_cmd.runGBM(parseResult = parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5) h2o.beta_features = False gbmTime = time.time() - gbmStart row.update( {'gbmBuildTime' : gbmTime, }) #TODO(spencer): Add in gbm scoring #gbmScoreStart = time.time() #gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - gbmScoreStart csvWrt.writerow(row) finally: output.close()
def doGLM2(f, folderPath, family, lambda_, alpha, nfolds, y, x, testFilehex, row, case_mode, case_val): debug = False bench = "bench" if debug: print "DOING GLM2 DEBUG" bench = "bench/debug" date = '-'.join([str(z) for z in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" glm2benchcsv = 'benchmarks/'+build+'/'+date+'/'+pre+'glm2bench.csv' if not os.path.exists(glm2benchcsv): output = open(glm2benchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(glm2benchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench+"/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: #print "Not doing Airlines10x and 100x for Parse2, regex seems to be broken..." #continue csvPathname = importFolderPath + "/" + f + "/*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() if f in (['AirlinesTrain10x', 'AirlinesTrain100x']): h2o.beta_features = False #regex parsing acting weird when not using browser, use VA -> FVEC converter parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = 5, pollTimeoutSecs = 7200, noPoll = True, doSummary = False ) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5) parseResult = {'destination_key':hex_key} parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." h2o.beta_features = True inspect_train = h2o.nodes[0].inspect(hex_key, timeoutSecs=7200) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200) row.update( {'h2o_build' : build, 'java_heap_GB' : java_heap_GB, 'dataset' : f, 'nTrainRows' : inspect_train['numRows'], 'nTestRows' : inspect_test['numRows'], 'nCols' : inspect_train['numCols'], 'trainParseWallTime' : parseWallTime, 'nfolds' : nfolds, }) params = {'vresponse' : y, 'ignored_cols' : x, 'family' : family, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, #'case_mode' : case_mode, #'case_val' : case_val, 'destination_key' : "GLM("+f+")", } h2o.beta_features = True kwargs = params.copy() glmStart = time.time() glm = h2o_cmd.runGLM(parseResult = parseResult, timeoutSecs=1800, noPoll=True, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=7200, retryDelaySecs=5) glmTime = time.time() - glmStart #glm = h2o.nodes[0].inspect("GLM("+f+")") row.update( {'glmBuildTime' : glmTime, #'AverageErrorOver10Folds' : glm['glm_model']['validations'][0]['err'], }) #if "Bedrooms" in f: #print "Sleeping 30" #time.sleep(30) glmView = h2o_cmd.runGLMView(modelKey = "GLM("+f+")", timeoutSecs=380) #glmScoreStart = time.time() #glmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - glmScoreStart row.update( {'AIC' : glmView['glm_model']['validation']['aic'], 'nIterations' : glmView['glm_model']['iteration'], 'nPredictors' : len(glmView['glm_model']['beta']), #'AverageError' : glmView['glm_model']['validation']['avg_err'], }) if family == "binomial": #Scrape html of 2/glmmodelview to get best threshold, #then, multiply by 100 and cast to int... #then ask for the coresponding CM from _cms inside glmView url = 'http://%s:%d/2/GLMModelView.html?_modelKey=%s' % (h2o.nodes[0].http_addr, 55555, 'GLM('+f+')') r = requests.get(url).text p1 = re.compile('threshold[:<>/a-z]*[0-9]\.[0-9]*') p2 = re.compile('[0-9]\.[0-9]*') best = int(float(p2.search(p1.search(text).group()).group()) * 100) best_cm = glmView['glm_model']['validation']['_cms'][best]['_arr'] avg_err = (best_cm[0][1] + best_cm[1][0]) / (sum([i for sublist in best_cm for i in sublist])) row.update( {#'scoreTime' : scoreTime, 'AUC' : glmView['glm_model']['validation']['auc'], 'AverageError' : avg_err, }) else: row.update( {#'scoreTime' : scoreTime, 'AUC' : 'NA', 'AverageError' : glmView['glm_model']['validation']['avg_err'], }) csvWrt.writerow(row) finally: output.close()
def doKMeans(f, folderPath): debug = False bench = "bench" if debug: print "Debugging KMEANS" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" kmeansbenchcsv = 'benchmarks/' + build + '/' + pre + 'kmeansbench.csv' if not os.path.exists(kmeansbenchcsv): output = open(kmeansbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(kmeansbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB #Train File Parsing# importFolderPath = bench + "/" + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200, doSummary=False) parseWallTime = time.time() - trainParseWallStart #End Train File Parse# print "Parsing training file took ", parseWallTime, " seconds." inspect = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row = { 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nRows': inspect['num_rows'], 'nCols': inspect['num_cols'], 'parseWallTime': parseWallTime, 'k': 6, 'max_iter': 100, 'init': 'Furthest', } params = { 'source_key': hex_key, 'k': 6, 'initialization': 'Furthest', 'max_iter': 100, 'seed': 1234567, 'normalize': 0, #'cols' : , 'destination_key': "KMeans(" + f + ")", } kwargs = params.copy() kmeansStart = time.time() kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=7200, **kwargs) kmeansTime = time.time() - kmeansStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) row.update({'kmeansBuildTime': kmeansTime}) csvWrt.writerow(row) finally: output.close()
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row): debug = False bench = "bench" if debug: print "DOING GLM DEBUG" bench = "bench/debug" date = '-'.join([str(z) for z in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" glmbenchcsv = 'benchmarks/' + build + '/' + pre + 'glmbench.csv' if not os.path.exists(glmbenchcsv): output = open(glmbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(glmbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200, doSummary=False) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nTrainRows': inspect_train['num_rows'], 'nTestRows': inspect_test['num_rows'], 'nCols': inspect_train['num_cols'], 'trainParseWallTime': parseWallTime, 'nfolds': nfolds, 'family': family, }) params = { 'y': y, 'x': x, 'family': family, 'link': link, 'lambda': lambda_, 'alpha': alpha, 'n_folds': nfolds, 'case_mode': "n/a", 'destination_key': "GLM(" + f + ")", 'expert_settings': 0, } kwargs = params.copy() glmStart = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=7200, **kwargs) glmTime = time.time() - glmStart row.update({ 'glmBuildTime': glmTime, #'AverageErrorOver10Folds' : glm['GLMModel']['validations'][0]['err'], }) glmScoreStart = time.time() glmScore = h2o_cmd.runGLMScore(key=testFilehex, model_key=params['destination_key'], timeoutSecs=1800) scoreTime = time.time() - glmScoreStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) if family == "binomial": row.update({ 'scoreTime': scoreTime, 'AUC': glmScore['validation']['auc'], 'AIC': glmScore['validation']['aic'], 'error': glmScore['validation']['err'], }) else: row.update({ 'scoreTime': scoreTime, 'AIC': glmScore['validation']['aic'], 'AUC': 'NA', 'error': glmScore['validation']['err'], }) csvWrt.writerow(row) finally: output.close()
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only( pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1 } rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def doPCA(f, folderPath): debug = False bench = "bench" if debug: print "Doing PCA DEBUG" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) retryDelaySecs = 5 #if f == 'AirlinesTrain1x' else 30 overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' pcabenchcsv = 'benchmarks/'+build+'/'+pre+'pcabench.csv' if not os.path.exists(pcabenchcsv): output = open(pcabenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(pcabenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' trainParseWallStart = time.time() hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = retryDelaySecs, pollTimeoutSecs = 7200, doSummary = False ) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." inspect = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row = {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nRows' : inspect['num_rows'], 'nCols' : inspect['num_cols'], 'parseWallTime' : parseWallTime, } params = {'destination_key' : "python_PCA_key", 'tolerance' : 0.0, 'standardize' : 1, } kwargs = params.copy() pcaStart = time.time() #h2o.beta_features = True pcaResult = h2o_cmd.runPCA(parseResult = parseResult, noPoll = True, timeoutSecs = 7200, **kwargs) h2j.pollWaitJobs(timeoutSecs=4800, pollTimeoutSecs=4800, retryDelaySecs=2) pcaTime = time.time() - pcaStart cmd = 'bash startloggers.sh ' + json + ' stop_' #stop all loggers os.system(cmd) row.update({'pcaBuildTime' : pcaTime}) csvWrt.writerow(row) finally: output.close()
def test_parse_multi_header_single(self): SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead? # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f # fix. should we have a h2o.n0 for brevity? or h2o.n. ? so we can change it around if multi-node? # frames = h2o.nodes[0].frames()['frames'] frames = h2o.n0.frames()['frames'] frames_dict = h2o_util.list_to_dict(frames, 'key/name') # print "frames:", dump_json(frames) # print "frames_dict:", dump_json(frames_dict) if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() # does h2o-dev take a regex? or do we need to glob parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, checkHeader="1") # header_from_file=header pA = h2o_cmd.ParseObj(parseResult, expectedNumRows=totalDataRows, expectedNumCols=totalCols) print pA.numRows print pA.numCols print pA.parse_key expectedLabelList = headerData.split(",") iA = h2o_cmd.InspectObj(pA.parse_key, expectedNumRows=totalDataRows, expectedNumCols=totalCols, expectedMissinglist=[], expectedLabelList=expectedLabelList) if DO_RF: # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) h2o.check_sandbox_for_errors()
def test_parse_multi_header_single_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str(SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception("Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() parseResult = h2i.parse_only(pattern='*'+rowxcol+'*', hex_key=hex_key, timeoutSecs=timeoutSecs, header="1", header_from_file=header) print "parseResult['destination_key']: " + parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual(inspect['numCols'], totalCols, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE'} else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed/timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
def doSUM(f, folderPath): debug = False bench = "bench" if debug: print "Doing SUM DEBUG" bench = "bench/debug" #date = '-'.join([str(x) for x in list(time.localtime())][0:3]) retryDelaySecs = 5 #if f == 'AirlinesTrain1x' else 30 overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' sumbenchcsv = 'benchmarks/' + build + '/' + pre + 'summarybench.csv' if not os.path.exists(sumbenchcsv): output = open(sumbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(sumbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' trainParseWallStart = time.time() hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=7200, doSummary=False) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row = { 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nRows': inspect['num_rows'], 'nCols': inspect['num_cols'], 'parseWallTime': parseWallTime, } sumStart = time.time() sumResult = h2o_cmd.runSummary(key=hex_key, timeoutSecs=7200) sumTime = time.time() - sumStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) row.update({'summaryBuildTime': sumTime}) csvWrt.writerow(row) finally: output.close()
def test_parse_multi_header_single_fvec(self): h2o.beta_features = True SYNDATASETS_DIR = h2o.make_syn_dir() csvFilename = "syn_ints.csv" csvPathname = SYNDATASETS_DIR + '/' + csvFilename headerData = "ID,CAPSULE,AGE,RACE,DPROS,DCAPS,PSA,VOL,GLEASON,output" # cols must be 9 to match the header above, otherwise a different bug is hit # extra output is added, so it's 10 total tryList = [ (57, 300, 9, 'cA', 60, 0), # try with 1-3 data lines in the header file too (57, 300, 9, 'cB', 60, 1), (57, 300, 9, 'cC', 60, 2), (57, 300, 9, 'cD', 60, 3), ] trial = 0 for (fileNum, rowCount, colCount, hex_key, timeoutSecs, dataRowsWithHeader) in tryList: trial += 1 # FIX! should we add a header to them randomly??? print "Wait while", fileNum, "synthetic files are created in", SYNDATASETS_DIR rowxcol = str(rowCount) + 'x' + str(colCount) totalCols = colCount + 1 # 1 extra for output totalDataRows = 0 for fileN in range(fileNum): csvFilename = 'syn_' + str(fileN) + "_" + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename rList = rand_rowData(colCount) dataRowsDone = write_syn_dataset(csvPathname, rowCount, headerData=None, rList=rList) totalDataRows += dataRowsDone # create the header file # can make it pass by not doing this if HEADER: csvFilename = 'syn_header_' + str( SEED) + "_" + rowxcol + '.csv' csvPathname = SYNDATASETS_DIR + '/' + csvFilename dataRowsDone = write_syn_dataset(csvPathname, dataRowsWithHeader, headerData, rList) totalDataRows += dataRowsDone # make sure all key names are unique, when we re-put and re-parse (h2o caching issues) src_key = "syn_" + str(trial) hex_key = "syn_" + str(trial) + ".hex" # DON"T get redirected to S3! (EC2 hack in config, remember!) # use it at the node level directly (because we gen'ed the files. # I suppose we could force the redirect state bits in h2o.nodes[0] to False, instead?:w # put them, rather than using import files, so this works if remote h2o is used # and python creates the files locally fileList = os.listdir(SYNDATASETS_DIR) for f in fileList: h2i.import_only(path=SYNDATASETS_DIR + "/" + f, schema='put', noPrint=True) print f if HEADER: header = h2i.find_key('syn_header') if not header: raise Exception( "Didn't find syn_header* key in the import") # use regex. the only files in the dir will be the ones we just created with *fileN* match print "Header Key = " + header start = time.time() parseResult = h2i.parse_only(pattern='*' + rowxcol + '*', hex_key=hex_key, timeoutSecs=timeoutSecs, header="1", header_from_file=header) print "parseResult['destination_key']: " + parseResult[ 'destination_key'] inspect = h2o_cmd.runInspect(None, parseResult['destination_key']) h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ " numRows:", "{:,}".format(inspect['numRows']), \ " numCols:", "{:,}".format(inspect['numCols']) # should match # of cols in header or ?? self.assertEqual( inspect['numCols'], totalCols, "parse created result with the wrong number of cols %s %s" % (inspect['numCols'], totalCols)) self.assertEqual(inspect['numRows'], totalDataRows, "parse created result with the wrong number of rows (header shouldn't count) %s %s" % \ (inspect['numRows'], totalDataRows)) # put in an ignore param, that will fail unless headers were parsed correctly if HEADER: kwargs = { 'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1, 'ignored_cols_by_name': 'ID,CAPSULE' } else: kwargs = {'sample_rate': 0.75, 'max_depth': 25, 'ntrees': 1} start = time.time() rfv = h2o_cmd.runRF(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) elapsed = time.time() - start print "%d pct. of timeout" % ((elapsed / timeoutSecs) * 100) print "trial #", trial, "totalDataRows:", totalDataRows, "parse end on ", csvFilename, \ 'took', time.time() - start, 'seconds' h2o.check_sandbox_for_errors()
bench = "bench/debug" if dat == 'Air1x' : fs = files['Airlines']['train'][0] if dat == 'Air10x' : fs = files['Airlines']['train'][1] if dat == 'Air100x' : fs = files['Airlines']['train'][2] if dat == 'AllB1x' : fs = files['AllBedrooms']['train'][0] if dat == 'AllB10x' : fs = files['AllBedrooms']['train'][1] if dat == 'AllB100x' : fs = files['AllBedrooms']['train'][2] if fp == "Airlines": #AIRLINES airlinesTestParseStart = time.time() hK = "AirlinesHeader.csv" headerPathname = bench+"/Airlines" + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) testFile = h2i.import_parse(bucket='home-0xdiag-datasets', path=bench+'/Airlines/AirlinesTest.csv', schema='local', hex_key="atest.hex", header=1, header_from_file=headerKey, separator=44, noPoll=True,doSummary=False) h2o_jobs.pollWaitJobs(timeoutSecs=16000, pollTimeoutSecs=16000, retryDelaySecs=5) elapsedAirlinesTestParse = time.time() - airlinesTestParseStart row = {'testParseWallTime' : elapsedAirlinesTestParse} response = 'IsDepDelayed' ignored = None doGBM(fs, fp, ignored_cols = ignored, classification = 1, testFilehex = 'atest.hex', ntrees = 100, depth = 5, minrows = 10, nbins = 100, learnRate = 0.01,
def doGBM(fs, folderPath, ignored_cols, classification, testFilehex, ntrees, depth, minrows, nbins, learnRate, response, row): bench = "bench" if debug: print "Doing GBM DEBUG" bench = "bench/debug" date = '-'.join([str(x) for x in list(time.localtime())][0:3]) for f in fs['train']: overallWallStart = time.time() pre = "" if debug: pre = 'DEBUG' gbmbenchcsv = 'benchmarks/' + build + '/' + date + '/' + pre + 'gbmbench.csv' if not os.path.exists(gbmbenchcsv): output = open(gbmbenchcsv, 'w') output.write(','.join(csv_header) + '\n') else: output = open(gbmbenchcsv, 'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore', delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + folderPath if (f in [ 'AirlinesTrain1x', 'AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x', 'CovTypeTrain1x', 'CovTypeTrain10x', 'CovTypeTrain100x' ]): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket='home-0xdiag-datasets', path=csvPathname, schema='local', hex_key=hex_key, header=1, header_from_file=headerKey, separator=44, timeoutSecs=7200, retryDelaySecs=5, pollTimeoutSecs=7200) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime, " seconds." inspect_train = h2o.nodes[0].inspect( parseResult['destination_key']) inspect_test = h2o.nodes[0].inspect(testFilehex) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len( h2o_hosts.hosts) row.update({ 'h2o_build': build, 'nMachines': nMachines, 'nJVMs': len(h2o.nodes), 'Xmx/JVM': java_heap_GB, 'dataset': f, 'nTrainRows': inspect_train['numRows'], 'nTestRows': inspect_test['numRows'], 'nCols': inspect_train['numCols'], 'trainParseWallTime': parseWallTime, 'classification': classification, }) params = { 'destination_key': 'GBM(' + f + ')', 'response': response, 'ignored_cols_by_name': ignored_cols, 'classification': classification, 'validation': testFilehex, 'ntrees': ntrees, 'max_depth': depth, 'min_rows': minrows, 'nbins': nbins, 'learn_rate': learnRate, } kwargs = params.copy() gbmStart = time.time() #TODO(spencer): Uses jobs to poll for gbm completion h2o.beta_features = True gbm = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True, timeoutSecs=4800, **kwargs) h2o_jobs.pollWaitJobs(timeoutSecs=7200, pollTimeoutSecs=120, retryDelaySecs=5) h2o.beta_features = False gbmTime = time.time() - gbmStart row.update({ 'gbmBuildTime': gbmTime, }) #TODO(spencer): Add in gbm scoring #gbmScoreStart = time.time() #gbmScore = h2o_cmd.runGLMScore(key=testFilehex,model_key=params['destination_key']) #scoreTime = time.time() - gbmScoreStart csvWrt.writerow(row) finally: output.close()
def doGLM(f, folderPath, family, link, lambda_, alpha, nfolds, y, x, testFilehex, row): debug = False bench = "bench" if debug: print "DOING GLM DEBUG" bench = "bench/debug" date = '-'.join([str(z) for z in list(time.localtime())][0:3]) overallWallStart = time.time() pre = "" if debug: pre = "DEBUG" glmbenchcsv = 'benchmarks/'+build+'/'+pre+'glmbench.csv' if not os.path.exists(glmbenchcsv): output = open(glmbenchcsv,'w') output.write(','.join(csv_header)+'\n') else: output = open(glmbenchcsv,'a') csvWrt = csv.DictWriter(output, fieldnames=csv_header, restval=None, dialect='excel', extrasaction='ignore',delimiter=',') try: java_heap_GB = h2o.nodes[0].java_heap_GB importFolderPath = bench + "/" + folderPath if (f in ['AirlinesTrain1x','AllBedroomsTrain1x', 'AllBedroomsTrain10x', 'AllBedroomsTrain100x']): csvPathname = importFolderPath + "/" + f + '.csv' else: csvPathname = importFolderPath + "/" + f + "/*linked*" hex_key = f + '.hex' hK = folderPath + "Header.csv" headerPathname = importFolderPath + "/" + hK h2i.import_only(bucket='home-0xdiag-datasets', path=headerPathname) headerKey = h2i.find_key(hK) trainParseWallStart = time.time() parseResult = h2i.import_parse(bucket = 'home-0xdiag-datasets', path = csvPathname, schema = 'local', hex_key = hex_key, header = 1, header_from_file = headerKey, separator = 44, timeoutSecs = 7200, retryDelaySecs = 5, pollTimeoutSecs = 7200, doSummary = False ) parseWallTime = time.time() - trainParseWallStart print "Parsing training file took ", parseWallTime ," seconds." inspect_train = h2o.nodes[0].inspect(parseResult['destination_key'], timeoutSecs=7200) inspect_test = h2o.nodes[0].inspect(testFilehex, timeoutSecs=7200) nMachines = 1 if len(h2o_hosts.hosts) is 0 else len(h2o_hosts.hosts) row.update( {'h2o_build' : build, 'nMachines' : nMachines, 'nJVMs' : len(h2o.nodes), 'Xmx/JVM' : java_heap_GB, 'dataset' : f, 'nTrainRows' : inspect_train['num_rows'], 'nTestRows' : inspect_test['num_rows'], 'nCols' : inspect_train['num_cols'], 'trainParseWallTime' : parseWallTime, 'nfolds' : nfolds, 'family' : family, }) params = {'y' : y, 'x' : x, 'family' : family, 'link' : link, 'lambda' : lambda_, 'alpha' : alpha, 'n_folds' : nfolds, 'case_mode' : "n/a", 'destination_key' : "GLM("+f+")", 'expert_settings' : 0, } kwargs = params.copy() glmStart = time.time() glm = h2o_cmd.runGLM(parseResult = parseResult, timeoutSecs = 7200, **kwargs) glmTime = time.time() - glmStart row.update( {'glmBuildTime' : glmTime, #'AverageErrorOver10Folds' : glm['GLMModel']['validations'][0]['err'], }) glmScoreStart = time.time() glmScore = h2o_cmd.runGLMScore(key = testFilehex, model_key = params['destination_key'], timeoutSecs = 1800) scoreTime = time.time() - glmScoreStart cmd = 'bash startloggers.sh ' + json + ' stop_' os.system(cmd) if family == "binomial": row.update( {'scoreTime' : scoreTime, 'AUC' : glmScore['validation']['auc'], 'AIC' : glmScore['validation']['aic'], 'error' : glmScore['validation']['err'], }) else: row.update( {'scoreTime' : scoreTime, 'AIC' : glmScore['validation']['aic'], 'AUC' : 'NA', 'error' : glmScore['validation']['err'], }) csvWrt.writerow(row) finally: output.close()