def test_GLM_both(self): if (1==1): csvFilenameList = [ ('logreg', 'benign.csv', 'binomial', 3, 10), # col is zero based # FIX! what's wrong here? index error ## ('uis.dat', 'binomial', 8, 5, False), ## ('pros.dat', 'binomial', 1, 10, False), ## ('chdage.dat', 'binomial', 2, 5, True), ## ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ## ('clslowbwt.dat', 'binomial', 7, 10, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ] else: csvFilenameList = [ # leave out ID and birth weight ('logreg', 'benign.csv', 'gaussian', 3, 10), (None, 'icu.dat', 'binomial', 1, 10), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'lowbwt.dat', 'binomial', 1, 10), (None, 'lowbwtm11.dat', 'binomial', 1, 10), (None, 'meexp.dat', 'gaussian', 3, 10), # FIX! does this one hang in R? (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'pbc.dat', 'gaussian', 1, 10), (None, 'pharynx.dat', 'gaussian', 12, 10), (None, 'uis.dat', 'binomial', 8, 10), ] trial = 0 for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList: # FIX! do something about this file munging if offset: csvPathname1 = offset + "/" + csvFilename else: csvPathname1 = 'logreg/umass_statdata/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(fullPathname, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) num_cols = inspect['numCols'] num_rows = inspect['numRows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" if csvFilename=='benign.csv' and (c==0 or c==1): print "Not including col 0,1 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x kwargs = { 'n_folds': 0, 'response': y, # what about x? 'family': family, 'alpha': 0, 'lambda': 0, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } if csvFilename=='benign.csv': kwargs['ignored_cols'] = '0,1' if csvFilename=='clslowbwt.dat': kwargs['ignored_cols'] = '6' start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def test_GLM_both(self): h2o.beta_features = True if (1==1): csvFilenameList = [ ('logreg', 'benign.csv', 'binomial', 3, 10), # col is zero based # FIX! what's wrong here? index error ## ('uis.dat', 'binomial', 8, 5, False), ## ('pros.dat', 'binomial', 1, 10, False), ## ('chdage.dat', 'binomial', 2, 5, True), ## ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ## ('clslowbwt.dat', 'binomial', 7, 10, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ] else: csvFilenameList = [ # leave out ID and birth weight ('logreg', 'benign.csv', 'gaussian', 3, 10), (None, 'icu.dat', 'binomial', 1, 10), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'lowbwt.dat', 'binomial', 1, 10), (None, 'lowbwtm11.dat', 'binomial', 1, 10), (None, 'meexp.dat', 'gaussian', 3, 10), # FIX! does this one hang in R? (None, 'nhanes3.dat', 'binomial', 15, 10), (None, 'pbc.dat', 'gaussian', 1, 10), (None, 'pharynx.dat', 'gaussian', 12, 10), (None, 'uis.dat', 'binomial', 8, 10), ] trial = 0 for (offset, csvFilename, family, y, timeoutSecs) in csvFilenameList: # FIX! do something about this file munging if offset: csvPathname1 = offset + "/" + csvFilename else: csvPathname1 = 'logreg/umass_statdata/' + csvFilename fullPathname = h2i.find_folder_and_filename('smalldata', csvPathname1, returnFullPath=True) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(fullPathname, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseResult = h2i.import_parse(path=csvPathname2, schema='put', hex_key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseResult['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) if h2o.beta_features: num_cols = inspect['numCols'] num_rows = inspect['numRows'] else: num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" if csvFilename=='benign.csv' and (c==0 or c==1): print "Not including col 0,1 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x if h2o.beta_features: kwargs = { 'n_folds': 0, 'response': y, # what about x? 'family': family, 'alpha': 0, 'lambda': 0, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } else: kwargs = { 'n_folds': 0, 'y': y, 'x': x, 'family': family, 'alpha': 0, 'lambda': 1e-4, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } if csvFilename=='benign.csv': kwargs['ignored_cols'] = '0,1' if csvFilename=='clslowbwt.dat': kwargs['ignored_cols'] = '6' start = time.time() glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def test_GLM_umass(self): if 1 == 1: csvFilenameList = [ # col is zero based # FIX! what's wrong here? index error ("uis.dat", "binomial", 8, 5, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ("pros.dat", "binomial", 1, 10, False), ("chdage.dat", "binomial", 2, 5, True), ("icu.dat", "binomial", 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ("clslowbwt.dat", "binomial", 7, 10, False), ] else: csvFilenameList = [ # leave out ID and birth weight ("icu.dat", "binomial", 1, 10, None), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! ("nhanes3.dat", "binomial", 15, 10), ("lowbwt.dat", "binomial", 1, 10, "2,3,4,5,6,7,8,9"), ("lowbwtm11.dat", "binomial", 1, 10, None), ("meexp.dat", "gaussian", 3, 10, None), # FIX! does this one hang in R? ("nhanes3.dat", "binomial", 15, 10, None), ("pbc.dat", "gaussian", 1, 10, None), ("pharynx.dat", "gaussian", 12, 10, None), ("uis.dat", "binomial", 8, 10, None), ] trial = 0 for (csvFilename, family, y, timeoutSecs, header) in csvFilenameList: # FIX! do something about this file munging csvPathname1 = h2o.find_file("smalldata/logreg/umass_statdata/" + csvFilename) csvPathname2 = SYNDATASETS_DIR + "/" + csvFilename + "_2.csv" h2o_util.file_clean_for_R(csvPathname1, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseKey = h2o_cmd.parseFile(None, csvPathname2, key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseKey["destination_key"] inspect = h2o_cmd.runInspect(None, destination_key) num_cols = inspect["num_cols"] num_rows = inspect["num_rows"] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y + 1) + " ~ " x = None col_names = "" for c in range(0, num_cols): # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c + 1) else: col_names += ",V" + str(c + 1) if c != y: if x is None: x = str(c) formula += "V" + str(c + 1) else: x += "," + str(c) formula += "+V" + str(c + 1) print "formula:", formula print "col_names:", col_names print "x:", x kwargs = { "n_folds": 0, "y": y, "x": x, "family": family, "link": "familyDefault", "alpha": 0, "lambda": 0, "case_mode": "=", "case": 1, "beta_eps": 1.0e-4, "max_iter": 50, } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, "took", time.time() - start, "seconds" h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare( self, csvPathname2, family, formula, y, header=header, h2oResults=h2oResults ) trial += 1 print "\nTrial #", trial
def test_GLM_umass(self): if (1==1): csvFilenameList = [ # col is zero based # FIX! what's wrong here? index error ('uis.dat', 'binomial', 8, 5, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ('pros.dat', 'binomial', 1, 10, False), ('chdage.dat', 'binomial', 2, 5, True), ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ('clslowbwt.dat', 'binomial', 7, 10, False), ] else: csvFilenameList = [ # leave out ID and birth weight ('icu.dat', 'binomial', 1, 10, None), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! ('nhanes3.dat', 'binomial', 15, 10), ('lowbwt.dat', 'binomial', 1, 10, '2,3,4,5,6,7,8,9'), ('lowbwtm11.dat', 'binomial', 1, 10, None), ('meexp.dat', 'gaussian', 3, 10, None), # FIX! does this one hang in R? ('nhanes3.dat', 'binomial', 15, 10, None), ('pbc.dat', 'gaussian', 1, 10, None), ('pharynx.dat', 'gaussian', 12, 10, None), ('uis.dat', 'binomial', 8, 10, None), ] trial = 0 for (csvFilename, family, y, timeoutSecs, header) in csvFilenameList: # FIX! do something about this file munging csvPathname1 = h2o.find_file("smalldata/logreg/umass_statdata/" + csvFilename) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(csvPathname1, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseKey = h2o_cmd.parseFile(None, csvPathname2, key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): if csvFilename=='clslowbwt.dat' and c==6: print "Not including col 6 for this dataset from x" else: # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x kwargs = { 'n_folds': 0, 'y': y, 'x': x, 'family': family, 'link': 'familyDefault', 'alpha': 0, 'lambda': 0, 'case_mode': '=', 'case': 1, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(self, csvPathname2, family, formula, y, header=header, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial
def test_GLM_umass(self): if (1==1): csvFilenameList = [ # col is zero based # FIX! what's wrong here? index error ('uis.dat', 'binomial', 8, 5, False), # ('cgd.dat', 'gaussian', 12, 5, False), # ('meexp.dat', 'gaussian', 3, 10, None), ('pros.dat', 'binomial', 1, 10, False), ('chdage.dat', 'binomial', 2, 5, True), ('icu.dat', 'binomial', 1, 10, False), # how to ignore 6? '1,2,3,4,5', False), ('clslowbwt.dat', 'binomial', 7, 10, False), ] else: csvFilenameList = [ # leave out ID and birth weight ('icu.dat', 'binomial', 1, 10, None), # need to exclude col 0 (ID) and col 10 (bwt) # but -x doesn't work..so do 2:9...range doesn't work? FIX! ('nhanes3.dat', 'binomial', 15, 10), ('lowbwt.dat', 'binomial', 1, 10, '2,3,4,5,6,7,8,9'), ('lowbwtm11.dat', 'binomial', 1, 10, None), ('meexp.dat', 'gaussian', 3, 10, None), # FIX! does this one hang in R? ('nhanes3.dat', 'binomial', 15, 10, None), ('pbc.dat', 'gaussian', 1, 10, None), ('pharynx.dat', 'gaussian', 12, 10, None), ('uis.dat', 'binomial', 8, 10, None), ] trial = 0 for (csvFilename, family, y, timeoutSecs, header) in csvFilenameList: # FIX! do something about this file munging csvPathname1 = h2o.find_file("smalldata/logreg/umass_statdata/" + csvFilename) csvPathname2 = SYNDATASETS_DIR + '/' + csvFilename + '_2.csv' h2o_util.file_clean_for_R(csvPathname1, csvPathname2) # we can inspect this to get the number of cols in the dataset (trust H2O here) parseKey = h2o_cmd.parseFile(None, csvPathname2, key=csvFilename, timeoutSecs=10) # we could specify key2 above but this is fine destination_key = parseKey['destination_key'] inspect = h2o_cmd.runInspect(None, destination_key) num_cols = inspect['num_cols'] num_rows = inspect['num_rows'] print "num_cols", num_cols, "num_rows", num_rows ## print h2o.dump_json(inspect) # create formula and the x for H2O GLM formula = "V" + str(y+1) + " ~ " x = None col_names = "" for c in range(0,num_cols): # don't add the output col to the RHS of formula if x is None: col_names += "V" + str(c+1) else: col_names += ",V" + str(c+1) if c!=y: if x is None: x = str(c) formula += "V" + str(c+1) else: x += "," + str(c) formula += "+V" + str(c+1) print 'formula:', formula print 'col_names:', col_names print 'x:', x kwargs = { 'num_cross_validation_folds': 0, 'y': y, 'x': x, 'family': family, 'link': 'familyDefault', 'alpha': 0, 'lambda': 0, 'case_mode': '=', 'case': 1, 'beta_epsilon': 1.0E-4, 'max_iter': 50 } start = time.time() glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, **kwargs) print "glm end (w/check) on ", csvPathname2, 'took', time.time()-start, 'seconds' h2oResults = h2o_glm.simpleCheckGLM(self, glm, None, prettyPrint=True, **kwargs) # now do it thru R and compare (warningsR, cListR, interceptR) = glm_R_and_compare(csvPathname2, family, formula, y, header=header, h2oResults=h2oResults) trial += 1 print "\nTrial #", trial