def import_svmlight(path, headers=""): raw = h2o.lazy_import(path) if settings.debug and len(headers) < 100: print utils.time() + "import with headers: " + str(headers) #parsesetup = h2o.parse_setup(raw,column_names=headers) parsesetup = h2o.parse_setup( raw ) # Issue: H2O 3.8 tests length of header vs. columns, but still imports the "pseudotarget" additionally parsesetup['parse_type'] = 'SVMLight' loaded_frame = h2o.parse_raw(parsesetup) if settings.debug: print "......HEader length: " + str(len(headers)) print "......Frame imported: " + str(loaded_frame.ncol) if (len(headers) > loaded_frame.ncol): n = len(headers) - loaded_frame.ncol print "Remove last " + str(n) + " header entries" del headers[-n:] loaded_frame.set_names(headers) #Workaround, Set names now print "First column: " + loaded_frame.names[ 0] #needed because lazy name setting if settings.debug and len(headers) < 100: loaded_frame.head(show=True) loaded_frame.pop(0) #remove first ('pseudotarget') columnn #if loaded_frame.ncol>len(headers)-1: #workaround: H2O reads info from svmlight into columns -> remove everything that is not in headers # delete = [] # for i in xrange(len(headers)-1,loaded_frame.ncol): # delete.append(loaded_frame.names[i]) # loaded_frame = remove_vecs(loaded_frame,delete) if settings.debug and len(headers) < 100: loaded_frame.head(show=True) return loaded_frame
def parse_false(): fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), parse=False) assert isinstance(fraw, list) fhex = h2o.parse_raw(h2o.parse_setup(fraw)) fhex.summary() assert fhex.__class__.__name__ == "H2OFrame"
def h2oparse_raw(): """ Python API test: h2o.parse_raw(setup, id=None, first_line_is_header=0) copied from pyunit_hexdev_29_parse_false.py """ fraw = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), parse=False) assert isinstance(fraw, list) fhex = h2o.parse_raw(h2o.parse_setup(fraw), id='hexdev_29.hex', first_line_is_header=0) fhex.summary() assert_is_type(fhex, H2OFrame)
def continuous_or_categorical(): fraw = h2o.lazy_import(h2o.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor())
def continuous_or_categorical(): fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor())
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def continuous_or_categorical(ip, port): fraw = h2o.import_file(h2o.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert df_hex["h1"].isfactor() assert df_hex["h2"].isfactor() assert df_hex["h3"].isfactor()
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def get_data(self, src_bucket="cargo.ml.training", obj_name="training_sample.csv"): # boto3.setup_default_session(region_name='us-west-2') # s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KID, aws_secret_access_key=ACCESS_KEY) input_path = os.path.join(INPUT_PATH, 'training_sample_input/training_sample.csv') # s3_client.download_file(src_bucket, obj_name, input_path) df_raw = h2o.import_file(input_path, parse=False) setup = h2o.parse_setup(df_raw, destination_frame="training.hex", header=1, column_names=self.col_headers, column_types=self.col_types) df = h2o.parse_raw(h2o.parse_setup(df_raw), id='training.csv', first_line_is_header=1) print("Input dataframe: ", df) return df