def pubdev_6339(): cluster = h2o.cluster() # number of nodes cloud_size = cluster.cloud_size # number of CPUs cores = sum(node["num_cpus"] for node in cluster.nodes) # path to file file_paths = [ pyunit_utils.locate("smalldata/arcene/arcene_train.data"), pyunit_utils.locate("smalldata/census_income/adult_data.csv"), pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"), pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), pyunit_utils.locate("smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv") ] for file_path in file_paths: # read data and parse setup to get number of columns data_raw = h2o.import_file(path=file_path,parse=False) setup = h2o.parse_setup(data_raw) # get number of columns from setup num_cols = setup['number_columns'] # get the chunk size chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size) # get chunk size to compare if calculation is correct result_size = setup['chunk_size'] assert chunk_size == result_size, "Calculated chunk size is incorrect!" print("chunk size for file", file_path, "is:", chunk_size) data_raw = h2o.import_file(path=file_paths[1],parse=False) setup = h2o.parse_setup(data_raw)
def import_svmlight(path, headers=""): raw = h2o.lazy_import(path) if settings.debug and len(headers) < 100: print utils.time() + "import with headers: " + str(headers) #parsesetup = h2o.parse_setup(raw,column_names=headers) parsesetup = h2o.parse_setup( raw ) # Issue: H2O 3.8 tests length of header vs. columns, but still imports the "pseudotarget" additionally parsesetup['parse_type'] = 'SVMLight' loaded_frame = h2o.parse_raw(parsesetup) if settings.debug: print "......HEader length: " + str(len(headers)) print "......Frame imported: " + str(loaded_frame.ncol) if (len(headers) > loaded_frame.ncol): n = len(headers) - loaded_frame.ncol print "Remove last " + str(n) + " header entries" del headers[-n:] loaded_frame.set_names(headers) #Workaround, Set names now print "First column: " + loaded_frame.names[ 0] #needed because lazy name setting if settings.debug and len(headers) < 100: loaded_frame.head(show=True) loaded_frame.pop(0) #remove first ('pseudotarget') columnn #if loaded_frame.ncol>len(headers)-1: #workaround: H2O reads info from svmlight into columns -> remove everything that is not in headers # delete = [] # for i in xrange(len(headers)-1,loaded_frame.ncol): # delete.append(loaded_frame.names[i]) # loaded_frame = remove_vecs(loaded_frame,delete) if settings.debug and len(headers) < 100: loaded_frame.head(show=True) return loaded_frame
def _handle_raw_fname(self, raw_fname, column_names=None): """ Handle result of upload_file :param raw_fname: A raw key :return: Part of the H2OFrame constructor. """ # perform the parse setup setup = h2o.parse_setup(raw_fname) # blocking parse, first line is always a header (since "we" wrote the data out) parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1) # a hack to get the column names correct since "parse" does not provide them if column_names and not parse["columnNames"]: cols = column_names else: cols = parse['columnNames'] # set the rows rows = parse['rows'] # set the vector keys veckeys = parse['vecKeys'] # create a new vec[] array self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) # print some information on the *uploaded* data print "Uploaded", raw_fname, "into cluster with", \ rows, "rows and", len(cols), "cols" print
def h2oparse_setup(): """ Python API test: h2o.parse_setup(raw_frames, destination_frame=None, header=0, separator=None, column_names=None, column_types=None, na_strings=None) """ try: col_types = [ 'enum', 'numeric', 'enum', 'enum', 'enum', 'numeric', 'numeric', 'numeric' ] col_headers = [ "CAPSULE", "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ] hex_key = "training_data.hex" fraw = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), parse=False) setup = h2o.parse_setup(fraw, destination_frame=hex_key, header=1, separator=',', column_names=col_headers, column_types=col_types, na_strings=["NA"]) assert_is_type(setup, H2OResponse) assert setup["number_columns"] == len( col_headers), "h2o.parse_setup() command is not working." except Exception as e: assert False, "h2o.parse_setup() command is not working."
def parse_false(): fraw = h2o.import_file(tests.locate("smalldata/jira/hexdev_29.csv"), parse=False) assert isinstance(fraw, list) fhex = h2o.parse_raw(h2o.parse_setup(fraw)) fhex.summary() assert fhex.__class__.__name__ == "H2OFrame"
def get_data(self, src_bucket="cargo.ml.training", obj_name="training_sample.csv"): # boto3.setup_default_session(region_name='us-west-2') # s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KID, aws_secret_access_key=ACCESS_KEY) input_path = os.path.join(INPUT_PATH, 'training_sample_input/training_sample.csv') # s3_client.download_file(src_bucket, obj_name, input_path) df_raw = h2o.import_file(input_path, parse=False) setup = h2o.parse_setup(df_raw, destination_frame="training.hex", header=1, column_names=self.col_headers, column_types=self.col_types) df = h2o.parse_raw(h2o.parse_setup(df_raw), id='training.csv', first_line_is_header=1) print("Input dataframe: ", df) return df
def fromRawText(text_key, check_header=None): """ Handle result of upload_file :param text_key: A key pointing to raw text to be parsed :return: Part of the H2OFrame constructor. """ setup = h2o.parse_setup(text_key) if check_header is not None: setup["check_header"] = check_header res = H2OFrame._parse_raw(setup) print "Uploaded {} into cluster with {:,} rows and {:,} cols".format(text_key, res.nrow, res.ncol) return res
def h2oparse_raw(): """ Python API test: h2o.parse_raw(setup, id=None, first_line_is_header=0) copied from pyunit_hexdev_29_parse_false.py """ fraw = h2o.import_file(pyunit_utils.locate("smalldata/jira/hexdev_29.csv"), parse=False) assert isinstance(fraw, list) fhex = h2o.parse_raw(h2o.parse_setup(fraw), id='hexdev_29.hex', first_line_is_header=0) fhex.summary() assert_is_type(fhex, H2OFrame)
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution = "multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def continuous_or_categorical(ip, port): fraw = h2o.import_file(h2o.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert df_hex["h1"].isfactor() assert df_hex["h2"].isfactor() assert df_hex["h3"].isfactor()
def continuous_or_categorical(): fraw = h2o.lazy_import(tests.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor())
def h2oparse_setup(): """ Python API test: h2o.parse_setup(raw_frames, destination_frame=None, header=0, separator=None, column_names=None, column_types=None, na_strings=None) """ col_types=['enum','numeric','enum','enum','enum','numeric','numeric','numeric'] col_headers = ["CAPSULE","AGE","RACE","DPROS","DCAPS","PSA","VOL","GLEASON"] hex_key = "training_data.hex" fraw = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), parse=False) setup = h2o.parse_setup(fraw, destination_frame=hex_key, header=1, separator=',', column_names=col_headers, column_types=col_types, na_strings=["NA"]) assert_is_type(setup, H2OResponse) assert setup["number_columns"]==len(col_headers), "h2o.parse_setup() command is not working."
def continuous_or_categorical(): fraw = h2o.lazy_import(h2o.locate("smalldata/jira/hexdev_29.csv")) fsetup = h2o.parse_setup(fraw) fsetup["column_types"][0] = "ENUM" fsetup["column_types"][1] = "ENUM" fsetup["column_types"][2] = "ENUM" df_hex = h2o.parse_raw(fsetup) df_hex.summary() assert (df_hex['h1'].isfactor()) assert (df_hex['h2'].isfactor()) assert (df_hex['h3'].isfactor())
def pubdev_6339(): cluster = h2o.cluster() # number of nodes cloud_size = cluster.cloud_size # number of CPUs cores = sum(node["num_cpus"] for node in cluster.nodes) # path to file file_paths = [ pyunit_utils.locate("smalldata/arcene/arcene_train.data"), pyunit_utils.locate("smalldata/census_income/adult_data.csv"), pyunit_utils.locate("smalldata/chicago/chicagoAllWeather.csv"), pyunit_utils.locate("smalldata/gbm_test/alphabet_cattest.csv"), pyunit_utils.locate( "smalldata/wa_cannabis/raw/Dashboard_Usable_Sales_w_Weight_Daily.csv" ) ] for file_path in file_paths: # read data and parse setup to get number of columns data_raw = h2o.import_file(path=file_path, parse=False) setup = h2o.parse_setup(data_raw) # get number of columns from setup num_cols = setup['number_columns'] # get the chunk size chunk_size = calculate_chunk_size(file_path, num_cols, cores, cloud_size) # get chunk size to compare if calculation is correct result_size = setup['chunk_size'] assert chunk_size == result_size, "Calculated chunk size is incorrect!" print("chunk size for file", file_path, "is:", chunk_size) data_raw = h2o.import_file(path=file_paths[1], parse=False) setup = h2o.parse_setup(data_raw)
def hexdev_394(): path = tests.locate("smalldata/covtype/covtype.20k.data") trainraw = h2o.lazy_import(path) tsetup = h2o.parse_setup(trainraw) tsetup["column_types"][10] = "ENUM" tsetup["column_types"][11] = "ENUM" tsetup["column_types"][12] = "ENUM" train = h2o.parse_raw(tsetup) cols = train.col_names # This returned space for first column name x_cols = [colname for colname in cols if colname != "C55"] x_cols splits = train.split_frame() newtrain = splits[0] newvalid = splits[1] newtrain_x = newtrain[x_cols] newtrain_y = newtrain[54].asfactor() newvalid_x = newvalid[x_cols] newvalid_y = newvalid[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) split1, split2 = train.split_frame() newtrain_x = split1[x_cols] newtrain_y = split1[54].asfactor() newvalid_x = split2[x_cols] newvalid_y = split2[54].asfactor() my_gbm = h2o.gbm(y=newtrain_y, validation_y=newvalid_y, x=newtrain_x, validation_x=newvalid_x, distribution="multinomial", ntrees=100, learn_rate=0.1, max_depth=6) print "KEEPING FRAME???" print train._keep
def _handle_text_key(self, text_key, column_names): """ Handle result of upload_file :param test_key: A key pointing to raw text to be parsed :return: Part of the H2OFrame constructor. """ # perform the parse setup setup = h2o.parse_setup(text_key) # blocking parse, first line is always a header (since "we" wrote the data out) parse = h2o.parse(setup, H2OFrame.py_tmp_key(), first_line_is_header=1) # a hack to get the column names correct since "parse" does not provide them cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(parse['vec_ids'])+1)] # set the rows rows = parse['rows'] # set the vector keys veckeys = parse['vec_ids'] # create a new vec[] array self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) # print some information on the *uploaded* data print "Uploaded", text_key, "into cluster with", rows, "rows and", len(cols), "cols"
def _parse(rawkey, destination_frame="", header=None, separator=None, column_names=None, column_types=None, na_strings=None): setup = h2o.parse_setup(rawkey, destination_frame, header, separator, column_names, column_types, na_strings) return H2OFrame._parse_raw(setup)
def __init__(self, python_obj=None, local_fname=None, remote_fname=None, vecs=None, text_key=None): """ Create a new H2OFrame object by passing a file path or a list of H2OVecs. If `remote_fname` is not None, then a REST call will be made to import the data specified at the location `remote_fname`. This path is relative to the H2O cluster, NOT the local Python process If `local_fname` is not None, then the data is not imported into the H2O cluster at the time of object creation. If `python_obj` is not None, then an attempt to upload the python object to H2O will be made. A valid python object has type `list`, or `dict`. For more information on the structure of the input for the various native python data types ("native" meaning non-H2O), please see the general documentation for this object. :param python_obj: A "native" python object - list, dict, tuple. :param local_fname: A local path to a data source. Data is python-process-local. :param remote_fname: A remote path to a data source. Data is cluster-local. :param vecs: A list of H2OVec objects. :param text_key: A raw key resulting from an upload_file. :return: An instance of an H2OFrame object. """ self.local_fname = local_fname self.remote_fname = remote_fname self._vecs = None if python_obj is not None: # avoids the truth value of an array is ambiguous err self._upload_python_object(python_obj) return # Import the data into H2O cluster if remote_fname: rawkey = h2o.import_file(remote_fname) setup = h2o.parse_setup(rawkey) parse = h2o.parse(setup, H2OFrame.py_tmp_key()) # create a new key veckeys = parse['vec_ids'] rows = parse['rows'] cols = parse['column_names'] if parse["column_names"] else ["C" + str(x) for x in range(1,len(veckeys)+1)] self._vecs = H2OVec.new_vecs(zip(cols, veckeys), rows) print "Imported", remote_fname, "into cluster with", rows, "rows and", len(cols), "cols" # Read data locally into python process elif local_fname: with open(local_fname, 'rb') as csvfile: self._vecs = [] for name in csvfile.readline().split(','): self._vecs.append(H2OVec(name.rstrip(), Expr([]))) for row in csv.reader(csvfile): for i, data in enumerate(row): self._vecs[i].append(data) print "Imported", local_fname, "into local python process" # Construct from an array of Vecs already passed in elif vecs: vlen = len(vecs[0]) for v in vecs: if not isinstance(v, H2OVec): raise ValueError("Not a list of Vecs") if len(v) != vlen: raise ValueError("Vecs not the same size: " + str(vlen) + " != " + str(len(v))) self._vecs = vecs elif text_key: self._handle_text_key(text_key, None) else: raise ValueError("Frame made from CSV file or an array of Vecs only")