def raw_to_h5(): ''' Transform the raw dataset into one of HDF5 type. ''' X_raw = np.zeros((n_all, num_features), dtype=dtype_X) y_raw = np.zeros((n_all, num_labels), dtype=dtype_y) print("Preparation: {}".format(data_name)) ## Read in the raw data. with open(toread, newline="") as f_table: print("Read {}.".format(toread)) f_reader = csv.reader(f_table, delimiter=" ") ## Populate the placeholder numpy arrays. idx = 0 for line in f_reader: if len(line) == 0: continue # do nothing for blank lines. ## Numpy arrays for individual instance. x, y = parse_line(x=line[0:-1], y=line[-1]) if x is None: continue # skip instances with missing values. else: X_raw[idx, :] = x y_raw[idx, 0] = y ## Update the index (also counts the clean data points). idx += 1 ## Check that number of *clean* instances is as expected. print("Number of clean guys: {}. Note n_all = {}".format(idx, n_all)) ## Create and populate the HDF5 file. makedir_safe(newdir) with tables.open_file(towrite, mode="w", title=title) as myh5: myh5.create_array(where=myh5.root, name="X", obj=X_raw, atom=atom_X, title=title_X) myh5.create_array(where=myh5.root, name="y", obj=y_raw, atom=atom_y, title=title_y) print(myh5) print("Wrote {}.".format(towrite)) ## Exit all context managers before returning. print("Done ({}).".format(data_name)) return None
def raw_to_h5(): ''' Transform the raw dataset into one of HDF5 type. ''' X_raw = np.zeros((n_all, num_features), dtype=dtype_X) y_raw = np.zeros((n_all, num_labels), dtype=dtype_y) print("Preparation: {}".format(data_name)) ## Read in the raw data. with open(toread, newline="") as f_table: print("Read {}.".format(toread)) f_reader = csv.reader(f_table, delimiter=",") ## Populate the placeholder numpy arrays. i = 0 for line in f_reader: if len(line) > 0: X_raw[i, :] = np.array(line[0:-1], dtype=X_raw.dtype) y_raw[i, 0] = np.array(line[-1], dtype=y_raw.dtype) - 1 # subtraction. i += 1 ## Create and populate the HDF5 file. makedir_safe(newdir) with tables.open_file(towrite, mode="w", title=title) as myh5: myh5.create_array(where=myh5.root, name="X", obj=X_raw, atom=atom_X, title=title_X) myh5.create_array(where=myh5.root, name="y", obj=y_raw, atom=atom_y, title=title_y) print(myh5) print("Wrote {}.".format(towrite)) ## Exit all context managers before returning. print("Done ({}).".format(data_name)) return None
'''A simple helper which picks up the master directory for saving data.''' ## External modules. import os import sys ## Internal modules. from mml.config import dir_data_toread from mml.utils import makedir_safe ############################################################################### if __name__ == "__main__": try: newdir = os.path.join(dir_data_toread, sys.argv[1]) makedir_safe(newdir) print(newdir) except IndexError: print("Please pass some value to this script") ###############################################################################
def raw_to_h5(): ''' Transform the raw dataset into one of HDF5 type. ''' X_raw_tr = np.zeros((n_tr,num_features), dtype=dtype_X) X_raw_te = np.zeros((n_te,num_features), dtype=dtype_X) y_raw_tr = np.zeros((n_tr,num_labels), dtype=dtype_y) y_raw_te = np.zeros((n_te,num_labels), dtype=dtype_y) i_tr = 0 print("Preparation: {}".format(data_name)) ## Loop over batches, and populate *_raw_tr. for num_batch in range(num_batches): toread = get_toread_tr(num=num_batch) with open(toread, mode="rb") as f_bin: print("Read {}.".format(toread)) for i in range(n_tr_perbatch): if i_tr % 5000 == 0: print("(tr) Working... image {}.".format(i_tr)) y_raw_tr[i_tr,0] = int.from_bytes(f_bin.read(1), byteorder="big", signed=False) for j in range(num_features): X_raw_tr[i_tr,j] = int.from_bytes(f_bin.read(1), byteorder="big", signed=False) i_tr += 1 ## Populate *_raw_te. with open(toread_te, mode="rb") as f_bin: print("Read {}.".format(toread_te)) for i in range(n_te): if i % 1000 == 0: print("(te) Working... image {}.".format(i)) y_raw_te[i,0] = int.from_bytes(f_bin.read(1), byteorder="big", signed=False) for j in range(num_features): X_raw_te[i,j] = int.from_bytes(f_bin.read(1), byteorder="big", signed=False) ## Concatenate. X_raw = np.vstack((X_raw_tr, X_raw_te)) y_raw = np.vstack((y_raw_tr, y_raw_te)) ## Create and populate the HDF5 file. makedir_safe(newdir) with tables.open_file(towrite, mode="w", title=title) as myh5: myh5.create_array(where=myh5.root, name="X", obj=X_raw, atom=atom_X, title=title_X) myh5.create_array(where=myh5.root, name="y", obj=y_raw, atom=atom_y, title=title_y) print(myh5) print("Wrote {}.".format(towrite)) ## Exit all context managers before returning. print("Done ({}).".format(data_name)) return None
rg = np.random.default_rng(seed=ss) ## Parse the arguments passed via command line. args = parser.parse_args() if args.data is None: raise TypeError("Given --data=None, should be a string.") ## Name to be used identifying the results etc. of this experiment. towrite_name = args.task_name+"-"+"_".join([args.model, args.algo]) ## Model class must be initialized here, to ensure all sub-procs get access. Model_class, paras_todo = get_model(model_class=args.model) ## Prepare a directory to save results. towrite_dir = os.path.join(results_dir, "torch", args.data) makedir_safe(towrite_dir) ## Main process. if __name__ == "__main__": ## Device settings. use_cuda = args.cuda and torch.cuda.is_available() dev = torch.device("cuda" if use_cuda else "cpu") print("cuda.is_available():", torch.cuda.is_available()) print("args.cuda:", args.cuda) print("use_cuda:", use_cuda, "\n") ## Arguments for the data loaders. dl_kwargs = {"batch_size": args.batch_size, "shuffle": True}
w_star = np.ones(d).reshape((d, 1)) w_init = np.copy(w_star) w_init += rg.uniform(low=-init_range, high=init_range, size=d).reshape( (d, 1)) ## Data setup is somewhat specialized here. cov_X = np.eye(d) # covariance matrix of the inputs. _var_noise, _mean_noise, _gen = subtask["get_gen"](level=dist_level, nval=n) var_noise = _var_noise[subtask_name] mean_noise = _mean_noise[subtask_name] gen_epsilon = _gen[subtask_name] ## Prepare a directory to write the results. towrite = os.path.join("results", task_name, subtask_name) makedir_safe(towrite) ## Prepare the method itinerary. itin = parse_itin(task_name) method_keys = itin.methods.keys() ## Print the experiment information, and write it to disk. write_expinfo(task_name=task_name, subtask_name=subtask_name, details=subtask, itin=itin) ## Prepare for performance evaluation. perf_fn = parse_perf(task_name) perf_names = perf_fn(model=None, algo=None,