Ejemplo n.º 1
0
def raw_to_h5():
    '''
    Transform the raw dataset into one of HDF5 type.
    '''

    X_raw = np.zeros((n_all, num_features), dtype=dtype_X)
    y_raw = np.zeros((n_all, num_labels), dtype=dtype_y)

    print("Preparation: {}".format(data_name))

    ## Read in the raw data.
    with open(toread, newline="") as f_table:

        print("Read {}.".format(toread))

        f_reader = csv.reader(f_table, delimiter=" ")

        ## Populate the placeholder numpy arrays.
        idx = 0
        for line in f_reader:

            if len(line) == 0:
                continue  # do nothing for blank lines.

            ## Numpy arrays for individual instance.
            x, y = parse_line(x=line[0:-1], y=line[-1])

            if x is None:
                continue  # skip instances with missing values.
            else:
                X_raw[idx, :] = x
                y_raw[idx, 0] = y

            ## Update the index (also counts the clean data points).
            idx += 1

        ## Check that number of *clean* instances is as expected.
        print("Number of clean guys: {}. Note n_all = {}".format(idx, n_all))

    ## Create and populate the HDF5 file.
    makedir_safe(newdir)
    with tables.open_file(towrite, mode="w", title=title) as myh5:
        myh5.create_array(where=myh5.root,
                          name="X",
                          obj=X_raw,
                          atom=atom_X,
                          title=title_X)
        myh5.create_array(where=myh5.root,
                          name="y",
                          obj=y_raw,
                          atom=atom_y,
                          title=title_y)
        print(myh5)

    print("Wrote {}.".format(towrite))

    ## Exit all context managers before returning.
    print("Done ({}).".format(data_name))
    return None
Ejemplo n.º 2
0
def raw_to_h5():
    '''
    Transform the raw dataset into one of HDF5 type.
    '''

    X_raw = np.zeros((n_all, num_features), dtype=dtype_X)
    y_raw = np.zeros((n_all, num_labels), dtype=dtype_y)

    print("Preparation: {}".format(data_name))

    ## Read in the raw data.
    with open(toread, newline="") as f_table:

        print("Read {}.".format(toread))

        f_reader = csv.reader(f_table, delimiter=",")

        ## Populate the placeholder numpy arrays.
        i = 0
        for line in f_reader:
            if len(line) > 0:
                X_raw[i, :] = np.array(line[0:-1], dtype=X_raw.dtype)
                y_raw[i, 0] = np.array(line[-1],
                                       dtype=y_raw.dtype) - 1  # subtraction.
            i += 1

        ## Create and populate the HDF5 file.
        makedir_safe(newdir)
        with tables.open_file(towrite, mode="w", title=title) as myh5:
            myh5.create_array(where=myh5.root,
                              name="X",
                              obj=X_raw,
                              atom=atom_X,
                              title=title_X)
            myh5.create_array(where=myh5.root,
                              name="y",
                              obj=y_raw,
                              atom=atom_y,
                              title=title_y)
            print(myh5)

        print("Wrote {}.".format(towrite))

    ## Exit all context managers before returning.
    print("Done ({}).".format(data_name))
    return None
Ejemplo n.º 3
0
'''A simple helper which picks up the master directory for saving data.'''

## External modules.
import os
import sys

## Internal modules.
from mml.config import dir_data_toread
from mml.utils import makedir_safe

###############################################################################

if __name__ == "__main__":

    try:
        newdir = os.path.join(dir_data_toread, sys.argv[1])
        makedir_safe(newdir)
        print(newdir)
    except IndexError:
        print("Please pass some value to this script")

###############################################################################
Ejemplo n.º 4
0
def raw_to_h5():
    '''
    Transform the raw dataset into one of HDF5 type.
    '''
    
    X_raw_tr = np.zeros((n_tr,num_features), dtype=dtype_X)
    X_raw_te = np.zeros((n_te,num_features), dtype=dtype_X)
    y_raw_tr = np.zeros((n_tr,num_labels), dtype=dtype_y)
    y_raw_te = np.zeros((n_te,num_labels), dtype=dtype_y)
    
    i_tr = 0
    
    print("Preparation: {}".format(data_name))
    
    ## Loop over batches, and populate *_raw_tr.
    for num_batch in range(num_batches):
        
        toread = get_toread_tr(num=num_batch)
        
        with open(toread, mode="rb") as f_bin:
        
            print("Read {}.".format(toread))
            
            for i in range(n_tr_perbatch):
                
                if i_tr % 5000 == 0:
                    print("(tr) Working... image {}.".format(i_tr))
                
                y_raw_tr[i_tr,0] = int.from_bytes(f_bin.read(1),
                                                  byteorder="big",
                                                  signed=False)
                for j in range(num_features):
                    X_raw_tr[i_tr,j] = int.from_bytes(f_bin.read(1),
                                                      byteorder="big",
                                                      signed=False)
                i_tr += 1
                
                

    ## Populate *_raw_te.
    with open(toread_te, mode="rb") as f_bin:
        
        print("Read {}.".format(toread_te))
        
        for i in range(n_te):
            
            if i % 1000 == 0:
                print("(te) Working... image {}.".format(i))
            
            y_raw_te[i,0] = int.from_bytes(f_bin.read(1),
                                           byteorder="big",
                                           signed=False)
            for j in range(num_features):
                X_raw_te[i,j] = int.from_bytes(f_bin.read(1),
                                               byteorder="big",
                                               signed=False)
    
    ## Concatenate.
    X_raw = np.vstack((X_raw_tr, X_raw_te))
    y_raw = np.vstack((y_raw_tr, y_raw_te))
    
    ## Create and populate the HDF5 file.
    makedir_safe(newdir)
    with tables.open_file(towrite, mode="w", title=title) as myh5:
        myh5.create_array(where=myh5.root,
                          name="X",
                          obj=X_raw,
                          atom=atom_X,
                          title=title_X)
        myh5.create_array(where=myh5.root,
                          name="y",
                          obj=y_raw,
                          atom=atom_y,
                          title=title_y)
        print(myh5)
    
    print("Wrote {}.".format(towrite))
    
    ## Exit all context managers before returning.
    print("Done ({}).".format(data_name))
    return None
Ejemplo n.º 5
0
rg = np.random.default_rng(seed=ss)

## Parse the arguments passed via command line.
args = parser.parse_args()
if args.data is None:
    raise TypeError("Given --data=None, should be a string.")

## Name to be used identifying the results etc. of this experiment.
towrite_name = args.task_name+"-"+"_".join([args.model, args.algo])

## Model class must be initialized here, to ensure all sub-procs get access.
Model_class, paras_todo = get_model(model_class=args.model)

## Prepare a directory to save results.
towrite_dir = os.path.join(results_dir, "torch", args.data)
makedir_safe(towrite_dir)

## Main process.

if __name__ == "__main__":

    ## Device settings.
    use_cuda = args.cuda and torch.cuda.is_available()
    dev = torch.device("cuda" if use_cuda else "cpu")
    print("cuda.is_available():", torch.cuda.is_available())
    print("args.cuda:", args.cuda)
    print("use_cuda:", use_cuda, "\n")
    
    ## Arguments for the data loaders.
    dl_kwargs = {"batch_size": args.batch_size,
                 "shuffle": True}
Ejemplo n.º 6
0
    w_star = np.ones(d).reshape((d, 1))
    w_init = np.copy(w_star)
    w_init += rg.uniform(low=-init_range, high=init_range, size=d).reshape(
        (d, 1))

    ## Data setup is somewhat specialized here.
    cov_X = np.eye(d)  # covariance matrix of the inputs.
    _var_noise, _mean_noise, _gen = subtask["get_gen"](level=dist_level,
                                                       nval=n)
    var_noise = _var_noise[subtask_name]
    mean_noise = _mean_noise[subtask_name]
    gen_epsilon = _gen[subtask_name]

    ## Prepare a directory to write the results.
    towrite = os.path.join("results", task_name, subtask_name)
    makedir_safe(towrite)

    ## Prepare the method itinerary.
    itin = parse_itin(task_name)
    method_keys = itin.methods.keys()

    ## Print the experiment information, and write it to disk.
    write_expinfo(task_name=task_name,
                  subtask_name=subtask_name,
                  details=subtask,
                  itin=itin)

    ## Prepare for performance evaluation.
    perf_fn = parse_perf(task_name)
    perf_names = perf_fn(model=None,
                         algo=None,