Example #1
0
     
 vprint( verbose,  "Using input_dir: " + input_dir)
 vprint( verbose,  "Using output_dir: " + output_dir)
     
 # Move old results and create a new output directory 
 if not(running_on_codalab) and save_previous_results:
     data_io.mvdir(output_dir, output_dir+'_'+the_date) 
 data_io.mkdir(output_dir) 
 
 #### INVENTORY DATA (and sort dataset names alphabetically)
 datanames = data_io.inventory_data(input_dir)
 # Overwrite the "natural" order
 
 #### DEBUG MODE: Show dataset list and STOP
 if debug_mode>=3:
     data_io.show_io(input_dir, output_dir)
     print('\n****** Sample code version ' + str(version) + ' ******\n\n' + '========== DATASETS ==========\n')        	
     data_io.write_list(datanames)      
     datanames = [] # Do not proceed with learning and testing
     
 # ==================== @RESULT SUBMISSION (KEEP THIS) =====================
 # Always keep this code to enable result submission of pre-calculated results
 # deposited in the res/ subdirectory.
 if len(datanames)>0:
     vprint( verbose,  "************************************************************************")
     vprint( verbose,  "****** Attempting to copy files (from res/) for RESULT submission ******")
     vprint( verbose,  "************************************************************************")
     datanames = data_io.copy_results(datanames, res_dir, output_dir, verbose) # DO NOT REMOVE!
     if not datanames: 
         vprint( verbose,  "[+] Results copied to output directory, no model trained/tested")
     else:
Example #2
0
sys.path.append("libs")

default_input_dir="C:\\Users\\vmkocheg\\Documents\\MLContest\\Phase2\\input"
default_output_dir="C:\\Users\\vmkocheg\\Documents\\MLContest\\Phase2\\output"
if len(argv)==1: # Use the default input and output directories if no arguments are provided
    input_dir = default_input_dir
    output_dir = default_output_dir
else:
    input_dir = argv[1]
    output_dir = os.path.abspath(argv[2]);

#### INVENTORY DATA (and sort dataset names alphabetically)
datanames = data_io.inventory_data(input_dir)
#### DEBUG MODE: Show dataset list and STOP
if debug_mode>=3:
    data_io.show_io(input_dir, output_dir)
    data_io.write_list(datanames)
    datanames = [] # Do not proceed with learning and testing


for basename in datanames: # Loop over datasets
    if basename not in ["robert"]:
        continue

    vprint( verbose,  "************************************************")
    vprint( verbose,  "******** Processing dataset " + basename.capitalize() + " ********")
    vprint( verbose,  "************************************************")

    # ======== Learning on a time budget:
    # Keep track of time not to exceed your time budget. Time spent to inventory data neglected.
    start = time.time()
Example #3
0
def predictSpatioTemporal(step_num, input_dir, output_dir, code_dir, \
                          ext = '.h5', verbose=True, debug_mode=0, \
                          time_budget = 300, max_samples = 0, \
                          AR_order = 1, I_order = 0, MA_order = 0, \
                          num_predicted_frames=8, \
                          save_model = False, cache_data = False, \
                          cache_dir = "", \
                          version = 0.1 ):
    ''' Main spatio-temporal prediction function.
    step_num
        Current file number n being processed Xn.h5.
    input_dir
        Input directory in which the training/adapatation data are found
        in two subdirectories train/ and adapt/
    output_dir
        Output directory in which we expect Yn+1.h5 predictions to be deposited.
        The next num_frame frames must be predicted.
    code_dir
        The directory to which the participant submissions are unzipped.
    ext
        The file extensions of input and output data
    verbose
        if True, debug messages are printed
    debug_mode
        0: run the code normally, using the time budget of the task
        1: run the code normally, but limit the time to max_time
        2: run everything, but do not train, use persistence
        3: just list the directories and program version
    time_budget
        Maximum total running time in seconds.
        The code should keep track of time spent and NOT exceed the time limit.
    max_samples
        Maximum number of training samples loaded.
        Allows you to limit the number of traiining samples read for speed-up.
    Model order
        The order of an ARIMA model.
        Your training algorithm may be slow, so you may want to limit .
        the window of past frames used.
        AR_order = 1 # Persistence is order 1
        I_order = 0
        MA_order = 0
    num_predicted_frames
        Number of frames to be predicted in the future.
    save_model
        Models can eventually be pre-trained and re-loaded.
    cache_data
        Data that were loaded in the past can be cached in some
        binary format for faster reload.
    cache_dir
        A directory where to cache data.
    version
        This code's version.
    '''
    #### Check whether everything went well (no time exceeded)
    execution_success = True
    start_time = time.time()         # <== Mark starting time
    if not(cache_dir): cache_dir = code_dir # For the moment it is the code directory

    path.append (code_dir)
    path.append (os.path.join(code_dir, 'sample_code'))
    import data_io
    from data_io import vprint
    from data_manager import DataManager # load/save data and get info about them
    from model import Model              # example model implementing persistence

    vprint( verbose,  "\n====> STEP: " + str(step_num))
    vprint( verbose,  "Using input_dir: " + input_dir)
    vprint( verbose,  "Using output_dir: " + output_dir)
    vprint( verbose,  "Using code_dir: " + code_dir)
    vprint( verbose,  "Using cache_dir: " + cache_dir)

    # Make a result directory and cache_dir if they do not exist
    data_io.mkdir(output_dir)
    data_io.mkdir(cache_dir)

    # List various directories
    if debug_mode >= 3:
        vprint( verbose,  "This code version is %d" + str(version))
        data_io.show_version()
        data_io.show_dir(os.getcwd()) # Run directory
        data_io.show_io(input_dir, output_dir)
        data_io.show_dir(output_dir)

    # Our libraries
    path.append (code_dir)

    #### START WORKING ####  ####  ####  ####  ####  ####  ####  ####  ####
    vprint( verbose,  "************************************************")
    vprint( verbose,  "******** Processing data chunk number " + str(step_num) + " ********")
    vprint( verbose,  "************************************************")

    # Instantiate data and model objects
    if cache_data:
        cache_file = os.path.join(cache_dir, "Din.pickle")
    else:
        cache_file = ""
    Din = DataManager(datatype="input", verbose=verbose, cache_file=cache_file)
    Dout = DataManager(datatype="output", verbose=verbose)
    M = Model(hyper_param=(AR_order, I_order, MA_order), path=code_dir, verbose=verbose)

    # Read data training frames and train
    if step_num == 0:
        # First time we read the training data.
        train_data_dir = os.path.join(input_dir, "train")
        Din.loadTrainData(train_data_dir, max_samples=max_samples)
        # Train the model
        M.train(Din.X, Din.t) # The X matrix is the time series, the T vector are the (optional) time indices
    else:
        # Reload the already trained model and data (warm start)
        if save_model:
            M.load(path=cache_dir)
        if cache_data:
            Din.reloadData('Din', data_dir=cache_dir, format='pickle')

    # Read additional frames and append them.
    adapt_data_dir = os.path.join(input_dir, "adapt")
    Din.appendSamples(step_num, adapt_data_dir)

    # Save data for future re-use (we do not forget anything at the moment,
    # but this may be waistful in time and memory). We especially may not need
    # the training data.
    if cache_data:
        Din.saveData('Din', data_dir=cache_dir, format='pickle')

    # Adapt the model. We pass all the data we have, the model is supposed to
    # know how to use a window of data in the past.
    M.adapt(Din.X, Din.t)
    # To save the effort of re-computing predictions made by the old model to
    # correct it we could re-load past predictions (still available in the output directory).
    # For simplicity we do not do it here.

    # Eventually save the model for future re-use (warm start)
    if save_model:
        M.save(path=cache_dir)

    # Make predictions
    Dout.X = M.predict(Din.X, num_predicted_frames=num_predicted_frames)
    Dout.t = np.array(range(1, Dout.X.shape[0]+1))

    # Save predictions
    Dout.saveData('Y' + str(step_num), data_dir=output_dir, format="h5")

    time_spent = time.time() - start_time
    time_left_over = time_budget - time_spent
    if time_left_over>0:
        vprint( verbose,  "[+] Done")
        vprint( verbose,  "[+] Time spent %5.2f sec " % time_spent + "::  Time budget %5.2f sec" % time_budget)
    else:
        execution_success = 0
        vprint( verbose,  "[-] Time exceeded")
        vprint( verbose,  "[-] Time spent %5.2f sec " % time_spent + " > Time budget %5.2f sec" % time_budget)

    return execution_success
Example #4
0
        except Exception:
            pass

    # Move old results and create a new output directory 
    if not(running_on_codalab) and save_previous_results:
        data_io.mvdir(res_dir, res_dir+'_'+the_date) 
    data_io.mkdir(res_dir) 
    
    #### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(data_dir)
    # Overwrite the "natural" order
    
    #### DEBUG MODE: Show dataset list and STOP
    if debug_mode>=3:
        data_io.show_io(data_dir, res_dir)
        logger.info('****** Sample code version ' + str(version) + '******')
        logger.info('========== DATASETS ==========')
        data_io.write_list(datanames)      
        datanames = [] # Do not proceed with learning and testing
        

    # =================== NEW AutoSKLEARN challenge REMOVED @RESULT SUBMISSION (KEEP THIS) ==================

    # ================ @CODE SUBMISSION (SUBTITUTE YOUR CODE) ================= 
    time_left_over = 0
    for basename in datanames: # Loop over datasets

        filename_valid = basename + '_valid.predict'
        filename_test = basename + '_test.predict'
        try:
Example #5
0
if __name__ == "__main__":
    print("****** The ingestion program generates the result  ******\n")
    datanames = data_io.inventory_data(input_directory)

    if len(datanames) == 0:
        print("****** No data found ******")

    # Loop over datasets
    for basename in datanames:
        print("****** Processing " + basename.capitalize() + " ******")
        # Here it should process the participant submission (but we don't bother)
        X = data_io.data(
            path.join(input_directory, basename, basename + '_valid.data'))
        Yvalid = random.rand(X.shape[0])
        X = data_io.data(
            path.join(input_directory, basename, basename + '_test.data'))
        # This should be where we call the participant submission and use his code:
        Ytest = random.rand(X.shape[0])
        # Write results to files
        data_io.write(path.join(output_directory, basename + '_valid.predict'),
                      Yvalid)
        data_io.write(path.join(output_directory, basename + '_test.predict'),
                      Ytest)

# Lots of debug code...
    data_io.show_io(input_directory, output_directory)
    data_io.show_version()

    exit(0)