Example #1
0
def load_dataset(input_dir, name):
    """ Load a TFRecords dataset (AutoDL format).
    """
    input_dir = os.path.join(input_dir, name)
    test_labels_file = os.path.join(input_dir, name + '.solution')
    test_labels = np.array(pd.read_csv(test_labels_file, header=None, sep=' '))
    data_dir = name + '.data'
    train = AutoDLDataset(os.path.join(input_dir, data_dir, 'train'))
    test = AutoDLDataset(os.path.join(input_dir, data_dir, 'test'))
    return name, train, test, test_labels
Example #2
0
    def read_data(self):
        """Given a dataset directory, read and return training/test set data as
    `AutoDLDataset` objects, along with other infomation.

    Args:
      dataset_dir: a string indicating the absolute or relative path of a
        formatted AutoDL dataset.
    Returns:
      d_train, d_test: 2 'AutoDLDataset' objects, containing training/test data.
      other_info: a dict containing some additional info on the dataset, e.g.
      the metadata on the column names and class names (contained in
        `label_to_index_map`).
    """
        dataset_dir = self.dataset_dir
        files = os.listdir(dataset_dir)
        data_files = [x for x in files if x.endswith('.data')]
        assert len(data_files) == 1
        dataset_name = data_files[0][:-5]
        solution_files = [x for x in files if x.endswith('.solution')]
        with_solution = None  # With or without solution (i.e. training or test)
        if len(solution_files) == 1:
            solution_dataset_name = solution_files[0][:-9]
            if solution_dataset_name == dataset_name:
                with_solution = True
            else:
                raise ValueError("Wrong dataset name. Should be {} but got {}."\
                                 .format(dataset_name, solution_dataset_name))
        elif not solution_files:
            with_solution = False
        else:
            return ValueError("Multiple solution files found:" +\
                              " {}".format(solution_files))
        d_train = AutoDLDataset(
            os.path.join(dataset_dir, dataset_name + '.data', "train"))
        d_test = AutoDLDataset(
            os.path.join(dataset_dir, dataset_name + '.data', "test"))
        other_info = {}
        other_info['dataset_name'] = dataset_name
        other_info['with_solution'] = with_solution
        label_to_index_map = d_train.get_metadata().get_label_to_index_map()
        if label_to_index_map:
            classes_list = [None] * len(label_to_index_map)
            for label in label_to_index_map:
                index = label_to_index_map[label]
                classes_list[index] = label
            other_info['classes_list'] = classes_list
        else:
            tf.logging.info(
                "No label_to_index_map found in metadata. Labels will "
                "only be represented by integers.")
        self.d_train, self.d_test, self.other_info = d_train, d_test, other_info
        if with_solution:
            solution_path = os.path.join(dataset_dir, solution_files[0])
            self.other_info['Y_test'] = np.loadtxt(solution_path)
        return d_train, d_test, other_info
Example #3
0
def get_train_and_test_data(input_dir, dataset_name, repeat=False):
    """
  Returns:
    D_train, D_test: 2 AutoDLDataset objects (defined in `dataset.py`)
  """
    train_path = os.path.join(input_dir, dataset_name, dataset_name + '.data',
                              'train')
    test_path = os.path.join(input_dir, dataset_name, dataset_name + '.data',
                             'test')
    D_train = AutoDLDataset(train_path)
    D_train.init(repeat=repeat)
    D_test = AutoDLDataset(test_path)
    D_test.init(repeat=repeat)
    return D_train, D_test
def news_baseline_test():

    dataset = AutoDLDataset('./tweet.data/train/')
    dataset.init()
    iterator = dataset.get_dataset().make_one_shot_iterator()
    next_element = iterator.get_next()

    # features, labels = next_element
    #
    # features = features.eval()
    # labels = labels.eval()
    #
    # print next_element
    data = []
    sess = tf.Session()
    for idx in range(10):
        print("Example " + str(idx))
        data.append(sess.run(next_element))

    for each_data in data:

        print each_data
Example #5
0
    if len(datanames) != 1:
      raise ValueError("{} datasets found in dataset_dir={}!\n"\
                       .format(len(datanames), dataset_dir) +
                       "Please put only ONE dataset under dataset_dir.")

    basename = datanames[0]

    logger.info("************************************************")
    logger.info("******** Processing dataset " + basename[:-5].capitalize() +
                 " ********")
    logger.info("************************************************")
    logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION))

    ##### Begin creating training set and test set #####
    logger.info("Reading training set and test set...")
    D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train"))
    D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test"))
    ##### End creating training set and test set #####

    ## Get correct prediction shape
    num_examples_test = D_test.get_metadata().size()
    output_dim = D_test.get_metadata().get_output_size()
    correct_prediction_shape = (num_examples_test, output_dim)

    # 20 min for participants to initializing and install other packages
    try:
      init_time_budget = 20 * 60 # time budget for initilization.
      timer = Timer()
      timer.set(init_time_budget)
      with timer.time_limit("Initialization"):
        ##### Begin creating model #####
                         "Please put only ONE dataset under dataset_dir.")

    basename = datanames[0]

    logger.info("************************************************")
    logger.info("******** Processing dataset " + basename[:-5].capitalize() +
                " ********")
    logger.info("************************************************")
    logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION))

    # Write solution files in output_dir
    copy_solution(dataset_dir, output_dir)

    ##### Begin creating training set and test set #####
    logger.info("Reading training set and test set...")
    D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train"))
    D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test"))
    D_predict_data = None

    if os.path.exists(os.path.join(dataset_dir, basename, "unlabelled")):
        logger.info("Found some data to predict...")
        D_predict = AutoDLDataset(
            os.path.join(dataset_dir, basename, "unlabelled"))
        num_examples_predict = D_predict.get_metadata().size()
        D_predict_data = D_predict.get_dataset()

    ##### End creating training set and test set #####

    ## Get correct prediction shape
    num_examples_test = D_test.get_metadata().size()
    output_dim = D_test.get_metadata().get_output_size()
def ingestion_fn(dataset_dir,
                 code_dir,
                 time_budget,
                 time_budget_approx,
                 output_dir,
                 score_dir,
                 model_config_name=None,
                 model_config=None):
    #### Check whether everything went well
    ingestion_success = True

    # Parse directories
    root_dir = _HERE(os.pardir)
    ingestion_program_dir = join(root_dir, "ingestion_program")

    if dataset_dir.endswith("run/input") and code_dir.endswith("run/program"):
        logger.debug(
            "Since dataset_dir ends with 'run/input' and code_dir "
            "ends with 'run/program', suppose running on " +
            "CodaLab platform. Modify dataset_dir to 'run/input_data' "
            "and code_dir to 'run/submission'. " +
            "Directory parsing should be more flexible in the code of " +
            "compute worker: we need explicit directories for " +
            "dataset_dir and code_dir.")
        dataset_dir = dataset_dir.replace("run/input", "run/input_data")
        code_dir = code_dir.replace("run/program", "run/submission")

    # Show directories for debugging
    logger.debug("sys.argv = " + str(sys.argv))
    logger.debug("Using dataset_dir: " + dataset_dir)
    logger.debug("Using output_dir: " + output_dir)
    logger.debug("Using ingestion_program_dir: " + ingestion_program_dir)
    logger.debug("Using code_dir: " + code_dir)

    # Our libraries
    path.append(ingestion_program_dir)
    path.append(code_dir)
    # IG: to allow submitting the starting kit as sample submission
    path.append(code_dir + "/sample_code_submission")
    import data_io
    from dataset import AutoDLDataset  # THE class of AutoDL datasets

    data_io.mkdir(output_dir)

    #### INVENTORY DATA (and sort dataset names alphabetically)
    datanames = data_io.inventory_data(dataset_dir)
    #### Delete zip files and metadata file
    datanames = [x for x in datanames if x.endswith(".data")]

    if len(datanames) != 1:
        raise ValueError("{} datasets found in dataset_dir={}!\n".format(
            len(datanames), dataset_dir) +
                         "Please put only ONE dataset under dataset_dir.")

    basename = datanames[0]

    logger.info("************************************************")
    logger.info("******** Processing dataset " + basename[:-5].capitalize() +
                " ********")
    logger.info("************************************************")
    logger.debug("Version: {}. Description: {}".format(VERSION, DESCRIPTION))

    ##### Begin creating training set and test set #####
    logger.info("Reading training set and test set...")
    D_train = AutoDLDataset(os.path.join(dataset_dir, basename, "train"))
    D_test = AutoDLDataset(os.path.join(dataset_dir, basename, "test"))
    ##### End creating training set and test set #####

    ## Get correct prediction shape
    num_examples_test = D_test.get_metadata().size()
    output_dim = D_test.get_metadata().get_output_size()
    correct_prediction_shape = (num_examples_test, output_dim)

    # 20 min for participants to initializing and install other packages
    # try:
    #     init_time_budget = 20 * 60  # time budget for initilization.
    #     timer = Timer()
    #     timer.set(init_time_budget)
    #     with timer.time_limit("Initialization"):

    ##### Begin creating model #####
    logger.info("Creating model...this process should not exceed 20min.")
    from model import Model  # in participants' model.py

    # The metadata of D_train and D_test only differ in sample_count
    M = Model(D_train.get_metadata(),
              model_config_name=model_config_name,
              model_config=model_config)
    ###### End creating model ######

    # except TimeoutException as e:
    #     logger.info("[-] Initialization phase exceeded time budget. Move to train/predict phase")
    # except Exception as e:
    #     logger.error("Failed to initializing model.")
    #     logger.error("Encountered exception:\n" + str(e), exc_info=True)
    #

    # Mark starting time of ingestion
    start = time.time()
    logger.info("=" * 5 + " Start core part of ingestion program. " +
                "Version: {} ".format(VERSION) + "=" * 5)

    write_start_file(output_dir,
                     start_time=start,
                     time_budget=time_budget,
                     task_name=basename.split(".")[0])

    try:
        # Check if the model has methods `train` and `test`.
        for attr in ["train", "test"]:
            if not hasattr(M, attr):
                raise ModelApiError(
                    "Your model object doesn't have the method " +
                    "`{}`. Please implement it in model.py.")

        # Check if model.py uses new done_training API instead of marking
        # stopping by returning None
        use_done_training_api = hasattr(M, "done_training")
        if not use_done_training_api:
            logger.warning(
                "Your model object doesn't have an attribute " +
                "`done_training`. But this is necessary for ingestion " +
                "program to know whether the model has done training " +
                "and to decide whether to proceed more training. " +
                "Please add this attribute to your model.")

        # Keeping track of how many predictions are made
        prediction_order_number = 0

        # Start the CORE PART: train/predict process
        while not (use_done_training_api and M.done_training):
            remaining_time_budget = start + time_budget - time.time()
            # Train the model
            logger.info("Begin training the model...")
            M.train(D_train.get_dataset(),
                    remaining_time_budget=remaining_time_budget)
            logger.info("Finished training the model.")
            # Make predictions using the trained model
            logger.info("Begin testing the model by making predictions " +
                        "on test set...")
            remaining_time_budget = start + time_budget - time.time()
            Y_pred = M.test(D_test.get_dataset(),
                            remaining_time_budget=remaining_time_budget)
            logger.info("Finished making predictions.")
            if Y_pred is None:  # Stop train/predict process if Y_pred is None
                logger.info("The method model.test returned `None`. " +
                            "Stop train/predict process.")
                break
            else:  # Check if the prediction has good shape
                prediction_shape = tuple(Y_pred.shape)
                if prediction_shape != correct_prediction_shape:
                    raise BadPredictionShapeError(
                        "Bad prediction shape! Expected {} but got {}.".format(
                            correct_prediction_shape, prediction_shape))
            remaining_time_budget = start + time_budget_approx - time.time()
            if remaining_time_budget < 0:
                break
            # Write timestamp to 'start.txt'
            write_timestamp(output_dir,
                            predict_idx=prediction_order_number,
                            timestamp=time.time())
            # Prediction files: adult.predict_0, adult.predict_1, ...
            filename_test = basename[:-5] + ".predict_" + str(
                prediction_order_number)
            # Write predictions to output_dir
            data_io.write(os.path.join(output_dir, filename_test), Y_pred)
            prediction_order_number += 1
            logger.info(
                "[+] {0:d} predictions made, time spent so far {1:.2f} sec".
                format(prediction_order_number,
                       time.time() - start))
            remaining_time_budget = start + time_budget_approx - time.time()
            logger.info(
                "[+] Time left {0:.2f} sec".format(remaining_time_budget))

    except Exception as e:
        ingestion_success = False
        logger.info("Failed to run ingestion.")
        logger.error("Encountered exception:\n" + str(e), exc_info=True)

    # Finishing ingestion program
    end_time = time.time()
    overall_time_spent = end_time - start

    # Write overall_time_spent to a end.txt file
    end_filename = "end.txt"
    with open(os.path.join(output_dir, end_filename), "w") as f:
        f.write("ingestion_duration: " + str(overall_time_spent) + "\n")
        f.write("ingestion_success: " + str(int(ingestion_success)) + "\n")
        f.write("end_time: " + str(end_time) + "\n")
        logger.info("Wrote the file {} marking the end of ingestion.".format(
            end_filename))
        if ingestion_success:
            logger.info("[+] Done. Ingestion program successfully terminated.")
            logger.info("[+] Overall time spent %5.2f sec " %
                        overall_time_spent)
        else:
            logger.info(
                "[-] Done, but encountered some errors during ingestion.")
            logger.info("[-] Overall time spent %5.2f sec " %
                        overall_time_spent)

    # Copy all files in output_dir to score_dir
    os.system("cp -R {} {}".format(os.path.join(output_dir, "*"), score_dir))
    logger.debug("Copied all ingestion output to scoring output directory.")

    logger.info("[Ingestion terminated]")
Example #8
0
    for i, basename in enumerate(datanames):

        print_log("========== Ingestion program version " + str(version) + " ==========")
        print_log("************************************************")
        print_log("******** Processing dataset " + basename[:-5].capitalize() + " ********")
        print_log("************************************************")

        # ======== Learning on a time budget:
        # Keep track of time not to exceed your time budget. Time spent to inventory data neglected.
        start = time.time()

        # ======== Creating a data object with data, informations about it
        print_log("Reading training set and test set...")

        ##### Begin creating training set and test set #####
        D_train = AutoDLDataset(os.path.join(input_dir, basename, "train"))
        D_test = AutoDLDataset(os.path.join(input_dir, basename, "test"))
        ##### End creating training set and test set #####

        # ======== Keep track of time
        if debug_mode<1:
            time_budget = get_time_budget(D_train)        # <== HERE IS THE TIME BUDGET!
        else:
            time_budget = max_time

        # ========= Creating a model
        print_log("Creating model...")
        ##### Begin creating model #####
        M = Model(D_train.get_metadata()) # The metadata of D_train and D_test only differ in sample_count
        ###### End creating model ######
Example #9
0
 def get_autodl_dataset(self, subset='train'):
     subset_path = self.get_path_to_subset(subset)
     return AutoDLDataset(subset_path)
Example #10
0
def get_dataset_handle(path, train_test="train"):
    # basename = Hammer.data
    basename = [
        x for x in os.listdir(path) if ".data" in x and "zip" not in x
    ][0]
    return AutoDLDataset(os.path.join(path, basename, train_test))