コード例 #1
0
 def __init__(self, path):
     self.path = os.path.abspath(path)
     self.train_path = os.path.join(self.path, 'data-train.csv')
     self.test_path = os.path.join(self.path, 'data-test.csv')
     self.valid_path = os.path.join(self.path, 'data-validation.csv')
     self.saver = Saver(self)
     self.loader = Loader(self)
     self.n_labels = 0
     self.train = None
     self.test = None
     self.validation = None
     self.X_train = None
     self.Y_train = None
     self.X_test = None
     self.Y_test = None
     self.X_val = None
     self.Y_val = None
     self.X = None
     self.Y = None
コード例 #2
0
class Dataset(object):
    @staticmethod
    def clean(path):
        clean_if_exist(
            path, ('data-train.csv', 'data-test.csv', 'data-validation.csv'))

    @staticmethod
    def optimize(path, reuse=0.15, output=None):
        optimize_dataset(path, reuse, output)

    @staticmethod
    def split_row(row, n_labels):
        x = row.values[:, 1:]
        y = to_categorical(row.values[:, 0], n_labels)
        return x, y

    def __init__(self, path):
        self.path = os.path.abspath(path)
        self.train_path = os.path.join(self.path, 'data-train.csv')
        self.test_path = os.path.join(self.path, 'data-test.csv')
        self.valid_path = os.path.join(self.path, 'data-validation.csv')
        self.saver = Saver(self)
        self.loader = Loader(self)
        self.n_labels = 0
        self.train = None
        self.test = None
        self.validation = None
        self.X_train = None
        self.Y_train = None
        self.X_test = None
        self.Y_test = None
        self.X_val = None
        self.Y_val = None
        self.X = None
        self.Y = None

    def exists(self):
        return os.path.exists(self.train_path) and \
               os.path.exists(self.test_path) and \
               os.path.exists(self.valid_path)

    def _set_xys(self, for_training=True):
        if for_training:
            self.X_train, self.Y_train = Dataset.split_row(
                self.train, self.n_labels)
            self.X_test, self.Y_test = Dataset.split_row(
                self.test, self.n_labels)
            self.X_val, self.Y_val = Dataset.split_row(self.validation,
                                                       self.n_labels)
        else:
            self.X, self.Y = Dataset.split_row(self.train, self.n_labels)

    def load(self):
        self.loader.load()
        self._set_xys()

    def source(self, data, p_test=0.0, p_val=0.0):
        # reset indexes and resample data just in case
        dataset = data.sample(frac=1).reset_index(drop=True)
        # count unique labels on first column
        self.n_labels = len(dataset.iloc[:, 0].unique())
        # if both values are zero, we're just loading a single file,
        # otherwise we want to generate training temporary datasets.
        for_training = p_test > 0.0 and p_val > 0.0
        if for_training:
            log.info(
                "generating train, test and validation datasets (test=%f validation=%f) ...",
                p_test, p_val)

            n_tot = len(dataset)
            n_train = int(n_tot * (1 - p_test - p_val))
            n_test = int(n_tot * p_test)
            n_val = int(n_tot * p_val)

            self.train = dataset.head(n_train)
            self.test = dataset.head(n_train + n_test).tail(n_test)
            self.validation = dataset.tail(n_val)

            self.saver.save()
        else:
            self.train = dataset

        self._set_xys(for_training)
コード例 #3
0
class Dataset(object):
    @staticmethod
    def split_row(row, n_labels, flat):
        x = row.iloc[:, 1:].copy()
        if not flat:
            if len(row) == 1:
                # this check is to prevent the list comprehension to fail
                # shouldn't happen in production but can fail on test
                log.error("Dataset size must be greater than 1")
                quit()
            x = [np.squeeze(np.array([x[i][:]]), axis=0) for i in x.columns]
        y = to_categorical(row.values[:, 0], n_labels)
        return x, y

    def __init__(self, path):
        self.path = os.path.abspath(path)
        self.train_path = os.path.join(self.path, 'data-train.csv')
        self.test_path = os.path.join(self.path, 'data-test.csv')
        self.valid_path = os.path.join(self.path, 'data-validation.csv')
        self.saver = Saver(self)
        self.loader = Loader(self)
        self.do_save = True
        self.is_flat = True
        self.n_labels = 0
        self.train = None
        self.test = None
        self.validation = None
        self.X_train = None
        self.Y_train = None
        self.X_test = None
        self.Y_test = None
        self.X_val = None
        self.Y_val = None
        self.X = None
        self.Y = None

    def has_train(self):
        return os.path.exists(self.train_path) or os.path.exists(
            self.train_path.replace('.csv', '.pkl'))

    def has_test(self):
        return os.path.exists(self.test_path) or os.path.exists(
            self.test_path.replace('.csv', '.pkl'))

    def has_validation(self):
        return os.path.exists(self.valid_path) or os.path.exists(
            self.valid_path.replace('.csv', '.pkl'))

    def exists(self):
        return self.has_train() and \
               self.has_test() and \
               self.has_validation()

    def _set_xys(self, for_training=True):
        if for_training:
            self.X_train, self.Y_train = Dataset.split_row(
                self.train, self.n_labels, self.is_flat)
            self.X_test, self.Y_test = Dataset.split_row(
                self.test, self.n_labels, self.is_flat)
            self.X_val, self.Y_val = Dataset.split_row(self.validation,
                                                       self.n_labels,
                                                       self.is_flat)
        else:
            self.X, self.Y = Dataset.split_row(self.train, self.n_labels,
                                               self.is_flat)

    def _set_xys_test(self):
        self.X_test, self.Y_test = Dataset.split_row(self.test, self.n_labels,
                                                     self.is_flat)

    def _check_encoding(self):
        pkl_test = self.train_path.replace('.csv', '.pkl')
        if os.path.exists(pkl_test):
            log.info("detected pickle encoded dataset")
            self.is_flat = False
            self.train_path = self.train_path.replace('.csv', '.pkl')
            self.test_path = self.test_path.replace('.csv', '.pkl')
            self.valid_path = self.valid_path.replace('.csv', '.pkl')

    def load_test(self):
        self._check_encoding()
        self.loader.load_test()
        self._set_xys_test()

    def load(self):
        self._check_encoding()
        self.loader.load()
        self._set_xys()

    def _is_scalar_value(self, v):
        try:
            return not (len(v) >= 0)
        except TypeError:
            # TypeError: object of type 'X' has no len()
            return True
        except:
            raise

    def source(self, data, p_test=0.0, p_val=0.0, shuffle=True):
        if shuffle:
            # reset indexes and resample data just in case
            dataset = data.sample(frac=1).reset_index(drop=True)
        else:
            dataset = data

        # check if the input vectors are made of scalars or other vectors
        self.is_flat = True
        for x in dataset.iloc[0, :]:
            if not self._is_scalar_value(x):
                log.info("detected non scalar input: %s", x.shape)
                self.is_flat = False
                break

        # count unique labels on first column
        self.n_labels = len(dataset.iloc[:, 0].unique())
        # if both values are zero, we're just loading a single file,
        # otherwise we want to generate training temporary datasets.
        for_training = p_test > 0.0 and p_val > 0.0
        if for_training:
            log.info(
                "generating train, test and validation datasets (test=%f validation=%f) ...",
                p_test, p_val)

            n_tot = len(dataset)
            n_train = int(n_tot * (1 - p_test - p_val))
            n_test = int(n_tot * p_test)
            n_val = int(n_tot * p_val)

            self.train = dataset.head(n_train)
            self.test = dataset.head(n_train + n_test).tail(n_test)
            self.validation = dataset.tail(n_val)

            if self.do_save:
                self.saver.save()
        else:
            self.train = dataset

        self._set_xys(for_training)

    def subsample(self, ratio):
        X = self.X.values if self.is_flat else self.X
        y = self.Y
        if ratio < 1.0:
            log.info("selecting a randomized sample of %d%% ...", ratio * 100)

            tot_rows = X.shape[0] if self.is_flat else X[0].shape[0]
            num = int(tot_rows * ratio)
            indexes = np.random.choice(tot_rows, num, replace=False)

            X = X[indexes] if self.is_flat else [i[indexes] for i in X]
            y = y[indexes]

        return X, y