Beispiel #1
0
def load_data(image_key: str = "x",
              label_key: str = "y",
              label_mode: str = "fine") -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the CIFAR100 dataset.

    Please consider using the ciFAIR100 dataset instead. CIFAR100 contains duplicates between its train and test sets.

    Args:
        image_key: The key for image.
        label_key: The key for label.
        label_mode: Either "fine" for 100 classes or "coarse" for 20 classes.

    Returns:
        (train_data, eval_data)

    Raises:
        ValueError: If the label_mode is invalid.
    """
    print("\033[93m {}\033[00m".format("FastEstimator-Warn: Consider using the ciFAIR100 dataset instead."))
    if label_mode not in ['fine', 'coarse']:
        raise ValueError("label_mode must be one of either 'fine' or 'coarse'.")
    (x_train, y_train), (x_eval, y_eval) = tf.keras.datasets.cifar100.load_data(label_mode=label_mode)
    train_data = NumpyDataset({image_key: x_train, label_key: y_train})
    eval_data = NumpyDataset({image_key: x_eval, label_key: y_eval})
    return train_data, eval_data
Beispiel #2
0
def load_data(
        root_dir: Optional[str] = None) -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the SVHN Cropped digits dataset.

    For more information about this dataset please visit http://ufldl.stanford.edu/housenumbers/. Here, we are using
    Format 2 to get MNIST-like 32-by-32 images.

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.

    Returns:
        (train_data, test_data)
    """
    home = str(Path.home())

    if root_dir is None:
        root_dir = os.path.join(home, 'fastestimator_data', 'SVHN_Cropped')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'SVHN_Cropped')
    os.makedirs(root_dir, exist_ok=True)

    # download data to memory
    train_path = os.path.join(root_dir, "train_32x32.mat")
    test_path = os.path.join(root_dir, "test_32x32.mat")

    if not os.path.exists(train_path):
        print("Downloading train data to {}".format(root_dir))
        wget.download('http://ufldl.stanford.edu/housenumbers/train_32x32.mat',
                      root_dir,
                      bar=bar_custom)

    if not os.path.exists(test_path):
        print("Downloading test data to {}".format(root_dir))
        wget.download('http://ufldl.stanford.edu/housenumbers/test_32x32.mat',
                      root_dir,
                      bar=bar_custom)

    xy_train = loadmat(train_path)
    xy_test = loadmat(test_path)

    # setting label of '0' digit from '10' to '0'
    xy_train['y'][xy_train['y'] == 10] = 0
    xy_test['y'][xy_test['y'] == 10] = 0

    # make datasets
    train_data = NumpyDataset({
        "x": np.transpose(xy_train['X'], (3, 0, 1, 2)),
        "y": xy_train['y']
    })
    test_data = NumpyDataset({
        "x": np.transpose(xy_test['X'], (3, 0, 1, 2)),
        "y": xy_test['y']
    })

    return train_data, test_data
Beispiel #3
0
def load_data(image_key: str = "x",
              label_key: str = "y",
              label_mode: str = "fine") -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the ciFAIR100 dataset.

    This is the cifar100 dataset but with test set duplicates removed and replaced. See
    https://arxiv.org/pdf/1902.00423.pdf or https://cvjena.github.io/cifair/ for details. Cite the paper if you use the
    dataset.

    Args:
        image_key: The key for image.
        label_key: The key for label.
        label_mode: Either "fine" for 100 classes or "coarse" for 20 classes.

    Returns:
        (train_data, test_data)

    Raises:
        ValueError: If the label_mode is invalid.
    """
    if label_mode not in ['fine', 'coarse']:
        raise ValueError(
            "label_mode must be one of either 'fine' or 'coarse'.")

    dirname = 'ciFAIR-100'
    archive_name = 'ciFAIR-100.zip'
    origin = 'https://github.com/cvjena/cifair/releases/download/v1.0/ciFAIR-100.zip'
    md5_hash = 'ddc236ab4b12eeb8b20b952614861a33'

    path = get_file(archive_name,
                    origin=origin,
                    file_hash=md5_hash,
                    hash_algorithm='md5',
                    extract=True,
                    archive_format='zip')
    path = os.path.join(os.path.dirname(path), dirname)

    fpath = os.path.join(path, 'train')
    x_train, y_train = _load_batch(fpath, label_key=label_mode + '_labels')

    fpath = os.path.join(path, 'test')
    x_test, y_test = _load_batch(fpath, label_key=label_mode + '_labels')

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    x_train = x_train.transpose((0, 2, 3, 1))
    x_test = x_test.transpose((0, 2, 3, 1))

    x_test = x_test.astype(x_train.dtype)
    y_test = y_test.astype(y_train.dtype)

    train_data = NumpyDataset({image_key: x_train, label_key: y_train})
    test_data = NumpyDataset({image_key: x_test, label_key: y_test})
    return train_data, test_data
Beispiel #4
0
def load_data(image_key: str = "x",
              label_key: str = "y") -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the ciFAIR10 dataset.

    This is the cifar10 dataset but with test set duplicates removed and replaced. See
    https://arxiv.org/pdf/1902.00423.pdf or https://cvjena.github.io/cifair/ for details. Cite the paper if you use the
    dataset.

    Args:
        image_key: The key for image.
        label_key: The key for label.

    Returns:
        (train_data, test_data)
    """
    dirname = 'ciFAIR-10'
    archive_name = 'ciFAIR-10.zip'
    origin = 'https://github.com/cvjena/cifair/releases/download/v1.0/ciFAIR-10.zip'
    md5_hash = 'ca08fd390f0839693d3fc45c4e49585f'

    path = get_file(archive_name,
                    origin=origin,
                    file_hash=md5_hash,
                    hash_algorithm='md5',
                    extract=True,
                    archive_format='zip')
    path = os.path.join(os.path.dirname(path), dirname)

    num_train_samples = 50000

    x_train = np.empty((num_train_samples, 3, 32, 32), dtype='uint8')
    y_train = np.empty((num_train_samples, ), dtype='uint8')

    for i in range(1, 6):
        fpath = os.path.join(path, f'data_batch_{i}')
        (x_train[(i - 1) * 10000:i * 10000, :, :, :],
         y_train[(i - 1) * 10000:i * 10000]) = _load_batch(fpath)

    fpath = os.path.join(path, 'test_batch')
    x_test, y_test = _load_batch(fpath)

    y_train = np.reshape(y_train, (len(y_train), 1))
    y_test = np.reshape(y_test, (len(y_test), 1))

    x_train = x_train.transpose((0, 2, 3, 1))
    x_test = x_test.transpose((0, 2, 3, 1))

    x_test = x_test.astype(x_train.dtype)
    y_test = y_test.astype(y_train.dtype)

    train_data = NumpyDataset({image_key: x_train, label_key: y_train})
    test_data = NumpyDataset({image_key: x_test, label_key: y_test})
    return train_data, test_data
def load_data(root_dir: Optional[str] = None,
              seq_length: int = 64) -> Tuple[NumpyDataset, NumpyDataset, NumpyDataset, List[str]]:
    """Load and return the Penn TreeBank dataset.

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.
        seq_length: Length of data sequence.

    Returns:
        (train_data, eval_data, test_data, vocab)
    """
    home = str(Path.home())

    if root_dir is None:
        root_dir = os.path.join(home, 'fastestimator_data', 'PennTreeBank')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'PennTreeBank')
    os.makedirs(root_dir, exist_ok=True)

    train_data_path = os.path.join(root_dir, 'ptb.train.txt')
    eval_data_path = os.path.join(root_dir, 'ptb.valid.txt')
    test_data_path = os.path.join(root_dir, 'ptb.test.txt')

    files = [(train_data_path, 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt'),
             (eval_data_path, 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'),
             (test_data_path, 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt')]

    texts = []
    for data_path, download_link in files:
        if not os.path.exists(data_path):
            # Download
            print("Downloading data: {}".format(data_path))
            wget.download(download_link, data_path, bar=bar_custom)

        text = []
        with open(data_path, 'r') as f:
            for line in f:
                text.extend(line.split() + ['<eos>'])
        texts.append(text)

    # Build dictionary from training data
    vocab = sorted(set(texts[0]))
    word2idx = {u: i for i, u in enumerate(vocab)}

    #convert word to index and split the sequences and discard the last incomplete sequence
    data = [[word2idx[word] for word in text[:-(len(text) % seq_length)]] for text in texts]
    x_train, x_eval, x_test = [np.array(d).reshape(-1, seq_length) for d in data]

    train_data = NumpyDataset(data={"x": x_train})
    eval_data = NumpyDataset(data={"x": x_eval})
    test_data = NumpyDataset(data={"x": x_test})
    return train_data, eval_data, test_data, vocab
Beispiel #6
0
def load_data(root_dir: Optional[str] = None) -> Tuple[NumpyDataset, NumpyDataset, Set[str], Set[str]]:
    """Load and return the GermEval dataset.

    Dataset from GermEval 2014 contains 31,000 sentences corresponding to over 590,000 tokens from German wikipedia
    and News corpora. The sentence is encoded as one token per line with information provided in tab-seprated columns.
    Sourced from https://sites.google.com/site/germeval2014ner/data

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.

    Returns:
        (train_data, eval_data, train_vocab, label_vocab)
    """
    url = 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1'
    home = str(Path.home())

    if root_dir is None:
        root_dir = os.path.join(home, 'fastestimator_data', 'GermEval')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'GermEval')
    os.makedirs(root_dir, exist_ok=True)

    data_path = os.path.join(root_dir, 'de_ner.tsv')
    data_folder_path = os.path.join(root_dir, 'germeval')

    if not os.path.exists(data_folder_path):
        # download
        if not os.path.exists(data_path):
            print("Downloading data to {}".format(root_dir))
            stream = requests.get(url, stream=True)  # python wget does not work
            total_size = int(stream.headers.get('content-length', 0))
            block_size = 128  # 1 MB
            progress = tqdm(total=total_size, unit='B', unit_scale=True)
            with open(data_path, 'wb') as outfile:
                for data in stream.iter_content(block_size):
                    progress.update(len(data))
                    outfile.write(data)
            progress.close()

    x, y, x_vocab, y_vocab = get_sentences_and_labels(data_path)

    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train = np.array(x_train)
    x_eval = np.array(x_eval)
    y_train = np.array(y_train)
    y_eval = np.array(y_eval)
    train_data = NumpyDataset({"x": x_train, "y": y_train})
    eval_data = NumpyDataset({"x": x_eval, "y": y_eval})
    return train_data, eval_data, x_vocab, y_vocab
Beispiel #7
0
def load_data(
    root_dir: Optional[str] = None
) -> Tuple[NumpyDataset, NumpyDataset, Set[str], Set[str]]:
    """Load and return the MIT Movie dataset.

    MIT Movies dataset is a semantically tagged training and test corpus in BIO format. The sentence is encoded as one
    token per line with information provided in tab-seprated columns.
    Sourced from https://groups.csail.mit.edu/sls/downloads/movie/

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.

    Returns:
        (train_data, eval_data, train_vocab, label_vocab)
    """
    home = str(Path.home())

    if root_dir is None:
        root_dir = os.path.join(home, 'fastestimator_data', 'MITMovie')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'MITMovie')
    os.makedirs(root_dir, exist_ok=True)

    train_data_path = os.path.join(root_dir, 'engtrain.bio')
    test_data_path = os.path.join(root_dir, 'engtest.bio')
    files = [(train_data_path,
              'https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio'),
             (test_data_path,
              'https://groups.csail.mit.edu/sls/downloads/movie/engtest.bio')]

    for data_path, download_link in files:
        if not os.path.exists(data_path):
            # Download
            print("Downloading data: {}".format(data_path))
            wget.download(download_link, data_path, bar=bar_custom)

    x_train, y_train, x_vocab, y_vocab = get_sentences_and_labels(
        train_data_path)
    x_eval, y_eval, x_eval_vocab, y_eval_vocab = get_sentences_and_labels(
        test_data_path)
    x_vocab |= x_eval_vocab
    y_vocab |= y_eval_vocab
    x_train = np.array(x_train)
    x_eval = np.array(x_eval)
    y_train = np.array(y_train)
    y_eval = np.array(y_eval)
    train_data = NumpyDataset({"x": x_train, "y": y_train})
    eval_data = NumpyDataset({"x": x_eval, "y": y_eval})
    return train_data, eval_data, x_vocab, y_vocab
Beispiel #8
0
def sample_system_object_torch():
    x_train = np.random.rand(3, 28, 28, 3)
    y_train = np.random.randint(10, size=(3, ))
    x_eval = np.random.rand(2, 28, 28, 3)
    y_eval = np.random.randint(10, size=(2, ))

    train_data = NumpyDataset({'x': x_train, 'y': y_train})
    eval_data = NumpyDataset({'x': x_eval, 'y': y_eval})
    test_data = eval_data.split(0.5)
    model = fe.build(model_fn=fe.architecture.pytorch.LeNet, optimizer_fn='adam', model_name='torch')
    pipeline = fe.Pipeline(train_data=train_data, eval_data=eval_data, test_data=test_data, batch_size=1)
    network = fe.Network(ops=[ModelOp(model=model, inputs="x_out", outputs="y_pred")])
    system = System(network=network, pipeline=pipeline, traces=[], total_epochs=10, mode='train')
    return system
Beispiel #9
0
def load_data(image_key: str = "x", label_key: str = "y") -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the MNIST dataset.

    Args:
        image_key: The key for image.
        label_key: The key for label.

    Returns:
        (train_data, eval_data)
    """
    (x_train, y_train), (x_eval, y_eval) = tf.keras.datasets.mnist.load_data()
    train_data = NumpyDataset({image_key: x_train, label_key: y_train})
    eval_data = NumpyDataset({image_key: x_eval, label_key: y_eval})
    return train_data, eval_data
Beispiel #10
0
def load_data() -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the UCI ML Breast Cancer Wisconsin (Diagnostic) dataset.

    For more information about this dataset and the meaning of the features it contains, see the sklearn documentation.

    Returns:
        (train_data, eval_data)
    """
    (x, y) = load_breast_cancer(return_X_y=True)
    x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train, x_eval = np.float32(x_train), np.float32(x_eval)
    train_data = NumpyDataset({"x": x_train, "y": y_train})
    eval_data = NumpyDataset({"x": x_eval, "y": y_eval})
    return train_data, eval_data
Beispiel #11
0
def load_data(image_key: str = "x",
              label_key: str = "y") -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the CIFAR10 dataset.

    Please consider using the ciFAIR10 dataset instead. CIFAR10 contains duplicates between its train and test sets.

    Args:
        image_key: The key for image.
        label_key: The key for label.

    Returns:
        (train_data, eval_data)
    """
    print("\033[93m {}\033[00m".format(
        "FastEstimator-Warn: Consider using the ciFAIR10 dataset instead."))
    (x_train, y_train), (x_eval,
                         y_eval) = tf.keras.datasets.cifar10.load_data()
    train_data = NumpyDataset({image_key: x_train, label_key: y_train})
    eval_data = NumpyDataset({image_key: x_eval, label_key: y_eval})
    return train_data, eval_data
Beispiel #12
0
def _create_dataset(data_path: str, translate_option: str,
                    extension: str) -> NumpyDataset:
    source, target = translate_option.split("_to_")
    source = source.replace("_", "-")
    if extension != "train":
        source = source.split("-")[0]
    source_data = _read_data(os.path.join(data_path, source + "." + extension))
    target_data = _read_data(os.path.join(data_path, target + "." + extension))
    assert len(target_data) == len(
        source_data), "Sizes do not match for {} ({} mode)".format(
            translate_option, extension)
    dataset = NumpyDataset({"source": source_data, "target": target_data})
    return dataset
Beispiel #13
0
def load_data(max_len: int,
              vocab_size: int) -> Tuple[NumpyDataset, NumpyDataset]:
    """Load and return the IMDB Movie review dataset.

    This dataset contains 25,000 reviews labeled by sentiments (either positive or negative).

    Args:
        max_len: Maximum desired length of an input sequence.
        vocab_size: Vocabulary size to learn word embeddings.

    Returns:
        (train_data, eval_data)
    """
    (x_train, y_train), (x_eval, y_eval) = tf.keras.datasets.imdb.load_data(
        maxlen=max_len, num_words=vocab_size)
    # pad the sequences to max length
    x_train = np.array([pad(x, max_len, 0) for x in x_train])
    x_eval = np.array([pad(x, max_len, 0) for x in x_eval])

    train_data = NumpyDataset({"x": x_train, "y": y_train})
    eval_data = NumpyDataset({"x": x_eval, "y": y_eval})
    return train_data, eval_data
Beispiel #14
0
def load_data(image_key: str = "x", label_key: str = "y") -> NumpyDataset:
    """Load and return the Sklearn digits dataset.

    Args:
        image_key: The key for image.
        label_key: The key for label.

    Returns:
        (train_data, eval_data)
    """
    ds = datasets.load_digits()
    images = ds.images
    targets = ds.target
    return NumpyDataset({image_key: images, label_key: targets})
Beispiel #15
0
def load_data(root_dir: Optional[str] = None,
              seq_length: int = 100) -> Tuple[NumpyDataset, List[str]]:
    """Load and return the Shakespeare dataset.

    Shakespeare dataset is a collection of texts written by Shakespeare.
    Sourced from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt

    Args:
        root_dir: The path to store the downloaded data. When `path` is not provided, the data will be saved into
            `fastestimator_data` under the user's home directory.
        seq_length: Length of data sequence.

    Returns:
        (train_data, vocab)
    """
    home = str(Path.home())

    if root_dir is None:
        root_dir = os.path.join(home, 'fastestimator_data', 'Shakespeare')
    else:
        root_dir = os.path.join(os.path.abspath(root_dir), 'Shakespeare')
    os.makedirs(root_dir, exist_ok=True)

    file_path = os.path.join(root_dir, 'shakespeare.txt')
    download_link = 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt'

    if not os.path.exists(file_path):
        # Download
        print("Downloading data: {}".format(file_path))
        wget.download(download_link, file_path, bar=bar_custom)

    with open(file_path, 'rb') as f:
        text_data = f.read().decode(encoding='utf-8')

    # Build dictionary from training data
    vocab = sorted(set(text_data))
    # Creating a mapping from unique characters to indices
    char2idx = {u: i for i, u in enumerate(vocab)}
    text_data = [char2idx[c] for c in text_data
                 ] + [0] * (seq_length - len(text_data) % seq_length)
    text_data = np.array(text_data).reshape(-1, seq_length)
    train_data = NumpyDataset(data={"x": text_data})
    return train_data, vocab