Ejemplo n.º 1
0
def load(dataset_url, out_train_dataset_path, out_val_dataset_path,
         out_meta_tsv_path, validation_split):
    '''
        Loads and converts a dataset of the format of the Penn Treebank sample 
        at http://www.nltk.org/nltk_data/ to the dataset type `CORPUS` for the task `POS_TAGGING`.

        :param str dataset_url: URL to download the dataset stored in the format similar to the Penn Treebank sample
        :param str out_train_dataset_path: Path to save the output train dataset file
        :param str out_val_dataset_path: Path to save the output validation dataset file
        :param str out_meta_tsv_path: Path to save the output dataset metadata .TSV file
        :param float validation_split: Proportion (0-1) to carve out validation dataset from the original dataset
    '''
    if all([
            os.path.exists(x) for x in
        [out_train_dataset_path, out_val_dataset_path, out_meta_tsv_path]
    ]):
        print('Dataset already loaded in local filesystem - skipping...')
        return

    dataset_path = download_dataset_from_url(dataset_url)

    print('Loading dataset and writing to output dataset files...')
    _convert_dataset(dataset_path, out_meta_tsv_path, out_train_dataset_path,
                     out_val_dataset_path, validation_split)

    print('Dataset metadata file is saved at {}'.format(out_meta_tsv_path))
    print(
        'Train dataset file is saved at {}. This should be submitted as `train_dataset` of a train job.'
        .format(out_train_dataset_path))
    print(
        'Validation dataset file is saved at {}. This should be submitted as `val_dataset` of a train job.'
        .format(out_val_dataset_path))
Ejemplo n.º 2
0
def load(dataset_url,
         label_to_name,
         out_train_dataset_path,
         out_val_dataset_path,
         out_test_dataset_path,
         out_meta_csv_path,
         validation_split,
         limit=None):
    '''
        Loads and converts an image dataset of the CIFAR format for IMAGE_CLASSIFICATION.
        Refer to https://www.cs.toronto.edu/~kriz/cifar.html for the CIFAR dataset format.

        :param str dataset_url: URL to download the Python version of the dataset
        :param dict[int, str] label_to_name: Dictionary mapping label index to label name
        :param str out_train_dataset_path: Path to save the output train dataset file
        :param str out_val_dataset_path: Path to save the output validation dataset file
        :param str out_test_dataset_path: Path to save the output test dataset file
        :param str out_meta_csv_path: Path to save the output dataset metadata .CSV file
        :param float validation_split: Proportion (0-1) to carve out validation dataset from the originl train dataset
        :param int limit: Maximum number of samples for each dataset (for purposes of development)
    '''
    if all([
            os.path.exists(x) for x in
        [out_train_dataset_path, out_val_dataset_path, out_meta_csv_path]
    ]):
        print('Dataset already loaded in local filesystem - skipping...')
        return

    print('Downloading dataset archive...')
    dataset_zip_file_path = download_dataset_from_url(dataset_url)

    print('Loading datasets into memory...')
    (train_images, train_labels, test_images,
     test_labels) = _load_dataset_from_zip_file(dataset_zip_file_path)
    (train_images, train_labels, val_images,
     val_labels) = _split_train_dataset(train_images, train_labels,
                                        validation_split)

    print('Converting and writing datasets...')

    (label_to_index) = _write_meta_csv(chain(train_labels, test_labels),
                                       label_to_name, out_meta_csv_path)
    print('Dataset metadata file is saved at {}'.format(out_meta_csv_path))

    _write_dataset(train_images, train_labels, label_to_index,
                   out_train_dataset_path, limit)
    print(
        'Train dataset file is saved at {}. This should be submitted as `train_dataset` of a train job.'
        .format(out_train_dataset_path))

    _write_dataset(val_images, val_labels, label_to_index,
                   out_val_dataset_path, limit)
    print(
        'Validation dataset file is saved at {}. This should be submitted as `val_dataset` of a train job.'
        .format(out_val_dataset_path))

    _write_dataset(test_images, test_labels, label_to_index,
                   out_test_dataset_path, limit)
    print('Test dataset file is saved at {}'.format(out_test_dataset_path))
Ejemplo n.º 3
0
def maybe_download(archive_name, target_dir, archive_url):
    # If no downloaded file is provided, download it...
    archive_path = os.path.join(target_dir, archive_name)

    if not os.path.exists(target_dir):
        print('No path "%s" - creating ...' % target_dir)
        os.makedirs(target_dir)

    if not os.path.exists(archive_path):
        print('No archive "%s" - downloading...' % archive_path)
        archive_path = download_dataset_from_url(archive_url)
    else:
        print('Found archive "%s" - not downloading.' % archive_path)
    return archive_path
Ejemplo n.º 4
0
def load(train_images_url, train_labels_url, test_images_url, test_labels_url, label_to_name, \
        out_train_dataset_path, out_val_dataset_path, out_test_dataset_path, out_meta_csv_path,
        validation_split, limit=None):
    '''
        Loads and converts an image dataset of the MNIST format for IMAGE_CLASSIFICATION.
        Refer to http://yann.lecun.com/exdb/mnist/ for the MNIST dataset format for.

        :param str train_images_url: URL to download the training set images stored in the MNIST format
        :param str train_labels_url: URL to download the training set labels stored in the MNIST format
        :param str test_images_url: URL to download the test set images stored in the MNIST format
        :param str test_labels_url: URL to download the test set labels stored in the MNIST format
        :param dict[int, str] label_to_name: Dictionary mapping label index to label name
        :param str out_train_dataset_path: Path to save the output train dataset file
        :param str out_val_dataset_path: Path to save the output validation dataset file
        :param str out_test_dataset_path: Path to save the output test dataset file
        :param str out_meta_csv_path: Path to save the output dataset metadata .CSV file
        :param float validation_split: Proportion (0-1) to carve out validation dataset from the originl train dataset
        :param int limit: Maximum number of samples for each dataset (for purposes of development)
    '''
    if all([
            os.path.exists(x) for x in
        [out_train_dataset_path, out_val_dataset_path, out_meta_csv_path]
    ]):
        print('Dataset already loaded in local filesystem - skipping...')
        return

    train_images_file_path = download_dataset_from_url(train_images_url)
    train_labels_file_path = download_dataset_from_url(train_labels_url)
    test_images_file_path = download_dataset_from_url(test_images_url)
    test_labels_file_path = download_dataset_from_url(test_labels_url)

    print('Loading datasets into memory...')
    (train_images,
     train_labels) = _load_dataset_from_files(train_images_file_path,
                                              train_labels_file_path)
    (test_images,
     test_labels) = _load_dataset_from_files(test_images_file_path,
                                             test_labels_file_path)
    (train_images, train_labels, val_images,
     val_labels) = _split_train_dataset(train_images, train_labels,
                                        validation_split)

    print('Converting and writing datasets...')

    (label_to_index) = _write_meta_csv(chain(train_labels, test_labels),
                                       label_to_name, out_meta_csv_path)
    print('Dataset metadata file is saved at {}'.format(out_meta_csv_path))

    _write_dataset(train_images, train_labels, label_to_index,
                   out_train_dataset_path, limit)
    print(
        'Train dataset file is saved at {}. This should be submitted as `train_dataset` of a train job.'
        .format(out_train_dataset_path))

    _write_dataset(val_images, val_labels, label_to_index, out_val_dataset_path,
                   limit)
    print(
        'Validation dataset file is saved at {}. This should be submitted as `val_dataset` of a train job.'
        .format(out_val_dataset_path))

    _write_dataset(test_images, test_labels, label_to_index,
                   out_test_dataset_path, limit)
    print('Test dataset file is saved at {}'.format(out_test_dataset_path))