Example #1
0
def setup_working_with_text_data():
    if IS_PYPY and os.environ.get('CI', None):
        raise SkipTest('Skipping too slow test with PyPy on CI')
    check_skip_network()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
Example #2
0
def fetch_maps_piano_dataset(*, data_origin: Optional[str] = None,
                             data_home: Optional[str] = None,
                             preprocessor: Optional[BaseEstimator] = None,
                             force_preprocessing: bool = False,
                             label_type: Literal["pitch", "onset", "offset"]) \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Load the MAPS piano dataset from Telecom Paris (classification).

    =================   =====================
    Classes                              TODO
    Samples total                        TODO
    Dimensionality                       TODO
    Features                             TODO
    =================   =====================

    Parameters
    ----------
    data_origin : Optional[str], default=None
        Specify where the original dataset can be found. By default,
        all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in
       '~/scikit_learn_data' subfolders.
    data_home : Optional[str], default=None
        Specify another download and cache folder fo the datasets. By default,
        all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in
       '~/scikit_learn_data' subfolders.
    preprocessor : Optional[sklearn.TransformerMixin], default=None,
        Estimator for preprocessing the dataset (create features and targets
        from audio and label files).
    force_preprocessing: bool, default=False
        Force preprocessing (label computation and feature extraction)
    label_type : Literal["pitch", "onset", "offset"], default="pitch",
        Type of labels to return. Possible are pitch labels or onset and offset
        labels for each pitch.

    Returns
    -------
    (X_train, X_test, y_train, y_test) : tuple(np.ndarray, np.ndarray,
    np.ndarray, np.ndarray)
    """
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)
    filepath = _pkl_filepath(data_home, 'maps.pkz')
    if not exists(filepath) or force_preprocessing:

        print('preprocessing MAPS dataset from %s to %s' %
              (data_origin, data_home))
        train_files = np.loadtxt(join(
            data_origin, Path("mapsSplits/sigtia-conf3-splits/train")),
                                 dtype=object)
        test_files = np.loadtxt(join(
            data_origin, Path("mapsSplits/sigtia-conf3-splits/test")),
                                dtype=object)

        X_train = np.empty(shape=(len(train_files), ), dtype=object)
        X_test = np.empty(shape=(len(test_files), ), dtype=object)
        y_train = np.empty(shape=(len(train_files), ), dtype=object)
        y_test = np.empty(shape=(len(test_files), ), dtype=object)

        for k, f in enumerate(train_files):
            X_train[k] = preprocessor.transform(
                join(data_origin, Path(f + ".wav")))
            y_train[k] = pd.read_csv(join(data_origin, Path(f + ".txt")),
                                     sep="\t")

        for k, f in enumerate(test_files):
            X_test[k] = preprocessor.transform(
                join(data_origin, Path(f + ".wav")))
            y_test[k] = pd.read_csv(join(data_origin, Path(f + ".txt")),
                                    sep="\t")

        joblib.dump([X_train, X_test, y_train, y_test], filepath, compress=6)
    else:
        X_train, X_test, y_train, y_test = joblib.load(filepath)

    x_shape_zero = np.unique([X.shape[0]
                              for X in X_train] + [X.shape[0] for X in X_test])
    x_shape_one = np.unique([X.shape[1]
                             for X in X_train] + [X.shape[1] for X in X_test])
    if len(x_shape_zero) == 1 and len(x_shape_one) > 1:
        for k in range(len(X_train)):
            X_train[k] = X_train[k].T
        for k in range(len(X_test)):
            X_test[k] = X_test[k].T
    elif len(x_shape_zero) > 1 and len(x_shape_one) == 1:
        pass
    else:
        raise TypeError("Invalid dataformat. Expected at least one equal "
                        "dimension of all sequences.")

    for k in range(len(X_train)):
        if label_type == "pitch":
            y_train[k] = _get_pitch_labels(X_train[k], y_train[k])
        else:
            raise TypeError("Invalid label type.")

    for k in range(len(X_test)):
        if label_type == "pitch":
            y_test[k] = _get_pitch_labels(X_test[k], y_test[k])
        else:
            raise TypeError("Invalid label type.")

    return X_train, X_test, y_train, y_test
Example #3
0
def setup_twenty_newsgroups():
    data_home = get_data_home()
    cache_path = _pkl_filepath(get_data_home(), CACHE_NAME)
    if not exists(cache_path):
        raise SkipTest("Skipping dataset loading doctests")
Example #4
0
def fetch_ptdb_tug_dataset(*, data_origin: Union[str, bytes],
                           data_home: Optional[Union[str, bytes]] = None,
                           preprocessor: Optional[BaseEstimator] = None,
                           augment: Union[int, np.integer] = 0,
                           force_preprocessing: bool = False) \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Load the PTDB-TUG: Pitch Tracking Database from
    Graz University of Technology.

    (classification and regression)

    =================   =====================
    Outputs                                 2
    Samples total                        TODO
    Dimensionality                       TODO
    Features                             TODO
    =================   =====================

    Parameters
    ----------
    data_origin : Optional[str]
        Specify where the original dataset can be found. By default,
        all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in
       '~/scikit_learn_data' subfolders.
    data_home : Optional[str]
        Specify another download and cache folder fo the datasets. By default,
        all pyrcn data is stored in '~/pyrcn_data' and all scikit-learn data in
       '~/scikit_learn_data' subfolders.
    preprocessor : Optional[BaseEstimator], default=None,
        Estimator for preprocessing the dataset (create features and targets
        from audio and label files).
    augment : Union[int, np.integer], default = 0
        Semitone range used for data augmentation
    force_preprocessing: bool, default=False
        Force preprocessing (label computation and feature extraction)

    Returns
    -------
    (X, y) : tuple
    """
    data_home = get_data_home(data_home=data_home)
    if not exists(data_home):
        makedirs(data_home)
    filepath = _pkl_filepath(data_home, 'ptdb_tug.pkz')
    if not exists(filepath) or force_preprocessing:
        print('preprocessing PTDB-TUG database from {0} to {1}'.format(
            data_origin, data_home))
        all_training_files = []
        all_test_files = []
        for root, dirs, files in walk(data_origin):
            for f in files:
                if (f.endswith(".wav") and f.startswith("mic")
                        and not re.search(r'\_[0-9]\.wav$', f)
                        and not re.search(r'\_\-[0-9]\.wav$', f)):
                    if "F09" in f or "F10" in f or "M09" in f or "M10" in f:
                        all_test_files.append(join(root, f))
                    else:
                        all_training_files.append(join(root, f))

        if augment > 0:
            augment = list(range(-augment, augment + 1))
            augment.remove(0)
        else:
            augment = [0]
        if len(augment) == 1:
            X_train = np.empty(shape=(len(all_training_files), ), dtype=object)
            y_train = np.empty(shape=(len(all_training_files), ), dtype=object)
        else:
            X_train = np.empty(shape=((1 + len(augment)) *
                                      len(all_training_files), ),
                               dtype=object)
            y_train = np.empty(shape=((1 + len(augment)) *
                                      len(all_training_files), ),
                               dtype=object)
        X_test = np.empty(shape=(len(all_test_files), ), dtype=object)
        y_test = np.empty(shape=(len(all_test_files), ), dtype=object)

        if len(augment) > 1:
            for k, f in enumerate(all_training_files):
                X_train[k] = preprocessor.transform(f)
                y_train[k] = pd.read_csv(f.replace("MIC", "REF").replace(
                    "mic", "ref").replace(".wav", ".f0"),
                                         sep=" ",
                                         header=None)
            for m, st in enumerate(augment):
                for k, f in enumerate(all_training_files):
                    X_train[k + int((m+1) * len(all_training_files))] = \
                        preprocessor.transform(
                            f.replace(".wav", "_" + str(st) + ".wav"))
                    df = pd.read_csv(f.replace("MIC", "REF").replace(
                        "mic", "ref").replace(".wav", ".f0"),
                                     sep=" ",
                                     header=None)
                    df[[0]] = df[[0]] * 2**(st / 12)
                    y_train[k + int((m + 1) * len(all_training_files))] = df
        else:
            for k, f in enumerate(all_training_files):
                X_train[k] = preprocessor.transform(f)
                y_train[k] = pd.read_csv(f.replace("MIC", "REF").replace(
                    "mic", "ref").replace(".wav", ".f0"),
                                         sep=" ",
                                         header=None)
        for k, f in enumerate(all_test_files):
            X_test[k] = preprocessor.transform(f)
            y_test[k] = pd.read_csv(f.replace("MIC", "REF").replace(
                "mic", "ref").replace(".wav", ".f0"),
                                    sep=" ",
                                    header=None)
        joblib.dump([X_train, X_test, y_train, y_test], filepath, compress=6)
    else:
        X_train, X_test, y_train, y_test = joblib.load(filepath)

    x_shape_zero = np.unique([x.shape[0]
                              for x in X_train] + [x.shape[0] for x in X_test])
    x_shape_one = np.unique([x.shape[1]
                             for x in X_train] + [x.shape[1] for x in X_test])
    if len(x_shape_zero) == 1 and len(x_shape_one) > 1:
        for k in range(len(X_train)):
            X_train[k] = X_train[k].T
            y_train[k] = _get_labels(X_train[k], y_train[k])
        for k in range(len(X_test)):
            X_test[k] = X_test[k].T
            y_test[k] = _get_labels(X_test[k], y_test[k])
    elif len(x_shape_zero) > 1 and len(x_shape_one) == 1:
        for k in range(len(X_train)):
            y_train[k] = _get_labels(X_train[k], y_train[k])
        for k in range(len(X_test)):
            y_test[k] = _get_labels(X_test[k], y_test[k])
    else:
        raise TypeError("Invalid dataformat. Expected at least one equal "
                        "dimension of all sequences.")

    return X_train, X_test, y_train, y_test