Ejemplo n.º 1
0
def load_plaid(split=None, return_X_y=True):
    """
    PLAID stands for the Plug Load Appliance Identification Dataset.

    Example of a univariate problem with unequal length series. It loads an 11 class
    classification problem with number of cases, n either 1074/537/537 (for split =
    None/Train/Test) of series variable length m.

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
        default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (features, target) separately instead of a single
        dataframe with columns for features and the target.

    Returns
    -------
    X: pandas DataFrame with m rows and 1 column for the time series, where each cell
    is a pd.Series containing the time series.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
    Dimensionality:     univariate
    Series length:      Variable
    Train cases:        537
    Test cases:         537
    Number of classes:  11

    """
    name = "PLAID"
    return _load_provided_dataset(name, split, return_X_y)
Ejemplo n.º 2
0
def load_basic_motions(split=None, return_X_y=True, return_type=None):
    """
    Load the BasicMotions time series classification problem and returns X and y.

    This is an equal length multivariate time series classification problem. It loads a
    4 class classification problem with number of cases, n, where n = 80 (if
    split is None) or 40 (if split is "train"/"test") of series length m = 100.

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
        default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (time series, target) separately as X and y instead of a single
        data structure.
    return_type: None or str{"numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Raises
    ------
    ValueError if argument "numpy2d"/"numpyflat" is passed as return_type
    Notes
    -----
    Dimensionality:     multivariate, 6
    Series length:      100
    Train cases:        40
    Test cases:         40
    Number of classes:  4

    The data was generated as part of a student project where four students performed
    four activities whilst wearing a smart watch. The watch collects 3D accelerometer
    and a 3D gyroscope It consists of four classes, which are walking, resting,
    running and badminton. Participants were required to record motion a total of
    five times, and the data is sampled once every tenth of a second, for a ten second
    period.

    Dataset details: http://www.timeseriesclassification.com/description.php?Dataset
    =BasicMotions
    """
    name = "BasicMotions"
    if return_type == "numpy2d" or return_type == "numpyflat":
        raise ValueError(
            f"{name} loader: Error, attempting to load into a numpy2d "
            f"array, but cannot because it is a multivariate problem. Use "
            f"numpy3d instead")
    return _load_provided_dataset(name=name,
                                  split=split,
                                  return_X_y=return_X_y,
                                  return_type=return_type)
Ejemplo n.º 3
0
def load_acsf1(split=None, return_X_y=True, return_type=None):
    """Load dataset on power consumption of typical appliances.

    This is an equal length univariate time series classification problem. It loads a
    10 class classification problem with number of cases, n, where n = 200 (if
    split is None) or 100 (if split is "train"/"test") of series length m = 1460

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
        default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (time series, target) separately as X and y instead of a single
        data structure.
    return_type: None or str{"numpy2d", "numpyflat", "numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
    Dimensionality:     univariate
    Series length:      1460
    Train cases:        100
    Test cases:         100
    Number of classes:  10

    The dataset contains the power consumption of typical appliances.
    The recordings are characterized by long idle periods and some high bursts
    of energy consumption when the appliance is active.
    The classes correspond to 10 categories of home appliances;
    mobile phones (via chargers), coffee machines, computer stations
    (including monitor), fridges and freezers, Hi-Fi systems (CD players),
    lamp (CFL), laptops (via chargers), microwave ovens, printers, and
    televisions (LCD or LED)."

    Dataset details:
    http://www.timeseriesclassification.com/description.php?Dataset=ACSF1
    """
    name = "ACSF1"
    return _load_provided_dataset(name=name,
                                  split=split,
                                  return_X_y=return_X_y,
                                  return_type=return_type)
Ejemplo n.º 4
0
def load_arrow_head(split=None, return_X_y=True, return_type=None):
    """
    Load dataset of arrow head shape outlines.

    This is an equal length univariate time series classification problem. It loads a
    three class classification problem with number of cases, n, where n = 211 (if
    split is None) or 36/175 (if split is "train"/"test") of series length m = 251

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
        default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (time series, target) separately as X and y instead of a single
        data structure.
    return_type: None or str{"numpy2d", "numpyflat", "numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
    Dimensionality:     univariate
    Series length:      251
    Train cases:        36
    Test cases:         175
    Number of classes:  3

    The arrowhead data consists of outlines of the images of arrowheads. The
    shapes of the projectile points are converted into a time series using the angle
    based method. The classification of projectile points is an important topic in
    anthropology. The classes are based on shape distinctions such as the presence and
    location of a notch in the arrow. The problem in the repository is a length
    normalised version of that used in [1]. The three classes are called "Avonlea",
    "Clovis" and "Mix".

    Dataset:
    http://timeseriesclassification.com/description.php?Dataset=ArrowHead
    """
    name = "ArrowHead"
    return _load_provided_dataset(name=name,
                                  split=split,
                                  return_X_y=return_X_y,
                                  return_type=return_type)
Ejemplo n.º 5
0
def load_gunpoint(split=None, return_X_y=True, return_type=None):
    """
    Load data of two actors making a motion with their hand.

    This is an equal length univariate time series classification problem. It loads
    a two class classification problem with number of cases, n either 200/50/150 (for
    split =None/Train/Test) of series length m=150.

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By default it
        loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (features, target) separately instead of a concatenated data
        structure.
    return_type: None or str{"numpy2d", "numpyflat", "numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
    Dimensionality:     univariate
    Series length:      150
    Train cases:        50
    Test cases:         150
    Number of classes:  2

    This dataset involves one female actor and one male actor making a motion with their
    hand. The two classes are: Gun-Draw and Point: For Gun-Draw the actors have their
    hands by their sides. They draw a replicate gun from a hip-mounted holster, point it
    at a target for approximately one second, then return the gun to the holster, and
    their hands to their sides. For Point the actors have their gun by their sides.
    They point with their index fingers to a target for approximately one second, and
    then return their hands to their sides. For both classes, they tracked the centroid
    of the actor's right hands in both X- and Y-axes, which appear to be highly
    correlated. The data in the archive is just the X-axis.

    Dataset details:
    http://timeseriesclassification.com/description.php?Dataset=GunPoint
    """
    name = "GunPoint"
    return _load_provided_dataset(name, split, return_X_y, return_type)
Ejemplo n.º 6
0
def load_osuleaf(split=None, return_X_y=True, return_type=None):
    """
    Load OSULeaf data set, which consists of one dimensional outlines of leaves.

    This is an equal length univariate time series classification problem. It loads
    a six class classification problem with number of cases, n either 442/200/242 (for
    split =None/Train/Test) of series length m=427.

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
        default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (features, target) separately instead of a concatenated data
        structure.
    return_type: None or str{"numpy2d", "numpyflat", "numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
    Dimensionality:     univariate
    Series length:      427
    Train cases:        200
    Test cases:         242
    Number of classes:  6

    The OSULeaf data set consist of one dimensional outlines of leaves. The series
    were obtained by color image segmentation and boundary extraction (in the
    anti-clockwise direction) from digitized leaf images of six classes: Acer
    Circinatum, Acer Glabrum, Acer Macrophyllum, Acer Negundo, Quercus Garryanaand
    Quercus Kelloggii for the MSc thesis "Content-Based Image Retrieval: Plant
    Species Identification" by A Grandhi.

    Dataset details:
    http://www.timeseriesclassification.com/description.php?Dataset=OSULeaf
    """
    name = "OSULeaf"
    return _load_provided_dataset(name, split, return_X_y, return_type)
Ejemplo n.º 7
0
def load_japanese_vowels(split=None, return_X_y=True):
    """
    Load the japanese vowels audio problem.

    This is a multivariate, unequal length time series classification problem. It
    loads a nine class classification problem with number of cases, n, where n = 640
    (if split is None) or 270/370 (if split is "train"/"test") of series variable
    length m.

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
    default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (features, target) separately instead of a
        single dataframe with columns for features and the target.

    Returns
    -------
    X: pandas DataFrame with n rows and c columns
        The time series data for the problem with m cases and c dimensions
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
    Dimensionality:     multivariate, 12
    Series length:      7-29
    Train cases:        270
    Test cases:         370
    Number of classes:  9

    A UCI Archive dataset. 9 Japanese-male speakers were recorded saying the vowels
    'a' and 'e'. A '12-degree linear prediction analysis' is applied to the raw
    recordings to obtain time-series with 12 dimensions and series lengths between 7 and
    29. The classification task is to predict the speaker. Therefore, each instance
    is a transformed utterance, 12*29 values with a single class label attached,
    [1...9]. The given training set is comprised of 30 utterances for each speaker,
    however the test set has a varied distribution based on external factors of
    timing and experimental availability, between 24 and 88 instances per
    speaker. Reference: M. Kudo, J. Toyama and M. Shimbo. (1999). "Multidimensional
    Curve Classification Using Passing-Through Regions". Pattern Recognition Letters,
    Vol. 20, No. 11--13, pages 1103--1111. Dataset details:
    http://timeseriesclassification.com/description.php?Dataset=JapaneseVowels
    """
    name = "JapaneseVowels"
    return _load_provided_dataset(name, split, return_X_y)
Ejemplo n.º 8
0
def load_unit_test(split=None, return_X_y=True, return_type=None):
    """
    Load UnitTest data.

    This is an equal length univariate time series classification problem. It is a
    stripped down version of the ChinaTown problem that is used in correctness tests
    for classification. It loads a two class classification problem with number of
    cases, n, where n = 42 (if split is None) or 20/22 (if split is "train"/"test")
    of series length m = 24

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By default it
        loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (features, target) separately instead of a concatenated data
        structure.
    return_type: None or str{"numpy2d", "numpyflat", "numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Details
    -------
    This is the Chinatown problem with a smaller test set, useful for rapid tests.
    Dimensionality:     univariate
    Series length:      24
    Train cases:        20
    Test cases:         22 (full dataset has 345)
    Number of classes:  2

     See
    http://timeseriesclassification.com/description.php?Dataset=Chinatown
    for the full dataset
    """
    name = "UnitTest"
    return _load_provided_dataset(name, split, return_X_y, return_type)
Ejemplo n.º 9
0
def load_italy_power_demand(split=None, return_X_y=True, return_type=None):
    """
    Load 12 monthly electrical power demand time series from Italy.

    This is an equal length univariate time series classification problem. It loads
    a two class classification problem with number of cases, n either 1096/67/1029 (for
    split =None/Train/Test) of series length m=24.

    Parameters
    ----------
    split: None or str{"train", "test"}, optional (default=None)
        Whether to load the train or test partition of the problem. By
        default it loads both.
    return_X_y: bool, optional (default=True)
        If True, returns (features, target) separately instead of a concatenated data
        structure.
    return_type: None or str{"numpy2d", "numpyflat", "numpy3d", "nested_univ"},
    optional (default=None). Controls the returned data structure.

    Returns
    -------
    X:  The time series data for the problem. If return_type is either
        "numpy2d"/"numpyflat", it returns 2D numpy array of shape (n,m), if "numpy3d" it
        returns 3D numpy array of shape (n,1,m) and if "nested_univ" or None it returns
        a nested pandas DataFrame of shape (n,1), where each cell is a pd.Series of
        length m.
    y: (optional) numpy array shape (n,1). The class labels for each case in X.
        If return_X_y is False, y is appended to X.

    Notes
    -----
     Dimensionality:     univariate
     Series length (m):      24
     Train cases:        67
     Test cases:         1029
     Number of classes:  2

    The data was first used in the paper "Intelligent Icons: Integrating Lite-Weight
    Data Mining and Visualization into GUI Operating Systems". The classification
    task is to distinguish days from Oct to March (inclusive) from April to September.
    Dataset details:
    http://timeseriesclassification.com/description.php?Dataset=ItalyPowerDemand
    """
    name = "ItalyPowerDemand"
    return _load_provided_dataset(name, split, return_X_y, return_type)