Exemple #1
0
def get_dataset_testset_double_than_trainset(data_dir_root):
    """

    :param data_dir_root: 
    :return:  
    (2018-12-15, when data_dir_root=UCR_TS_Archive_2015)
        Number of data set: 45
        These data set are:
        ['ArrowHead', 'CBF', 'ChlorineConcentration', 'CinC_ECG_torso', 'DiatomSizeReduction', 
         'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 
         'ECG5000', 'ECGFiveDays', 'Earthquakes', 'FaceAll', 'FaceFour', 'FacesUCR', 'FordA', 'FordB', 
         'Gun_Point', 'HandOutlines', 'InlineSkate', 'InsectWingbeatSound', 'ItalyPowerDemand', 'MALLAT', 
         'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MiddlePhalanxTW', 'MoteStrain', 
         'Phoneme', 'ShapeletSim', 'SonyAIBORobotSurface', 'SonyAIBORobotSurfaceII', 'StarLightCurves', 
         'Symbols', 'ToeSegmentation1', 'ToeSegmentation2', 'TwoLeadECG', 'Two_Patterns', 
         'UWaveGestureLibraryAll', 'WordsSynonyms', 'Worms', 'WormsTwoClass', 'uWaveGestureLibrary_X', 
         'uWaveGestureLibrary_Y', 'uWaveGestureLibrary_Z', 'wafer', 'yoga']
    """
    data_name_list = get_data_name_list(data_dir_root)
    res = []
    for fname in data_name_list:
        datasets = ucr.load_ucr(fname, data_dir_root)
        if datasets.test.X.shape[0] >= 2 * datasets.train.X.shape[0]:
            res.append(fname)
    return res
Exemple #2
0
def get_dataset_testset_double_than_trainset_for_each_class(data_dir_root):
    """

    :param data_dir_root: 
    :return: 
    (2018-12-15, when data_dir_root=UCR_TS_Archive_2015)
        Number of data set: 33
        These data set are:
        ['ArrowHead', 'CBF', 'ChlorineConcentration', 'CinC_ECG_torso', 'DiatomSizeReduction', 'ECG5000', 
         'ECGFiveDays', 'FacesUCR', 'FordA', 'FordB', 'Gun_Point', 'HandOutlines', 'InlineSkate', 
         'InsectWingbeatSound', 'ItalyPowerDemand', 'MALLAT', 'MoteStrain', 'ShapeletSim', 
         'SonyAIBORobotSurface', 'SonyAIBORobotSurfaceII', 'StarLightCurves', 'Symbols', 
         'ToeSegmentation1', 'TwoLeadECG', 'Two_Patterns', 'UWaveGestureLibraryAll', 'Worms', 
         'WormsTwoClass', 'uWaveGestureLibrary_X', 'uWaveGestureLibrary_Y', 'uWaveGestureLibrary_Z', 
         'wafer', 'yoga']
    """
    data_name_list = get_data_name_list(data_dir_root)
    res = []
    for fname in data_name_list:
        datasets = ucr.load_ucr(fname, data_dir_root)
        distr_train = utils.distribute_y(datasets.train.y)
        distr_test = utils.distribute_y(datasets.test.y)
        is_pass = True
        for key_tr in distr_train.keys():
            num_tr = distr_train[key_tr]
            num_te = distr_test[key_tr]
            if num_te is None or num_te < 2 * num_tr:
                is_pass = False
                break
        if is_pass:
            res.append(fname)
    return res
Exemple #3
0
def get_dataset_testset_larger_than_trainset(data_dir_root):
    """

    :param data_dir_root: 
    :return: 
    (2018-12-15, when data_dir_root=UCR_TS_Archive_2015)
        Number of data set: 79
        These data set are: 
        ['50words', 'Adiac', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'CBF', 'Car', 
         'ChlorineConcentration', 'CinC_ECG_torso', 'Coffee', 'Computers', 'Cricket_X', 
         'Cricket_Y', 'Cricket_Z', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 
         'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'ECG200', 'ECG5000', 'ECGFiveDays', 
         'Earthquakes', 'FISH', 'FaceAll', 'FaceFour', 'FacesUCR', 'FordA', 'FordB', 'Gun_Point', 
         'HandOutlines', 'Haptics', 'Herring', 'InlineSkate', 'InsectWingbeatSound', 'ItalyPowerDemand', 
         'LargeKitchenAppliances', 'Lighting2', 'Lighting7', 'MALLAT', 'Meat', 'MedicalImages', 
         'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MiddlePhalanxTW', 
         'MoteStrain', 'NonInvasiveFatalECG_Thorax1', 'NonInvasiveFatalECG_Thorax2', 
         'OSULeaf', 'OliveOil', 'Phoneme', 'Plane', 'ProximalPhalanxTW', 'RefrigerationDevices', 
         'ScreenType', 'ShapeletSim', 'ShapesAll', 'SmallKitchenAppliances', 'SonyAIBORobotSurface', 
         'SonyAIBORobotSurfaceII', 'StarLightCurves', 'Strawberry', 'SwedishLeaf', 'Symbols', 
         'ToeSegmentation1', 'ToeSegmentation2', 'Trace', 'TwoLeadECG', 'Two_Patterns', 
         'UWaveGestureLibraryAll', 'WordsSynonyms', 'Worms', 'WormsTwoClass', 'synthetic_control', 
         'uWaveGestureLibrary_X', 'uWaveGestureLibrary_Y', 'uWaveGestureLibrary_Z', 'wafer', 'yoga']
    """
    data_name_list = get_data_name_list(data_dir_root)
    res = []
    for fname in data_name_list:
        datasets = ucr.load_ucr(fname, data_dir_root)
        if datasets.test.X.shape[0] >= datasets.train.X.shape[0]:
            res.append(fname)
    return res
Exemple #4
0
def category_dataset_exact_length(dir_data):
    data_categories_length = {}
    fname_list = ucr.get_data_name_list(dir_data)
    for fname in fname_list:
        data = ucr.load_ucr(fname, dir_data)
        length = data.train.X.shape[1]
        if (length in data_categories_length.keys()) is False:
            data_categories_length[length] = []
        data_categories_length[length].append(fname)
    for key in sorted(data_categories_length.keys()):
        print(key, data_categories_length[key])
    return data_categories_length
Exemple #5
0
def get_data_info(data_dir_root,
                  data_name_list=None,
                  out_csv='./dataset_info.csv'):
    """

    :param data_dir_root:   
    :param data_name_list: 
    :param out_csv: 
    :return: 
    """
    if data_name_list is None:
        data_name_list = get_data_name_list(data_dir_root)

    res_col = [
        'DataSet', 'NClass', 'SequenceLength', 'SizeAll', 'SizeTrain',
        'SizeTest', 'SizeValid', 'DistributionAll', 'DistributionTrain',
        'DistributionTest', 'DistributionValid'
    ]
    res_df = pd.DataFrame(columns=res_col)
    for i, fname in enumerate(data_name_list):
        print("preprocessing dataset: {}".format(fname))
        # load data
        data = ucr.load_ucr(fname, data_dir_root)
        if data.valid is None:
            X_all = np.concatenate([data.train.X, data.test.X], axis=0)
            y_all = np.concatenate([data.train.y, data.test.y])
        else:
            X_all = np.concatenate([data.train.X, data.test.X, data.valid.X],
                                   axis=0)
            y_all = np.concatenate([data.train.y, data.test.y, data.valid.y])
        # get the information of specific data set
        res_df.loc[i, 'DataSet'] = fname
        res_df.loc[i, 'NClass'] = len(np.unique(y_all))
        res_df.loc[i, 'SequenceLength'] = X_all.shape[1]
        res_df.loc[i, 'SizeAll'] = X_all.shape[0]
        res_df.loc[i, 'SizeTrain'] = data.train.X.shape[0]
        res_df.loc[i, 'SizeTest'] = data.test.X.shape[0]
        if data.valid is not None:
            res_df.loc[i, 'SizeValid'] = data.valid.X.shape[0]
        res_df.loc[i, 'DistributionAll'] = utils.distribute_y_json(y_all)
        res_df.loc[i,
                   'DistributionTrain'] = utils.distribute_y_json(data.train.y)
        res_df.loc[i,
                   'DistributionTest'] = utils.distribute_y_json(data.test.y)
        if data.valid is not None:
            res_df.loc[i, 'DistributionValid'] = utils.distribute_y_json(
                data.valid.y)

    res_df.to_csv(out_csv)
Exemple #6
0
def category_dataset_nclass(data_dir_root):
    name_list = get_data_name_list(data_dir_root)

    names_0_to_10 = []
    names_11_to_30 = []
    names_greater_30 = []
    for fname in name_list:
        dataset = ucr.load_ucr(fname, data_dir_root)
        n_class = dataset.nclass
        if n_class <= 10:
            names_0_to_10.append(fname)
        elif n_class <= 30:
            names_11_to_30.append(fname)
        else:
            names_greater_30.append(fname)
    print("===== Category dataset by the number of class: ")
    print("from 0 to 10: ", len(names_0_to_10), names_0_to_10)
    print("from 11 to 30: ", len(names_11_to_30), names_11_to_30)
    print("greater than 30: ", len(names_greater_30), names_greater_30)
    print()
Exemple #7
0
def category_dataset_length(data_dir_root):
    name_list = get_data_name_list(data_dir_root)

    names_0_to_300 = []
    names_301_to_700 = []
    names_greater_700 = []
    for fname in name_list:
        dataset = ucr.load_ucr(fname, data_dir_root)
        length = dataset.train.X.shape[1]
        if length <= 300:
            names_0_to_300.append(fname)
        elif length <= 700:
            names_301_to_700.append(fname)
        else:
            names_greater_700.append(fname)
    print("===== Category dataset by length: ")
    print("from 0 to 300: ", len(names_0_to_300), names_0_to_300)
    print("from 301 to 700: ", len(names_301_to_700), names_301_to_700)
    print("greater than 700: ", len(names_greater_700), names_greater_700)
    print()
Exemple #8
0
def category_dataset_testsize(data_dir_root):
    name_list = get_data_name_list(data_dir_root)

    names_0_to_300 = []
    names_301_to_1000 = []
    names_greater_1000 = []
    for fname in name_list:
        dataset = ucr.load_ucr(fname, data_dir_root)
        n = dataset.test.X.shape[0]
        if n <= 300:
            names_0_to_300.append(fname)
        elif n <= 1000:
            names_301_to_1000.append(fname)
        else:
            names_greater_1000.append(fname)
    print("===== Category dataset by test size: ")
    print("from 0 to 300: ", len(names_0_to_300), names_0_to_300)
    print("from 301 to 1000: ", len(names_301_to_1000), names_301_to_1000)
    print("greater than 1000: ", len(names_greater_1000), names_greater_1000)
    print()
Exemple #9
0
def z_normalize(in_data_dir_root, out_data_dir_root, data_name_list=None):
    """
    
    :param in_data_dir_root: 
    :param out_data_dir_root: 
    :param data_name_list: 
    :return: 
    """
    if data_name_list is None:
        data_name_list = get_data_name_list(in_data_dir_root)

    for fname in data_name_list:
        X_train, y_train, X_test, y_test = ucr.load_ucr(
            fname, in_data_dir_root)

        X_train_norm = utils.z_normalize(X_train)
        X_test_norm = utils.z_normalize(X_test)

        data_train = np.hstack([y_train[:, np.newaxis], X_train_norm])
        data_test = np.hstack([y_test[:, np.newaxis], X_test_norm])

        out_path = os.path.join(out_data_dir_root, fname)
        if os.path.exists(out_path):
            shutil.rmtree(out_path)
        os.makedirs(out_path)
        np.savetxt(os.path.join(out_path, "{}_TRAIN".format(fname)),
                   data_train,
                   delimiter=',',
                   newline="\n")
        np.savetxt(os.path.join(out_path, "{}_TEST").format(fname),
                   data_test,
                   delimiter=',',
                   newline="\n")

        print("Finish to process dataset {}".format(fname), data_train.shape,
              data_test.shape)
Exemple #10
0
def category_dataset_trainsize(data_dir_root):
    name_list = get_data_name_list(data_dir_root)

    names_0_to_50 = []
    names_51_to_100 = []
    names_101_to_500 = []
    names_greater_500 = []
    for fname in name_list:
        dataset = ucr.load_ucr(fname, data_dir_root)
        n = dataset.train.X.shape[0]
        if n <= 50:
            names_0_to_50.append(fname)
        elif n <= 100:
            names_51_to_100.append(fname)
        elif n <= 500:
            names_101_to_500.append(fname)
        else:
            names_greater_500.append(fname)
    print("===== Category dataset by train size: ")
    print("form 0 to 50: ", len(names_0_to_50), names_0_to_50)
    print("from 51 to 100: ", len(names_51_to_100), names_51_to_100)
    print("from 101 to 500: ", len(names_101_to_500), names_101_to_500)
    print("greater than 500: ", len(names_greater_500), names_greater_500)
    print()
Exemple #11
0
from data import utils
from data import ucr

if __name__ == '__main__':
    DATA_ROOT = '../../dataset/UCR_TS_Archive_2015'
    filename = '50words'
    datasets = ucr.load_ucr(filename, DATA_ROOT, one_hot=False)
    X_train = datasets.train.X
    y_train = datasets.train.y
    X_test = datasets.test.X
    y_test = datasets.test.y

    distr = utils.distribute_dataset(X_train, y_train)
    print(distr.keys())