Example #1
0
def test_loading():

    #test multivariate
    #Test univariate
    for i in range(0, len(univariate_datasets)):
        data_dir = "E:/tsc_ts/"
        dataset = univariate_datasets[i]
        trainX, trainY = load_ts(data_dir + dataset + '/' + dataset +
                                 '_TRAIN.ts')
        testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts')
        print("Loaded " + dataset + " in position " + str(i))
        print("Train X shape :")
        print(trainX.shape)
        print("Train Y shape :")
        print(trainY.shape)
        print("Test X shape :")
        print(testX.shape)
        print("Test Y shape :")
        print(testY.shape)
    for i in range(16, len(multivariate_datasets)):
        data_dir = "E:/mtsc_ts/"
        dataset = multivariate_datasets[i]
        print("Loading " + dataset + " in position " + str(i) + ".......")
        trainX, trainY = load_ts(data_dir + dataset + '/' + dataset +
                                 '_TRAIN.ts')
        testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts')
        print("Loaded " + dataset)
        print("Train X shape :")
        print(trainX.shape)
        print("Train Y shape :")
        print(trainY.shape)
        print("Test X shape :")
        print(testX.shape)
        print("Test Y shape :")
        print(testY.shape)
Example #2
0
def shapelet_extraction(timeseries_dir, data_dir, data_name, shp_type, seed, l_norm):
    trainX, trainY = load_ts(timeseries_dir + data_name + '/' + data_name + '_TRAIN.ts')
    testX, testY = load_ts(timeseries_dir + data_name + '/' + data_name + '_TEST.ts')

    # Encoding of the labels from 1 to num_classes of the dataset.
    le = LabelEncoder()
    trainY = le.fit_transform(trainY)
    testY = le.transform(testY)
    trainY = trainY + 1
    testY = testY + 1
    if shp_type == "Standard":
        shp = ContractedShapeletTransform(time_limit_in_mins=0.1, random_state=seed)
    else:
        shp = ContractedOrdinalShapeletTransform(time_limit_in_mins=0.1, quality=shp_type, random_state=seed, l_norm=l_norm)
    shp.fit(trainX, trainY)

    shapelets = shp.get_shapelets()
    writeShapeletsToCSV(shapelets, 60, data_dir + '/' + data_name + '/' + data_name + '_shapelets.csv')

    for i in range(90, 110, 10):
        directory = data_dir + '/' + data_name + '/' + 'transform_' + str(i) + '/'
        if not os.path.exists(directory):
            os.makedirs(directory)

        shp.shapelets = shapelets[:int((len(shapelets)*i)/100)]
        train_transform = shp.transform(trainX)
        test_transform = shp.transform(testX)
        train_transform['label'] = trainY
        test_transform['label'] = testY

        train_transform.to_csv(directory + '/' + data_name + '_train.0', header=None, index=None, sep=' ')
        test_transform.to_csv(directory + '/' + data_name + '_test.0', header=None, index=None, sep=' ')
def test_loading():
    #test multivariate
    #Test univariate
    data_dir = "E:/tsc_ts/"
    dataset = "Gunpoint"
    trainX, trainY = load_ts(data_dir + dataset + '/' + dataset + '_TRAIN.ts')
    testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts')
    print("Loaded " + dataset + " in position " + str(i))
Example #4
0
                    current_frame_size = (1 - remaining)

            # if the last frame was lost due to double imprecision
            if current_frame == self.num_intervals - 1:
                frames.append(frame_sum / frame_length)

            data.append(pd.Series(frames))

        dims[0] = data

        return dims


if __name__ == "__main__":
    testPath = "C:\\Users\\ajb\\Dropbox\\Data\\TSCProblems\\Chinatown\\Chinatown_TRAIN.ts"
    train_x, train_y = load_ts(testPath)

    print("Correctness testing for PAA using Chinatown")
    #    print("First case used for testing")
    #    print(train_x.iloc[0,0])
    p = PAA()
    print("Test 1: num intervals =1, result should be series mean")
    p.set_num_intervals(1)
    x2 = p.transform(train_x)
    print("Correct mean case 1: =  561.875")
    print("Transform mean case 1: =")
    print(x2.iloc[0, 0])
    print("Test 2: num intervals = series length, series should be unchanged")
    p.set_num_intervals(24)
    x2 = p.transform(train_x)
    print("Before transform: =")
Example #5
0
def run_experiment(problem_path,
                   results_path,
                   cls_name,
                   dataset,
                   classifier=None,
                   resampleID=0,
                   overwrite=False,
                   format=".ts",
                   train_file=False):
    """
    Method to run a basic experiment and write the results to files called testFold<resampleID>.csv and, if required,
    trainFold<resampleID>.csv.
    :param problem_path: Location of problem files, full path.
    :param results_path: Location of where to write results. Any required directories will be created
    :param cls_name: determines which classifier to use, as defined in set_classifier. This assumes predict_proba is
    implemented, to avoid predicting twice. May break some classifiers though
    :param dataset: Name of problem. Files must be  <problem_path>/<dataset>/<dataset>+"_TRAIN"+format, same for "_TEST"
    :param resampleID: Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name.
    :param overwrite: if set to False, this will only build results if there is not a result file already present. If
    True, it will overwrite anything already there
    :param format: Valid formats are ".ts", ".arff" and ".long". For more info on format, see
    https://github.com/alan-turing-institute/sktime/blob/master/examples/Loading%20Data%20Examples.ipynb
    :param train_file: whether to generate train files or not. If true, it performs a 10xCV on the train and saves
    :return:
    """

    build_test = True
    if not overwrite:
        full_path = str(results_path) + "/" + str(
            cls_name) + "/Predictions/" + str(dataset) + "/testFold" + str(
                resampleID) + ".csv"
        if os.path.exists(full_path):
            print(
                full_path +
                " Already exists and overwrite set to false, not building Test"
            )
            build_test = False
        if train_file:
            full_path = str(results_path) + "/" + str(
                cls_name) + "/Predictions/" + str(
                    dataset) + "/trainFold" + str(resampleID) + ".csv"
            if os.path.exists(full_path):
                print(
                    full_path +
                    " Already exists and overwrite set to false, not building Train"
                )
                train_file = False
        if train_file == False and build_test == False:
            return

    # TO DO: Automatically differentiate between problem types, currently only works with .ts
    trainX, trainY = load_ts(problem_path + dataset + '/' + dataset +
                             '_TRAIN' + format)
    testX, testY = load_ts(problem_path + dataset + '/' + dataset + '_TEST' +
                           format)

    trainX = _normalise_X(trainX)
    testX = _normalise_X(testX)

    if resampleID != 0:
        # allLabels = np.concatenate((trainY, testY), axis = None)
        # allData = pd.concat([trainX, testX])
        # train_size = len(trainY) / (len(trainY) + len(testY))
        # trainX, testX, trainY, testY = train_test_split(allData, allLabels, train_size=train_size,
        #                                                                random_state=resampleID, shuffle=True,
        #                                                                stratify=allLabels)
        trainX, trainY, testX, testY = stratified_resample(
            trainX, trainY, testX, testY, resampleID)

    le = preprocessing.LabelEncoder()
    le.fit(trainY)
    trainY = le.transform(trainY)
    testY = le.transform(testY)
    if classifier is None:
        classifier = set_classifier(cls_name, resampleID)
    print(cls_name + " on " + dataset + " resample number " + str(resampleID))
    if build_test:
        # TO DO : use sklearn CV
        start = int(round(time.time() * 1000))
        classifier.fit(trainX, trainY)
        build_time = int(round(time.time() * 1000)) - start
        start = int(round(time.time() * 1000))
        probs = classifier.predict_proba(testX)
        preds = classifier.classes_[np.argmax(probs, axis=1)]
        test_time = int(round(time.time() * 1000)) - start
        ac = accuracy_score(testY, preds)
        print(cls_name + " on " + dataset + " resample number " +
              str(resampleID) + ' test acc: ' + str(ac) + ' time: ' +
              str(test_time))
        #        print(str(classifier.findEnsembleTrainAcc(trainX, trainY)))
        if "Composite" in cls_name:
            second = "Para info too long!"
        else:
            second = str(classifier.get_params())
        second.replace('\n', ' ')
        second.replace('\r', ' ')

        print(second)
        temp = np.array_repr(classifier.classes_).replace('\n', '')

        third = str(ac) + "," + str(build_time) + "," + str(
            test_time) + ",-1,-1," + str(len(classifier.classes_))
        write_results_to_uea_format(second_line=second,
                                    third_line=third,
                                    output_path=results_path,
                                    classifier_name=cls_name,
                                    resample_seed=resampleID,
                                    predicted_class_vals=preds,
                                    actual_probas=probs,
                                    dataset_name=dataset,
                                    actual_class_vals=testY,
                                    split='TEST')
    if train_file:
        start = int(round(time.time() * 1000))
        if build_test and hasattr(
                classifier, "_get_train_probs"
        ):  #Normally Can only do this if test has been built ... well not necessarily true, but will do for now
            train_probs = classifier._get_train_probs(trainX)
        else:
            train_probs = cross_val_predict(classifier,
                                            X=trainX,
                                            y=trainY,
                                            cv=10,
                                            method='predict_proba')
        train_time = int(round(time.time() * 1000)) - start
        train_preds = classifier.classes_[np.argmax(train_probs, axis=1)]
        train_acc = accuracy_score(trainY, train_preds)
        print(cls_name + " on " + dataset + " resample number " +
              str(resampleID) + ' train acc: ' + str(train_acc) + ' time: ' +
              str(train_time))
        if "Composite" in cls_name:
            second = "Para info too long!"
        else:
            second = str(classifier.get_params())
        second.replace('\n', ' ')
        second.replace('\r', ' ')
        temp = np.array_repr(classifier.classes_).replace('\n', '')
        third = str(train_acc) + "," + str(train_time) + ",-1,-1,-1," + str(
            len(classifier.classes_))
        write_results_to_uea_format(second_line=second,
                                    third_line=third,
                                    output_path=results_path,
                                    classifier_name=cls_name,
                                    resample_seed=resampleID,
                                    predicted_class_vals=train_preds,
                                    actual_probas=train_probs,
                                    dataset_name=dataset,
                                    actual_class_vals=trainY,
                                    split='TRAIN')
Example #6
0
     run_experiment(problem_path=data_dir,
                    results_path=results_dir,
                    cls_name=classifier,
                    dataset=dataset,
                    resampleID=resample,
                    train_file=tf)
 else:  #Local run
     #        data_dir = "/scratch/univariate_datasets/"
     #        results_dir = "/scratch/results"
     data_dir = "/bench/datasets/Univariate2018/"
     results_dir = "C:/Users/ajb/Dropbox/Turing Project/Results/"
     # data_dir = "Z:/ArchiveData/Univariate_ts/"
     # results_dir = "E:/Temp/"
     #        results_dir = "Z:/Results/sktime Bakeoff/"
     dataset = "ItalyPowerDemand"
     trainX, trainY = load_ts(data_dir + dataset + '/' + dataset +
                              '_TRAIN.ts')
     testX, testY = load_ts(data_dir + dataset + '/' + dataset + '_TEST.ts')
     classifier = "TSF"
     resample = 1
     #         for i in range(0, len(univariate_datasets)):
     #             dataset = univariate_datasets[i]
     # #            print(i)
     # #            print(" problem = "+dataset)
     tf = False
     run_experiment(overwrite=True,
                    problem_path=data_dir,
                    results_path=results_dir,
                    cls_name=classifier,
                    dataset=dataset,
                    resampleID=resample,
                    train_file=tf)
Example #7
0
    """
    def predict(self, X, input_checks=True):
        if isinstance(X, pd.DataFrame):
            if X.shape[1] > 1:
                raise TypeError("ShapeDTW cannot handle multivariate problems yet")
            elif isinstance(X.iloc[0,0], pd.Series):
                X = np.asarray([a.values for a in X.iloc[:,0]])
            else:
                raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe with a single column of Series objects (ShapeDTW cannot yet handle multivariate problems")
                
        n_samps, self.series_length = X.shape
        pass 
        
if __name__ == "__main__":
    testPath="C:\\Users\\Vince\\Documents\\Dissertation Repositories\\datasets\\Univariate2018_ts\\Chinatown\\Chinatown_TRAIN.ts"
    trainData,trainDataClass =  load_ts(testPath)

    num_atts = trainData.shape[1]
    num_insts = trainData.shape[0]
    
    if isinstance(trainData, pd.DataFrame):
            if trainData.shape[1] > 1:
                raise TypeError("ShapeDTW cannot handle multivariate problems yet")
            elif isinstance(trainData.iloc[0,0], pd.Series):
                trainData = np.asarray([a.values for a in trainData.iloc[:,0]])
            else:
                raise TypeError("Input should either be a 2d numpy array, or a pandas dataframe with a single column of Series objects (ShapeDTW cannot yet handle multivariate problems")


    first = trainData[0,:]
    second = trainData[1,:]
Example #8
0
         results_path=results_dir,
         cls_name=classifier,
         dataset=dataset,
         resampleID=resample,
         train_file=tf,
     )
 else:  # Local run
     #        data_dir = "/scratch/univariate_datasets/"
     #        results_dir = "/scratch/results"
     #         data_dir = "/bench/datasets/Univariate2018/"
     #         results_dir = "C:/Users/ajb/Dropbox/Turing Project/Results/"
     data_dir = "Z:/ArchiveData/Univariate_ts/"
     results_dir = "E:/Temp/"
     #        results_dir = "Z:/Results/sktime Bakeoff/"
     dataset = "GunPoint"
     trainX, trainY = load_ts(data_dir + dataset + "/" + dataset +
                              "_TRAIN.ts")
     testX, testY = load_ts(data_dir + dataset + "/" + dataset + "_TEST.ts")
     classifier = "TDE"
     resample = 0
     #         for i in range(0, len(univariate_datasets)):
     #             dataset = univariate_datasets[i]
     # #            print(i)
     # #            print(" problem = "+dataset)
     tf = False
     run_experiment(
         overwrite=True,
         problem_path=data_dir,
         results_path=results_dir,
         cls_name=classifier,
         dataset=dataset,
         resampleID=resample,