コード例 #1
0
    def execute(training, op_exec):
        """
        This function will help execute the tree
        """
        data_op = prep.read_from_file(training)
        print(data_op)
        ts = prep.impute_missing_data(data_op)

        return_value = None
        if op_exec == "denoise":
            return_value = prep.impute_missing_data(ts)
        elif op_exec == "impute_missing_data":
            return_value = prep.impute_missing_data(ts)
        elif op_exec == "impute_outliers":
            return_value = prep.impute_outliers(ts)
        elif op_exec == "longest_continuous_run":
            return_value = prep.longest_continuous_run(ts)
        elif op_exec == "clip":
            return_value = prep.denoise(ts, self.starting_date,
                                        self.starting_date, self.final_date)
        elif op_exec == "assign_time":
            return_value = prep.assign_time(ts, self.starting_date,
                                            self.increment)
        elif op_exec == "difference":
            return_value = prep.difference(ts)
        elif op_exec == "scaling":
            return_value = prep.scaling(ts)
        elif op_exec == "standardize":
            return_value = prep.standardize(ts)
        elif op_exec == "logarithm":
            return_value = prep.logarithm(ts)
        elif op_exec == "cubic_roots":
            return_value = prep.cubic_roots(ts)
        return return_value
コード例 #2
0
ファイル: classification.py プロジェクト: pechom/Diplomka
def check_selections():
    sys.stdout = open(classification_selected_dir + 'classification_selected.txt', 'w')
    files = glob.glob(selected_dir)
    labels = np.loadtxt(labels_path, delimiter=',', skiprows=1, dtype=np.uint8)
    for file in files:
        data = np.loadtxt(file, delimiter=',', skiprows=1, dtype=np.uint64)
        print(os.path.basename(file)[:-4] + ": " + str(len(data[0])))
        tree_methods(data, labels)
        print("------------------------------------------------------------")
        print('\n')
    preprocessing.standardize(selected_dir, standard_selected_dir, True)  # aj ulozi scaler pre kazdu selekciu
    files = glob.glob(standard_selected_dir + '*')
    for file in files:
        standard_data = np.loadtxt(file, delimiter=',', skiprows=1, dtype=np.float64)
        print(os.path.basename(file)[:-4] + ": " + str(len(standard_data[0])))
        svm_methods(standard_data, labels)
        print("------------------------------------------------------------")
        print('\n')
    sys.stdout.close()
コード例 #3
0
def load(window_size):
    X_train = fio.load_file(X_train_dataset)
    Y_train = fio.load_file(Y_train_dataset)
    X_test = fio.load_file(X_test_dataset)
    Y_test = fio.load_file(Y_test_dataset)
    train_sample = fio.load_sample_file(train_sample_dataset)
    valid_sample = fio.load_sample_file(valid_sample_dataset)

    stat = pc.get_feat_stat(X_train)

    X_train = pc.standardize(X_train, stat)
    X_test = pc.standardize(X_test, stat)

    X_train = pc.expand(X_train, window_size)
    X_test = pc.expand(X_test, window_size)

    Y_train = pc.classify(Y_train)
    Y_test = pc.classify(Y_test)

    testing_sample = [
        np.indices((x.shape[0], x.shape[1])).reshape((2, -1)).T for x in X_test
    ]

    return X_train, Y_train, X_test, Y_test, train_sample, valid_sample, testing_sample
コード例 #4
0
def S2I_ensemble(ERSP_all,
                 tmp_all,
                 freqs,
                 indices,
                 num_time,
                 n_split,
                 index_exp=0):
    '''
    Standardize data, then generate topo for ensemble methods

    Parameters
    ----------
    ERSP_all : numpy 4d array
        Event related spectral perturbations
    tmp_all : numpy 1d or 2d array
        Periods of time or solution latency
    freqs : numpy 1d array
        Frequency steps
    indices : dict
        Indices of training and testing data
    num_time : int
        Number of frame for each trials
    n_split : int
        Number of split clusters
    index_exp : int
        Index of experiment for K-fold cross validation
    

    Returns
    -------
    None.

    '''
    assert isinstance(ERSP_all, np.ndarray) and ERSP_all.ndim == 4
    assert isinstance(tmp_all, np.ndarray) and (tmp_all.ndim == 1
                                                or tmp_all.ndim == 2)
    assert isinstance(freqs, np.ndarray) and freqs.ndim == 1
    assert isinstance(indices, dict)
    assert isinstance(num_time, int)
    assert isinstance(n_split, int) and n_split > 1
    assert isinstance(index_exp, int) and index_exp >= 0

    # Create folder for exp and train, test
    if not os.path.exists('./images/exp%d' % (index_exp)):
        os.makedirs('./images/exp%d' % (index_exp))
        for i in range(n_split):
            os.makedirs('./images/exp%d/train%d' % (index_exp, i))
        os.makedirs('./images/exp%d/test' % (index_exp))

    # Standardize the data
    ERSP_all, SLs = preprocessing.standardize(ERSP_all,
                                              tmp_all,
                                              num_time,
                                              train_indices=indices['train'],
                                              threshold=0.0)

    ERSP_dict = {
        kind: ERSP_all[indices[kind], :]
        for kind in ['train', 'test']
    }
    SLs_dict = {kind: SLs[indices[kind]] for kind in ['train', 'test']}

    ERSP_list, SLs_list = preprocessing.stratified_split(ERSP_dict['train'],
                                                         SLs_dict['train'],
                                                         n_split=n_split,
                                                         mode=args.split_mode)

    start_time = time.time()
    print('[%.1f] Signal to image (Ensemble)' % (time.time() - start_time))

    # Generate topoplot for training data in each split
    for index_split in range(n_split):
        print('--- Split %d ---' % (index_split))
        # Data augmentation
        if args.mode == 'add_noise':
            ERSP_split, SLs_split = data_augmentation.aug(
                ERSP_list[index_split], SLs_list[index_split], 'add_noise',
                (5, 1))
        elif args.mode == 'SMOTER':
            ERSP_split, SLs_split = data_augmentation.aug(
                ERSP_list[index_split], SLs_list[index_split], 'SMOTER')
        else:
            ERSP_split, SLs_split = ERSP_list[index_split], SLs_list[
                index_split]

        fileNames = generate_topo(ERSP_split,
                                  freqs,
                                  num_time,
                                  index_exp=index_exp,
                                  index_split=index_split)
        generate_csv(fileNames, SLs_split, index_exp, index_split)

        if index_split == 0:
            fileNames_train = fileNames
            SLs_train = SLs_split
        else:
            fileNames_train = np.concatenate((fileNames_train, fileNames))
            SLs_train = np.concatenate((SLs_train, SLs_list[index_split]))

    # Generate topoplot for all training data
    generate_csv(fileNames_train, SLs_train, index_exp, 100)

    # Generate topo for testing data
    print('--- Split test ---')
    fileNames = generate_topo(ERSP_dict['test'],
                              freqs,
                              num_time,
                              index_exp=index_exp,
                              index_split=-1)
    generate_csv(fileNames, SLs_dict['test'], index_exp, -1)

    print('[%.1f] Finish S2I' % (time.time() - start_time))
コード例 #5
0
def S2I_main(ERSP_all,
             tmp_all,
             freqs,
             indices,
             mode,
             num_time,
             index_exp=0,
             sub_ID=None):
    '''
    Standardize data, then generate topo

    Parameters
    ----------
    ERSP_all : numpy 4d array
        Event related spectral perturbations
    tmp_all : numpy 1d or 2d array
        Periods of time or solution latency
    freqs : numpy 1d array
        Frequency steps
    indices : dict
        Indices of training and testing data
    mode : string
        Multiframe or single frame or SMOTE
    num_time : int
        Number of frame for each trials
    index_exp : int
        Index of experiment for K-fold cross validation

    Returns
    -------
    None.

    '''
    assert isinstance(ERSP_all, np.ndarray) and ERSP_all.ndim == 4
    assert isinstance(tmp_all, np.ndarray) and (tmp_all.ndim == 1
                                                or tmp_all.ndim == 2)
    assert isinstance(freqs, np.ndarray) and freqs.ndim == 1
    assert isinstance(indices, dict)
    assert isinstance(mode, str)
    assert isinstance(num_time, int)
    assert isinstance(index_exp, int) and index_exp >= 0

    # Create folder for exp and train, test
    if not os.path.exists('./images/exp%d' % (index_exp)):
        os.makedirs('./images/exp%d' % (index_exp))
        os.makedirs('./images/exp%d/train0' % (index_exp))
        os.makedirs('./images/exp%d/test' % (index_exp))

    # Standardize the data
    ERSP_all, SLs = preprocessing.standardize(ERSP_all,
                                              tmp_all,
                                              num_time,
                                              train_indices=indices['train'],
                                              threshold=0.0)

    # Normalize subjects
    if args.normal_sub:
        ERSP_all = preprocessing.normalize_subject(ERSP_all, sub_ID,
                                                   indices['train'])

    ERSP_dict = {
        kind: ERSP_all[indices[kind], :]
        for kind in ['train', 'test']
    }
    SLs_dict = {kind: SLs[indices[kind]] for kind in ['train', 'test']}

    # Data augmentation
    if mode == 'SMOTER':
        ERSP_dict['train'], SLs_dict['train'] = data_augmentation.aug(
            ERSP_dict['train'], SLs_dict['train'], 'SMOTER')
    elif mode == 'add_noise':
        ERSP_dict['train'], SLs_dict['train'] = data_augmentation.aug(
            ERSP_dict['train'], SLs_dict['train'], 'add_noise', (10, 1))

    # Concatenate training and testing data
    ERSP_concat = np.concatenate((ERSP_dict['train'], ERSP_dict['test']),
                                 axis=0)
    SLs_concat = np.concatenate((SLs_dict['train'], SLs_dict['test']), axis=0)

    start_time = time.time()
    print('[%.1f] Signal to image (%s)' % (time.time() - start_time, mode))

    fileNames = generate_topo(ERSP_concat, freqs, num_time,
                              np.arange(ERSP_dict['train'].shape[0]),
                              index_exp)
    split(fileNames,
          SLs_concat,
          len(SLs_dict['test']),
          random=False,
          index_exp=index_exp)

    print('[%.1f] Finished S2I' % (time.time() - start_time))

    if mode == 'normal':
        if args.data_cate == 2:
            print('Generate conditional entropy of exp%d' % (index_exp))
            with open(
                    './raw_data/CE_sub%d_channel%d.data' %
                (args.subject_ID, args.num_channels), 'rb') as fp:
                CE_all = pickle.load(fp)

            for model_mode in ['train', 'test']:
                with open(
                        './raw_data/CE_sub%d_channel%d_exp%d_%s.data' %
                    (args.subject_ID, args.num_channels, index_exp,
                     model_mode), 'wb') as fp:
                    pickle.dump(CE_all[indices[model_mode], :], fp)
コード例 #6
0
def main(index_exp, index_split):
    
    faulthandler.enable()
    torch.cuda.empty_cache()
    
    best_error = 100
    lr_step = [40, 70, 120]
    multiframe = ['convlstm', 'convfc']
    dirName = '%s_data%d_%s_%s_%s'%(args.model_name, args.data_cate, args.augmentation, args.loss_type, args.file_name)
    fileName = '%s_split%d_exp%d'%(dirName, index_split, index_exp)
    
    # Create folder for results of this model
    if not os.path.exists('./results/%s'%(dirName)):
        os.makedirs('./results/%s'%(dirName))
    
    # ------------- Wrap up dataloader -----------------
    if args.input_type == 'signal':
        X, Y_reg, C = raw_dataloader.read_data([1,2,3], list(range(11)), channel_limit=21, rm_baseline=True)
        num_channel = X.shape[1]
        num_feature = X.shape[2]     # Number of time sample
        
        # Remove trials
        X, Y_reg = preprocessing.remove_trials(X, Y_reg, threshold=60)
        
        # Split data for cross validation
        if args.num_fold == 1:
            train_data, test_data, train_target, test_target = train_test_split(X, Y_reg, test_size=0.1, random_state=23)
            # Random state 15: training error becomes lower, testing error becomes higher
        else:
            kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23)
            for i, (train_index, test_index) in enumerate(kf.split(X)):
                if i == index_exp:
                    train_data, train_target = X[train_index, :], Y_reg[train_index]
                    test_data, test_target = X[test_index, :], Y_reg[test_index]
                    
        # Split data for ensemble methods
        if not args.ensemble:
            if args.num_split > 1:
                data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode)
                train_data, train_target = data_list[index_split], target_list[index_split]
                '''
                kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32)
                for i, (other_index, split_index) in enumerate(kf.split(train_data)):
                    if i == index_split:
                        train_data, train_target = train_data[split_index, :], train_target[split_index]
                '''
        # Normalize the data
        if args.normalize:
            train_data, test_data = preprocessing.normalize(train_data, test_data)
        
                    
        # Data augmentation
        if args.augmentation == 'overlapping':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation,
                                                             (256, 64, 128))
            test_data, test_target = data_augmentation.aug(test_data, test_target, args.augmentation,
                                                             (256, 64, 128))
        elif args.augmentation == 'add_noise':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation,
                                                             (30, 1))
        elif args.augmentation == 'add_noise_minority':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation,
                                                             (30, 1))
        elif args.augmentation == 'SMOTER':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation)
            
        # scale data
        if args.scale_data:
            train_data, test_data = train_data.reshape((train_data.shape[0],-1)), test_data.reshape((test_data.shape[0],-1))
            train_data, test_data = preprocessing.scale(train_data, test_data)
            train_data = train_data.reshape((train_data.shape[0],num_channel, -1))
            test_data = test_data.reshape((test_data.shape[0],num_channel, -1))
            
        if args.model_name in ['eegnet', 'eegnet_trans_signal']:
            # (sample, channel, time) -> (sample, channel_NN, channel_EEG, time)
            [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_feature)) \
                                       for X in [train_data, test_data]]
        
        
        (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map(
                torch.from_numpy, (train_data, train_target, test_data, test_target))
        [train_dataset,test_dataset] = map(\
                Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()])

        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size)
        
        model_param = [train_data.shape]
        
    elif args.input_type == 'power':
        if args.data_cate == 1:
            ERSP_all, tmp_all, freqs = dataloader.load_data()
        elif args.data_cate == 2:
            data_file = './raw_data/ERSP_from_raw_%d_channel21.data'%(args.index_sub)
            with open(data_file, 'rb') as fp:
                dict_ERSP = pickle.load(fp)
            ERSP_all, tmp_all = dict_ERSP['ERSP'], dict_ERSP['SLs']
        num_channel = ERSP_all.shape[1]
        num_freq = ERSP_all.shape[2]
            
        # Remove trials
        ERSP_all, tmp_all = preprocessing.remove_trials(ERSP_all, tmp_all, threshold=60)
        
        # Split data for cross validation
        if args.num_fold == 1:
            train_data, test_data, train_target, test_target = train_test_split(ERSP_all, tmp_all[:,2], test_size=0.1, random_state=23)
        else:
            kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23)
            for i, (train_index, test_index) in enumerate(kf.split(ERSP_all)):
                if i == index_exp:
                    train_data, test_data = ERSP_all[train_index, :], ERSP_all[test_index, :]
                    if args.data_cate == 2:
                        train_target, test_target = tmp_all[train_index], tmp_all[test_index]
                    else:
                        train_target, test_target = tmp_all[train_index, 2], tmp_all[test_index, 2]
                        
                    if args.add_CE:
                        assert args.data_cate == 2
                        with open('./raw_data/CE_sub%d'%(args.index_sub), 'rb') as fp:
                            CE = pickle.load(fp)
                        CE_train, CE_test = CE[train_index,:], CE[test_index,:]
                        # PCA for CE
                        pca = PCA(n_components=10)
                        pca.fit(CE_train)
                        CE_train, CE_test = pca.transform(CE_train), pca.transform(CE_test)
                        
                    
        # Split data for ensemble methods
        if not args.ensemble:
            if args.num_split > 1:
                data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode)
                train_data, train_target = data_list[index_split], target_list[index_split]
                '''
                kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32)
                for i, (other_index, split_index) in enumerate(kf.split(np.arange(len(train_data)))):
                    if i == index_split:
                        train_data, train_target = train_data[split_index, :], train_target[split_index]
                '''
                    
        # Concatenate train and test for standardizinsg
        data = np.concatenate((train_data, test_data), axis=0)
        target = np.concatenate((train_target, test_target))
                    
        # Standardize data
        num_train = len(train_data)
        data, target = preprocessing.standardize(data, target, train_indices = np.arange(num_train), threshold=0.0)
        data = data.reshape((data.shape[0], -1))
        
        # Scale target between 0 and 1
        if args.post_scale:
            print('Scale the target between 0-1')
            target = target/60
        
        # Split data
        train_data, test_data = data[:num_train, :], data[num_train:, :]
        train_target, test_target = target[:num_train], target[num_train:]
        
        # Data augmentation
        if args.augmentation == 'SMOTER':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation)
        
        # center data
        if args.center_flag:
            train_data, test_data = preprocessing.center(train_data, test_data)
            
        # scale data
        if args.scale_data:
            train_data, test_data = preprocessing.scale(train_data, test_data)
            
        # Add conditional entropy
        if args.add_CE:
            train_data = np.concatenate((train_data, CE_train), axis=1)
            test_data = np.concatenate((test_data, CE_train), axis=1)
            
        if args.model_name == 'eegnet_trans_power':
            # (sample, channel, freq) -> (sample, channel_NN, channel_EEG, freq)
            [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_freq)) \
                                       for X in [train_data, test_data]]
        
        (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map(
                torch.from_numpy, (train_data, train_target, test_data, test_target))
        [train_dataset,test_dataset] = map(\
                Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()])

        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size)
        
        model_param = [train_data.shape]
        
    elif args.input_type == 'image':
        
        if args.ensemble:
            input_model_name = args.pre_model_name
        else:
            input_model_name = args.model_name
        
        assert (input_model_name in multiframe) == (args.num_time>1)
        
        # Let input size be 224x224 if the model is vgg16
        if input_model_name in ['vgg16', 'resnet50']:
            input_size = 224
        else:
            input_size = 64
            
        # Load Data
        data_transforms = {
                'train': transforms.Compose([
                        ndl.Rescale(input_size, args.num_time),
                        ndl.ToTensor(args.num_time)]), 
                'test': transforms.Compose([
                        ndl.Rescale(input_size, args.num_time),
                        ndl.ToTensor(args.num_time)])
                }

        print("Initializing Datasets and Dataloaders...")

        # Create training and testing datasets
        # image_datasets = {x: ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x],
        #                 scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']}
        [train_dataset,test_dataset] = [ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x],
                        scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']]

        # Create training and testing dataloaders
        # if not args.str_sampling:
        #     train_loader = Data.DataLoader(image_datasets['train'], batch_size=args.batch_size, shuffle=True, num_workers=4)
        # test_loader = Data.DataLoader(image_datasets['test'], batch_size=args.batch_size, shuffle=False, num_workers=4)
        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4)
        model_param = [input_size]
        
    elif args.input_type == 'EEGLearn_img':
        
        # Load data
        with open('./EEGLearn_imgs/data1.data', 'rb') as fp:
            dict_data = pickle.load(fp)
        data, target = dict_data['data'], dict_data['target']
        input_size = data.shape[2]
        
        # Split data for cross validation
        if args.num_fold == 1:
            train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.1, random_state=23)
            # Random state 15: training error becomes lower, testing error becomes higher
        else:
            kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23)
            for i, (train_index, test_index) in enumerate(kf.split(data)):
                if i == index_exp:
                    train_data, train_target = data[train_index, :], target[train_index]
                    test_data, test_target = data[test_index, :], target[test_index]
        
        (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map(
                torch.from_numpy, (train_data, train_target, test_data, test_target))
        [train_dataset,test_dataset] = map(\
                Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()])

        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size)
        
        
    # ------------ Create model ---------------
    if args.input_type in ['image','EEGLearn_img']:
        model_param = [input_size]
    else:
        model_param = [train_data.shape]
    
    if not args.ensemble:
        model = read_model(args.model_name, model_param)
    else:
        pre_models = []
        for i in range(args.num_split):
            pre_model = read_model(args.pre_model_name, model_param)
            pre_model.load_state_dict( torch.load('%s/last_model_exp%d_split%d.pt'%(args.ensemble, index_exp, i)) )
            set_parameter_requires_grad(pre_model, True)
            pre_models.append(pre_model)
            
        model = models.__dict__[args.model_name](pre_models)
        
    print('Use model %s'%(args.model_name))
        
    # Run on GPU
    model = model.to(device=device)
    
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    if args.loss_type == 'L2':
        criterion = nn.MSELoss().to(device=device)
    elif args.loss_type == 'L1':
        criterion = nn.L1Loss().to(device=device)
    elif args.loss_type == 'L4':
        criterion = L4Loss
    elif args.loss_type == 'MyLoss':
        criterion = MyLoss
    print('Use %s loss'%(args.loss_type))
    
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_rate,momentum=0.9)
    #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_rate)
    
    # Record loss and accuracy of each epoch
    dict_error = {'train_std': list(range(args.num_epoch)), 'test_std': list(range(args.num_epoch)),
                  'train_mape': list(range(args.num_epoch)), 'test_mape': list(range(args.num_epoch))}
    
    # optionally evaluate the trained model
    if args.evaluate:
        if args.resume:
            if os.path.isfile(args.resume):
                model.load_state_dict(torch.load(args.resume))
        
        _, target, pred, _, _ = validate(test_loader, model, criterion)
        plot_scatter(target, pred, dirName, fileName)
        return 0
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_error = checkpoint['best_error']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            dict_error['train_std'][:args.start_epoch] = checkpoint['dict_error']['train_std']
            dict_error['test_std'][:args.start_epoch] = checkpoint['dict_error']['test_std']
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    
    # ------------- Train model ------------------

    for epoch in range(args.start_epoch, args.num_epoch):
        # Create dataloader if using stratified sampler
        if args.str_sampling:
            sampler = SubsetRandomSampler(get_indices_RSS(train_target, int(0.5*len(train_target))))
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, \
                                           sampler=sampler, num_workers=4)
            
        # Learning rate decay
        if epoch in lr_step:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
        
        # train for one epoch
        _, dict_error['train_std'][epoch], dict_error['train_mape'][epoch] = \
            train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        _, _, _, std_error, dict_error['test_mape'][epoch] = validate(test_loader, model, criterion)
        dict_error['test_std'][epoch] = std_error

        # remember best standard error and save checkpoint
        is_best = std_error < best_error
        best_error = min(std_error, best_error)
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_error': best_error,
            'optimizer': optimizer.state_dict(),
            'dict_error': dict_error
        }, is_best)
        
        # Save best model
        if is_best:
            torch.save(model.state_dict(), './results/%s/best_model_exp%d_split%d.pt'%(dirName, index_exp, index_split))
        if epoch == args.num_epoch-1:
            torch.save(model.state_dict(), './results/%s/last_model_exp%d_split%d.pt'%(dirName, index_exp, index_split))
    # Plot error curve
    plot_error(dict_error, dirName, fileName)
    
    # Plot scatter plots
    _, target, pred, _, _ = validate(test_loader, model, criterion)
    plot_scatter(target, pred, dirName, fileName)
    dict_error['target'], dict_error['pred'] = target, pred
    
    # Plot histogram
    import matplotlib.pyplot as plt
    plt.hist(target, label = 'True')
    plt.hist(pred, label = 'Pred')
    plt.legend(loc='upper right')
    plt.savefig('./results/hist.png')
    
    # Save error over epochs
    with open('./results/%s/%s.data'%(dirName, fileName), 'wb') as fp:
        pickle.dump(dict_error, fp)
コード例 #7
0
    if not classical:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        print(device)

    # Load data
    if args.input_type == 'signal':
        X, Y, _, S, D = raw_dataloader.read_data([1, 2, 3],
                                                 range(11),
                                                 channel_limit=21,
                                                 rm_baseline=True)
    elif args.input_type == 'ERSP':
        with open('./raw_data/ERSP_from_raw_100_channel21.data', 'rb') as fp:
            dict_ERSP = pickle.load(fp)
        ERSP, Y, S, D = dict_ERSP['ERSP'], dict_ERSP['SLs'], dict_ERSP[
            'Sub_ID'], dict_ERSP['D']
        X, Y = preprocessing.standardize(ERSP, Y, threshold=0.0)
    elif args.input_type == 'bp_ratio':
        X, Y, _, S, D = raw_dataloader.read_data([1, 2, 3],
                                                 range(11),
                                                 channel_limit=21,
                                                 rm_baseline=True)
        low, high = [4, 7, 13], [7, 13, 30]
        X = bandpower.get_bandpower(X, low=low, high=high)
        X = add_features.get_bandpower_ratio(X)

    # Create folder for results of this model
    if not os.path.exists('./results/%s' % (args.dirName)):
        os.makedirs('./results/%s' % (args.dirName))

    # LOSO or CV
    if args.cv_mode == 'LOSO':
コード例 #8
0
    if not os.path.exists('./results/%s' % (args.dirName)):
        os.makedirs('./results/%s' % (args.dirName))

    # Load data
    if args.input_type == 'signal':
        data, SLs, _, S, D = raw_dataloader.read_data([1, 2, 3],
                                                      range(11),
                                                      channel_limit=21,
                                                      rm_baseline=True)
        #data = np.random.rand(data.shape[0], data.shape[1], data.shape[2])
    elif args.input_type == 'ERSP':
        with open('./raw_data/ERSP_from_raw_100_channel21.data', 'rb') as fp:
            dict_ERSP = pickle.load(fp)
        data, SLs, S, D = dict_ERSP['ERSP'], dict_ERSP['SLs'], dict_ERSP[
            'Sub_ID'], dict_ERSP['D']
        data, SLs = preprocessing.standardize(data, SLs, threshold=0.0)
    elif args.input_type == 'bp_ratio':
        data, SLs, _, S, D = raw_dataloader.read_data([1, 2, 3],
                                                      range(11),
                                                      channel_limit=21,
                                                      rm_baseline=True)
        low, high = [4, 7, 13], [7, 13, 30]
        data = bandpower.get_bandpower(data, low=low, high=high)
        data = add_features.get_bandpower_ratio(data)
    elif args.input_type == 'bandpower':
        data, SLs, _, S, D = raw_dataloader.read_data([1, 2, 3],
                                                      range(11),
                                                      channel_limit=21,
                                                      rm_baseline=True)
        low, high = [4, 7, 13], [7, 13, 30]
        data = bandpower.get_bandpower(data, low=low, high=high)
コード例 #9
0
[[150], [100], [0, 1, 10], [1], [1e-1], [1e-1], [5], [10]]

# Set seed for reproducibility
np.random.seed(98)

# Iterate over each subset and build a model
# The predictions of every single model are combined
for i in range(num_subsets):
    # Extract the train/test subsets
    y_train_subset, X_train_subset, ids_train_subset = train_subsets[i]
    y_test_subset, X_test_subset, ids_test_subset = test_subsets[i]

    # Map the categorical output labels into [0, 1]
    y_train_subset = map_0_1(y_train_subset)
    # Standardize the data
    X_train_subset, X_test_subset = standardize(X_train_subset, X_test_subset)
    print(
        f"Train shape before feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}"
    )
    # Build the polynomial features and expand the data
    X_train_subset, X_test_subset = build_poly(X_train_subset,
                                               max_degree[i]), build_poly(
                                                   X_test_subset,
                                                   max_degree[i])
    print(
        f"Train shape after  feature expansion: {str(X_train_subset.shape):>12}   Test shape: {str(X_test_subset.shape):>12}"
    )

    # Set n_best_features to X_train_subset.shape[1] if you don't want feature selection
    n_best_features = round(fs_perc[i] * X_train_subset.shape[1])
    D = n_best_features
コード例 #10
0
def main():
	#Loading the Data
	# Training dataset
	DATA_TRAIN_PATH = '../data/train.csv' 
	y, X, ids = load_csv_data(DATA_TRAIN_PATH)
	# Testing Dataset
	DATA_TEST_PATH = '../data/test.csv' 
	y_t, X_t, ids_t = load_csv_data(DATA_TEST_PATH)

	#Separate training and testing sets into 4 different categories depending 
	#on the PRI_jet_num feature with index -8
	feature = -8
	X_cat = preproc.get_categories(X, feature=feature)
	X_t_cat = preproc.get_categories(X_t, feature=feature)

	#looop for every v in range 4 to obtain the 4 predictions, 
	#then concatenate and create submission file
	y_pred_all = []
	# Found using cross_validation

	# Setting best hyperparameters (the degree and the corresponding lambda) for each category
	degrees = [10, 10, 9, 9]
	lambdas = [0.00047508101621, 7.05480231072e-07, 0.000343046928631, 5.72236765935e-05]
	
	for v in range(4):
		# Extract category (test, train and labels)
	    Xv = X[X_cat[v]]
	    Xv_t = X_t[X_t_cat[v]]
	    y_v = y[X_cat[v]]

	    #Concatenante the train and testing set
	    all_Xv = np.concatenate((Xv, Xv_t), axis=0)

	    # find features (bad_features) with a unique value
	    bad_features = []
	    for i in range(len(all_Xv.T)):
	        if(len(np.unique(all_Xv.T[i])) == 1):
	            bad_features.append(i)

	    # Delete bad_features and fill missing values
	    all_Xv_c =  X_v = np.delete(all_Xv, bad_features, axis=1)
	    all_Xv_filled = preproc.fill_missing_values(all_Xv_c, tresh=1)

	    #Separate train and test
	    Xv_f = all_Xv_filled[:len(Xv)]
	    Xv_t_f = all_Xv_filled[len(Xv):]	    

	    #Standardize the dataset
	    tXv, mean_x, std_x = preproc.standardize(Xv_f)
	    tXv_t,  mean_x, std_x = preproc.standardize(Xv_t_f)

	    ### Generate model

	    final_degree = degrees[v]
	    best_lambda = lambdas[v]

	    # Build the polynomial basis, perform ridge regression
	    final_X = impl.build_poly(tXv, final_degree)
	    final_Xt = impl.build_poly(tXv_t, final_degree)

	    #Generate the model (Using Ridge Regression)
	    final_w, loss_ = impl.ridge_regression(y_v, final_X, best_lambda)

	    # Genereate prediction for this category
	    y_predv = predict_labels(final_w, final_Xt)
	    y_pred_all.append(y_predv)
	    p = len(X_cat[v])/len(X)

    ### Concatenate all predictions, and sort them by indices
	Xt_cat_all = [idx for sublist in X_t_cat for idx in sublist]
	y_pred = [yi for sublist in y_pred_all for yi in sublist]
	final_ypred = np.asarray(y_pred)[np.argsort(Xt_cat_all)]

	#Create Submission file
	OUTPUT_PATH = '../submissions/results__4categories_fillByCat_run.csv'

	create_csv_submission(ids_t, final_ypred, OUTPUT_PATH)
	print('Congratulations ........ Submission file created ::: ', OUTPUT_PATH)
コード例 #11
0
ファイル: EEGLearn_S2I.py プロジェクト: hundredball/Math24
        for s, sample in enumerate(data):
            augData[s, :] = sample + (components * coeffs.reshape(
                (n_components, -1))).sum(axis=0)
    else:
        # Add Gaussian noise with std determined by weighted std of each feature
        for f, feat in enumerate(data.transpose()):
            augData[:,
                    f] = feat + np.random.normal(scale=stdMult * np.std(feat),
                                                 size=feat.size)
    return augData


if __name__ == '__main__':

    ERSP_all, tmp_all, freqs = dataloader.load_data()
    ERSP_all, SLs = preprocessing.standardize(ERSP_all, tmp_all, threshold=0.0)
    num_channels = ERSP_all.shape[1]

    ERSP_all, SLs = preprocessing.remove_trials(ERSP_all, SLs, 60.0)
    num_example = ERSP_all.shape[0]

    # Concatenate theta, alpha, beta
    low, high = [4, 7, 13], [7, 13, 30]
    for i in range(len(low)):
        bp_i = preprocessing.bandpower(ERSP_all, freqs, [low[i]],
                                       [high[i]]).reshape((num_example, -1))
        if i == 0:
            bp_all = bp_i
        else:
            bp_all = np.concatenate((bp_all, bp_i), axis=1)
コード例 #12
0
def run_benchmark(
        learners_obj,
        learners_name,  # For generating reports
        n_runs=30,
        n_folds=5,
        noise_level=0.0,
        preprocessing='none',
        report_name="benchmark.pdf",
        verbose=False):
    """Run benchmark tests to evaluate a method and generate a latex-compatible report.
    
    Parameters:
    -----------
    learners_obj: a set of object. list or tuple 
        objects of methods to be tested
    
    learners_name: a set of string. list of tuple
    
    n_runs: int default 30
        Runs of cross-validation 
    
    n_folds: int default 5
        Number of folds. Must be at least 2. Typically 5 or 10
    
    noise_level: number default 0.0
        Percentage of samples labels of which are flipped    
    
    preprocessing: str. default 'none'
        Preprocessing method. Must be 'none' or 'normalize' or 'standardize' 
        
    report_name: string default "benchmark.txt"    
        Name of a report that will be generated.
        
    verbose: bool default False
        If true, display progress 
    """

    # Check parameters

    # Run benchmark tests
    # ----------------------------------------------------------------------------#
    #             Result format                                                   #
    #                                                                             #
    #    ROW 0 : ds_name, run,  cv_fold, method_index, train_error, test_error    #
    #    ROW 1 : ds_name, run,  cv_fold, method_index, train_error, test_error    #
    #                                                                             #
    #    ROW k : ds_name, run,  cv_fold, method_index, train_error, test_error    #
    #                                                                             #
    # ----------------------------------------------------------------------------#

    benchmark_results = []
    ds_count = -1
    for ds in g_benchmark_datasets:
        ds_count += 1
        X, y, n_samples, n_features = _load_data(ds, noise_level)
        for irun in range(n_runs):
            # Cross-validation
            kf = KFold(n_samples, n_folds=n_folds, shuffle=True)
            fold_count = -1

            for train_idx, test_idx in kf:
                fold_count += 1
                X_train = X[train_idx, :]
                y_train = y[train_idx]
                X_test = X[test_idx, :]
                y_test = y[test_idx]

                if preprocessing == 'standardize':
                    X_train = standardize(X_train)
                    X_test = standardize(X_test)

                if preprocessing == 'normalize':
                    X_train = normalize(X_train)
                    X_test = normalize(X_test)

                # Call learners
                method_count = -1
                for learner in learners_obj:
                    method_count += 1
                    lcopy = copy.deepcopy(learner)
                    lcopy.fit(X_train, y_train)
                    train_err = 1.0 - lcopy.score(X_train, y_train)
                    test_err = 1.0 - lcopy.score(X_test, y_test)

                    result = [
                        ds_count, irun, fold_count, method_count, train_err,
                        test_err
                    ]
                    benchmark_results.append(result)

                    if verbose:
                        print 'dataset: {0}  run: {1}  cv_fold: {2}  ' \
                              'method: {3}  train_error: {4}  test_error: {5}' \
                                .format(ds, irun, fold_count, \
                                        method_count, train_err, test_err)
                        #print result

    # Save results
    print 'Saving results ... '
    global g_result_top_dir
    g_result_top_dir = os.path.join(os.getcwd(), "benchmark_results")
    if not os.path.exists(g_result_top_dir):
        os.mkdir(g_result_top_dir)

    if not (".pdf" in report_name.lower()):
        report_name += '.pdf'
    dir_name = report_name.replace('.pdf', '')
    if not os.path.exists(os.path.join(g_result_top_dir, dir_name)):
        os.mkdir(os.path.join(g_result_top_dir, dir_name))

    datetime = time.strftime("%H%M%S%d_%m_%Y")
    result_file_name = os.path.join(os.path.join(g_result_top_dir, dir_name),
                                    "benchmark_results_" + datetime)
    np.save(result_file_name, benchmark_results)

    # Save meta information: names of datasets and methods for generating a report.
    np.save(result_file_name + '_methods_name', learners_name)
    np.save(result_file_name + '_datasets_name', g_benchmark_datasets)

    # Generate report
    print 'Generating reports ... '
    _generate_report(report_name=report_name,
                     result_file=result_file_name,
                     methods_name=learners_name,
                     n_runs=n_runs,
                     n_folds=n_folds)
コード例 #13
0
def get_data(use_preexisting=True,
             save_preprocessed=True,
             z_outlier=False,
             feature_expansion=False,
             correlation_analysis=False,
             class_equalizer=False,
             M=4,
             z_value=3.0):
    """
    Data supplying function.

    This function has the purpose of loading data and applying preprocessing.
    It includes many features such as downloading the data from the github
    repository, saving the data (for fast reuse), applying different
    preprocessing algorithms, etc...

    Args:
        use_preexisting (bool): if existent, enabling this parameters will allow
                                the function to use previously preprocessed and
                                saved data files
        save_preprocessed (bool): enabling this parameters will allow the
                                    function to save the preprocessed data
        z_outlier (Union[int, bool]): enabling this parameters will allow the function to
                            perform z outlier detection
        feature_expansion (bool): enabling this parameters will allow the
                                    function to perform exponential feature
                                    expansion
        correlation_analysis (Union[int, bool]): enabling this parameters will allow the
                                        function to perform correlation analysis
                                        and remove highly correlated features
        class_equalizer (Union[int, bool]): enabling this parameters will allow the function to
                            perform class balancing
        M (Union[int, list]): feature expansion parameter per group
        z_value (Union[float, list]): outlier detection threshold per group

    Returns:
        list: groups of training samples
        list: corresponding groups of training labels
        list: corresponding indexes of affiliated training ows
        list: groups of test samples
        list: corresponding groups of test labels
        list: corresponding indexes of affiliated test rows
        list: list of indexes of testing (for creating submissions)

    """

    if os.path.isdir(config.DATA_PATH) and os.path.isdir(
            config.PREPROCESSED_PATH) and use_preexisting:
        print("[*] Using previously preprocessed Data")
        groups_tr_X = np.load(config.PREPROCESSED_X_TR_GROUPS_NPY,
                              allow_pickle=True)
        groups_tr_Y = np.load(config.PREPROCESSED_Y_TR_GROUPS_NPY,
                              allow_pickle=True)
        indc_list_tr = np.load(config.PREPROCESSED_GROUP_INDEX_TR_NPY,
                               allow_pickle=True)
        groups_te_X = np.load(config.PREPROCESSED_X_TE_GROUPS_NPY,
                              allow_pickle=True)
        groups_te_Y = np.load(config.PREPROCESSED_Y_TE_GROUPS_NPY,
                              allow_pickle=True)
        indc_list_te = np.load(config.PREPROCESSED_GROUP_INDEX_TE_NPY,
                               allow_pickle=True)
        ids_te = np.load(config.PREPROCESSED_IDS_TE_GROUPS_NPY,
                         allow_pickle=True)

    else:
        if not (os.path.isdir(config.DATA_PATH)
                and os.path.isfile(config.TRAIN_DATA_CSV_PATH)
                and os.path.isfile(config.TEST_DATA_CSV_PATH)):
            Path(config.DATA_PATH).mkdir(exist_ok=True)
            download_url(config.TRAIN_URL, config.TRAIN_DATA_CSV_PATH)
            download_url(config.TEST_URL, config.TEST_DATA_CSV_PATH)

        print("[*] Creating preprocessed Data")

        # load data from csv filesconfig.Z_VALUE
        Y_tr, X_tr, ids_tr = load_csv_data(config.TRAIN_DATA_CSV_PATH)
        Y_te, X_te, ids_te = load_csv_data(config.TEST_DATA_CSV_PATH)

        groups_tr_Y, groups_tr_X, indc_list_tr = split_groups(Y_tr, X_tr)
        groups_te_Y, groups_te_X, indc_list_te = split_groups(Y_te, X_te)

        nr_groups_tr = len(indc_list_tr)

        # make to lists
        z_outlier = make_to_list(z_outlier)
        class_equalizer = make_to_list(class_equalizer)
        correlation_analysis = make_to_list(correlation_analysis)
        M = make_to_list(M)

        for indx in range(nr_groups_tr):
            # perform z outlier detection
            if z_outlier[indx]:
                groups_tr_X[indx] = z_score_outlier_detection(
                    groups_tr_X[indx], thresh=z_value)
                groups_te_X[indx] = z_score_outlier_detection(
                    groups_te_X[indx], thresh=z_value)

            # perform correlation analysis
            if correlation_analysis[indx]:
                groups_tr_X[indx], columns_to_keep = corr_filter(
                    groups_tr_X[indx], threshold=0.95)
                groups_te_X[indx] = groups_te_X[indx][:, columns_to_keep]

            # perform class equalization
            if class_equalizer[indx]:
                groups_tr_X[indx], groups_tr_Y[
                    indx] = class_imbalance_equalizer(groups_tr_X[indx],
                                                      groups_tr_Y[indx])

            # perform feature expansion
            if feature_expansion:
                groups_tr_X[indx] = augment_features_polynomial(
                    groups_tr_X[indx], M=M[indx])
                groups_te_X[indx] = augment_features_polynomial(
                    groups_te_X[indx], M=M[indx])

            # standardize features
            groups_tr_X[indx] = standardize(groups_tr_X[indx])
            groups_te_X[indx] = standardize(groups_te_X[indx])

            # add bias
            groups_tr_X[indx] = add_bias(groups_tr_X[indx])
            groups_te_X[indx] = add_bias(groups_te_X[indx])

            print(f"\t [+]Group {indx + 1} finished!")

        if save_preprocessed:
            Path(config.PREPROCESSED_PATH).mkdir(exist_ok=True)

            np.save(config.PREPROCESSED_X_TR_GROUPS_NPY,
                    groups_tr_X,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_Y_TR_GROUPS_NPY,
                    groups_tr_Y,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_X_TE_GROUPS_NPY,
                    groups_te_X,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_Y_TE_GROUPS_NPY,
                    groups_te_Y,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_GROUP_INDEX_TR_NPY,
                    indc_list_tr,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_GROUP_INDEX_TE_NPY,
                    indc_list_te,
                    allow_pickle=True)
            np.save(config.PREPROCESSED_IDS_TE_GROUPS_NPY,
                    ids_te,
                    allow_pickle=True)
            print("[+] Saved Preprocessed Data")

    return groups_tr_X, groups_tr_Y, indc_list_tr, groups_te_X, groups_te_Y, indc_list_te, ids_te
コード例 #14
0
    print("Before:\n", data, "\n")
    assigned_time = prep.assign_time(data, "1/10/2019", 1)
    print("With Assigned Time:\n")
    print(assigned_time)
    # ---------------------
    print("--------Preprocessing Test--------")
    print("          -difference()-          ")
    test_data = prep.read_from_file("../TestData/AtmPres2005NovMin.csv")
    difference_ts = prep.difference(test_data)
    print(difference_ts)
    # ---------------------
    print("--------Preprocessing Test--------")
    print("            -scaling()-           ")
    scaled = prep.scaling(assigned_time)
    print(scaled)
    # ----------------
    print("--------Preprocessing Test--------")
    print("         -standardize()-          ")
    standardized = prep.standardize(test_data)
    print(standardized, "\n")
    # ----------------
    print("--------Preprocessing Test--------")
    print("          -logarithm()-           ")
    log = prep.logarithm(test_data)
    print(log, "\n")
    # ----------------
    print("--------Preprocessing Test--------")
    print("            -cubic()-             ")
    cubed = prep.cubic_roots(test_data)
    print(cubed)