def get_dataset_files_fs(dataset_path):
    '''
		Scales ALL data for feature selection
		Scaled files are saved in Normalized/ normal and anomalous folders
	'''
    print('\t[...] Retrieving datasets')

    # Get paths to data
    path_normal = dataset_path + '/normal/not_normalized/'
    path_anomalous = dataset_path + '/anomalous/not_normalized/'

    dest_normal = dataset_path + '/normal/normalized/'
    dest_anomalous = dataset_path + '/anomalous/normalized/'

    # Removes every normalized files if any
    clean_dir(dest_normal)
    clean_dir(dest_anomalous)

    # Get list of files
    normal_files = [path_normal + x for x in os.listdir(path_normal)]
    anomal_files = [path_anomalous + x for x in os.listdir(path_anomalous)]
    all_files = normal_files + anomal_files

    # Scale data
    print('\t[...] Fitting scaler')
    scaler = utils.fit_scaler(all_files)

    # Apply std and save files @ dest_...
    print('\t[...] Scaling data')
    utils.standardize(normal_files, scaler, dest_normal)
    utils.standardize(anomal_files, scaler, dest_anomalous)
Beispiel #2
0
def dimensionality_reduction(data, labels, method=1):
    '''

    :param data:
    :param labels:
    :param method:  1-. PCA
                    2-. LDA
    :return:
    '''
    # PCA
    if method == 1:

        reductedPCAData = mpl.mlab.PCA(data)
        # We create a dataFrame matching the reductedData with their label.
        dataFramePCA = pds.DataFrame(reductedPCAData.Y)
        dataFramePCA['labels'] = pds.DataFrame(labels)
        dataFramePCA = ut.standardize(dataFramePCA)
        reduction = dataFramePCA
    # LDA
    else:
        clf = skl.LDA(n_components=3)

        # clf.fit_transform(data, labels)
        # We create a dataFrame matching the reductedData with their label.
        dataFrameLDA = pds.DataFrame(clf.fit_transform(data, labels))
        dataFrameLDA['labels'] = pds.DataFrame(labels)
        dataFrameLDA = ut.standardize(dataFrameLDA)
        reduction = dataFrameLDA
    return reduction
def preprocess(no_wells_marmousi, no_wells_seam):
    """Function initializes data, performs standardization, and train test split
    
    Parameters:
    ----------
    no_wells_marmousi : int,
        number of evenly spaced wells and seismic samples to be evenly sampled 
        from marmousi section.
        
    no_wells_seam : int
        number of evenly spaced wells and seismic samples to be evenly sampled from SEAM
        
    Returns
    -------
    seismic_marmousi : array_like, shape(num_traces, depth samples)
        2-D array containing seismic section for marmousi
        
    seismic_seam : array_like, shape(num_traces, depth samples)
        2-D array containing seismic section for SEAM
        
    model_marmousi : array_like, shape(num_wells, depth samples)
        2-D array containing model section from marmousi 2
        
    model_seam : array_like, shape(num_wells, depth samples)
        2-D array containing model section from SEAM
    
    """

    # get project root directory
    project_root = os.getcwd()

    if ~os.path.isdir(
            'data'):  # if data directory does not exists then extract
        extract('data.zip', project_root)

    # Load data
    seismic_marmousi = np.load(join(
        'data', 'marmousi_synthetic_seismic.npy')).squeeze()
    seismic_seam = np.load(join('data',
                                'poststack_seam_seismic.npy')).squeeze()[:,
                                                                         50:]
    seismic_seam = seismic_seam[::2, :]

    # Load targets and standardize data
    model_marmousi = np.load(join('data',
                                  'marmousi_Ip_model.npy')).squeeze()[::5, ::4]
    model_seam = np.load(join('data',
                              'seam_elastic_model.npy'))[::3, :, ::2][:, :,
                                                                      50:]
    model_seam = model_seam[:, 0, :] * model_seam[:, 2, :]

    # standardize
    seismic_marmousi, model_marmousi = standardize(seismic_marmousi,
                                                   model_marmousi,
                                                   no_wells_marmousi)
    seismic_seam, model_seam = standardize(seismic_seam, model_seam,
                                           no_wells_seam)

    return seismic_marmousi, seismic_seam, model_marmousi, model_seam
Beispiel #4
0
def resolve(query, kb):
    threshold = kb.size
    tbu = indexed_kb()
    query = cleanup_query(query)
    query[0]["truth"] = not query[0]["truth"]
    tbu.add(query, occur_check=True)
    iter = 0
    #print query
    start = time.time()
    while not tbu.empty():
        iter += 1
        x, parent = tbu.pop()
        if not sanity_check(parent, kb, threshold):
            continue
        #print "Popped: ", x
        for x_pred in x:
            if not x_pred["truth"]:
                indices = get_indices(kb.true, x_pred["name"])
            else:
                indices = get_indices(kb.false, x_pred["name"])
            #print "X_pred is ", x_pred, indices
            for index in indices:
                y = kb.all[index]
                for y_pred in y:
                    sub = unify(x_pred, y_pred)
                    if sub is not None:
                        resolved_sentence = get_resolved_sentence(
                            copy.deepcopy(x), copy.deepcopy(y),
                            copy.deepcopy(x_pred), copy.deepcopy(y_pred), sub)
                        if resolved_sentence == []:
                            return True
                        if isTrue(resolved_sentence):
                            return False
                        resolved_sentence = standardize(resolved_sentence)
                        new_parent = copy.deepcopy(parent)
                        if index not in new_parent:
                            new_parent[index] = 0
                        new_parent[index] += 1
                        tbu.add(resolved_sentence,
                                new_parent,
                                occur_check=True,
                                verbose=False)
                        print tbu.size, iter
                        if len(resolved_sentence) > 10000:
                            print "x: ", x
                            print "y: ", y
                            print "sub: ", sub
                            print "Resolved: ", resolved_sentence
                            print tbu.size
                            print '\n'
                            xxx = input()
        end = time.time()
        if (end - start) > 10:
            print "Breaking out in 10 seconds"
            break
        x = standardize(x)
        kb.add(x, occur_check=True)
    return False
Beispiel #5
0
    def fit(self, path, print_after=1, plot=False):
        """Wrapper method for training and saving the model"""
        X_train, X_test, Y_train, Y_test = self._load(path)

        Y_train = Y_train.reshape((1, -1))
        Y_test = Y_test.reshape((1, -1))

        X_train = standardize(X_train)
        X_test = standardize(X_test)

        _, n_feature = X_train.shape

        accuracy_to_plot = []
        error_to_plot = []

        curr_best = -1
        for iter_ in range(self.n_init):
            print("Running Model {}".format(iter_ + 1))
            self._init_weight(n_feature)
            cost, accuracy, error = self._train(X_train, Y_train, print_after,
                                                plot)

            if iter_ == 0 or cost < curr_best:
                self._save()
                curr_best = cost
                accuracy_to_plot = accuracy
                error_to_plot = error

        print("Loading the best model ...")
        dict_ = self.load_state_dict()
        self.w = dict_['w']
        self.b = dict_['b']

        if plot:
            plt.figure(1)
            plt.plot(range(self.n_epoch + 1), error_to_plot, c='b')
            plt.xlabel('Number of Epochs')
            plt.ylabel('Logistic Loss')
            plt.title('Loss Function vs Epochs')
            plt.savefig('./regr_error_plot.png')

            plt.figure(2)
            plt.plot(range(self.n_epoch + 1), accuracy_to_plot, c='r')
            plt.xlabel('Number of Epochs')
            plt.ylabel('Accuracy %')
            plt.title('Accuracy vs Epochs')
            plt.savefig('./regr_accuracy_plot.png')

        Y_pred = self.classify(X_test)
        dict_ = metrics(Y_test.reshape(-1), Y_pred.reshape(-1))
        print("Validation Accuracy: {:4}".format(dict_['accuracy']))
        print("F-Score: {:4}".format(100 * dict_['f1-score']))
Beispiel #6
0
def load_data(data_name):
    timer = utils.timer(name='main')
    data_path = './data/' + data_name
    user_pref_file = data_path + '/U_BPR.npy'
    item_pref_file = data_path + '/V_BPR.npy'
    item_content_file = data_path + '/item_features.txt'
    train_file = data_path + '/train.csv'
    test_file = data_path + '/test.csv'
    vali_file = data_path + '/vali.csv'
    dat = {}

    # load preference data
    timer.tic()
    dat['u_pref'] = np.load(user_pref_file)
    dat['v_pref'] = np.load(item_pref_file)
    timer.toc('loaded U:%s,V:%s' %
              (str(dat['u_pref'].shape), str(dat['v_pref'].shape))).tic()

    # pre-process preference data
    _, dat['u_pref'] = utils.standardize(dat['u_pref'])
    _, dat['v_pref'] = utils.standardize_2(dat['v_pref'])
    timer.toc('standardized U,V').tic()

    # load item(article) content data
    # load_svmlight_file(file): 读取svmlight格式的数据文件,文件存放格式
    # <label> <feature-id>:<feature-value> <feature-id>:<feature-value> ...
    # 其中 zero_based 选项,如果为 False 的话会将所有的 indices 减 1
    # 返回 (X, y),其中 X 是 scipy.sparse matrix,y 是 numpy.ndarray
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    # tfidf 文本特征化
    item_content = tfidf(item_content)
    # svd 特征降维
    u, s, _ = randomized_svd(item_content, n_components=300, n_iter=5)
    item_content = u * s
    # 特征标准化
    _, item_content = utils.standardize(item_content)
    dat['item_content'] = item_content
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    train = pd.read_csv(train_file, dtype=np.int32)
    dat['user_list'] = train['uid'].values
    dat['item_list'] = train['iid'].values
    timer.toc('read train triplets %s' % str(train.shape))

    dat['test_eval'] = data.load_eval_data(test_file)
    dat['vali_eval'] = data.load_eval_data(vali_file)
    return dat
Beispiel #7
0
 def fit(self, X, y):
     """Transforms dataset using computed S_w & S_b matrices to form new discriminants"""
     
     # if number of discriminants is not specified, make it equal to # of columns in dataset
     if self.n_discriminants is None:
         self.n_discriminants  = X.shape[1]
         
     # standardize data if specified    
     if self.centered:
         X_fit = standardize(X)
     else:
         X_fit = X
         
     # calculate S_w and S_b, to be used for eigen decomposition    
     S_b    = self.between_class_matrix(X_fit, y)
     S_w    = self.within_class_matrix(X_fit, y)
     inv_Sw = np.linalg.inv(S_w)
     
     # get eigen values and eigen vectors to be used for data transformation
     eigen_vals, eigen_vecs = np.linalg.eig(inv_Sw @ S_b)
     
     # pair each eigen value with its eigen vector
     eigen_pairs           = [(eigen_vals[i], eigen_vecs[:, i]) for i in range(len(eigen_vals))]
     # sort from high to low
     sorted_pairs          = sorted(eigen_pairs, key=lambda x: x[0], reverse=True)
     # stack discriminants in appropriate order
     self.discriminants_   = np.hstack((sorted_pairs[i][1][:, np.newaxis].real for i in range(self.n_discriminants)))
     # calculated total explained variance for included discriminants
     self.variance_ratios_ = [np.abs(pair[0].real)/np.sum(eigen_vals.real) for pair in sorted_pairs[:self.n_discriminants]]
     
     return self
Beispiel #8
0
    def fit(self, X):
        """
        Determine the eigenvalues and eigenvectors of the feature matrix.
        Returns itself to be chained w/ the fit_transform() method
        """

        # if no value for n_components is specified, create one for each column in dataset
        if self.n_components is None:
            self.n_components = X.shape[1]

        # standardize dataset, if specified
        if self.centered:
            X = standardize(X)

        # create covariance matrix, perform eigen decomposition
        # return the eigenvalues and eigen vectors from decomposition
        cov_mat = np.cov(X.T)
        eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

        # pair each eigen value with its eigen vector
        eigen_pairs = [(eigen_vals[i], eigen_vecs[:, i])
                       for i in range(len(eigen_vals))]
        # sort from high to low
        sorted_pairs = sorted(eigen_pairs, reverse=True)
        # stack components in appropriate order
        self.components_ = np.hstack((sorted_pairs[i][1][:, np.newaxis]
                                      for i in range(self.n_components)))

        self.variance_ratios_ = [
            np.abs(pair[0].real) / np.sum(eigen_vals.real)
            for pair in sorted_pairs[:self.n_components]
        ]

        return self
Beispiel #9
0
def load_data(files, vnet, batch_size, num_gpus, norm):
    """Loads and preprocesses data."""

    # Optionally standardizes data.
    if norm:
        arr = [standardize(np.load(file)) for file in files]
    else:
        arr = [np.load(file) for file in files]

    if len(arr) == 1:
        arr = arr[0]
    # If all the same shape, concat.
    elif len(set([sub_arr.shape for sub_arr in arr])) == 1:
        arr = np.concatenate(arr)

    # 2D case with different shapes not implemented
    else:
        raise NotImplementedError()

    # Ensure dimensionality is correct.
    if arr.ndim == 4 and arr.shape[3] == 2:
        arr = arr[:, :, :, 1]
    elif arr.ndim == 4:
        arr = arr[:, :, :, 0]
    arr = np.expand_dims(arr, axis=3)

    return arr, coords, orig_shape
 def fit(self, X, y):
     """
     Determine statistical relationship between columns in X and target variable y
     """
     # standardize feature matrix if needed
     X_fit = np.zeros(X.shape)
     if self.centered:
         X_fit = standardize(X)
     else:
         X_fit = X
         
     # if gradient descent, then solve w/ closed form solution    
     if not self.gd:
         # add bias unit
         X_fit      = np.c_[np.ones(len(X_fit)), X_fit]
         self.coef_ = np.linalg.inv(X_fit.T @ X_fit + self.alpha * np.eye(X_fit.shape[1])) @ X_fit.T @ y
         
     # otherwise, use gradient descent
     else:
         # initialize weights, adding an extra for the intercept
         self.coef_  = np.random.normal(loc=0, scale=0.1, size=X.shape[1] + 1)
         self.cost_  = []
         
         for i in range(self.n_iter):
             l2_grad        = self.alpha * self.coef_[1:]             # update l2 gradient
             l2_penalty     = self.alpha * np.sum(self.coef_[1:]**2)  # update l2 loss term
             output         = self.predict(X_fit)                     # make prediction - linear output
             errors         = y - output                              # get error column
             gradient       = (X_fit.T @ errors + l2_grad) * 1/len(X) # get error wrt to each column, add l2, scale by 1/m
             self.coef_[1:] += gradient * self.eta                    # update the weights by gradients * learning rate
             self.coef_[0]  += errors.sum() * self.eta  * 1/len(X)    # update intercept by error column * learning rate * 1/m
             cost           = (np.sum(errors**2) + l2_penalty) / 2    # compute the cost
             self.cost_.append(cost)                                  # log it
Beispiel #11
0
    def fit(self, X, y):
        """
        Determine statistical relationship between columns in X and target variable y
        """
        # standardize feature matrix if needed
        if self.centered:
            X_fit = standardize(X)
        else:
            X_fit = X

        # if gradient descent, then solve w/ closed form solution
        if not self.gd:
            # add bias unit
            X_fit = np.c_[np.ones(len(X_fit)), X_fit]
            self.coef_ = np.linalg.inv(X_fit.T @ X_fit) @ X_fit.T @ y

        # otherwise, use gradient descent
        else:
            rgen = np.random.RandomState()
            # initialize weights, adding an extra for the intercept
            self.coef_ = rgen.normal(loc=0, scale=0.1, size=X_fit.shape[1] + 1)
            self.cost_ = []

            for i in range(self.n_iter):
                output = self.predict(X_fit)  # create prediction
                errors = y - output  # get errors
                gradient = X_fit.T @ errors * 1 / len(
                    X
                )  # get gradient w.r.t. each column, scale by # of samples
                self.coef_[1:] += gradient * self.eta  # update weights
                self.coef_[0] += errors.sum() * self.eta * 1 / len(
                    X)  # update intercept -- no regularization
                cost = np.sum(errors**2) / 2  # calculate cost
                self.cost_.append(cost)  # log it
Beispiel #12
0
	def _corrupt(self, data, corruption):
		
		if type(corruption) == float:
			cdata = np.random.binomial(size=data.shape, n=1, p=1.-corruption) * data
		elif np.shape(np.asarray(corruption).T) == np.shape(data):
			cdata = corruption.T
		else:
			if self.layers[0].data_std is not None and self.layers[0].data_norm is not None:
				scales = np.random.uniform(low=corruption[0], high=corruption[1], size=data.shape[1])
				
				data = u.unnormalise(data, self.layers[0].data_norm[0], self.layers[0].data_norm[1])
				data = u.unstandardize(data, self.layers[0].data_std[0], self.layers[0].data_std[1])
				
				p = np.random.binomial
				noise_maps = [np.random.normal(scale=sig, size=data.shape[0]) for sig in scales] #* p(1, 0.5) 
				noise_maps = np.asarray(noise_maps)
				cdata = data + noise_maps.T
				
				cdata, _, _ = u.standardize(cdata, self.layers[0].data_std[0], self.layers[0].data_std[1])
				cdata, _, _ = u.normalise(cdata, self.layers[0].data_norm[0], self.layers[0].data_norm[1])
				
				# Just making sure we're not out of bounds:
				min_thr = 1e-6
				max_thr = 0.99999
				
				#if ((cdata < min_thr).sum() > 0 or (cdata > max_thr).sum() > 0) and False:
				#	print np.amin(data), np.amax(data), np.mean(data), np.std(data)
				#	print 'N/C:', (cdata < min_thr).sum(), (cdata > max_thr).sum()
				#	print np.amin(cdata), np.amax(cdata), np.mean(cdata), np.std(cdata)
				#	print 
				cdata[cdata < min_thr] = min_thr
				cdata[cdata > max_thr] = max_thr
				
		return cdata
Beispiel #13
0
def predict(model, x, x_params_list, y_params, is_many=True):
    """
    支持单样本和多样本预测
    当为多样本时,x = [zz_inputs, xx_inputs, ly_inputs, xc_inputs, decoder_inputs],其中.._inputs = (simples, units, features)
    当为多样本时,x = [zz_input, xx_input, ly_input, xc_input, decoder_input],其中.._input = (units, features)
    :param model: AQPredict模型
    :param x:
    :param x_params_list:
    :param y_params:
    :param is_many: 是否为多样本
    :return: y_pred = (simples, features) or (1, features)
    """
    if is_many:
        x_stded = standardize(x, x_params_list, return_params_list=False)
        return _predict_many(model, x_stded, y_params)
    else:
        x_stded = standardize(x, x_params_list, return_params_list=False)
        return _predict_one(model, x_stded, y_params)
Beispiel #14
0
def main(args):
    proj_path = os.getcwd()
    data_path = 'data'
    test_path = data_path + '/test/preprocessed'
    model_save_path = 'model'

    save_freq = 10
    max_epoch = 5000
    max_patience = 30
    window_size = 7
    num_features = 264
    batch_size = 16

    net = torch.load(args[1])

    test_x_list, test_y_list = utils.data_load('data/final/preprocessed')

    train_piece_lens = []
    test_piece_lens = []

    for i in range(len(test_x_list)):
        # Add 1 to train data for log computability.
        # It can be inversed at post-processing phase.
        test_x_list[i] = utils.standardize(test_x_list[i] + 1, log=True).T
        test_y_list[i] = test_y_list[i].T
        test_piece_lens.append(test_x_list[i].shape[0])

        print('test loaded {}/{}'.format(i + 1, len(test_x_list)))

    test_x = np.vstack(test_x_list)
    del test_x_list
    test_y = np.vstack(test_y_list)
    del test_y_list

    # For GPU computing.
    dtype = torch.cuda.FloatTensor
    test_x = Variable(torch.Tensor(test_x).type(dtype))
    test_x.volatile = True
    test_y = Variable(torch.Tensor(test_y).type(dtype))
    test_y.volatile = True

    min_valid_loss = float('inf')
    patience = 0

    # criterion = nn.BCEWithLogitsLoss()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters())

    print('Preprocessing Completed.')

    # Train and calculate loss value.
    prec, recall, acc = run_test(net, test_x, test_y, criterion,
                                 test_piece_lens, batch_size, window_size)
    f_score = 2 * prec * recall / (prec + recall)

    print('Precision: {}\tRecall: {}\tAccuracy: {}'.format(prec, recall, acc))
    print('F-score: {}'.format(f_score))
Beispiel #15
0
def load_data(files, vnet, batch_size, num_gpus, norm):
    """Loads and preprocesses data."""

    # Optionally standardizes data.
    if norm:
        arr = [standardize(np.load(file)) for file in files]
    else:
        arr = [np.load(file) for file in files]

    if len(arr) == 1:
        arr = arr[0]
    # If all the same shape, concat.
    elif len(set([sub_arr.shape for sub_arr in arr])) == 1:
        arr = np.concatenate(arr)
    # If different shapes and 3D, chunk then concat.
    elif vnet:
        # TODO: Somehow save coords and orig_shape for each sub_arr.
        # Low priority because this block only used for training data right now.
        if arr[0].ndim == 4 and arr[0].shape[3] == 2:
            arr = [sub_arr[:, :, :, 1] for sub_arr in arr]
        elif arr[0].ndim == 4:
            arr = [sub_arr[:, :, :, 0] for sub_arr in arr]
        arr = [np.expand_dims(sub_arr, axis=3) for sub_arr in arr]

        chunked = [chunks(sub_arr, trim=False) for sub_arr in arr]
        arr = np.concatenate([chunk[0] for chunk in chunked])

        # Avoids https://github.com/keras-team/keras/issues/11434
        last_batch_gpus = (arr.shape[0] % batch_size) % num_gpus
        if last_batch_gpus != 0:
            arr = arr[:-last_batch_gpus, :, :, :, :]

        return arr, None, None

    # 2D case with different shapes not implemented
    else:
        raise NotImplementedError()

    # Ensure dimensionality is correct.
    if arr.ndim == 4 and arr.shape[3] == 2:
        arr = arr[:, :, :, 1]
    elif arr.ndim == 4:
        arr = arr[:, :, :, 0]
    arr = np.expand_dims(arr, axis=3)

    # Chunks data if necessary.
    if vnet:
        arr, coords, orig_shape = chunks(arr)
    else:
        # Avoids https://github.com/keras-team/keras/issues/11434
        last_batch_gpus = (arr.shape[0] % batch_size) % num_gpus
        if last_batch_gpus != 0:
            arr = arr[:-last_batch_gpus, :, :, :]
            coords = None
            orig_shape = arr.shape

    return arr, coords, orig_shape
Beispiel #16
0
    def _compute_spatial_kernels(self, train_paths, test_paths):
        for fn_train, fn_test in zip(train_paths, test_paths):
            # Process train set.
            ss = np.fromfile(fn_train, dtype=np.float32)

            xx = self.spatial_sstats_to_spatial_features(ss, self.gmm)
            xx, mu, sigma = standardize(xx)
            xx = power_normalize(xx, 0.5)
            self.Zx += compute_L2_normalization(xx)

            self.Kxx += dot(xx, xx.T)

            # Process test set.
            ss = np.fromfile(fn_test, dtype=np.float32)

            yy = self.spatial_sstats_to_spatial_features(ss, self.gmm)
            yy = standardize(yy, mu, sigma)[0]
            yy = power_normalize(yy, 0.5)
            self.Zy += compute_L2_normalization(yy)

            self.Kyx += dot(yy, xx.T)
Beispiel #17
0
    def _compute_kernels(self, train_paths, test_paths):
        for fn_train, fn_test in zip(train_paths, test_paths):
            # Process train set.
            ss = np.fromfile(fn_train, dtype=np.float32)

            xx = self.sstats_to_features(ss, self.gmm)
            xx, mu, sigma = standardize(xx)
            xx = power_normalize(xx, 0.5)
            self.Zx += compute_L2_normalization(xx)

            self.Kxx += dot(xx, xx.T)

            # Process test set.
            ss = np.fromfile(fn_test, dtype=np.float32)

            yy = self.sstats_to_features(ss, self.gmm)
            yy = standardize(yy, mu, sigma)[0]
            yy = power_normalize(yy, 0.5)
            self.Zy += compute_L2_normalization(yy)

            self.Kyx += dot(yy, xx.T)
Beispiel #18
0
    def read_data(self, index):
        """This function is used to read the data with the index

        :param index: the index of the data you want to get.
        """

        # if this is for training, just load the the from training list
        if self.training:
            x1 = self.train_images[index]  # the first list of images (ADC)
            x2 = self.train_images[index]  # the second list of images (T2WI)
            y = self.train_labels[index]  # the list of labels
        else:  # if this is for testing, just load the the from testing list
            x1 = self.test_images[index]  # the first list of images (ADC)
            x2 = self.test_images[index]  # the second list of images (T2WI)
            y = self.test_labels[index]  # the list of labels

        height, width = x1.shape  # get the size of the image
        x1 = normalize(
            x1.reshape(height, width,
                       1))  # apply the normalization (norm to range [0, 1])
        x1 = standardize(x1)  # apply the standardization (reshape the data)

        x2 = normalize(
            x2.reshape(height, width,
                       1))  # apply the normalization (norm to range [0, 1])
        x2 = standardize(x2)  # apply the standardization (reshape the data)

        # apply data augmentation
        augmented_data = data_augmentation(np.concatenate([x1, x2], axis=2),
                                           use_rigid=self.use_rigid,
                                           use_non_rigid=self.use_non_rigid)

        # NOTE: because the data I used has multiple classes, so I have to modified it a bit. Remove the following line (just one line)
        y = (y != 1).astype(np.uint8)  # remove this
        return augmented_data[:, :, :, :
                              3], augmented_data[:, :, :,
                                                 3:], tf.keras.utils.to_categorical(
                                                     y,
                                                     num_classes=2,
                                                     dtype='float32')
Beispiel #19
0
def _load_data(dataset, is_training=False):
    """Load input data, target values and file names for a dataset.

    The input data is assumed to be a dataset of feature vectors. These
    feature vectors are standardized using a scaler that is either
    loaded from disk (if it exists) or computed on-the-fly. The latter
    is only possible if the input data is training data, which is
    indicated by the `is_training` parameter.

    Target values and file names are read from the metadata file.

    Args:
        dataset: Structure encapsulating dataset information.
        training (bool): Whether the input data is training data.

    Returns:
        x (np.ndarray): The input data.
        y (np.ndarray): The target values.
        names (list): The associated file names.
    """
    import data_augmentation as aug
    import features

    features_path = os.path.join(cfg.extraction_path, dataset.name + '.h5')
    x = utils.timeit(lambda: features.load_features(features_path),
                     'Loaded features of %s dataset' % dataset.name)

    # Clip dynamic range to 90 dB
    x = np.maximum(x, x.max() - 90.0)

    # Load scaler from file if cached, or else compute it.
    scaler_path = cfg.scaler_path
    if os.path.exists(scaler_path) or not is_training:
        with open(scaler_path, 'rb') as f:
            scaler = pickle.load(f)
    else:
        scaler = utils.timeit(lambda: utils.compute_scaler(x),
                              'Computed standard scaler')
        with open(scaler_path, 'wb') as f:
            pickle.dump(scaler, f)

    x = utils.timeit(lambda: utils.standardize(x, scaler),
                     'Standardized %s features' % dataset.name)

    names, y = utils.timeit(lambda: utils.read_metadata(dataset.metadata_path),
                            'Loaded %s metadata' % dataset.name)
    if dataset == cfg.training_set and cfg.enable_augmentation:
        names, y = aug.expand_metadata((names, y))

    return x, y, names
Beispiel #20
0
def load_data(data_name):
    timer = utils.timer(name='main').tic()
    data_path = './data/' + data_name
    u_file = data_path + '/U_BPR.npy'
    v_file = data_path + '/V_BPR.npy'
    user_content_file = data_path + '/user_content.npz'
    train_file = data_path + '/train.csv'
    test_file = data_path + '/test.csv'
    vali_file = data_path + '/vali.csv'

    dat = {}
    # load preference data
    timer.tic()
    u_pref = np.load(u_file)
    v_pref = np.load(v_file)
    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref
    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref'] = utils.standardize_2(u_pref)
    _, dat['v_pref'] = utils.standardize(v_pref)
    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    user_content = scipy.sparse.load_npz(user_content_file)
    dat['user_content'] = user_content.tolil(copy=False)
    timer.toc('loaded user feature sparse matrix: %s' %
              (str(user_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(train_file, dtype=np.int32)
    dat['user_list'] = train['uid'].values
    dat['item_list'] = train['iid'].values
    dat['warm_item'] = np.unique(train['iid'].values)
    timer.toc('read train triplets %s' % str(train.shape)).tic()

    dat['vali_eval'] = data.load_eval_data(vali_file,
                                           cold_user=True,
                                           test_item_ids=dat['warm_item'])
    dat['test_eval'] = data.load_eval_data(test_file,
                                           cold_user=True,
                                           test_item_ids=dat['warm_item'])
    return dat
Beispiel #21
0
    def predict(file):
        data = pd.read_csv(file)
        data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[])

        # pre-process data
        try:
            modify = pd.read_csv(file.replace('.csv', '_modified.csv'))
        except FileNotFoundError:
            modify = RacingPredictor.pre_process(file, persistent=True)

        # perform standardization
        modify = standardize(modify)

        # slice data
        x_test, y_test = slice_classification_data(modify)

        # prediction
        clf = lgb.Booster(model_file='lgb_classifier.txt')

        winprob = clf.predict(x_test)
        data['winprob'] = 0

        i = 0
        groups = data.groupby(['rdate', 'rid'])
        for name, group in groups:
            total = np.sum(winprob[i, 0:len(group)])

            j = 0
            for index, row in group.iterrows():
                row['winprob'] = winprob[i, j] / total
                data.iloc[index] = row
                j += 1
            i += 1

        data['plaprob'] = WinP2PlaP(data, wpcol='winprob')

        fixratio = 1 / 10000
        mthresh = 9
        print("Getting win stake...")
        data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] >
                                       mthresh)
        print("Getting place stake...")
        data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] >
                                       mthresh)

        data.to_csv('test_result.csv')
Beispiel #22
0
    def train(self):
        # pre-process data
        try:
            modify = pd.read_csv(self.file.replace('.csv', '_modified.csv'))
        except FileNotFoundError:
            modify = RacingPredictor.pre_process(self.file, persistent=True)

        # shuffle among groups
        groups = [
            df.transform(np.random.permutation)
            for _, df in modify.groupby(['rdate', 'rid'])
        ]
        modify = pd.concat(groups).reset_index(drop=True)
        # drop outdated data
        # modify = modify[:][[val > '2017' for val in modify['rdate']]]
        # perform standardization
        modify = standardize(modify)

        # slice data
        x_train, y_train = slice_classification_data(modify)

        # convert training data into LightGBM dataset format
        d_train = lgb.Dataset(x_train, label=y_train)

        params = dict()
        params['learning_rate'] = 3e-4
        params['boosting_type'] = 'rf'
        params['objective'] = 'multiclass'
        params['metric'] = 'multi_logloss'
        params['num_class'] = 16

        params['bagging_freq'] = 1
        params['bagging_fraction'] = 0.8
        # params['lambda_l1'] = 10
        # params['lambda_l2'] = 1
        # params['max_depth'] = 10
        # params['cat_smooth'] = 10
        # params['feature_fraction'] = 0.8
        # params['num_leaves'] = 128
        # params['min_data_in_leaf'] = 32

        self.lgb_model = lgb.train(params, d_train, 400)

        self.lgb_model.save_model('lgb_classifier.txt',
                                  num_iteration=self.lgb_model.best_iteration)
Beispiel #23
0
def joint_scores(query_features,
                 query_cams,
                 query_frames,
                 gallery_features,
                 gallery_cams,
                 gallery_frames,
                 distribution,
                 alpha=5,
                 interval=100):

    query_features, gallery_features = standardize(query_features,
                                                   gallery_features)

    scores = torch.Tensor()

    for feature, cam, frame in zip(query_features, query_cams, query_frames):
        # n: Number of Gallery instances
        # (n, 1228*6) * 2048*6  -> n
        # Visual Feature Stream
        feature_score = torch.matmul(gallery_features, feature)

        # Size: n
        gallery_frames = gallery_frames
        gallery_cams = gallery_cams

        diff = torch.abs(gallery_frames - frame)
        hist_ = (diff / interval).type(torch.int16)
        # Size: n
        st_score = distribution[cam.type(torch.int16).tolist() -
                                1][(gallery_cams -
                                    1).type(torch.int16).tolist(),
                                   hist_.tolist()]
        st_score = torch.tensor(st_score)

        # score -> probabilities; This must be a formula from the paper!
        # Size: n
        score = 1/(1+torch.exp(-alpha*feature_score)) * \
            1/(1+2*torch.exp(-alpha*st_score))

        scores = torch.cat([scores, torch.unsqueeze(score,
                                                    dim=0)])  # all_scores

    # Size: k * n; k -> Num. of Query Instansces
    return scores
Beispiel #24
0
def test_regression(model):

    Regression = models[model]

    print ("-- Regression Tree --")

    # Load temperature data
    data = pd.read_csv('data/TempLinkoping2016.txt', sep="\t")

    time = np.atleast_2d(data["time"].values).T
    temp = np.atleast_2d(data["temp"].values).T

    X = standardize(time)        # Time. Fraction of the year [0, 1]
    y = temp[:, 0]  # Temperature. Reduce to one-dim
    print (X.shape, y.shape)

    X_train, y_train, X_test, y_test = split_train_test(X, y)

    model = Regression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_pred_line = model.predict(X)

    # Color map
    cmap = plt.get_cmap('viridis')

    mse = mean_squared_error(y_test, y_pred)

    print ("Mean Squared Error:", mse)

    # Plot the results
    # Plot the results
    m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
    m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10)
    plt.suptitle("Regression Tree")
    plt.title("MSE: %.2f" % mse, fontsize=10)
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
    plt.show()
def main():
    # Load temperature data
    data = pd.read_csv('./TempLinkoping2016.txt', sep="\t")

    #[[0.00273224] [0.00546448] [0.00819672]......]
    time = np.atleast_2d(data["time"].values).T
    #[[  0.1] [ -4.5] [ -6.3]...]
    temp = np.atleast_2d(data["temp"].values).T

    #X:[[-1.72732488], [-1.71786008],....[-1.72732488]]
    X = standardize(time)  # Time. Fraction of the year [0, 1]  标准化
    #[[  0.1] [ -4.5] [ -6.3]...]---------->[0.1,-4.5,-6.3........]
    y = temp[:, 0]  # Temperature. Reduce to one-dim

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        shuffle=True)

    model = RegressionTree()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    model.print_tree(indent=' ')
    # Color map
    cmap = plt.get_cmap('viridis')

    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

    # Plot the results
    # Plot the results
    m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
    m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10)
    plt.suptitle("Regression Tree")
    plt.title("MSE: %.2f" % mse, fontsize=10)
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"),
               loc='lower right')
    plt.show()
Beispiel #26
0
def init_data(nsamples, dx, dy):
    Xold = np.linspace(0, 1000, nsamples * dx).reshape([nsamples, dx])
    X = utils.standardize(Xold)

    invertible = False
    while not invertible:
        W = np.random.randint(1, 10, size=(dy, dx))
        if linalg.cond(W) < 1 / sys.float_info.epsilon:
            invertible = True
            print('W invertible')

    Y = W.dot(X.T)  # target

    # for i in range(Y.shape[1]):
    #     Y[:, i] = utils.add_noise(Y[:, i])
    print('shapes Y = {}, X: {}, W: {}'.format(Y.shape, X.shape, W.shape))
    x = Variable(torch.from_numpy(X), requires_grad=True).type(torch.FloatTensor)
    y = Variable(torch.from_numpy(Y), requires_grad=True).type(torch.FloatTensor)
    w = Variable(torch.from_numpy(W), requires_grad=True).type(torch.FloatTensor)
    return x, y, w
def preprocess(no_wells):
    """Function initializes data, performs standardization, and train test split
    
    Parameters:
    ----------
    no_wells : int,
        number of evenly spaced wells and seismic samples to be evenly sampled 
        from seismic section.

        
    Returns
    -------
    seismic : array_like, shape(num_traces, depth samples)
        2-D array containing seismic section 
        
    model : array_like, shape(num_wells, depth samples)
        2-D array containing model section 

    """

    # get project root directory
    project_root = os.getcwd()

    if ~os.path.isdir(
            'data'):  # if data directory does not exists then extract
        extract('data.zip', project_root)

    # Load data
    seismic = np.load(join('data',
                           'poststack_seam_seismic.npy')).squeeze()[:, 50:]
    seismic = seismic[::2, :]

    # Load targets and standardize data
    model = np.load(join('data', 'seam_elastic_model.npy'))[::3, :, ::2][:, :,
                                                                         50:]
    model = model[:, 0, :] * model[:, 2, :]

    # standardize
    seismic, model = standardize(seismic, model, no_wells)

    return seismic, model
 def __init__(self, sess, state_dim, action_dim, learning_rate):
     self.sess = sess
     self.s_dim = state_dim
     self.a_dim = action_dim
     self.learning_rate = learning_rate
     
     # Actor Network
     self.inputs, self.out = self.create_actor_network()
     
     # This returns will be provided by the Discount Reward
     self.returns = tf.placeholder("float", [None,1], name='returns')
     self.actions = tf.placeholder("float", [None,self.a_dim], name='actions')
     
     # tf reward processing
     self.tf_discounted_epr = self.tf_discount_rewards(self.returns)
     self.tf_discounted_epr = utils.standardize(self.tf_discounted_epr)
     
     self.loss = tf.nn.l2_loss(self.actions-self.out)
     optimizer = tf.train.AdamOptimizer(self.learning_rate)
     grads = optimizer.compute_gradients(self.loss, var_list=tf.trainable_variables(), grad_loss=self.tf_discounted_epr)
     self.optimize = optimizer.apply_gradients(grads)      
Beispiel #29
0
    def fit(self):
        episode_length = len(self.states)

        # These targets are used for optimization step.
        discounted_rewards = self.discount_rewards(self.rewards)
        # Standardized discounted rewards
        discounted_rewards = standardize(discounted_rewards)
        advantages = np.zeros((episode_length, self.action_size))

        # Create inputs for our model (not crucial but it helps
        # to keep track of input dimension)
        update_input = np.zeros(((episode_length, ) + self.state_size))

        for i in range(episode_length):
            update_input[i, :] = self.states[i]

        # We predict on batch using list of states
        values = self.critic.predict(update_input)

        for i in range(episode_length):
            advantages[i][self.actions[i]] = discounted_rewards[i] - values[i]

        # Refer to "https://medium.freecodecamp.org/an-intro-to-advantage-actor-critic-methods-lets-play-sonic-the-hedgehog-86d6240171d"

        # Actor use Cross-entropy with critic q value
        actor_loss = self.actor.fit(update_input,
                                    advantages,
                                    batch_size=self.batch_size,
                                    epochs=1,
                                    verbose=0)
        # Critic use MSE its predicted value (value)
        critic_loss = self.critic.fit(update_input,
                                      discounted_rewards,
                                      batch_size=self.batch_size,
                                      epochs=1,
                                      verbose=0)

        self.states, self.actions, self.rewards = [], [], []

        return values, actor_loss.history['loss'], critic_loss.history['loss']
def main():

    print ("-- Regression Tree --")

    # Load temperature data
    data = pd.read_csv('../TempLinkoping2016.txt', sep="\t")

    time = np.atleast_2d(data["time"].as_matrix()).T
    temp = np.atleast_2d(data["temp"].as_matrix()).T

    X = standardize(time)        # Time. Fraction of the year [0, 1]
    y = temp[:, 0]  # Temperature. Reduce to one-dim

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    model = RegressionTree()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_pred_line = model.predict(X)

    # Color map
    cmap = plt.get_cmap('viridis')

    mse = mean_squared_error(y_test, y_pred)

    print ("Mean Squared Error:", mse)

    # Plot the results
    # Plot the results
    m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
    m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
    m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10)
    plt.suptitle("Regression Tree")
    plt.title("MSE: %.2f" % mse, fontsize=10)
    plt.xlabel('Day')
    plt.ylabel('Temperature in Celcius')
    plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
    plt.show()
Beispiel #31
0
	def _corrupt(self, data):
		
		if type(self.corruption) == float:
			cdata = np.random.binomial(size=data.shape, n=1, p=1.-self.corruption) * data
		elif np.shape(np.asarray(self.corruption).T) == np.shape(data):
			cdata = self.corruption.T
		else:
			
			if self.data_std is not None and self.data_norm is not None:
				scales = np.random.uniform(low=self.corruption[0], high=self.corruption[1], size=data.shape[1])
				
				data = u.unnormalise(data, self.data_norm[0], self.data_norm[1])
				data = u.unstandardize(data, self.data_std[0], self.data_std[1])
				
				p = np.random.binomial
				noise_maps = [np.random.normal(scale=sig, size=data.shape[0]) for sig in scales] # * p(1, 0.5)
				noise_maps = np.asarray(noise_maps)
				
				cdata = data + noise_maps.T
				
				cdata, _, _ = u.standardize(cdata, self.data_std[0], self.data_std[1])
				cdata, _, _ = u.normalise(cdata, self.data_norm[0], self.data_norm[1])
				
				# Just making sure we're not out of bounds:
				min_thr = 1e-6
				max_thr = 0.99999
				
				#if ((cdata < min_thr).sum() > 0 or (cdata > max_thr).sum() > 0) and False:
				#	print np.amin(data), np.amax(data), np.mean(data), np.std(data)
				#	print 'N/C:', (cdata < min_thr).sum(), (cdata > max_thr).sum()
				cdata[cdata < min_thr] = min_thr
				cdata[cdata > max_thr] = max_thr
				
				#print np.amin(cdata), np.amax(cdata), np.mean(cdata), np.std(cdata)
			else:
				raise RuntimeError("Can't normalise the data (%s, %s). You must provide the normalisation and standardisation values. Giving up." % (self.data_std, self.data_norm))
		#print np.amin(data), np.amax(data)
		#print np.amin(cdata), np.amax(cdata)
		return cdata
Beispiel #32
0
    def predict(file):
        data = pd.read_csv(file)
        data = cleanse_sample(data, keys=['rdate', 'rid', 'hid'], indices=[])

        # pre-process data
        try:
            modify = pd.read_csv(file.replace('.csv', '_modified.csv'))
        except FileNotFoundError:
            modify = RacingPredictor.pre_process(file, persistent=True)

        # perform standardization
        modify = standardize(modify)

        # slice data
        x_test, y_test = slice_naive_data(modify)

        # prediction
        clf = lgb.Booster(model_file='lgb_classifier.txt')

        winprob = clf.predict(x_test)

        data['winprob'] = winprob[:, 1]
        data['plaprob'] = winprob[:, 1] + winprob[:, 2] + winprob[:, 3]

        fixratio = 5e-3
        mthresh = 1.6
        print("Getting win stake...")
        data['winstake'] = fixratio * (data['winprob'] * data['win_t5'] >
                                       mthresh)
        print("Getting place stake...")
        data['plastake'] = fixratio * (data['plaprob'] * data['place_t5'] >
                                       mthresh)

        data.to_csv('test_result.csv')

        return data
Beispiel #33
0
do_regularize = False

y_, song_id, nb_of_songs = load_y(DATADIR)
X_ = load_X(DATADIR, song_id)

# Now  let's mix everything so that we can take test_set and train_set independantly
# We need to separate PER SONG
X_train, y_train, X_test, y_test, song_id_tst = mix(X_, y_, PURCENT,
                                                    NUM_FRAMES, song_id,
                                                    nb_of_songs)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
# print X_train[0:3,0:3]

# standardize data
X_train, scaler = standardize(X_train)
X_test, _ = standardize(X_test, scaler)

X_train = X_train[:, [
    10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148,
    212, 214, 218, 220
]]
X_test = X_test[:, [
    10, 12, 13, 17, 19, 82, 83, 84, 85, 89, 90, 91, 103, 140, 142, 146, 148,
    212, 214, 218, 220
]]
# X_train = X_train[:,[13,85,103,142,214]]
# X_test = X_test[:,[13,85,103,142,214]]

# one dimension at a time
# 0: arousal, 1: valence
Beispiel #34
0
def main(POWER, INPUT_NOISE, C, REG, ATTENTION_HIDDEN, HIDDEN_SIZE, N_LAYERS):
    directory = sys.argv[1]#'data/mat/fox_100x100_matlab.mat'
    D = io.loadmat(directory)
    features0 = D['features'].todense()
## remove identical features
    uniid = []
    for i in range(features0.shape[1]):
        if len(np.unique(np.array(features0[:,i]))) == 1:
            uniid.append(i)
    features = np.delete(features0,uniid,axis = 1)
## standardize all data (maybe flawed)
    all_mean,all_std = utils.standardize(features)
    features = (features - all_mean)/all_std

    from sklearn.decomposition import PCA
    pca = PCA()
    pca.fit(features)
    loading = pca.explained_variance_ratio_

    n_components = len(loading)
    for p in range(len(loading)):
        if sum(loading[:p])>POWER:
            n_components = p
            break
    features = pca.transform(features)[:,:p]
#pdb.set_trace()

#features = features0
#pdb.set_trace()
    labels = np.array(D['labels'].todense())[0]
    bag_ids = D['bag_ids'][0]

    MAX_LENGTH = max([list(bag_ids).count(iBag) for iBag in set(bag_ids)])
    N_FEATURE_DIM = features.shape[1]

    X = np.zeros((len(set(bag_ids)),MAX_LENGTH,N_FEATURE_DIM))
    Y = np.zeros((len(set(bag_ids)),))
    M = np.zeros((len(set(bag_ids)),MAX_LENGTH))

    for iBag in set(bag_ids):
        instance_index = np.where(bag_ids == iBag)[0]
#    print instance_index[0]
#    print np.concatenate((features[instance_index],np.zeros((MAX_LENGTH-len(instance_index[0]),N_FEATURE_DIM))),axis = 0).shape
#    break
        X[iBag-1] = np.concatenate((features[instance_index],np.zeros((MAX_LENGTH-len(instance_index),N_FEATURE_DIM))),axis = 0).astype(theano.config.floatX)
        assert(len(set(labels[instance_index])) == 1)
        Y[iBag -1] = labels[instance_index[0]].astype(theano.config.floatX)
        Y[Y == -1] = 0
        M[iBag-1] = np.concatenate((np.ones(len(instance_index))
                                    ,np.zeros((MAX_LENGTH-len(instance_index)))),axis = 0).astype(theano.config.floatX)

    import csv
### train val test set
    DROPOUT_RATIO = 0
    learning_rate = 0.001
    R = 3
    s = '%s'%REG
    s = s.split()[1] ## s is the name of regularization
    expDir = os.path.join('exp_spearmint/',os.path.basename(directory),'PCA%.1f_innoise_%f_%snorm_%f_attention_%d_hidden_%d_layers_%d'%(POWER, INPUT_NOISE,  s, C, ATTENTION_HIDDEN, HIDDEN_SIZE, N_LAYERS)+os.path.sep)
    if not os.path.isdir(expDir):
        os.makedirs(expDir)
        with open(os.path.join(expDir,'README'),'w') as fid:
            fid.write('learning rate = %f\n'%learning_rate)
            fid.write('dropout ratio = %f\n'%learning_rate)
            fid.write('penalty factor = %f\n'%C)
    result = np.zeros(shape=(3,10))
    for r in range(R):
        k=0
        kf = KFold(X.shape[0],10,True)
        for train_index,test_index in kf:
            input_shape = (None, MAX_LENGTH, N_FEATURE_DIM)
            # Construct network
            layer = lasagne.layers.InputLayer(shape=input_shape, name='Input')
            n_batch, n_seq, n_features = layer.input_var.shape
            # Store a dictionary which conveniently maps names to layers we will
            # need to access later
            layers = {'in': layer}
            # Add dense input layer
            layer = lasagne.layers.GaussianNoiseLayer(layer,INPUT_NOISE)
            layer = lasagne.layers.ReshapeLayer(
                layer, (n_batch*n_seq, input_shape[-1]), name='Reshape 1')
            layer = lasagne.layers.DenseLayer(
                layer, HIDDEN_SIZE, W=lasagne.init.HeNormal(), name='Input dense',
                nonlinearity=lasagne.nonlinearities.leaky_rectify)
            layer = lasagne.layers.ReshapeLayer(
                layer, (n_batch, n_seq, HIDDEN_SIZE), name='Reshape 2')
            # Add the layer to aggregate over time steps
                # We must force He initialization because Lasagne doesn't like
                # 1-dim shapes in He and Glorot initializers
            layer = utils.AttentionLayer(
                    layer,ATTENTION_HIDDEN,
                    W=lasagne.init.Normal(1./np.sqrt(layer.output_shape[-1])),
                    name='Attention')
            for _ in range(N_LAYERS):
                layer = lasagne.layers.DenseLayer(
                    layer, HIDDEN_SIZE, W=lasagne.init.HeNormal(), name='Out dense 1',
                    nonlinearity=lasagne.nonlinearities.leaky_rectify)
                layer = lasagne.layers.DropoutLayer(layer, p=DROPOUT_RATIO)
            # Add final dense layer, whose bias is initialized to the target mean
            layer = lasagne.layers.DenseLayer(
                layer, 1, W=lasagne.init.HeNormal(), name='Out dense 3',
                nonlinearity=lasagne.nonlinearities.sigmoid)
            layer = lasagne.layers.ReshapeLayer(
                layer, (-1,))
            # Keep track of the final layer
            layers['out'] = layer
            #l_norm = regularize_layer_params(layer,l1)
            l_norm = regularize_layer_params(lasagne.layers.get_all_layers(layers['out']),REG)

            # Symbolic variable for target values
            target = T.vector('target')
            # Retrive the symbolic expression for the network
            network_output = lasagne.layers.get_output(layers['out'],deterministic=True)
            # Create a symbolic function for the network cost
            cost = T.mean(lasagne.objectives.binary_crossentropy(network_output,target))
            # try Hinge loss
            #cost = T.mean(lasagne.objectives.binary_hinge_loss(network_output,target))
            cost = cost + C*l_norm
            #cost = T.mean((network_output - target)**2)
            # Retrieve all network parameters
            all_params = lasagne.layers.get_all_params(layers['out'])
            # Compute updates
            updates = lasagne.updates.rmsprop(cost, all_params, learning_rate )
            # Compile training function
            train = theano.function([layers['in'].input_var, target],
                                    cost, updates=updates)

            # Accuracy is defined as binary accuracy
            compute_cost = theano.function([layers['in'].input_var, target],
                                    cost,)
            accuracy = T.sum(lasagne.objectives.binary_accuracy(network_output, target))
            compute_accuracy = theano.function(
                [layers['in'].input_var, target], accuracy)
            #print 'Model built.'


            X_train_all, X_test = X[train_index], X[test_index]
            y_train_all, y_test = Y[train_index], Y[test_index]
            m_train_all, m_test = M[train_index], M[test_index]
            kf_val = KFold(X_train_all.shape[0],10,True)
            for train_ind,val_ind in kf_val:
                X_train, X_val = X_train_all[train_ind], X_train_all[val_ind]
                y_train, y_val = y_train_all[train_ind], y_train_all[val_ind]
                m_train, m_val = m_train_all[train_ind], m_train_all[val_ind]
                break
            ## standardize three sets
#        x_tr_mean,x_tr_std = utils.standardize(X_train)
#
#        X_train = (X_train-x_tr_mean)/x_tr_std
#        X_val = (X_val-x_tr_mean)/x_tr_std
#        X_test = (X_test-x_tr_mean)/x_tr_std
#        print X_train
#        pdb.set_trace()

            MAX_EPOCH = 500
            NO_BEST = 10
            train_acc = np.array([])
            train_cost = np.array([])
            test_acc = np.array([])
            test_cost = np.array([])
            val_acc = np.array([])
            val_cost = np.array([])
            early_stop = False

            for iEpoch in range(MAX_EPOCH):
                b = batch_generator(X_train,y_train,m_train)
                trac = 0
                trco = 0
                for x_b,y_b in b:
                    #print x_b.shape,y_b.shape
                    train(x_b,y_b)
                    b_cost = compute_cost(x_b,y_b)
                    trco += b_cost
                    trac +=  compute_accuracy(x_b,y_b)
                if any([not np.isfinite(b_cost),
                            any([not np.all(np.isfinite(p.get_value()))
                            for p in all_params])]):
        #                logger.info('####### Non-finite values found, aborting')
                    print '####### Non-finite values found, aborting'
                    break
                train_acc = np.append(train_acc, trac/X_train.shape[0]) #compute_accuracy(x_b,y_b)
                train_cost = np.append(train_cost,trco/X_train.shape[0])

                vaco = 0
                vaac = 0
                bv = batch_generator(X_val,y_val,m_val)
                for xv,yv in bv:
                    vaco += compute_cost(xv,yv)
                    vaac += compute_accuracy(xv,yv)
                val_cost = np.append(val_cost,vaco)
                val_acc = np.append(val_acc,vaac)

                teac = 0
                teco = 0
                bt = batch_generator(X_test,y_test,m_test)
                for xt,yt in bt:
                    teac+= compute_accuracy(xt,yt)
                    teco+= compute_cost(xt,yt)
                test_acc = np.append(test_acc,teac)
                test_cost = np.append(test_cost,teac)

                if iEpoch > NO_BEST:
                    early_stop = True
                    last_val = val_cost[-NO_BEST: ]
                    for i,v in enumerate(last_val[:-2]):
                        if last_val[i] >= last_val[i+1]:
                            early_stop = False
                            break
                if early_stop:
                    #print "early stoping, last %s validation costs are: "%NO_BEST + ','.join([str(tmp) for tmp in last_val])
                    break
            best_model = np.argmin(val_cost)

            #print train_acc
            #print train_cost
            #print val_cost
            #print test_acc
            #print 'Reach maxmal validation acc at %dth iteration'%best_model
            #print 'train_cost = %f'%train_cost[best_model]
            #print 'val_cost = %f'%val_cost[best_model]
            #print 'test_acc = %f'%test_acc[best_model]
            result[r][k] = test_acc[best_model]/X_test.shape[0]
            print "%d times, %d folder finished, test acc is %f"%(r,k,test_acc[best_model]/X_test.shape[0])
            #pdb.set_trace()
            with open(os.path.join( expDir, 'val_cost_r%d_k%d.csv'%(r,k) ),'w') as fid:
                writer = csv.writer(fid)
                writer.writerows([val_cost])
            with open(os.path.join( expDir, 'val_acc_r%d_k%d.csv'%(r,k) ),'w') as fid:
                writer = csv.writer(fid)
                writer.writerows([val_acc])
            with open(os.path.join( expDir, 'test_cost_r%d_k%d.csv'%(r,k) ),'w') as fid:
                writer = csv.writer(fid)
                writer.writerows([test_cost])
            with open(os.path.join( expDir, 'test_acc_r%d_k%d.csv'%(r,k) ),'w') as fid:
                writer = csv.writer(fid)
                writer.writerows([test_acc])
            k=k+1

    print np.mean(result[:])
    with open(os.path.join( expDir, 'result_%f_%f.csv'%(np.mean(result[:]),np.std(result[:])) ),'w') as fid:
        writer = csv.writer(fid)
        writer.writerows(result)
Beispiel #35
0
MODEL_PATH = 'best_model'
BASE_DATA_PATH = 'data'

if __name__ == '__main__':
    # Load in training files
    X_train = []
    Y_train = []
    for filename in glob.glob(os.path.join(BASE_DATA_PATH, 'train', '*.npz')):
        data = np.load(filename)
        # Convert to floatX with correct column order
        X_train.append(np.array(
            data['X'], dtype=theano.config.floatX, order='C'))
        Y_train.append(np.array(
            data['Y'], dtype=theano.config.floatX, order='C'))
    # Stack to compute training mean and std
    X_mean, X_std = utils.standardize(np.concatenate(X_train, axis=0))
    Y_mean, Y_std = utils.standardize(np.concatenate(Y_train, axis=0))
    # Compute max length as median of lengths
    max_length_X = int(np.median([len(X) for X in X_train]))
    max_length_Y = int(np.median([len(Y) for Y in Y_train]))

    # Retrieve the hyperparameters which achivieved the lowest objective
    best_params, _ = train_best_network.get_best_trial(RESULTS_PATH)
    # Convert parameters to layer specifications
    (conv_layer_specs,
     dense_layer_specs) = train_network.layer_specs_from_params(best_params)
    # Build networks
    layers = {
        'X': utils.build_network(
            (None, None, X_train[0].shape[-1]), conv_layer_specs,
            dense_layer_specs),
# We need to separate PER SONG
X_train, y_train, X_test, y_test, song_id_tst = mix(X_, y_, PURCENT, NUM_FRAMES, song_id, nb_of_songs)
print X_train.shape, y_train.shape, X_test.shape, y_test.shape
# print X_train[0:3,0:3]
# print np.mean(X_train[:,0:3], axis=0), np.std(X_train[:,0:3], axis=0)
# print np.mean(X_test[:,0:3], axis=0), np.std(X_test[:,0:3], axis=0)

# with(open('train_dummy.txt', mode='w')) as infile:
#     for i in range(X_train.shape[0]):
#         s=''
#         for feat in range(3):
#             s = s + '%g '%X_train[i,feat]
#         infile.write('%s\n'%s)

# standardize data
X_train, scaler = standardize(X_train)
X_test, _ = standardize(X_test, scaler)

# print np.mean(X_train[:,0:3], axis=0), np.std(X_train[:,0:3], axis=0)
# print np.mean(X_test[:,0:3], axis=0), np.std(X_test[:,0:3], axis=0)

# with(open('train_dummy_normed.txt', mode='w')) as infile:
#     for i in range(X_train.shape[0]):
#         s=''
#         for feat in range(3):
#             s = s + '%g '%X_train[i,feat]
#         infile.write('%s\n'%s)

# one dimension at a time
y_train = y_train[:, 0]
y_test = y_test[:, 0]
Beispiel #37
0
yptr = yp_tr.drop(yp_tr.columns.difference(['GPPp', 'ETp', 'SWp']), axis=1)
ypte = yp_te.drop(yp_te.columns.difference(['GPPp', 'ETp', 'SWp']), axis=1)

#yp = yptr.merge(ypte, how="outer")

#print(len(yptr), len(ypte))
#print(yptr, ypte)
#yp = pd.concat([yptr, ypte])
#print(yp)


n = [1,1]
x_tr, n = utils.add_history(yptr, n, 1)
x_te, n = utils.add_history(ypte, n, 1)
x_tr = utils.standardize(x_tr)
x_te = utils.standardize(x_te)

y = y.to_frame()
train_x = x_tr[~x_tr.index.year.isin([2007,2008])]
train_y = y[~y.index.year.isin([2007,2008])]
splits = len(train_x.index.year.unique())

test_x = x_te[x_te.index.year == 2008]
test_y = y[y.index.year == 2008][1:]

print(train_x, train_y)


#print(len(x), len(y))
splits = len(train_x.index.year.unique())
Beispiel #38
0
import scipy.io.wavfile as wavfile
import numpy as np
from numpy import inf
import utils
import matplotlib.pyplot as plt
import pdb

sr, wav = wavfile.read('example2.wav')
wav = np.mean(wav, axis=1)

cqt = utils.cqt(wav)
print(cqt.min())
std_cqt = utils.standardize(cqt)
log_std_cqt = utils.standardize(cqt + 1, log=True)
#log_std_cqt[log_std_cqt == -inf] = 0
pdb.set_trace()
plt.pcolormesh(std_cqt, cmap='jet')
plt.show()
plt.pcolormesh(log_std_cqt, cmap='jet')
plt.show()
Beispiel #39
0
if concatenate:
    df = pd.concat([data_red, data_white])
else:
    if red:
        df = data_red
    else:
        df = data_white


# Filter Nans
df = ut.filter_nans(df)


# Normalize or standardize
df = ut.standardize(df)

df_no_pca = df
label_name = df_no_pca.columns[-1]
df_no_pca = df_no_pca.rename(columns={label_name: 'labels'})  # Change name of the last column to data

###################################################################################################################

df_rfe = ut.reorder_by_RFE(df_no_pca)
pd.options.display.mpl_style = 'default'
utils_plots.plot_principal_component_2D(df_rfe, display=True)
pd.options.display.mpl_style = 'default'
utils_plots.plot_principal_component_3D(df_rfe, display=True)

# with PCA
# Separate data from labels
Beispiel #40
0
    T = tetrahedra.shape[0]
    print 'Reading INRIA .mesh file',meshfile
    print '\tFound', V, 'vertices'
    print '\tFound', T, 'tetrahedra'

    bbox = np.empty((3,2))
    for i in xrange(3):
        bbox[i,0] = np.min(vertices[:,i])
        bbox[i,1] = np.max(vertices[:,i])
        
    kernel = stats.gaussian_kde(vertices.T,0.1)

    G = 40
    grids = [np.linspace(bbox[i,0],bbox[i,1],G) for i in xrange(3)]
    P = make_points(grids)
    P += 0.1*np.random.randn(*P.shape)

    Z = kernel(P.T)
    stdZ = standardize(Z)
    cmap = plt.get_cmap('spectral')
    C = cmap(stdZ)
    C[:,3] = (stdZ)**1.5

    mask = (C[:,3] > 0.025)
    
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(P[mask,0],P[mask,1],P[mask,2],c=C[mask,:],
               s=125,lw=0)
    plt.show()
def rnn_cv( folds, n_hidden=10, n_epochs=50, lr=0.001, lrd = 0.999, reg_coef= 0.01, doSmoothing=False, useEssentia=False):

    doSaveModel = False

    if doSmoothing:
        dir_name = 'nfeat%d_nh%d_ne%d_lr%g_reg%g_smoothed'%(nb_features, n_hidden, n_epochs, lr, reg_coef)
    else:
        dir_name = 'nfeat%d_nh%d_ne%d_lr%g_reg%g'%(nb_features, n_hidden, n_epochs, lr, reg_coef)
    MODELDIR = 'rnn/' + dir_name + '/'
    LOGDIR = MODELDIR

    if not path.exists(MODELDIR):
        makedirs(MODELDIR)

    print '... output dir: %s'%(MODELDIR)

    # smoothing params
    taille = 12
    wts = np.ones(taille-1)*1./taille
    wts = np.hstack((np.array([1./(2*taille)]), wts, np.array([1./(2*taille)])))
    delay = (wts.shape[0]-1) / 2

    # # initialize global logger variable
    # print '... initializing global logger variable'
    # logger = logging.getLogger(__name__)
    # withFile = False
    # logger = settings.init(MODELDIR + 'train.log', withFile)

    # perf_file_name = LOGDIR + 'rnn_nh%d_ne%d_lr%g_reg%g.log'%(n_hidden, n_epochs, lr, reg_coef)
    perf_file_name = LOGDIR + 'performance.log'
    log_f = open(perf_file_name, 'w')

    all_fold_pred = list()
    all_fold_y_test = list()
    all_fold_id_test = list()

    for fold_id in range(10):
        # fold_id = 0
        fold = folds[fold_id]
        t0 = time.time()

        # print '... loading FOLD %d'%fold_id
        # if useEssentia:
            # fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed_essentia.pkl'%(fold_id), "rb" ) )

        if useEssentia:
            X_train = fold['train']['X']
            y_train = fold['train']['y']
            id_train = fold['train']['song_id']

            X_test = fold['test']['X']
            y_test = fold['test']['y']
            id_test = fold['test']['song_id']

        else:
            fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed.pkl'%(fold_id), "rb" ) )
            X_train, y_train, id_train = load_X_from_fold_to_3dtensor(fold, 'train', NUM_OUTPUT)
            X_test, y_test, id_test = load_X_from_fold_to_3dtensor(fold, 'test', NUM_OUTPUT)


        print X_train.shape, y_train.shape, X_test.shape, y_test.shape

        if useMelodyFeatures:
            # first feature = slope, other two = mean, std
            melody_train, melody_test = subset_features(all_song_melody_features, id_train, id_test)
            # melody_train = melody_train[:,:,1:]
            # melody_test = melody_test[:,:,1:]

            # standardize train data
            melody_concat_train = np.reshape(melody_train, (melody_train.shape[0]*melody_train.shape[1], melody_train.shape[2]), order='C')
            melody_concat_train_normed, scaler = standardize(melody_concat_train)
            # print concat_train_normed.shape
            melody_train_normed = np.reshape(melody_concat_train_normed, (melody_train.shape[0], melody_train.shape[1], melody_train.shape[2]), order='C')
            del melody_concat_train, melody_concat_train_normed

            # standardize test data
            melody_concat_test = np.reshape(melody_test, (melody_test.shape[0]*melody_test.shape[1], melody_test.shape[2]), order='C')
            melody_concat_test_normed, _ = standardize(melody_concat_test, scaler)
            # print concat_test_normed.shape
            melody_test_normed = np.reshape(melody_concat_test_normed, (melody_test.shape[0], melody_test.shape[1], melody_test.shape[2]), order='C')
            del melody_concat_test, melody_concat_test_normed

            # concat with the other features
            X_train = np.concatenate((X_train, melody_train_normed), axis=2)
            X_test = np.concatenate((X_test, melody_test_normed), axis=2)

        if useTempoFeatures:
            tempo_train, tempo_test = subset_features(all_song_tempo_features, id_train, id_test)
            # standardize train data
            tempo_concat_train = np.reshape(tempo_train, (tempo_train.shape[0]*tempo_train.shape[1], tempo_train.shape[2]), order='C')
            tempo_concat_train_normed, scaler = standardize(tempo_concat_train)
            # print concat_train_normed.shape
            tempo_train_normed = np.reshape(tempo_concat_train_normed, (tempo_train.shape[0], tempo_train.shape[1], tempo_train.shape[2]), order='C')
            del tempo_concat_train, tempo_concat_train_normed

            # standardize test data
            tempo_concat_test = np.reshape(tempo_test, (tempo_test.shape[0]*tempo_test.shape[1], tempo_test.shape[2]), order='C')
            tempo_concat_test_normed, _ = standardize(tempo_concat_test, scaler)
            # print concat_test_normed.shape
            tempo_test_normed = np.reshape(tempo_concat_test_normed, (tempo_test.shape[0], tempo_test.shape[1], tempo_test.shape[2]), order='C')
            del tempo_concat_test, tempo_concat_test_normed

            # concat with the other features
            X_train = np.concatenate((X_train, tempo_train_normed), axis=2)
            X_test = np.concatenate((X_test, tempo_test_normed), axis=2)

        # print id_test.shape

        # X_train = X_train[0:100,:,:]
        # y_train = y_train[0:100,:,:]

        # X_train = X_train[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]]
        # X_test = X_test[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]]
        # X_train = X_train[:,[13,85,103,142,214]]
        # X_test = X_test[:,[13,85,103,142,214]]

        # X_test = X_train[119:119+y_test.shape[0],:]
        # y_test = y_train[119:119+y_test.shape[0]]


        print X_train.shape, y_train.shape, X_test.shape, y_test.shape
        nb_seq_train, nb_frames_train, nb_features_train = X_train.shape
        nb_seq_test, nb_frames_test, nb_features_test = X_test.shape

        assert nb_frames_train == nb_frames_test, 'ERROR: nb of frames differ from TRAIN to TEST'
        assert nb_features_train == nb_features_test, 'ERROR: nb of features differ from TRAIN to TEST'

        dim_ouput_train = y_train.shape[2]
        dim_ouput_test = y_test.shape[2]

        assert dim_ouput_test == dim_ouput_train, 'ERROR: nb of targets differ from TRAIN to TEST'


        n_in = nb_features_train
        n_out = dim_ouput_train
        n_steps = nb_frames_train

        validation_frequency = nb_seq_train * 2 # for logging during training: every 2 epochs

        model = rnn_model.MetaRNN(n_in=n_in, n_hidden=n_hidden, n_out=n_out,
                        learning_rate=lr, learning_rate_decay=lrd,
                        L1_reg=reg_coef, L2_reg=reg_coef,
                        n_epochs=n_epochs, activation='tanh')

        model.fit(X_train, y_train, validation_frequency=validation_frequency)

        if doSaveModel:
            # model_name = MODELDIR + 'rnn_fold%d_nh%d_nepochs%d_lr%g_reg%g.pkl'%(fold_id, n_hidden, n_epochs, lr, reg_coef)
            model_name = MODELDIR + 'model_fold%d.pkl'%(fold_id)
            model.save(fpath=model_name)

        pred = list()
        for ind_seq_test in xrange(nb_seq_test):
            pred.append(model.predict(X_test[ind_seq_test]))

        y_hat = np.array(pred, dtype=float)
        print y_hat.shape

        if doSmoothing:
            # smooooooth
            y_hat_smooth = np.zeros_like(y_hat, dtype=float)
            for i in xrange(y_hat.shape[0]):
                y_hat_smooth[i, :, 0] = np.convolve(y_hat[i, :, 0], wts, mode='same')
                y_hat_smooth[i, :delay, 0] = y_hat[i, :delay, 0]
                y_hat_smooth[i, -delay:, 0] = y_hat[i, -delay:, 0]
                y_hat_smooth[i, :, 1] = np.convolve(y_hat[i, :, 1], wts, mode='same')
                y_hat_smooth[i, :delay, 1] = y_hat[i, :delay, 1]
                y_hat_smooth[i, -delay:, 1] = y_hat[i, -delay:, 1]


        # save predictions on the test subset, before reshaping to 2-d arrays (I need 3d arrays)
        if doSmoothing:
            # fold_pred = [item for sublist in fold_pred for item in sublist]
            # fold_pred = np.array(fold_pred, dtype=float)
            pred_file = LOGDIR + 'fold%d_test_predictions.pkl'%(fold_id)
            pickle.dump( y_hat_smooth, open( pred_file, "wb" ) )
            print ' ... predictions y_hat_smooth saved in: %s'%(pred_file)
        else:
            # fold_pred = [item for sublist in fold_pred for item in sublist]
            # fold_pred = np.array(fold_pred, dtype=float)
            pred_file = LOGDIR + 'fold%d_test_predictions.pkl'%(fold_id)
            pickle.dump( y_hat, open( pred_file, "wb" ) )
            print ' ... predictions y_hat saved in: %s'%(pred_file)


        if doSmoothing:
            y_hat_smooth = np.reshape(y_hat_smooth, (y_hat_smooth.shape[0]*y_hat_smooth.shape[1], y_hat_smooth.shape[2]))
        y_hat = np.reshape(y_hat, (y_hat.shape[0]*y_hat.shape[1], y_hat.shape[2]))
        y_test_concat = np.reshape(y_test, (y_test.shape[0]*y_test.shape[1], y_test.shape[2]))

        print y_hat.shape, y_test_concat.shape

        assert y_hat.shape == y_test_concat.shape, 'ERROR: pred and ref shapes are different!'

        # concat hyp labels:
        if doSmoothing:
            all_fold_pred.append(y_hat_smooth.tolist())
        else:
            all_fold_pred.append(y_hat.tolist())

        # concat ref labels:
        all_fold_y_test.append(y_test_concat.tolist())

        if doSmoothing:
            RMSE, pcorr, error_per_song, mean_per_song = evaluate(y_test_concat, y_hat_smooth, id_test.shape[0])
        else:
            RMSE, pcorr, error_per_song, mean_per_song = evaluate(y_test_concat, y_hat, id_test.shape[0])

        s = (
                'fold: %d valence: %.4f %.4f arousal: %.4f %.4f\n'
              % (fold_id, RMSE[0], pcorr[0][0], RMSE[1], pcorr[1][0])
        )
        print s
        log_f.write(s)



        # predict on the train set and save predictions (useful to train rnn2)
        if doSmoothing:
            pred = list()
            for ind_seq_train in xrange(nb_seq_train):
                pred.append(model.predict(X_train[ind_seq_train]))

            train_y_hat = np.array(pred, dtype=float)
            print train_y_hat.shape

            train_y_hat_smooth = np.zeros_like(train_y_hat, dtype=float)
            for i in xrange(train_y_hat.shape[0]):
                train_y_hat_smooth[i, :, 0] = np.convolve(train_y_hat[i, :, 0], wts, mode='same')
                train_y_hat_smooth[i, :delay, 0] = train_y_hat[i, :delay, 0]
                train_y_hat_smooth[i, -delay:, 0] = train_y_hat[i, -delay:, 0]
                train_y_hat_smooth[i, :, 1] = np.convolve(train_y_hat[i, :, 1], wts, mode='same')
                train_y_hat_smooth[i, :delay, 1] = train_y_hat[i, :delay, 1]
                train_y_hat_smooth[i, -delay:, 1] = train_y_hat[i, -delay:, 1]

            # no reshape, I need 3d arrays
            # train_y_hat_smooth = np.reshape(train_y_hat_smooth, (train_y_hat_smooth.shape[0]*train_y_hat_smooth.shape[1], train_y_hat_smooth.shape[2]))

            pred_file = LOGDIR + 'fold%d_train_predictions.pkl'%(fold_id)
            pickle.dump( train_y_hat_smooth, open( pred_file, "wb" ) )
            print ' ... predictions y_hat_smooth saved in: %s'%(pred_file)
        else:
            pred = list()
            for ind_seq_train in xrange(nb_seq_train):
                pred.append(model.predict(X_train[ind_seq_train]))

            train_y_hat = np.array(pred, dtype=float)
            pred_file = LOGDIR + 'fold%d_train_predictions.pkl'%(fold_id)
            pickle.dump( train_y_hat, open( pred_file, "wb" ) )
            print ' ... predictions y_hat saved in: %s'%(pred_file)


        doPlot = False
        if doPlot:
            fig, ax = plt.subplots()
            x1 = np.linspace(1, y_test_concat.shape[0], y_test_concat.shape[0])
            if EMO == 'valence':
                ax.plot(x1, y_test_concat[:, 0], 'o', label="Data")
                # ax.plot(x1, y_hat[:,0], 'r-', label="OLS prediction")
                ax.plot(x1, y_hat[:,0], 'ro', label="OLS prediction")
            else:
                ax.plot(x1, y_test_concat[:, 1], 'o', label="Data")
                ax.plot(x1, y_hat[:,1], 'ro', label="OLS prediction")

            plt.title(EMO + ' on Test subset')
            ax.legend(loc="best")
            plt.show()
            # plt.savefig('figures/rnn_%s_fold%d.png'%(EMO, fold_id), format='png')


        doPlotTrain = False
        if doPlotTrain:
            # plt.close('all')
            fig = plt.figure()
            ax1 = plt.subplot(211)
            plt.plot(X_train[0])
            ax1.set_title('input')

            ax2 = plt.subplot(212)
            true_targets = plt.plot(y_train[0])

            guess = model.predict(X_train[0])
            guessed_targets = plt.plot(guess, linestyle='--')
            for i, x in enumerate(guessed_targets):
                x.set_color(true_targets[i].get_color())
            ax2.set_title('solid: true output, dashed: model output')
            plt.show()

        doPlotTest = False
        if doPlotTest:
            # plt.close('all')
            fig = plt.figure()
            ax1 = plt.subplot(211)
            plt.plot(X_test[0])
            ax1.set_title('input')

            ax2 = plt.subplot(212)
            true_targets = plt.plot(y_test[0])

            # guess = model.predict(X_test[0])
            guess = y_hat[0]

            guessed_targets = plt.plot(guess, linestyle='--')
            for i, x in enumerate(guessed_targets):
                x.set_color(true_targets[i].get_color())
            ax2.set_title('solid: true output, dashed: model output')
            plt.show()

        print "... Elapsed time: %f" % (time.time() - t0)

    all_fold_pred = [item for sublist in all_fold_pred for item in sublist]
    all_fold_y_test = [item for sublist in all_fold_y_test for item in sublist]

    all_fold_pred = np.array(all_fold_pred, dtype=float)
    all_fold_y_test = np.array(all_fold_y_test, dtype=float)

    print all_fold_pred.shape, all_fold_y_test.shape

    # save predictions
    pred_file = LOGDIR + 'all_predictions.pkl'
    pickle.dump( all_fold_pred, open( pred_file, "wb" ) )
    print ' ... all predictions saved in: %s'%(pred_file)
    # ref_file = 'rnn/all_groundtruth.pkl'
    # pickle.dump( all_fold_y_test, open( ref_file, "wb" ) )

    # compute t-test p-values with baseline predictions
    baseline_prediction_file = 'rnn/all_baseline_predictions_260feat.pkl'
    baseline_preds = pickle.load(open( baseline_prediction_file, 'r' ))

    pvalue_val = stats.ttest_ind(baseline_preds[:,0], all_fold_pred[:,0])[1]
    pvalue_ar = stats.ttest_ind(baseline_preds[:,1], all_fold_pred[:,1])[1]
    pvalues = (pvalue_val, pvalue_ar)
    RMSE, pcorr, error_per_song, mean_per_song = evaluate(all_fold_y_test, all_fold_pred, 0)

    # print(
    #         'sklearn --> valence: %.4f, arousal: %.4f\n'
    #         'Pearson Corr --> valence: %.4f, arousal: %.4f \n'
    #         # % (RMSE[0], -1. , pcorr[0][0], -1)
    #       % (RMSE[0],RMSE[1],pcorr[0][0], pcorr[1][0])
    # )

    s = (
            'allfolds valence: %.4f %.4f arousal: %.4f %.4f p-values: %.4f, %.4f\n'
          % (RMSE[0], pcorr[0][0], RMSE[1], pcorr[1][0], pvalue_val, pvalue_ar)
    )

    print s
    log_f.write(s)
    log_f.close()
    return RMSE, pcorr, pvalues
    fold_id = 0
    t0 = time.time()

    print '... loading FOLD %d'%fold_id
    fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed.pkl'%(fold_id), "rb" ) )

    X_train, y_train, id_train = load_X_from_fold_to_3dtensor(fold, 'train', NUM_OUTPUT)
    X_test, y_test, id_test = load_X_from_fold_to_3dtensor(fold, 'test', NUM_OUTPUT)

    if useMelodyFeatures:
        # first feature = slope, other two = mean, std
        melody_train, melody_test = subset_features(all_song_melody_features, id_train, id_test)

        # standardize train data
        melody_concat_train = np.reshape(melody_train, (melody_train.shape[0]*melody_train.shape[1], melody_train.shape[2]), order='C')
        melody_concat_train_normed, scaler = standardize(melody_concat_train)
        # print concat_train_normed.shape
        melody_train_normed = np.reshape(melody_concat_train_normed, (melody_train.shape[0], melody_train.shape[1], melody_train.shape[2]), order='C')
        del melody_concat_train, melody_concat_train_normed

        # standardize test data
        melody_concat_test = np.reshape(melody_test, (melody_test.shape[0]*melody_test.shape[1], melody_test.shape[2]), order='C')
        melody_concat_test_normed, _ = standardize(melody_concat_test, scaler)
        # print concat_test_normed.shape
        melody_test_normed = np.reshape(melody_concat_test_normed, (melody_test.shape[0], melody_test.shape[1], melody_test.shape[2]), order='C')
        del melody_concat_test, melody_concat_test_normed

        # concat with the other features
        X_train = np.concatenate((X_train, melody_train_normed), axis=2)
        X_test = np.concatenate((X_test, melody_test_normed), axis=2)
Beispiel #43
0
def plot_mesh(F,vertices,edges,triangles,tetrahedra,**kwargs):
    no_function = (F is None)
    if not no_function:      
        std_F = standardize(F) # Between 0 and 1
        print 'function size',F.shape
        print 'vertices',vertices.shape
        assert F.size == vertices.shape[0]
    else:
        F = 'k'
    V = vertices.shape[0]

    cmap = plt.get_cmap(kwargs.get('cmap','jet'))
    no_nodes = kwargs.get('no_nodes',False)
    no_mesh = kwargs.get('no_mesh',False)
    alpha_fn = kwargs.get('alpha_fn',lambda x : 0.1)
    
    # Plot points
    fig = plt.gcf()
    ax = plt.gca()
    p = ax.scatter(vertices[:,0],
                   vertices[:,1],
                   vertices[:,2],
                   s=25,
                   c = F,
                   alpha=0.25,
                   lw=0,
                   cmap=cmap)
    if not no_function:
        fig.colorbar(p)

    # Build line collection
    if not no_mesh:
        segs = []
        seg_set = set()
        obj_groups = [np.array(x,dtype=np.integer)\
                      for x in [edges,triangles,tetrahedra]]
        for objs in obj_groups:
            if 0 == objs.size:
                continue
            (N,D) = objs.shape
            for i in xrange(N):
                for verts in itertools.combinations(objs[i,:],2):
                    verts = [int(v) - 1 for v in verts]
                    for v in verts:
                        assert 0 <= v < V
                    key = tuple(verts)
                    if key in seg_set:
                        continue
                    seg_set.add(key)
                    segs.append([vertices[x,:] for x in verts])
        S = len(segs)
        linecolors = [0.5,0.5,0.5,0.1] # Dark gray
        print 'Plotting {0} line segments'.format(S)
        seg_collection = Line3DCollection(segs,colors=linecolors)
        ax.add_collection3d(seg_collection)

    # Build a poly collection of faces
    # This makes for a "stained glass" look
    if not no_function:
        poly = []
        poly_set = set()
        obj_groups = [x.astype(np.integer) for x in [triangles,tetrahedra]]
        facecolors = []
        for (I,objs) in enumerate(obj_groups):
            if objs is None or no_function:
                continue
            (N,D) = objs.shape
            for i in xrange(N):
                for verts in itertools.combinations(objs[i,:],3):
                    verts = [int(v) - 1 for v in verts]
                    for v in verts:
                        assert 0 <= v < V
                    key = tuple(verts)
                    if key in poly_set:
                        continue
                    poly_set.add(key)

                    if np.any(np.isnan(std_F[verts])):
                        continue
                    
                    mean_F = np.mean(std_F[verts])
                    alpha = alpha_fn(mean_F)
                    
                    if alpha < 0.025:
                        # Skip if all vertices are greater
                        # than cutoff
                        continue                             
                    triangle = [vertices[x,:] for x in verts]
                    poly.append(triangle)
                    # Color with the mean vertex color
                    color = list(cmap(mean_F))
                    color[3] = alpha
                    facecolors.append(color)
        P = len(poly)
        print 'Plotting {0} triangles'.format(P)
        edgecolors = np.zeros((P,4))
        poly_collection = Poly3DCollection(poly,
                                           facecolors=facecolors,
                                           edgecolors=edgecolors)
        ax.add_collection3d(poly_collection)
Beispiel #44
0
            return self.get_batch()


## data formating
# directory = 'data/mat/fox_100x100_matlab.mat'
directory = sys.argv[1]  #'data/mat/fox_100x100_matlab.mat'
D = io.loadmat(directory)
features0 = D["features"].todense()
## remove identical features
uniid = []
for i in range(features0.shape[1]):
    if len(np.unique(np.array(features0[:, i]))) == 1:
        uniid.append(i)
features = np.delete(features0, uniid, axis=1)
## standardize all data (maybe flawed)
all_mean, all_std = utils.standardize(features)
features = (features - all_mean) / all_std

from sklearn.decomposition import PCA

pca = PCA()
features = pca.fit_transform(features)


# features = features0
# pdb.set_trace()
labels = np.array(D["labels"].todense())[0]
bag_ids = D["bag_ids"][0]

MAX_LENGTH = max([list(bag_ids).count(iBag) for iBag in set(bag_ids)])
N_FEATURE_DIM = features.shape[1]
Beispiel #45
0
def plot_mesh_slice(f,bound,meshfile,**kwargs):
    G = kwargs.get('grid_points',64)
    flat = kwargs.get('flat',True)
    
    assert((3,2) == bound.shape)
    idx = np.where(bound[:,0] == bound[:,1])[0]
    nidx = np.where(bound[:,0] != bound[:,1])[0]
    if 2 != nidx.size:
        print "Check slice bounds, need exactly 2 non-trivial dimensions"
    assert 1 == idx.size

    bound = np.hstack([bound,G*np.ones((3,1))])
    bound[idx,2] = 1
    
    grids = [np.linspace(*list(bound[i,:])) for i in xrange(3)]
    (points,meshes) = make_points(grids,True)

    timestamp = str(time.time())
    point_file = "/tmp/points." + timestamp
    value_file = "/tmp/value." + timestamp
    out_file = "/tmp/out." + timestamp
    arch = Archiver(points=points)
    arch.write(point_file)
    arch.clear()
    arch.add(values=f)
    arch.write(value_file)

    (base,ext) = os.path.splitext(meshfile)
    assert '.mesh' == ext
    
    
    cmd = ['cdiscrete/tet_interp',
           '--mesh',base + '.ctri',
           '--points',point_file,
           '--values',value_file,
           '--out',out_file]
    cmd = ' '.join(cmd)
    print cmd
    try:
        subprocess.check_call(cmd,shell=True)
    except Exception:
        print "Interpolation failed; check .ctri file?"
        quit()
    unarch = Unarchiver(out_file)
    F = np.reshape(unarch.interp,(G,G))
    Fm = np.ma.masked_where(np.isnan(F),F)

    if flat:
        plt.gcf()
        [X,Y] = [meshes[i].squeeze() for i in nidx]
        plt.pcolormesh(X,Y,Fm)
    else:
        Fm = standardize(Fm)
        [X,Y,Z] = [mesh.squeeze() for mesh in meshes]
        fig = plt.gcf()
        ax = fig.gca(projection='3d')
        cmap = plt.get_cmap('jet')
        colors = cmap(Fm)
        colors[...,3]= 0.25*(1-Fm)**1.5
        p = ax.plot_surface(X,Y,Z,
                            rstride=1,cstride=1,
                            facecolors=colors,
                            shade=False)