Python RandomizedPCA.fit Examples, sklearn.decomposition.RandomizedPCA.fit Python Examples

Example #1

0

Show file

def do_RandomizedPCA(armadillo):
    #
    # TODO: Write code to import the libraries required for
    # RandomizedPCA. Then, train your RandomizedPCA on the armadillo
    # dataframe. Finally, drop one dimension (reduce it down to 2D)
    # and project the armadillo down to the 2D principal component
    # feature space.
    #
    # NOTE: Be sure to RETURN your projected armadillo!
    # (This projection is actually stored in a NumPy NDArray and
    # not a Pandas dataframe, which is something Pandas does for
    # you automatically. =)
    #
    # NOTE: SKLearn deprecated the RandomizedPCA method, but still
    # has instructions on how to use randomized (truncated) method
    # for the SVD solver. To find out how to use it, check out the
    # full docs here:
    # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
    #
    # .. your code here ..
    from sklearn.decomposition import RandomizedPCA
    rpca = RandomizedPCA(n_components=2)
    rpca.fit(armadillo)
    rpca.transform(armadillo)
    return rpca.transform(armadillo)

Example #2

0

Show file

File: sklearn_classification.py Project: xc6su/Image-Classification

def pca2(Xtrain, Xtest):
    newTrain = []
    pca = RandomizedPCA(n_components=len(Xtrain[0]) - 30)
    pca.fit(Xtrain)
    newTrain = pca.transform(Xtrain)
    newTest = pca.transform(Xtest)
    return newTrain, newTest

Example #3

0

Show file

File: number_recognition.py Project: goa5t/CS6316---Machine-Learning

def pca_knn(train, test):
    y = []
    Xtrain, ytrain, Xtest, ytest = loadData(train, test)

    #PCA, fit and transform
    pca = RandomizedPCA(n_components=200)
    pca.fit(Xtrain)
    Xtrain = pca.transform(Xtrain)
    new_Xtest = pca.transform(Xtest)

    #Make classifier
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(Xtrain, ytrain)
    y = clf.predict(new_Xtest)

    #y1 = clf.predict(Xtrain)
    #terror = test_error(ytrain, y1)
    #print "training error for KNN, k=3:"
    #print terror

    error = test_error(ytest, y)
    print "test error for KNN, k=3:"
    print error
    print "\\\\\\\\\\\\\\\\"

    return y

Example #4

0

Show file

def compute_pca(reception_stats, n_components=5):
    reception_mean = reception_stats.mean(axis=0)
    pca = RandomizedPCA(n_components - 1)
    pca.fit(reception_stats)
    pca_components = np.vstack([reception_mean, pca.components_])

    return pca, pca_components

Example #5

0

Show file

File: recognizer.py Project: samhappymind/Face-Recognition

def train_data():
    n_components = 256    
    pca = RandomizedPCA(n_components=n_components, whiten=True)
    clf=svm.SVC(kernel='rbf',C=5., gamma=0.001)
    
    train_directory = 'dataset/real_train'
    
    
    images, labels = prepare_dataset(train_directory)
    
    training_data=[]
    
    for i in range(len(images)):
        training_data.append(images[i].flatten())
    
    print("% shape of traing data => ",np.array(training_data).shape)
    
    print('labels =>',np.array(labels).shape)
   
    
    pca.fit(np.array(training_data))
    transformed = pca.transform(np.array(training_data))
    
    filename = 'models/pca_model.sav'
    pickle.dump(pca, open(filename, 'wb'))
    
    print("% shape of transformed data => ",transformed.shape)
    
    clf.fit(transformed,np.array(labels))
    
    filename = 'models/svm_model.sav'
    pickle.dump(clf, open(filename, 'wb'))

Example #6

0

Show file

def open_img():
    x = filedialog.askopenfilenames(
    	parent=root,
    	initialdir='/',
    	initialfile='tmp',
    	filetypes=[
    		("All files", "*")])
    
    img = Image.open(x[0])
    img = img.resize((250, 250), Image.ANTIALIAS)
    img = ImageTk.PhotoImage(img)
    panel = tk.Label(root, image=img)
    panel.image = img
    panel.grid(row=70, column=1)
            
    image = cv2.imread(x[0])
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    cv2.imwrite("grey.jpeg", gray)
    gray.shape
    img = mpimg.imread("grey.jpeg")

    f=compo()
    ipca = RandomizedPCA(f)
    ipca.fit(img)
    img_c = ipca.transform(img)
    print(img_c.shape)
    temp = ipca.inverse_transform(img_c)
    
    print(temp.shape)
    cv2.imwrite("pca1.jpg", temp)
    print(np.sum(ipca.explained_variance_ratio_))
    plt.plot(np.cumsum(ipca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance');
    plt.savefig("graph.jpg")

Example #7

0

Show file

def main():
    X_train, X_test, y_train, y_test, y_encoder = get_binary_encoded_xy_split(5000)
    # reduce 1000 X 1024 dimensions to 11 (number of X columns before label binarization in table)
    X_train_randPCA = RandomizedPCA()
    X_train_randPCA.fit(X_train)
    print("pca fit")

    X_train_reduced = X_train_randPCA.transform(X_train)
    X_test_reduced = X_train_randPCA.transform(X_test)

    print("Reduced components")
    print("Begin classifier")
    clf = GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=1)
    print(y_train.shape, y_test.shape)
    print(y_encoder.classes_)
    print(y_encoder.transform(["Accident"]))
    print(np.where(y_encoder.classes_ == "Accident"))
    clf.fit(X_train_reduced, y_train[:, np.where(y_encoder.classes_=="Accident")[0]])
    print("Fitted")
    print("_" * 80)
    feature_vals = y_encoder.transform(y_encoder.classes_)
    feature_labels = y_encoder.classes_
    print(feature_vals)
    print(feature_labels)
    fig, axs = plot_partial_dependence(clf, X_train,[0,1], n_jobs=4, grid_resolution=100)
    plt.show()

Example #8

0

Show file

    def pca(self, y):

        # select a random subset of Y dimensions (possibly gives robustness as well as speed)
        rand_dims = np.sort(
            np.random.choice(y.shape[1],
                             np.minimum(self.tree_params['num_dims_for_pca'],
                                        y.shape[1]),
                             replace=False))
        y_dim_subset = y.take(rand_dims, 1)

        pca = RandomizedPCA(n_components=1)  # compute for all components

        # optional: select a subset of exs (not so important if PCA is fast)
        if self.tree_params['sub_sample_exs_pca']:
            rand_exs = np.sort(
                np.random.choice(y.shape[0],
                                 np.minimum(
                                     self.tree_params['num_exs_for_pca'],
                                     y.shape[0]),
                                 replace=False))
            pca.fit(y_dim_subset.take(rand_exs, 0))
            return pca.transform(y_dim_subset)

        else:
            # perform PCA
            return pca.fit_transform(y_dim_subset)

Example #9

0

Show file

File: fs.py Project: purblue10/si650_project

def rpca(train_X, test_X, n):
	start_time = time.time()
	pca = RandomizedPCA(n_components=n)
	pca.fit(train_X.toarray())
	train_X_pca = pca.transform(train_X.toarray())
	test_X_pca = pca.transform(test_X.toarray())
	print("--- %s seconds ---" % (time.time() - start_time))
	return pca, train_X_pca, test_X_pca

Example #10

0

Show file

    def compute_PCA(n_components=5):
        spec_mean = spectra.mean(axis=0)
        print spec_mean.shape

        #Randomized PCA is faster (according to astroML):
        pca = RandomizedPCA(n_components - 1)
        pca.fit(spectra)
        pca_components = np.vstack([spec_mean, pca.components_])

        return pca_components

Example #11

0

Show file

def RPCA(model_data, components=None, transform_data=None):
    t0 = time()
    rpca = RandomizedPCA(n_components=components)
    if transform_data == None:
        projection = rpca.fit_transform(model_data)
    else:
        rpca.fit(model_data)
        projection = rpca.transform(transform_data)
    print "Randomized PCA Explained Variance: ", rpca.explained_variance_ratio_
    print "Randomized PCA Time: %0.3f" % (time() - t0)
    return projection

Example #12

0

Show file

def pca_knn(train, test):
    y = []
    Xtrain, ytrain, Xtest, ytest = load_data(train, test)
    dim_red = RandomizedPCA(n_components=43)
    dim_red.fit(Xtrain)
    Rtrain = dim_red.transform(Xtrain)
    Rtest = dim_red.transform(Xtest)
    clf = KNeighborsClassifier(n_neighbors=knn_para[2], weights='distance')
    clf.fit(X=Rtrain, y=ytrain)
    y = clf.predict(X=Rtest)
    #print(1 - clf.score(X=Rtest, y=ytest))
    return y

Example #13

0

Show file

def pca_test(img_kind):
	import pylab as pl
	from mpl_toolkits.mplot3d import Axes3D

	subdir = "data/"

	classes = []
	data = []

	the_ones = glob.glob(subdir + "f_" + img_kind + "*.jpg")
	all_of_them = glob.glob(subdir + "f_*_*.jpg")
	the_others = []

	for x in all_of_them:
		if the_ones.count(x) < 1:
			the_others.append(x)
	
	for x in the_ones:
		classes.append(1)
		data.append(get_image_features(cv.LoadImageM(x)))
	
	for x in the_others:
		classes.append(-1)
		data.append(get_image_features(cv.LoadImageM(x)))
	
	pca = PCA(46, whiten=True)
	print 'fiting'
	pca.fit(data)
	print 'transforming'
	X_r = pca.transform(data)
	print '----'

	print X_r.shape

	x0 = [x[0] for x in X_r]
	x1 = [x[1] for x in X_r]

	pl.figure()

	for i in xrange(0,len(x0)):
		if classes[i] == 1:
			pl.scatter(x0[i], x1[i], c = 'r')
		else:
			pl.scatter(x0[i], x1[i], c = 'b')
	

	
	# for c, i, target_name in zip("rg", [1, -1], target_names):
	#     pl.scatter(X_r[classes == i, 0], X_r[classes == i, 1], c=c, label=target_name)
	pl.legend()
	pl.title('PCA of dataset')

	pl.show()

Example #14

0

Show file

def pca_svm(train, test):
    y = []
    Xtrain, ytrain, Xtest, ytest = load_data(train, test)
    dim_red = RandomizedPCA(n_components=50)
    dim_red.fit(Xtrain)
    Rtrain = dim_red.transform(Xtrain)
    Rtest = dim_red.transform(Xtest)
    clf = SVC(kernel='poly', C=1, gamma=0.02)
    clf.fit(X=Rtrain, y=ytrain)
    y = clf.predict(X=Rtest)
    #print(1 - clf.score(X=Rtest, y=ytest));
    return y

Example #15

0

Show file

File: dim.py Project: bcocanougher/polyseq

    def bootstrap_pc(seed):
        np.random.seed(seed)

        b = np.copy(zscored)
        nrows, ncols = b.shape
        for i in range(ncols):
            b[:, i] = b[:, i][np.random.permutation(nrows)]

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            pca = RandomizedPCA(n_components=1)
        pca.fit(b)
        return pca.explained_variance_[0]

Example #16

0

Show file

def transform_PCA(k, train_X, test_X):
    pca = RandomizedPCA(n_components=k)
    pca.fit(train_X)

    # Transform test data with principal components:
    X_reduced = pca.transform(test_X)

    # Reconstruct:
    X_rec = np.dot(X_reduced, pca.components_)

    # Restore mean:
    X_rec += pca.mean_
    return X_rec

Example #17

0

Show file

def pca_test(X):  #这个函数是用来返回最佳的n值，即ng讲的那个测评函数要达到99%
    pca = RandomizedPCA()
    pca.fit(X)
    n_components = X.shape[1]
    for n in range(10, X.shape[1], 5):
        s = sum(pca.explained_variance_ratio_[:n])
        if (s >= 0.99):
            n_components = n
            print n
            #print "%d is best for pca" %n_components
            break
    #pca.set_params(n_components=n_components)
    return n_components

Example #18

0

Show file

File: number_recognition.py Project: CabbageUVa/Machine-Learning-course-project

def pca_knn(train, test):
    fid = open(train)
    tid = open(test)
    for line in fid:
        line = line.strip()
        m = [int(float(x)) for x in line.split(' ')]
        train_label.append(m[0])
        train_data.append(m[1:])

    for line in tid:
        line = line.strip()
        m = [int(float(x)) for x in line.split(' ')]
        test_real_label.append(m[0])
        test_data.append(m[1:])


    pca = RandomizedPCA(n_components=5)
    pca.fit(train_data)

    train_data_5 = pca.transform(train_data)
    test_data_5 = pca.transform(test_data)
    count = 0

    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(train_data_5, train_label)
    y1 = neigh.predict(test_data_5)

    for i in range(2007):
        if int(float(y1[i])) == test_real_label[i]:
            count += 1
    acc1 = count * 1.0 / 2007

    pca = RandomizedPCA(n_components=20)
    pca.fit(train_data)

    train_data_20 = pca.transform(train_data)
    test_data_20 = pca.transform(test_data)
    count = 0

    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(train_data_20, train_label)
    y2 = neigh.predict(test_data_20)

    for i in range(2007):
        if int(float(y2[i])) == test_real_label[i]:
            count += 1
    acc2 = count * 1.0 / 2007

    y = y2
    #acc1 = 0.7777      acc2 = 0.9337
    return y

Example #19

0

Show file

def run_stuff ():
    dataset = refactor_labels(get_data("C:\\Users\\user\\PycharmProjects\\AnxietyClassifier(2)\Alls_data_NO_specific_vars_corr.xlsx", "Sheet1"),"group")
    dataset = imputing_avarage(dataset)
    features_df = dataset.drop(['Age','group','PHQ9','Subject_Number'],1)
    X = features_df.values
    X = StandardScaler().fit_transform(X)

    #X = array[:,3:116]
    pca = RandomizedPCA(50)
    pca.fit(X)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance');
    plt.show()

Example #20

0

Show file

File: analyzer.py Project: ztencmcp/ivre

 def _pca(hosts, dim=2):
     """
     Principal component analysis
     Reduce the numpy-matrix hosts to a dim-dimensional vector space
     """
     pca = RandomizedPCA(n_components=dim)
     pca.fit(hosts)
     # Return most discriminating values by axis.
     pca_indexes = []
     for ind in xrange(dim):
         vect = pca.components_[ind]
         pca_indexes.append([(idx, vect[idx])
                             for idx in (-abs(vect)).argsort()])
     return (pca, pca_indexes)

Example #21

0

Show file

File: assignment1.py Project: gaurprabhakar94/Dat210x

def do_RandomizedPCA(armadillo):

    # For importing the libraries required for RandomizedPCA.
    from sklearn.decomposition import RandomizedPCA

    # Training the RandomizedPCA on the armadillo dataframe, then
    # dropping one dimension and projecting the armadillo
    # down to the 2D principal component feature space.
    rpca = RandomizedPCA(n_components=2)
    rpca.fit(armadillo)

    RRarmadillo = rpca.transform(armadillo)

    return RRarmadillo

Example #22

0

Show file

File: number_recognition.py Project: CabbageUVa/Machine-Learning-course-project

def pca_svm(train, test):
    fid = open(train)
    tid = open(test)
    for line in fid:
        line = line.strip()
        m = [int(float(x)) for x in line.split(' ')]
        train_label.append(m[0])
        train_data.append(m[1:])

    for line in tid:
        line = line.strip()
        m = [int(float(x)) for x in line.split(' ')]
        test_real_label.append(m[0])
        test_data.append(m[1:])

    pca = RandomizedPCA(n_components=5)
    pca.fit(train_data)

    train_data_5 = pca.transform(train_data)
    test_data_5 = pca.transform(test_data)
    trained_model = SVC(C=100, kernel='rbf', degree=3,
                        gamma=0.01)
    trained_model.fit(train_data_5, train_label)
    count = 0
    y1 = trained_model.predict(test_data_5)
    for i in range(2007):
        if int(float(y1[i])) == test_real_label[i]:
            count += 1
    acc1 = count * 1.0 / 2007

    pca = RandomizedPCA(n_components=20)
    pca.fit(train_data)

    train_data_20 = pca.transform(train_data)
    test_data_20 = pca.transform(test_data)
    trained_model = SVC(C=100, kernel='rbf', degree=3,
                        gamma=0.01)
    trained_model.fit(train_data_20, train_label)
    count = 0
    y2 = trained_model.predict(test_data_20)
    for i in range(2007):
        if int(float(y2[i])) == test_real_label[i]:
            count += 1
    acc2 = count * 1.0 / 2007

    y = y2
    #acc1 = 0.7997      acc2 = 0.9417
    return y

Example #23

0

Show file

def train(directory):
    images, labels = prepare_dataset(directory)
    n_components = 10
    pca = RandomizedPCA(n_components=n_components, whiten=True)

    param_grid = {
        'C': [1e3, 5e3, 1e4, 5e4, 1e5],
        'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
    }
    clf = GridSearchCV(
        SVC(kernel='rbf', class_weight='auto', probability=True), param_grid)

    testing_data = []
    for i in range(len(images)):
        print images[i].flatten().shape
        testing_data.append(images[i].flatten())

    pca = pca.fit(testing_data)

    transformed = pca.transform(testing_data)
    clf.fit(transformed, labels)
    scores = cross_val_score(clf, transformed, labels, cv=5)
    print("Mean cross-validation accuracy")
    print(sum(scores) / 5)
    joblib.dump(clf, "svm.pkl")
    joblib.dump(pca, "pca.pkl")

Example #24

0

Show file

File: assignment1.py Project: jeffmkw/DAT210x-Lab

def do_RandomizedPCA(armadillo):
    #
    # TODO: Write code to import the libraries required for RandomizedPCA. Then, train your RandomizedPCA on the armadillo
    # dataframe. Finally, drop one dimension (reduce it down to 2D) and project the armadillo down to the 2D principal component
    # feature space.
    #
    # NOTE: Be sure to RETURN your projected armadillo!
    # (This projection is actually stored in a NumPy NDArray and not a Pandas dataframe, which is something Pandas does for
    # you automatically. =)
    #
    # .. your code here ..
    from sklearn.decomposition import RandomizedPCA
    rpca = RandomizedPCA(n_components=2)
    rpca.fit(armadillo)
    R = rpca.transform(armadillo)
    return R

Example #25

0

Show file

def gap_statistic(x, random_datasets=64):
    """
    Returns the gap statistic of the data set. Keeps increasing the number of clusters until the maximum gap statistic is more than double the current gap statistic.
    http://blog.echen.me/2011/03/19/counting-clusters/
    """
    assert isinstance(x, np.ndarray)
    assert len(x.shape) == 2

    if x.shape > SETTINGS.GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD:
        pca = RandomizedPCA(SETTINGS.GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD)
    else:
        pca = PCA()

    pca.fit(x)
    transformed = pca.transform(x)

    reference_datasets = [
        pca.inverse_transform(generate_random_dataset(transformed))
        for _ in range(random_datasets)
    ]

    max_gap_statistic = -1
    best_num_clusters = 1

    for num_clusters in range(1, x.shape[0] + 1):
        kmeans = MiniBatchKMeans(num_clusters)
        kmeans.fit(x)

        trained_dispersion = dispersion(kmeans, x)

        random_dispersions = [
            dispersion(kmeans, data) for data in reference_datasets
        ]

        gap_statistic = np.log(sum(random_dispersions) /
                               random_datasets) - np.log(trained_dispersion)

        if gap_statistic > max_gap_statistic:
            max_gap_statistic = gap_statistic
            best_num_clusters = num_clusters

        if gap_statistic < max_gap_statistic * SETTINGS.GAP_STATISTIC.MAXIMUM_DECLINE:
            break
        if num_clusters > best_num_clusters + SETTINGS.GAP_STATISTIC.NUM_CLUSTERS_WITHOUT_IMPROVEMENT:
            break

    return best_num_clusters

Example #26

0

Show file

File: Projections.py Project: ptdtan/FastProject

def perform_weighted_PCA(data, weights, max_components=200):
    """
    Performs Weighted PCA on the data

    Parameters
    ----------
    data : (Num_Features x Num_Samples) numpy.ndarray (or subclass)
        Matrix containing data to project into 2 dimensions

    weights : (Num_Features x Num_Samples) numpy.ndarray (or subclass)
        Matrix containing weights to use for each coordinate in data

    max_components: int
        Maximum number of components to calculate

    Returns
    -------
    pca_data : (Num_Components x Num_Samples) numpy.ndarray
        Data transformed using PCA.  Num_Components = Num_Samples

    """
    np.random.seed(RANDOM_SEED)

    proj_data = data

    #Weighted means
    wmean = np.sum(proj_data * weights, axis=1) / np.sum(weights, axis=1)
    wmean = wmean.reshape((wmean.size, 1))

    data_centered = proj_data - wmean
    weighted_data_centered = data_centered * weights

    wcov = np.dot(weighted_data_centered, weighted_data_centered.T) / np.dot(
        weights, weights.T)
    wcov[np.isnan(wcov)] = 0.0
    # Need this when weight dot product is zero
    model = RandomizedPCA(n_components=min(proj_data.shape[0],
                                           proj_data.shape[1], max_components))
    model.fit(wcov)
    e_vec = model.components_

    wpca_data = np.dot(e_vec, data_centered)
    e_val = np.var(wpca_data, axis=1)
    total_var = np.sum(np.var(proj_data, axis=1))
    e_val /= total_var

    return wpca_data, e_val, e_vec.T

Example #27

0

Show file

File: ClusterDimRedFeatTransf.py Project: manimalakumar/ML-Unrevealed

def callRandomizedPCA(X, n, type):
    # type = 1 for Energy data to avoid 1D plot, 2 for others
    rpca = RandomizedPCA(n_components=n)
    rpca.fit(X)
    transformed = rpca.transform(X)
    print("original shape:   ", X.shape)
    print("transformed shape after Randomized PCA:", transformed.shape)
    X_recons = rpca.inverse_transform(transformed)
    print("reconstruct shape after Randomized PCA:", X_recons.shape)

    if type == 2:  # Gstore data
        myplot(transformed[:, 0:2], np.transpose(rpca.components_[0:2, :]))
        plt.show()
        myplot(X_recons[:, 0:2], np.transpose(rpca.components_[0:2, :]))
        plt.show()

    return transformed

Example #28

0

Show file

File: reduction.py Project: 2dpodcast/cs109-project-1

class RandomizedPCAReduction(AbstractReduction):
    """
    Use Randomized PCA to reduce dimensionality

    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.RandomizedPCA.html
    """
    def __init__(self, n_components, **kwargs):
        self.pca = RandomizedPCA(n_components=n_components, **kwargs)

    def n_components(self):
        return self.n_components

    def fit(self, X):
        self.pca.fit(X)

    def transform(self, X):
        return self.pca.transform(X)

Example #29

0

Show file

File: number_recognition.py Project: fitrialif/MachineLearning-1

def pca_knn(train, test):
	y = []
	xTrain, yTrain = loadData(train)
	xTest, yTest = loadData(test)
	for i in [32, 64, 128] :
		print "n_components", i
		pca = RandomizedPCA(n_components = i, random_state = 1)
		pca.fit(xTrain)
		reducedXTrain = pca.transform(xTrain)
		reducedXTest = pca.transform(xTest)
		kNN = KNeighborsClassifier(n_neighbors = 4, weights = 'distance')
		kNN.fit(reducedXTrain, yTrain)
		y = kNN.predict(reducedXTest)
		testError = 1 - kNN.score(reducedXTest, yTest)
		print 'Test error: ' , testError
		print "sum of explained_variance_ratio_", pca.explained_variance_ratio_.sum()
	return y

Example #30

0

Show file

File: 06_compute_pca.py Project: vikashranjan/voxlets

def fit_and_save_pca(np_array, savepath):

    if parameters['pca']['subsample_length'] < np_array.shape[0]:
        idxs = np.random.choice(np_array.shape[0],
                                parameters['pca']['subsample_length'],
                                replace=False)
        np_array = np_array[idxs]

    # fit the pca model
    # NOTE that by setting copy=False, we overwrite the input data in fitting.
    # This helps on memory but could cause issues if this function is reused elsewhere.
    pca = RandomizedPCA(n_components=parameters['pca']['number_dims'],
                        copy=False)
    pca.fit(np_array)

    with open(savepath, 'wb') as f:
        pickle.dump(pca, f, pickle.HIGHEST_PROTOCOL)