Example #1
0
def dim_redux():
    directions = ['left', 'right', 'up', 'down']

    df = pandas.read_csv(FILE_RECORD_MOVES, sep='|', header=None)
    df = df.iloc[:20, :]
    columns = df.columns.tolist()
    index_direction = columns[-1]
    # df = df[columns[:len(columns) // 2] + [index_direction]]

    x = df[columns[:len(columns) // 2]]
    y = df[index_direction]

    # Set 1 column for each direction {0, 1}
    for direction in directions:
        df[direction] = df[index_direction].map(
            lambda s: s == direction and 1 or 0)

    vectors_to_keep = []
    for direction in directions:
        x_train = x[y == direction]

        pca = PCA(n_components=2)
        pca.fit(x_train)

        eigenval = pca.explained_variance_ratio_
        eigenvect = pca.components_

        vectors_to_keep.append(eigenvect[0])
        if eigenval[1] > 0.1:
            vectors_to_keep.append(eigenvect[1])

    vectors_to_keep = reduce_space_to_base(vectors_to_keep)
    print("Base :")
    print(vectors_to_keep)
Example #2
0
def pca(tx, ty, rx, ry):
    compressor = PCA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    #eigenvalues = compressor.explained_variance_
    print "PCA"
    # for eigenvalue, eigenvector in zip(eigenvalues, compressor.components_):    
    #     print(eigenvalue)
    # variance = compressor.explained_variance_ratio_ #calculate variance ratios
    # var = np.cumsum(np.round(compressor.explained_variance_ratio_, decimals=3)*100)
    # print var
    #print compressor.explained_variance_
    #print compressor.explained_variance_ratio_
    print compressor.explained_variance_ratio_.cumsum()
    print compressor.singular_values_
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    #em(newtx, ty, newrx, ry, add="wPCAtr", times=10)
    #km(newtx, ty, newrx, ry, add="wPCAtr", times=10)
    # var=np.cumsum(np.round(compressor.explained_variance_ratio_, decimals=3)*100)
    # print var
    # plt.ylabel('% Variance Explained')
    # plt.xlabel('# of Features')
    # plt.title('PCA Analysis')
    # plt.ylim(30,100.5)
    # plt.style.context('seaborn-whitegrid')
    # plt.plot(var)
    # plt.savefig('PCA.png')
    # plt.show()
    nn(newtx, ty, newrx, ry, add="wPCA")    
Example #3
0
class PCACCLayer(Layer):
    def __init__(self, n_out):
        self.pca = PCA(n_components=n_out)

    def get_train_output_for(self, inputX):
        batches, n_in, rows, cols = inputX.shape
        # 归一化
        # inputX = norm4d(inputX)
        # inputX, self.P1 = whiten4d(inputX)
        myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4d')
        inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in))
        outputX = self.pca.fit_transform(inputX)
        outputX = outputX.reshape((batches, rows, cols, -1)).transpose(
            (0, 3, 1, 2))
        myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pca')
        return outputX

    def get_test_output_for(self, inputX):
        batches, n_in, rows, cols = inputX.shape
        # 归一化
        # inputX = norm4d(inputX)
        # inputX = whiten4d(inputX, self.P1)
        myUtils.visual.save_map(inputX[[10, 100, 1000]], dir_name, 'norm4dte')
        inputX = inputX.transpose((0, 2, 3, 1)).reshape((-1, n_in))
        outputX = self.pca.transform(inputX)
        outputX = outputX.reshape((batches, rows, cols, -1)).transpose(
            (0, 3, 1, 2))
        myUtils.visual.save_map(outputX[[10, 100, 1000]], dir_name, 'pcate')
        return outputX
Example #4
0
def calc_pca(bnd, npc=None, preaverage=False, use_unbiased=False, \
    method='mdp'):
    '''
    Parameters
    ----------
    bnd : BinnedData
      binned data
    npc : int or None, optional
      number of PCs to calculate, defaults to None
    preaverage : bool
      average across repeats?
      
    Returns
    -------
    score : ndarray
      (npc, nobs)
    weight : ndarray
      (npc, nvar)
    '''
    assert method in ['mdp', 'skl']
    data = format_for_fa(bnd, preaverage=preaverage,
                         use_unbiased=use_unbiased)
    if method == 'mdp':    
        pca_node = mdp.nodes.PCANode(output_dim=npc)
        score = pca_node.execute(data)
        weight = pca_node.get_projmatrix()
    elif method == 'skl':
        pca_obj = PCA(n_components=npc)
        score = pca_obj.fit(data).transform(data)
        weight = pca_obj.components_.T
    return score.T, weight.T
Example #5
0
def recommender_system_using_svd_pca(user_id, user_movies, movie_tag_vector,
                                     genome_tags, model):
    movies_watched = list(user_movies[user_id])

    movies_watched_tags = {}
    for movie in movies_watched:
        movie_tags = movie_tag_vector[movie]
        for tag in list(movie_tags.keys()):
            movies_watched_tags[tag] = 1
    tags = list(movies_watched_tags.keys())

    # tags = list(genome_tags.keys())
    all_movies = list(movie_tag_vector.keys())
    all_movies, movie_tag_matrix = build_movie_tag_matrix(
        all_movies, tags, movie_tag_vector)

    if model == 'PCA':
        pca = PCA(n_components=min(10, len(tags)))
        U = pca.fit_transform(movie_tag_matrix)
    else:
        U, S, Vt = np.linalg.svd(movie_tag_matrix, full_matrices=False)

    watched_indexed, U_watched, rest_indexed, U_rest \
        = split_output(U, movies_watched, all_movies)

    similarity_mapping = get_similarity_mapping(watched_indexed, U_watched,
                                                rest_indexed, U_rest)

    weighted_similarities = weigh_similarities(user_id, similarity_mapping)

    return weighted_similarities
Example #6
0
class PCADecomposition(AbstractPreProcessor):
    pca = None
    no_components = 2

    def fit(self, data, y=None):
        self.pca = PCA(n_components=self.no_components)
        self.pca.fit(data)

    def fit_transform(self, data, y=None):
        self.fit(data, y)
        return self.transform(data, y)

    def transform(self, data, y=None):
        data = self._check_input(data)
        output = self.pca.transform(data)
        output = self._check_output(data, output)
        return output

    def _check_output(self, data, output):
        if isinstance(data, pd.DataFrame):
            columns = [
                'Component ' + str(x + 1) for x in range(self.no_components)
            ]
            output = pd.DataFrame(data=output,
                                  columns=columns,
                                  index=data.index)
            return output
Example #7
0
def main():
    print('Reading data file')
    data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv',
                       usecols=['Sentiment', 'SentimentText'], error_bad_lines=False)

    print('Preprocess')
    corpus = data['SentimentText']
    vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                                 stop_words='english', tokenizer=tokenize)
    X = vectorizer.fit_transform(corpus.values)
    y = data['Sentiment'].values

    print('Train sentiment classification')
    classifier = MultinomialNB()
    classifier.fit(X, y)

    print('Word2Vec')
    corpus = corpus.map(lambda x: tokenize(x))
    word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4)
    word2vec.init_sims(replace=True)

    print('Fitting 2 PCA')
    #word_vectors = [word2vec[word] for word in word2vec.vocab] # pre -1.0.0
    word_vectors = [word2vec[word] for word in word2vec.wv.vocab]  # in genism 1.0.0+ should use

    pca = PCA(n_components=2)
    pca.fit(word_vectors)
Example #8
0
def build_images_KMeans(spectra,
                        spectrum_columns,
                        spectra_distances,
                        TSNE_learning_rate=500,
                        TSNE_n_iter=1500,
                        TSNE_learning_rate2=300):
    colors = ['red', 'black']
    c = spectra['marked'].apply(lambda x: colors[x])

    plt.subplots(figsize=(18, 6))
    plt.subplot(131)
    plt.title("PCA")
    pca = PCA(n_components=2, random_state=42)
    spectra_2D = pca.fit_transform(spectra[spectrum_columns])
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    plt.subplot(132)
    plt.title("TSNE, Euclidean distance")
    tsne = TSNE(n_components=2,
                random_state=42,
                learning_rate=TSNE_learning_rate,
                n_iter=TSNE_n_iter)
    spectra_2D = tsne.fit_transform(spectra[spectrum_columns])
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    plt.subplot(133)
    plt.title("TSNE, Chosen distance")
    tsne = TSNE(n_components=2,
                random_state=42,
                metric="precomputed",
                learning_rate=TSNE_learning_rate2)
    spectra_2D = tsne.fit_transform(spectra_distances)
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)
    plt.show()

    # visualization - tsne with chosen distance
    print('Clustering')
    plt.subplots(figsize=(18, 12))
    plt.subplot(3, 3, 1)

    colors = ['red', 'black']
    c = spectra['marked'].apply(lambda x: colors[x])
    plt.title("true labels")
    plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    colors = [
        'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple',
        'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan'
    ]

    for n in range(2, 10):
        kmeans = cluster.KMeans(n_clusters=n, random_state=42)
        cluster_labels = kmeans.fit_predict(spectra_distances)

        plt.subplot(3, 3, n)
        c = [colors[l] for l in cluster_labels]
        plt.title("cluster labels ({} clusters)".format(n))
        plt.scatter(spectra_2D[:, 0], spectra_2D[:, 1], c=c, alpha=0.5)

    plt.show()
Example #9
0
def pca_plot(fp_list, clusters):
    np_fps = []
    for fp in fp_list:
        arr = numpy.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    pca = PCA(n_components=3)
    pca.fit(np_fps)
    np_fps_r = pca.transform(np_fps)
    p1 = figure(x_axis_label="PC1",
                y_axis_label="PC2",
                title="PCA clustering of PAINS")
    p2 = figure(x_axis_label="PC2",
                y_axis_label="PC3",
                title="PCA clustering of PAINS")
    color_vector = ["blue", "red", "green", "orange", "pink", "cyan", "magenta",
                    "brown", "purple"]
    print len(set(clusters))
    for clust_num in set(clusters):
        print clust_num
        local_cluster = []
        for i in xrange(len(clusters)):
            if clusters[i] == clust_num:
                local_cluster.append(np_fps_r[i])
        print len(local_cluster)
        p1.scatter(np_fps_r[:,0], np_fps_r[:,1],
                   color=color_vector[clust_num])
        p2.scatter(np_fps_r[:,1], np_fps_r[:,2],
                   color=color_vector[clust_num])
    return HBox(p1, p2)
 def init_from_linear_case(self, Y, d_):
     """ Solve the equation min ||(Y-\hat{Y}) - M(Y-\hat{Y})||2_2
     Here we take PCA on Y, which compute the eigen-decomposition on 
         YY^{T} = USU^{T}
     and M = U_{d_} * U_{d_}^{T}, where U_{d_} are the first d_ eignvectors
     and b = \hat{y} - M\hat{y}
     @Parameters:
         Y: ndarray with shape (d, num_imags * H' * W' * sample_ratio)
         d_: the number of eigenvectors to remain
     @Returns:
         M: d * d_
         b = d * 1
     """
     logger.debug("Init M, b from linear-case...")
     pca = PCA(n_components=d_)
     # pca = PCA()
     # with shape d_, * d
     pca.fit(Y.transpose())
     # d_ * d
     U = pca.components_
     # d * d
     M = U.transpose().dot(U)
     mean_Y = np.average(Y, axis=1)
     mean_Y = mean_Y.reshape(mean_Y.shape[0], 1)
     b = mean_Y - M.dot(mean_Y)
     Err = (Y - mean_Y) - M.dot(Y - mean_Y)
     logger.debug("Linear-case loss:{:.3f}".format(np.linalg.norm(Err)))
     logger.debug("Linear-case: M.max:{:.2f}, M.min:{:.2f}, b.max:{:.2f},"
                  " b.min:{:.2f}".format(M.max(), M.min(), b.max(),
                                         b.min()))
     return M, U.transpose(), U, b
Example #11
0
 def cross_validate(self, train_x, train_y, test_x, test_y, **params):
     if not params:
         params = {"dummy": [0]}
     keys, values = list(zip(*list(params.items())))
     for param_list in itertools.product(*values):
         cv_params = list(self.params.items()) + list(zip(keys, param_list))
         for use_pca in (False, True):
             if self.have_tested(cv_params, use_pca):
                 continue
             if use_pca:
                 pca = PCA(n_components=0.99)
                 proc_train_x = pca.fit_transform(train_x)
                 proc_test_x = pca.transform(test_x)
             else:
                 proc_train_x = train_x
                 proc_test_x = test_x
             if "dummy" in params:
                 model = self.func().fit(proc_train_x, train_y)
             else:
                 model = self.func(**dict(cv_params)).fit(
                     proc_train_x, train_y)
             predictions = model.predict_proba(proc_test_x)
             if len(predictions.shape) == 2:
                 predictions = predictions[:, 1]
             num_right = (test_y == predictions.round()).sum()
             self.json["tests"].append({})
             test_data = self.json["tests"][-1]
             test_data["use_pca"] = use_pca
             test_data["pct_right"] = 100 * num_right / float(len(test_y))
             test_data["loss"] = log_loss(test_y, predictions)
             test_data["num_right"] = num_right
             test_data["num_tests"] = len(test_y)
             test_data["params"] = dict(cv_params)
             self._write()
             print((self.print_test(test_data)))
Example #12
0
class PCAImpl():
    def __init__(self,
                 n_components=None,
                 copy=True,
                 whiten=False,
                 svd_solver='auto',
                 tol=0.0,
                 iterated_power='auto',
                 random_state=None):
        self._hyperparams = {
            'n_components': n_components,
            'copy': copy,
            'whiten': whiten,
            'svd_solver': svd_solver,
            'tol': tol,
            'iterated_power': iterated_power,
            'random_state': random_state
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Example #13
0
def main():
    print('Reading in data file...')
    data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv',
                       usecols=['Sentiment', 'SentimentText'], error_bad_lines=False)

    print('Pre-processing tweet text...')
    corpus = data['SentimentText']
    vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
                                 stop_words='english', tokenizer=tokenize)
    X = vectorizer.fit_transform(corpus.values)
    y = data['Sentiment'].values

    print('Training sentiment classification model...')
    classifier = MultinomialNB()
    classifier.fit(X, y)

    print('Training word2vec model...')
    corpus = corpus.map(lambda x: tokenize(x))
    word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4)
    word2vec.init_sims(replace=True)

    print('Fitting PCA transform...')
    word_vectors = [word2vec[word] for word in word2vec.vocab]
    pca = PCA(n_components=2)
    pca.fit(word_vectors)

    print('Saving artifacts to disk...')
    joblib.dump(vectorizer, path + 'vectorizer.pkl')
    joblib.dump(classifier, path + 'classifier.pkl')
    joblib.dump(pca, path + 'pca.pkl')
    word2vec.save(path + 'word2vec.pkl')

    print('Process complete.')
Example #14
0
def pca(target, control, title, name_one, name_two):
    np_fps = []
    for fp in target + control:
        arr = numpy.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    ys_fit = [1] * len(target) + [0] * len(control)
    names = ["PAINS", "Control"]
    pca = PCA(n_components=3)
    pca.fit(np_fps)
    np_fps_r = pca.transform(np_fps)
    p1 = figure(x_axis_label="PC1",
                y_axis_label="PC2",
                title=title)
    p1.scatter(np_fps_r[:len(target), 0], np_fps_r[:len(target), 1],
               color="blue", legend=name_one)
    p1.scatter(np_fps_r[len(target):, 0], np_fps_r[len(target):, 1],
               color="red", legend=name_two)
    p2 = figure(x_axis_label="PC2",
                y_axis_label="PC3",
                title=title)
    p2.scatter(np_fps_r[:len(target), 1], np_fps_r[:len(target), 2],
               color="blue", legend=name_one)
    p2.scatter(np_fps_r[len(target):, 1], np_fps_r[len(target):, 2],
               color="red", legend=name_two)
    return HBox(p1, p2)
def run(ARGS, data=None, model=None, is_test=False):
    data = data or get_regression_data(ARGS.dataset, split=ARGS.split)
    model = model or get_regression_model(ARGS.model)(is_test=is_test, seed=ARGS.seed)

    model.fit(data.X_train, data.Y_train)

    res = {}

    samples = model.sample(data.X_test, ARGS.num_samples)
    data_tiled = np.tile(data.X_test[None, :, :], [ARGS.num_samples, 1, 1])
    shape =  [ARGS.num_samples * data.X_test.shape[0], data.X_test.shape[1] + data.Y_test.shape[1]]
    A = np.reshape(np.concatenate([data_tiled, samples], -1), shape)
    B = np.concatenate([data.X_test, data.Y_test], -1)


    if ARGS.pca_dim > 0:
        AB = np.concatenate([A, B], 0)
        pca = PCA(n_components=ARGS.pca_dim).fit(AB)
        A = pca.transform(A)
        B = pca.transform(B)

    # import matplotlib.pyplot as plt
    # plt.scatter(A[:, 0], A[:, 1], color='b')
    # plt.scatter(B[:, 0], B[:, 1], color='r')
    # plt.show()

    kernel = gpflow.kernels.RBF(A.shape[-1])
    res['mmd'] = mmd(A, B, kernel)

    print(res)

    res.update(ARGS.__dict__)
    if not is_test:  # prgama: no cover
        with Database(ARGS.database_path) as db:
            db.write('mmd', res)
Example #16
0
class LogisticClassifier(object):
    def __init__(self, learning_rate=0.01, reg=0., momentum=0.5):
        self.classifier = LogisticRegression(learning_rate, reg, momentum)
        self.pca = None
        self.scaler = None

    def sgd_optimize(self, data, n_epochs, mini_batch_size):
        data = self._preprocess_data(data)
        sgd_optimization(data, self.classifier, n_epochs, mini_batch_size)

    def _preprocess_data(self, data):
        # center data and scale to unit std
        if self.scaler is None:
             self.scaler = StandardScaler()
             data = self.scaler.fit_transform(data)
        else:
            data = self.scaler.transform(data)

        if self.pca is None:
            # use minika's mle to guess appropriate dimension
            self.pca = PCA(n_components='mle')
            data = self.pca.fit_transform(data)
        else:
            data = self.pca.transform(data)

        return data
Example #17
0
 def fit_data(self, X_train, Y_train, n_components=None, pca_only=False):
     if n_components is None:
         n_components = X_train.shape[1]
     X_train = scale(X_train, axis=0)
     # then pca
     print "training with PCA transform..."
     self.pca_transformer = PCA(n_components=n_components).fit(X_train)
     print "variance explained " + str(
         np.sum(self.pca_transformer.explained_variance_ratio_))
     if pca_only:
         # return pca transformer only
         return
     # then lda
     print "training with LDA transform..."
     self.lda_transformer = LDA(n_components=n_components).fit(
         X_train, Y_train)
     print "variance explained " + str(
         np.sum(self.lda_transformer.explained_variance_ratio_))
     # then nmf
     print "training with NMF transform..."
     norm_traindata = normalize(X_train - np.min(X_train))
     self.nmf_transformer = NMF(
         n_components=n_components).fit(norm_traindata)
     print "reconstruction error " + str(
         np.sum(self.nmf_transformer.reconstruction_err_))
     # then ssnmf
     print "training with SSNMF transform..."
     G, W, self.ssnmf_H, rec_err = self.ssnmf_fit(norm_traindata,
                                                  Y_train,
                                                  npc=n_components)
     print "reconstruction error " + str(rec_err)
Example #18
0
    def write_predictions(self, model):
        if not os.path.exists(self.pred_dir):
            os.mkdir(self.pred_dir)

        raw_train_x, train_y = features_labels(self.season + 1)
        scaler = StandardScaler()

        train_x = scaler.fit_transform(raw_train_x)
        pca = PCA()
        if model.json.get("use_pca", False):
            train_x = pca.fit_transform(train_x)

        clf = model.func(**model.best_params()["params"]).fit(train_x, train_y)

        features, ids = self.get_features_and_ids()

        features = scaler.transform(features)
        if model.json.get("use_pca", False):
            features = pca.transform(features)

        predictions = clf.predict_proba(features)
        if len(predictions.shape) == 2:
            predictions = predictions[:, 1]

        with open(self.pred_path, 'w') as buff:
            buff.write("id,pred\n")
            for (label, pred) in zip(ids, predictions):
                buff.write("{:s},{:s}\n".format(label, str(pred)))
Example #19
0
def pca(tx, ty, rx, ry):
    compressor = PCA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wPCAtr", times=10)
    km(newtx, ty, newrx, ry, add="wPCAtr", times=10)
    nn(newtx, ty, newrx, ry, add="wPCAr")
Example #20
0
def pca(tx, ty, rx, ry):
    compressor = PCA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wPCAtr", times=10)
    km(newtx, ty, newrx, ry, add="wPCAtr", times=10)
    nn(newtx, ty, newrx, ry, add="wPCAtr")
Example #21
0
class EnsembleModel:
    def __init__(self, models, **params):
        self.models = models.values()
        self.model_funcs = [j.model for j in models.values()]
        self.params = params
        self._pca = PCA(n_components=0.99)
        self._clf = None

    def fit(self, x, y):
        train_x, test_x, train_y, test_y, = train_test_split(x, y, test_size=0.2)
        pca_train_x = self._pca.fit_transform(train_x)
        pca_test_x = self._pca.transform(test_x)
        for model, model_func in zip(self.models, self.model_funcs):
            if model.json.get("use_pca", False):
                train_x = pca_train_x
                test_x = pca_test_x
            else:
                pass
            model_func.fit(train_x, train_y)
        self._fit_meta_estimator(test_x, test_y)
        return self

    def _fit_meta_estimator(self, x, y):
        predictions = self._predictions(x).T
        y = numpy.atleast_2d(y).T
        labels = numpy.argmin(abs(predictions - y * numpy.ones((1, predictions.shape[1]))), 1)
        self._clf = GaussianNB().fit(x, labels)

    def _predictions(self, x):
        pca_x = self._pca.transform(x)
        predictions = []
        weights = []

        for model, model_func in zip(self.models, self.model_funcs):
            if model.json.get("use_pca", False):
                test_x = pca_x
            else:
                test_x = x
            predictions.append(model_func.predict_proba(test_x)[:, 1])
            weights.append(model.best_params()["loss"])
        return numpy.array(predictions)

    def predict_proba(self, x):
        blend = self.params.get("blend", "mean")
        predictions = self._predictions(x)
        if blend == "median":
            return numpy.median(predictions, 0)
        if blend == "meta":
            probs = self._clf.predict_proba(x)
            preds = []
            for row, prob in zip(predictions.T, probs):
                if max(prob) > 0.99:
                    preds.append(row[numpy.argmax(prob)])
                else:
                    preds.append(numpy.median(row))
            return numpy.array(preds)

        return predictions.mean(0)
Example #22
0
    def initialize(self, override_data_dir=None):
        assert self._real_activations is None

        data_dir = override_data_dir if override_data_dir else \
            (self._config.target_data_dir if self._config.target_data_dir else self._config.data_dir)
        activations_file = os.path.join(
            "data", data_dir,
            "activations_{}.npz".format(self._config.extractor_name))
        if os.path.exists(activations_file):
            tf.logging.info(
                "Loading activations from {}".format(activations_file))
            with np.load(activations_file) as activations:
                self._real_activations = [
                    tf.convert_to_tensor(activations[f])
                    for f in sorted(activations.files)
                ]
        else:
            tf.logging.warning(
                "Computing activations for real images in '{}'".format(
                    data_dir))
            self._real_activations = self._get_activations_from_images(
                load_image_names(data_dir))
            tf.logging.info(
                "Saving activations to {}".format(activations_file))
            np.savez(
                activations_file, **{
                    "block_{}".format(i): act.numpy()
                    for i, act in enumerate(self._real_activations)
                })

        tf.logging.debug("Fitting PCA")
        self._pca = PCA(n_components=2)
        low_dimensional_real_activations = self._pca.fit_transform(
            self._real_activations[-1])
        tf.logging.debug("Explained variance: {} ({:.5f})".format(
            self._pca.explained_variance_ratio_,
            np.sum(self._pca.explained_variance_ratio_)))

        high_dimensional_clusters = 7
        tf.logging.debug(
            "Clustering high-dimensional activations with {} clusters".format(
                high_dimensional_clusters))
        self._high_dimensional_kmeans = KMeans(
            n_clusters=high_dimensional_clusters)
        self._high_dimensional_kmeans.fit(self._real_activations[-1])
        tf.logging.debug("Inertia: {:.1f}".format(
            self._high_dimensional_kmeans.inertia_))

        low_dimensional_clusters = 4
        tf.logging.debug(
            "Clustering low-dimensional activations with {} clusters".format(
                low_dimensional_clusters))
        self._low_dimensional_kmeans = KMeans(
            n_clusters=low_dimensional_clusters)
        self._low_dimensional_kmeans.fit(low_dimensional_real_activations)
        tf.logging.debug("Inertia: {:.1f}".format(
            self._low_dimensional_kmeans.inertia_))
Example #23
0
def plot_pca_2d(data, ax=None, fpath='', show_fig=False):
    ax = _get_ax(ax)

    pca_2d = PCA(n_components=2)
    data_hat = pca_2d.fit_transform(data)
    ax.scatter(data_hat[:, 0], data_hat[:, 1])
    _save_show_fig(fpath, show_fig)

    return pca_2d
def pca(tx, ty, rx, ry):
    print "pca"
    compressor = PCA(n_components = tx[1].size/2)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wPCAtr")  
    km(newtx, ty, newrx, ry, add="wPCAtr")
    nn(newtx, ty, newrx, ry, add="wPCAtr")
    print "pca done"
Example #25
0
def test_pass_pca_corr_pca_out():
    X, y = iris_data()
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    eigen = pca.explained_variance_

    plot_pca_correlation_graph(X,
                               variables_names=['1', '2', '3', '4'],
                               X_pca=X_pca,
                               explained_variance=eigen)
	def PCA佮SVM模型(self, 問題, 答案):
		sample_weight_constant = np.ones(len(問題))
		clf = svm.SVC(C=1)
		pca = PCA(n_components=100)
# 		clf = svm.NuSVC()
		print('訓練PCA')
		pca.fit(問題)
		print('訓練SVM')
		clf.fit(pca.transform(問題), 答案, sample_weight=sample_weight_constant)
		print('訓練了')
		return lambda 問:clf.predict(pca.transform(問))
Example #27
0
def get_diversity_fom(ndim, data, return_pca=False):

    pca = PCA(n_components=ndim)
    pca.fit(data)

    if return_pca:
        return pca

    vec = pca.explained_variance_ratio_ + 1e-15
    div = (-vec * np.log(vec)).sum(-1) * pca.explained_variance_.sum(-1)
    div /= ndim
    return div
Example #28
0
def pca(data, whiten_bool, components):
    # Set PCA parameters
    pca = PCA(n_components=components, whiten=whiten_bool, svd_solver="full")
    # Fit PCA to data
    pca.fit(data)
    np.set_printoptions(suppress=True)
    print("PCA Components Explained Variance Ratio: " +
          str(np.around(pca.explained_variance_ratio_ * 100, 2)))
    # Calculate loading matrix
    loadings_matrix = (pca.components_.T * np.sqrt(pca.explained_variance_)).T
    # Transform data
    data_transformed = pca.transform(data)
    return data_transformed
Example #29
0
def pca_scatter2d(data: np.ndarray,
                  labels: Optional[np.ndarray] = None,
                  label_mapping: Optional[Mapping[int, str]] = None,
                  ax=None,
                  fpath='',
                  show_fig=False,
                  title=None,
                  **kwargs):

    pca_2d = PCA(n_components=2)
    data_hat = pca_2d.fit_transform(data)
    scatter2d(data_hat, labels, label_mapping, ax, fpath, show_fig, title,
              **kwargs)
Example #30
0
def build_images_KMeans(spectra, spectrum_columns, spectra_distances, colors, TSNE_learning_rate=500, TSNE_n_iter=1500, TSNE_learning_rate2=300):
    colors_m = ['red','black']
    
    cols = spectra['marked'].apply(lambda x: colors[x])
    col = spectra['marked']

    plt.subplots(figsize=(18, 6))
    plt.subplot(131)
    plt.title("PCA")
    pca = PCA(n_components=2, random_state=42)
    spectra_2D = pca.fit_transform(spectra[spectrum_columns])
    for i in range(len(spectra_2D)):
        #print(i)
        #print(spectra_2D[i, 0])
        #print(spectra_2D[i, 1])
        #print(cols[i])
        #print(col[i])
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])

    plt.subplot(132)
    plt.title("TSNE, Euclidean distance")
    tsne = TSNE(n_components=2, random_state=42, learning_rate=TSNE_learning_rate, n_iter=TSNE_n_iter)
    spectra_2D = tsne.fit_transform(spectra[spectrum_columns])
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])

    plt.subplot(133)
    plt.title("TSNE, Chosen distance")
    tsne = TSNE(n_components=2, random_state=42, metric="precomputed", learning_rate=TSNE_learning_rate2)
    spectra_2D = tsne.fit_transform(spectra_distances)
    for i in range(len(col)):
        plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]],s=markersizes[col[i]])
    
    # visualization - tsne with chosen distance
    print('Clustering')
    plt.subplots(figsize=(18, 12))

    for n in range(2, 10):
        kmeans = cluster.KMeans(n_clusters=n, random_state=42)
        cluster_labels = kmeans.fit_predict(spectra_distances)

        plt.subplot(3, 3, n-1)
        cols = [colors[l] for l in cluster_labels]
        plt.title("cluster labels ({} clusters)".format(n))
        for i in range(len(col)):
            plt.scatter(spectra_2D[i, 0], spectra_2D[i, 1], c=cols[i], alpha=0.5, marker=markers[col[i]], s=markersizes[col[i]])
    
    plt.show()

    plt.show()
    return spectra_2D
Example #31
0
def apply_pca(X_train, X_test, pca_thresh):
    """ apply principal component analysis to reduce dimensionality of feature
    vectors"""
    feature_labels = X_train.columns
    pca = PCA(n_components=pca_thresh)
    shape_orig = X_train.shape
    X_train = pca.fit_transform(X_train)
    shape_reduced = X_train.shape
    X_test = pca.transform(X_test)
    logging.info("reduced dimensionality from {} to {}".format(
        shape_orig, shape_reduced))
    rows = ["PC-{}".format(i) for i in range(len(pca.components_))]
    pcs = pd.DataFrame(pca.components_, columns=feature_labels, index=rows)
    return X_train, X_test, pcs
Example #32
0
def test_X_PCA_but_no_explained_variance():
    with pytest.raises(
            ValueError,
            match='If `X_pca` is not None, the `explained variance` '
            'values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=None)
Example #33
0
    def compute_pca(self, model):
        X = model.wv.vectors
        #self.plot_matrix(X, 'Before')
        scaler = preprocessing.StandardScaler()
        X_scaled = scaler.fit_transform(X)
        #self.plot_matrix(X_scaled, 'After')

        pca = PCA(n_components=2)
        principalComponents = pca.fit_transform(X_scaled)

        print(pca.explained_variance_ratio_)
        self.scatterplot(principalComponents)

        return principalComponents
Example #34
0
def test_no_X_PCA_but_explained_variance():
    with pytest.raises(ValueError,
                       match='If `explained variance` is not None, the '
                       '`X_pca` values should not be `None`.'):

        X, y = iris_data()
        pca = PCA(n_components=2)
        pca.fit(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=None,
                                   explained_variance=eigen)
Example #35
0
def find_distributions(query_areas, query_energies, binned_image,
                       n_classes=2,
                       bimages_path='/home/fin/Documents/Timepix/particle-tk/datagen/images.pkl',
                       segments_path='/home/fin/Documents/Timepix/particle-tk/datagen/segments.pkl'):

    # Load binned images and segments
    b_im = pkl.load(open(bimages_path, 'rb'))
    segments = pkl.load(open(segments_path, 'rb'))
    reductor = PCA(n_components=3)
    b_im = reductor.fit_transform(b_im)
    queried_binned_image = reductor.transform(binned_image.reshape(1,-1))

    areas = [[] for i in range(0,n_classes)]
    pixel_energies = [[] for i in range(0,n_classes)]
    binned_images = [[] for i in range(0,n_classes)]
    binned_images_energies = [[] for i in range(0,n_classes)]

    for segment in segments:
        for lbl in range(1,n_classes+1):
            if segment.get_metadata('label') == lbl:
                areas[lbl-1].append(area(segment.get_bitmap()))
                nonzeroE = segment.get_bitmap().flatten()[segment.get_bitmap().flatten() > 0]
                for e in nonzeroE:
                    pixel_energies[lbl-1].append(e)
                    binned_images_energies[lbl-1].append(b_im[segment.get_metadata('parent_im_id')])
                binned_images[lbl-1].append(b_im[segment.get_metadata('parent_im_id')])
                break

    # Estimation of size density given image
    sizes = list() # for each particle type one array of size
    sizes.append(np.linspace(0,20,100))
    sizes.append(np.linspace(0,10,100))
    energies = list()
    energies.append(np.linspace(0,400,100))
    energies.append(np.linspace(0,400,100))
    p_SgX = list()
    p_EgX = list()

    for lbl in range(1,n_classes+1):
        print(areas[lbl-1])
        estimator_P_SgX = estimate_P_SgX(areas[lbl-1], binned_images[lbl-1])
        estimator_P_EgX = estimate_P_SgX(pixel_energies[lbl-1], binned_images_energies[lbl-1])
        p_SgX.append(estimator_P_SgX.score_samples(query_areas[lbl-1,:],
                                                   np.repeat(np.atleast_2d(queried_binned_image),
                                                             query_areas[lbl-1,:].shape[0], axis=0)))
        p_EgX.append(estimator_P_EgX.score_samples(query_energies[lbl-1,:],
                                                   np.repeat(np.atleast_2d(queried_binned_image),
                                                             query_energies[lbl-1,:].shape[0], axis=0)))
    return np.array(p_SgX), np.array(p_EgX)
def train_pca(pains_fps, num_components=3):
    '''
    Dimensional reduction of fps bit vectors to principal components
    :param pains_fps:
    :return: pca reduced fingerprints bit vectors
    '''
    np_fps = []
    for fp in pains_fps:
        arr = numpy.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    pca = PCA(n_components=num_components)
    pca.fit(np_fps)
    fps_reduced = pca.transform(np_fps)
    return fps_reduced
Example #37
0
def test_not_enough_components():
    s = (
        'Number of principal components must match the number of eigenvalues. Got 2 != 1'
    )
    with pytest.raises(ValueError, match=s):

        X, y = iris_data()
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X)
        eigen = pca.explained_variance_

        plot_pca_correlation_graph(X,
                                   variables_names=['1', '2', '3', '4'],
                                   X_pca=X_pca,
                                   explained_variance=eigen[:-1])
Example #38
0
def classify_for_benchmark(data_set_df, user_info_df, features, label='gender', classifier=None, num=None):
    instance_num = len(data_set_df.columns)
    x = data_set_df.loc[features]
    x = x.dropna(how='all', axis=0)
    x = x.dropna(how='all', axis=1)

    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
    x_replaced = x.replace([np.inf, -np.inf], np.nan)
    x_imp = imp.transform(x_replaced)

    y = user_info_df.get(label)
    y_filtered = y[(map(int, x.columns.values))]

    clf = nb.BernoulliNB() if classifier is None else classifier
    cv_num = min(len(y_filtered), 10)
    if cv_num <= 1 or len(y_filtered.unique()) <= 1:
        return 0.0, 100.0
    else:
        final_score = 0.0
        for i in range(100):
            score = 0.0
            cnt = 0
            skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True)
            for tr_index, te_index in skf:
                x_train, x_test = x_imp.T[tr_index], x_imp.T[te_index]
                y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index]
                min_num = min(len(x_train), len(x_train.T), len(x_test), len(x_test.T), num)
                pca = PCA(min_num)
                x_train = pca.fit_transform(x_train)
                x_test = pca.fit_transform(x_test)

                try:
                    clf.fit(x_train, y_train)
                    score += clf.score(x_test, y_test)
                    cnt += 1
                    # cv_score = cross_validation.cross_val_score(clf, x_imp.T, y_filtered, cv=cv_num)
                except ValueError:
                    traceback.print_exc()
                    print i, "why error? skip!"
            if cnt > 0:
                score /= cnt
                print i, score
            else:
                return 0.0, (float(instance_num - len(y_filtered)) / instance_num)
            final_score += score
        final_score /= 100
        miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num)
        return final_score, miss_clf_rate
Example #39
0
    def reduce_dims_train(self, X, y, method='PCA', **kwargs):

        pickle_file = './data/train/reduced_dims_' + method

        data = self.unpickle_data(pickle_file)

        if len(data) > 0:
            transformed, l_kwargs, transform_model_type, reduction_model, = data
            if transform_model_type == self.transform_model and l_kwargs == kwargs:
                self.reduction_model = reduction_model
                return transformed

        if method == 'PCA':
            self.reduction_model = PCA(**kwargs)

        elif method == 'LDA':
            self.reduction_model = LDA(n_components=50)
            dct = {k: i for (i, k) in enumerate(set(y))}
            y = [dct[i] for i in y]

        else:
            raise Exception("Wrong Method")

        # Fit the reduction model to our data
        self.reduction_model.fit(X, y)

        # Perform dimensionality reduction
        transformed = self.reduction_model.transform(X)

        self.pickle_data(
            pickle_file,
            (transformed, kwargs, self.transform_model, self.reduction_model))

        return transformed
Example #40
0
def reduction(data, params):

    # parse parameters

    for item in params:
        if isinstance(params[item], str):
            exec(item+'='+'"'+params[item]+'"')
        else:
            exec(item+'='+str(params[item]))

    # apply PCA

    pca = PCA(n_components=n_components)
    pca.fit(data)
    X = pca.transform(data)

    return X
Example #41
0
def pca_no_labels(target, title="PCA clustering of PAINS", color="blue"):
    np_fps = []
    for fp in target:
        arr = numpy.zeros((1,))
        DataStructs.ConvertToNumpyArray(fp, arr)
        np_fps.append(arr)
    pca = PCA(n_components=3)
    pca.fit(np_fps)
    np_fps_r = pca.transform(np_fps)
    p3 = figure(x_axis_label="PC1",
                y_axis_label="PC2",
                title=title)
    p3.scatter(np_fps_r[:, 0], np_fps_r[:, 1], color=color)
    p4 = figure(x_axis_label="PC2",
                y_axis_label="PC3",
                title=title)
    p4.scatter(np_fps_r[:, 1], np_fps_r[:, 2], color=color)
    return HBox(p3, p4)
def airline_pca():
    X = np.array(pca_data)
    pca = PCA(n_components=3)
    pca.fit(X)
    Y=pca.transform(normalize(X))
    
    fig = plt.figure(1, figsize=(8, 6))
    ax = Axes3D(fig, elev=-150, azim=110)
    colordict = {carrier:i for i,carrier in enumerate(major_carriers)}
    pointcolors  = [colordict[carrier] for carrier in target_carrier]
    ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], c=pointcolors)
    ax.set_title("First three PCA directions")
    ax.set_xlabel("1st eigenvector")
    ax.w_xaxis.set_ticklabels([])
    ax.set_ylabel("2nd eigenvector")
    ax.w_yaxis.set_ticklabels([])
    ax.set_zlabel("3rd eigenvector")
    ax.w_zaxis.set_ticklabels([])
Example #43
0
def test_pipeline_transform():
    # Test whether pipeline works with a transformer at the end.
    # Also test pipline.transform and pipeline.inverse_transform
    iris = load_iris()
    X = iris.data
    pca = PCA(n_components=2)
    pipeline = Pipeline([('pca', pca)])

    # test transform and fit_transform:
    X_trans = pipeline.fit(X).transform(X)
    X_trans2 = pipeline.fit_transform(X)
    X_trans3 = pca.fit_transform(X)
    assert_array_almost_equal(X_trans, X_trans2)
    assert_array_almost_equal(X_trans, X_trans3)

    X_back = pipeline.inverse_transform(X_trans)
    X_back2 = pca.inverse_transform(X_trans)
    assert_array_almost_equal(X_back, X_back2)
Example #44
0
def do_train_with_freq():
    tf_mix = TrainFiles(train_path = train_path_mix, labels_file = labels_file, test_size = 0.)
    tf_freq = TrainFiles(train_path = train_path_freq, labels_file = labels_file, test_size = 0.)

    X_m, Y_m, _, _ = tf_mix.prepare_inputs()
    X_f, Y_f, _, _ = tf_freq.prepare_inputs()

    X = np.c_[X_m, X_f]
    Y = Y_f

    X, Xt, Y, Yt = train_test_split(X, Y, test_size = 0.1)
    sl = SKSupervisedLearning(SVC, X, Y, Xt, Yt)
    sl.fit_standard_scaler()

    pca = PCA(250)
    pca.fit(np.r_[sl.X_train_scaled, sl.X_test_scaled])
    X_pca = pca.transform(sl.X_train_scaled)
    X_pca_test = pca.transform(sl.X_test_scaled)

    #sl.train_params = {'C': 100, 'gamma': 0.0001, 'probability' : True}
    #print "Start SVM: ", time_now_str()
    #sl_ll_trn, sl_ll_tst = sl.fit_and_validate()
    #print "Finish Svm: ", time_now_str()

    ##construct a dataset for RBM
    #X_rbm = X[:, 257:]
    #Xt_rbm = X[:, 257:]

    #rng = np.random.RandomState(123)
    #rbm = RBM(X_rbm, n_visible=X_rbm.shape[1], n_hidden=X_rbm.shape[1]/4, numpy_rng=rng)

    #pretrain_lr = 0.1
    #k = 2
    #pretraining_epochs = 200
    #for epoch in xrange(pretraining_epochs):
    #    rbm.contrastive_divergence(lr=pretrain_lr, k=k)
    #    cost = rbm.get_reconstruction_cross_entropy()
    #    print >> sys.stderr, 'Training epoch %d, cost is ' % epoch, cost


    trndata, tstdata = createDataSets(X_pca, Y, X_pca_test, Yt)
    fnn = train(trndata, tstdata, epochs = 1000, test_error = 0.025, momentum = 0.2, weight_decay = 0.0001)
Example #45
0
def showDataTable():
    title = "Descriptive statistics"
    df = frame[cols]
    data_dsc = df.describe().transpose()
    # dsc = df.describe()

    pca = PCA(n_components=5)
    pca.fit(df)
    pc = pca.explained_variance_ratio_

    data_corr = df.corr()
    eigenValues, eigenVectors = LA.eig(data_corr)
    idx = eigenValues.argsort()[::-1]
    # print sorted(eigenValues, key=int, reverse=True)
    print  eigenValues.argsort()[::-1]
    print  eigenValues.argsort()
    eigenValues = pd.DataFrame(eigenValues[idx]).transpose()
    eigenVectors = pd.DataFrame(eigenVectors[:, idx])

    return render_template("showDataTable.html", title=title, data=df, data_dsc=data_dsc, pca=pd.DataFrame(pc).transpose(),data_corr=data_corr, w=eigenValues, v=eigenVectors)
Example #46
0
def pca_prefit(weights, xs):
    """
    SOMの初期値を計算するための前処理.
    線形変換によって重みベクトル列の主成分とその固有値を入力ベクトル列のものと一致させる.
    :param weights: 初期重みベクトル列
    :param xs: 入力ベクトル列
    :return: 前処理した重みベクトル列
    """
    n = np.shape(xs)[1]
    pca_w = PCA(n_components=n)
    pca_w.fit(weights)
    pca_x = PCA(n_components=n)
    pca_x.fit(xs)

    mean_w = np.mean(weights, axis=0)
    mean_x = np.mean(xs, axis=0)
    com_w = pca_w.components_
    com_x = pca_x.components_
    var_w = pca_w.explained_variance_
    var_x = pca_x.explained_variance_

    var_w[var_w == 0] = np.max(var_w) * 1e-6
    new_w = (weights - mean_w).dot(com_w.T) / np.sqrt(var_w)
    new_w = (new_w * np.sqrt(var_x)).dot(com_x) + mean_x

    return new_w
Example #47
0
def plot_similarity_clusters(desc1, desc2, files, plot = None):
	"""
	find similar sounds using Affinity Propagation clusters

	:param desc1: first descriptor values
	:param desc2: second descriptor values
	:returns:
	  - euclidean_labels: labels of clusters
	""" 

	if plot == True:
		print((Fore.MAGENTA + "Clustering"))
	else:
		pass
         
	min_max = preprocessing.scale(np.vstack((desc1,desc2)).T, with_mean=False, with_std=False)          
	pca = PCA(n_components=2, whiten=True)
	y = pca.fit(min_max).transform(min_max)
	    
	euclidean = AffinityPropagation(convergence_iter=1800, affinity='euclidean')                           
	euclidean_labels= euclidean.fit_predict(y)

	if plot == True:

		time.sleep(5)  

		print((Fore.WHITE + "Cada número representa el grupo al que pertence el sonido como ejemplar de otro/s. El grupo '0' esta coloreado en azul, el grupo '1' esta coloreado en rojo, el grupo '2' esta coloreado en amarillo. Observa el ploteo para ver qué sonidos son ejemplares de otros"))
		print(np.vstack((euclidean_labels,files)).T)

		time.sleep(6)

		plt.scatter(y[euclidean_labels==0,0], y[euclidean_labels==0,1], c='b')
		plt.scatter(y[euclidean_labels==1,0], y[euclidean_labels==1,1], c='r')
		plt.scatter(y[euclidean_labels==2,0], y[euclidean_labels==2,1], c='y')
		plt.scatter(y[euclidean_labels==3,0], y[euclidean_labels==3,1], c='g')
		plt.show()
	else:
		pass

	return euclidean_labels
Example #48
0
def calc_pcs_variance_explained(bnd, preaverage=False, 
    use_unbiased=False, method='skl'):
    '''
    Parameters
    ----------
    bnd : BinnedData
      binned data
    preaverage : bool
      average across repeats?
    use_unbiased : False
      use the unbiased spike rates calculated using Rob Kass's
      spike rate method
    '''
    assert type(method) == str
    
    data = format_for_fa(bnd, preaverage=preaverage,
                     use_unbiased=use_unbiased)
    
    if method == 'skl':
        pca_obj = PCA()
        score = pca_obj.fit(data)
        return pca_obj.explained_variance_ratio_
    else:
        raise ValueError('method %s not implemented' % method)
Example #49
0
    def _preprocess_data(self, data):
        # center data and scale to unit std
        if self.scaler is None:
             self.scaler = StandardScaler()
             data = self.scaler.fit_transform(data)
        else:
            data = self.scaler.transform(data)

        if self.pca is None:
            # use minika's mle to guess appropriate dimension
            self.pca = PCA(n_components='mle')
            data = self.pca.fit_transform(data)
        else:
            data = self.pca.transform(data)

        return data
def dimensional(tx, ty, rx, ry, add=None):
    print "pca"
    for j in range(tx[1].size):
        i = j + 1
        print "===" + str(i)
        compressor = PCA(n_components = i)
        t0 = time()
        compressor.fit(tx, y=ty)
        newtx = compressor.transform(tx)
        runtime=time() - t0
        V = compressor.components_
        print runtime, V.shape, compressor.score(tx)
        distances = np.linalg.norm(tx-compressor.inverse_transform(newtx))
        print distances
    print "pca done"
    print "ica"
    for j in range(tx[1].size):
        i = j + 1
        print "===" + str(i)
        compressor = ICA(whiten=True)
        t0 = time()
        compressor.fit(tx, y=ty)
        newtx = compressor.transform(tx)
        runtime=time() - t0
        print newtx.shape, runtime
        distances = np.linalg.norm(tx-compressor.inverse_transform(newtx))
        print distances
    print "ica done"
    print "RP"
    for j in range(tx[1].size):
        i = j + 1
        print "===" + str(i)
        compressor = RandomProjection(n_components=i)
        t0 = time()
        compressor.fit(tx, y=ty)    
        newtx = compressor.transform(tx)
        runtime=time() - t0
        shape = newtx.shape
        print runtime, shape
    print "RP done"
    print "K-best"
    for j in range(tx[1].size):
        i = j + 1
        print "===" + str(i)
        compressor = best(add, k=i)
        t0 = time()
        compressor.fit(tx, y=ty.ravel())
        newtx = compressor.transform(tx)
        runtime=time() - t0
        shape = newtx.shape
        print runtime, shape
    print "K-best done"
Example #51
0
import numpy as np
import pandas as pd
from sklearn.decomposition.pca import PCA
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
from sklearn.mixture import GMM
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt

X_test = pd.read_csv('Data/test.csv', header=None).as_matrix()
y = pd.read_csv('Data/trainLabels.csv', header=None)[0].as_matrix()
X = pd.read_csv('Data/train.csv', header=None).as_matrix()

pca2 = PCA(n_components=2, whiten=True)
pca2.fit(np.r_[X, X_test])
X_pca = pca2.transform(X)
i0 = np.argwhere(y == 0)[:, 0]
i1 = np.argwhere(y == 1)[:, 0]
X0 = X_pca[i0, :]
X1 = X_pca[i1, :]
plt.plot(X0[:, 0], X0[:, 1], 'ro')
plt.plot(X1[:, 0], X1[:, 1], 'b*')

pca = PCA(whiten=True)
X_all = pca.fit_transform(np.r_[X, X_test])
print (pca.explained_variance_ratio_)

def kde_plot(x):
        from scipy.stats.kde import gaussian_kde
        kde = gaussian_kde(x)
        positions = np.linspace(x.min(), x.max())
Example #52
0
 def _calc_factors(data, npc=None):
     pca_obj = PCA(n_components=npc)
     score = pca_obj.fit(data).transform(data)
     # transpose here makes the output match with mdp
     weight = pca_obj.components_.T
     return score.T, weight.T
Example #53
0
import numpy as np
from sklearn import tree
from sklearn.decomposition.pca import PCA
import mnist_loader as loader
import mnist_writer as writer

print('Reading data...')
train_data, train_labels = loader.load_train_data()
test_data = loader.load_test_data()

# convert to numpy arrays
train_data = np.array(train_data)
train_labels = np.array(train_labels)
test_data = np.array(test_data)

print('PCA analysis...')
pca = PCA(n_components=35, whiten=True)
pca.fit(train_data)
train_data = pca.transform(train_data)
test_data = pca.transform(test_data)

print('Fitting decision tree...')
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_labels)

print('Making predictions...')
predict = clf.predict(test_data)

print('Writing results...')
writer.write_predictions(predict, '/Users/clint/Development/data/mnist/predict_tree.csv')
Example #54
0
        inertia = extra.sum()
        print it, inertia
        if ((old_extra - extra)**2).sum() < tol:
            print "finished at iteration %d" % it
            break
        old_extra = extra.copy()

    return labels


if __name__ == "__main__":
    X, Y = data.libras_movement()
    labels = kernel_k_means(X, k=15)

    # Pour representer les donnees, prendre le PCA
    pca = PCA(n_components=2)
    pca.fit(X)
    Xt = pca.transform(X)

    fig = pl.figure()

    colors = ['#334433',
              '#6699aa',
              '#88aaaa',
              '#aacccc',
              '#447799',
              '#225533',
              '#44bbcc',
              '#88dddd',
              '#bbeeff',
              '#0055bb',
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from operator import itemgetter

data_dir = '../../data/'

n_pca_components = 10
eps_range = numpy.arange(0.01,20,0.1)
min_samples_range = [2,3,5,10]
allowed_noise_ratio = 0.2

# data
derivatives = numpy.loadtxt(os.path.join(data_dir, 'derivatives.dat'))

# PCA
pca = PCA(n_components=n_pca_components)
pca.fit(derivatives)
X = pca.transform(derivatives)
X = StandardScaler().fit_transform(X)

results = []

for eps in eps_range:
    for minsamp in min_samples_range:

        model = DBSCAN(eps=eps, min_samples=minsamp, algorithm='kd_tree')
        model.fit(X)

        labels = model.labels_
        noise_ratio = float(sum(labels==-1)) / len(labels)
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
Example #56
0
# eng = matlab.engine.start_matlab()
X=[]
for i in xrange(1,30):
    file=open('J_Left/'+`i`)
    mylist=[]
    x=0
    for line in file:
        line=line[:-1]
        temp=line.split(' ')
        for i in range(len(temp)-1):
#             print temp[i]
            mylist.append(float(temp[i]))
    mylist=mylist+[0]*(5000*9-len(mylist))
    X.append(mylist)
print 'len of X',len(X)
pca=PCA(n_components=4)
t=pca.fit_transform(X)
l=[]
for v in t:
    arr=[]
    for e in v:
        f=float(e)
        arr.append(f)
    l.append(arr)
# ret = eng.moh_pca(l)

print l
print type(l)
print 'len of t',len(t)

Example #57
0
"""
http://stats.stackexchange.com/questions/82050/principal-component-analysis-
\and-regression-in-python
"""

import pandas as pd
from sklearn.decomposition.pca import PCA

source = pd.read_csv('../files/multicollinearity.csv')
frame = pd.DataFrame(source)
cols = [col for col in frame.columns if col not in ['response']]
frame2 = frame[cols]

pca = PCA(n_components=5)
pca.fit(frame2)

# The amount of variance that each PC explains?
print pca.explained_variance_ratio_

# What are these? Eigenvectors?
print pca.components_

# Are these the eigenvalues?
print pca.explained_variance_

# it looks like sklearn won't operate directly on a pandas dataframe.
# Let's say that I convert it to a numpy array:

npa = frame2.values
npa
Example #58
0
def predict():
    tf = TrainFiles('/kaggle/malware/train/mix_lbp', val_path = '/kaggle/malware/test/mix_lbp', labels_file = "/kaggle/malware/trainLabels.csv")

    X_train, Y_train, X_test, Y_test = tf.prepare_inputs()

    sl_svm = SKSupervisedLearning(SVC, X_train, Y_train, X_test, Y_test)
    sl_svm.fit_standard_scaler()
    sl_svm.train_params = {'C': 100, 'gamma': 0.01, 'probability': True}

    print "Starting SVM: ", time_now_str()
    _, ll_svm = sl_svm.fit_and_validate()

    print "SVM score: {0:.4f}".format(ll_svm if not prediction else _)
    print "Finished training SVM: ", time_now_str()

    # neural net
    print "Starting NN: ", time_now_str()

    trndata = _createDataSet(sl_svm.X_train_scaled, Y_train, one_based = True)
    tstdata = _createUnsupervisedDataSet(sl_svm.X_test_scaled)
    fnn = predict_nn(trndata)
    proba_nn = fnn.activateOnDataset(tstdata)

    print "Finished training NN: ", time_now_str()

    # no validation labels on actual prediction
    if doTrees:
        # random forest
        sl_ccrf = SKSupervisedLearning(CalibratedClassifierCV, X_train, Y_train, X_test, Y_test)
        sl_ccrf.train_params = \
            {'base_estimator': RandomForestClassifier(**{'n_estimators' : 7500, 'max_depth' : 200}), 'cv': 10}
        sl_ccrf.fit_standard_scaler()

        print "Starting on RF: ", time_now_str()
        ll_ccrf_trn, ll_ccrf_tst = sl_ccrf.fit_and_validate()

        print "RF score: {0:.4f}".format(ll_ccrf_tst if not prediction else ll_ccrf_trn)
        sl_ccrf.proba_test.tofile("/temp/sl_ccrf.prob")
        sl_svm.proba_test.tofile("/temp/sl_svm.prob")
        proba_nn.tofile("/temp/nn.prob")

        print "Finished training RF: ", time_now_str()

    if prediction:
        proba = vote([sl_svm.proba_test, sl_ccrf.proba_test, proba_nn], [2./3., 1./6., 1./3.])

        out_labels = "/kaggle/malware/submission33.csv"
        task_labels = "/kaggle/malware/testLabels.csv"
        labels = [path.splitext(t)[0] for t in tf.get_val_inputs()]
        out = write_to_csv(task_labels, labels, proba, out_labels)

    else:
        # visualize the decision surface, projected down to the first
        # two principal components of the dataset
        pca = PCA(n_components=2).fit(sl_svm.X_train_scaled)

        X = pca.transform(sl_svm.X_train_scaled)

        x = np.arange(X[:, 0].min() - 1, X[:, 1].max() + 1, 1)
        y = np.arange(X[:, 1].min() - 1, X[:, 1].max() + 1, 1)

        xx, yy = np.meshgrid(x, y)

        # title for the plots
        titles = ['SVC with rbf kernel',
                  'Random Forest \n'
                  'n_components=7500',
                  'Decision Trees \n'
                  'n_components=7500']

        #plt.tight_layout()
        plt.figure(figsize=(12, 5))

        # predict and plot
        for i, clf in enumerate((sl_svm.clf, sl_rfc.clf, sl_trees.clf)):
            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            plt.subplot(1, 3, i + 1)
            clf.fit(X, Y_train)
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
            plt.axis('off')

            # Plot also the training points
            plt.scatter(X[:, 0], X[:, 1], c=Y_train, cmap=plt.cm.Paired)

            plt.title(titles[i])
        plt.tight_layout()
        plt.show()
colorbar()
plt.title("correlation matrix")
plt.savefig("correlation_matrix.png")
show()

### center the variables before performing PCA
# center to the mean, but DO NOT component wise scale to unit variance
# by centering the variables, principal components remain the same,
# by standardizing the variables, principal components change
X_train = pp.scale(X_train, with_mean=True, with_std=False)
X_test = pp.scale(X_test, with_mean=True, with_std=False)

### dimensionality reduction using PCA
# since data is uncorrelated and with variance almost equal to 1,
# whitening is not necessary
pca40 = PCA(n_components=40, whiten=False) 
pca40.fit(X_train)
print(pca40.explained_variance_ratio_)

# plot all the principal components with their relative explained variance
features = [x for x in range(1,41)]
plt.figure(3)
# percentage of variance explained by each of the selected components.
# The sum of explained variances is equal to 1.0
plt.plot(features, pca40.explained_variance_ratio_, 'g--', marker='o')
plt.axis([1, 40, 0, 0.3])
plt.grid(True)
plt.xlabel("principal components"), plt.ylabel("variance explained")
plt.title("scree plot")
plt.savefig("scree_plot.png")