Esempio n. 1
0
def svd(feature_matrix,k):

    svd = TruncatedSVD(n_components=k)
    components = svd.fit_transform(feature_matrix)
    print(svd.explained_variance_ratio_)
    principalDf = pd.DataFrame(data=components)
    # print(principalDf)
    return 0
Esempio n. 2
0
    def svd(self, factors = 5, top_questions = 10):
        '''
        Input: instance variable, integer, integer
        Output: dataframe, dataframe, list, integer

        This function performs SVD, calls the describe function which returns the results and returns the decomposed matricies.
        It also returns the svd model itself so that other functions can be called on it during EDA.
        '''

        svd = TruncatedSVD(n_components = factors)
        W = svd.fit_transform(self.answers_clean)
        H = svd.components_
        var_ratio = svd.explained_variance_ratio_
        var = svd.explained_variance_
        self.describe_svd_results(W, H, top_questions)
        return W, H, var_ratio, var
Esempio n. 3
0
 def sk_lsi_bi(self, num_vec):
     """
     performs lsa/lsi or simply svd on the tf-matrix using sklearn
     :param num_vec:
     :return:
     """
     index_arr = self.ini_index_bi.to_numpy()[:, :-1]
     index_arr_spa = csr_matrix(index_arr)
     svd = TruncatedSVD(n_components=num_vec, n_iter=50)
     B1 = svd.fit_transform(index_arr_spa)
     B = svd.inverse_transform(B1)
     b_df = pd.DataFrame(data=B,
                         index=list(self.ini_index_bi.index),
                         columns=self.docIDs)
     # b_df['n_i'] = self.ini_index['n_i']
     b_df['idf'] = self.ini_index_bi['idf']
     self.index_bi = b_df
Esempio n. 4
0
# データロード
iris = load_iris()

# データ格納
iris_X = iris.data
y = iris.target

# 1 特異値分解の実行 ------------------------------------------------------------------------------

# インスタンス生成
svd = TruncatedSVD()
vars(svd)

# 学習
iris_svd = svd.fit_transform(iris_X)
iris_svd[:5]
vars(svd)

# 分散の寄与率
svd.explained_variance_ratio_

# 2 主成分分析と比較 --------------------------------------------------------------------------------

# インスタンスの生成
pca = PCA(n_components=2)
vars(pca)

# 学習
iris_pca = pca.fit_transform(iris_X)
Esempio n. 5
0
import numpy as np
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD

d = np.array([[4, 2, 1], [10, 8, 3], [3, 9, 4]])
u, s, v = svd(d)
r = 1
new_s = s[0:r]
new_u = u[:, 0:r]
new_v = v[0:r, :]
print new_v
print new_u.dot(np.diag(new_s))

svd = TruncatedSVD(n_components=r, random_state=42)
p = svd.fit_transform(d)
print p
def tSNE(fileName):
    cnx = sqlite3.connect('data/10Feature.db')
    dforigtrain = pandas.read_csv(fileName)
    print(dforigtrain.shape)
    print dforigtrain.head()
    dforigtrain.rename(
        columns=lambda x: '_'.join([x.strip() for x in x.lower().split()]),
        inplace=True)
    df = dforigtrain[[
        c for c in dforigtrain.columns.values.tolist() if c != 'orig_set'
    ]]
    print(df.shape)

    # Write to DB to allow easier loading later
    df.to_sql('df_clean', cnx, if_exists='replace', index=None)

    df = pd.read_sql('select * from df_clean', cnx)

    print(df.shape)
    scaler = StandardScaler().fit(df.iloc[:, 2:])

    dfs = pd.DataFrame(scaler.transform(df.iloc[:, 2:]),
                       index=df.index,
                       columns=df.columns[2:])

    # Commented part helps in creating SVD
    '''u, s, vt = svd(dfs)

    ax = pd.Series(s).plot(figsize=(10,3), logy=True)

    print('{} SVs are NaN'.format(np.isnan(s).sum()))
    print('{} SVs less than 1e-12'.format(len(s[s < 1e-12])))
    
    plt.show()'''

    # from here the Truncated SVD , this is mostly helpful in image data set where reducing dimensions is mostl;y possibel . In our case, every feature was contributing . Hence no Truncation is possible

    ncomps = 19

    svd = TruncatedSVD(algorithm='randomized', n_components=ncomps)

    svd_fit = svd.fit(dfs)

    Y = svd.fit_transform(dfs)

    ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot(
        kind='line', figsize=(10, 3))

    print(
        'Variance preserved by first ' + str(ncomps) +
        ' components == {:.2%}'.format(
            svd_fit.explained_variance_ratio_.cumsum()[-1]))

    plt.show()

    dfsvd = pd.DataFrame(Y,
                         columns=['c{}'.format(c) for c in range(ncomps)],
                         index=df.index)

    dfsvd.to_sql('df_svd', cnx, if_exists='replace', index=None)

    dfsvd = pd.read_sql('select * from df_svd', cnx)

    print(dfsvd.shape)

    svdcols = [c for c in dfsvd.columns if c[0] == 'c']

    df = pd.read_sql('select * from df_clean', cnx)

    print(dfsvd.shape)

    print(dfsvd.head())

    plotdims = 8
    ploteorows = 1
    dfsvdplot = dfsvd[svdcols].iloc[:, :plotdims]
    dfsvdplot['class'] = df['class']
    #interactive(plot_3d_scatter, A=fixed(dfsvd), elevation=30, azimuth=120)

    ax = sns.pairplot(dfsvdplot.iloc[::ploteorows, :], hue='class', size=1.8)

    plt.show()

    #rowsubset = [10,20,40,80,160,320,640, 1280, 1900]
    tsne = TSNE(n_components=2, random_state=0)
    '''runs = np.empty((len(rowsubset),1))

    for i, rows in enumerate(rowsubset):
        t0 = time()
        Z = tsne.fit_transform(dfsvd.iloc[:rows,:][svdcols])
        runs[i] = time() - t0

    ax = pd.DataFrame(runs, index=rowsubset).plot(kind='bar', logy=False, figsize=(10,4))
    plt.show()
    
    '''
    Z = tsne.fit_transform(dfsvd[svdcols])
    dftsne = pd.DataFrame(Z, columns=['x', 'y'], index=dfsvd.index)
    ax = sns.lmplot('x',
                    'y',
                    dftsne,
                    fit_reg=False,
                    size=8,
                    scatter_kws={
                        'alpha': 0.7,
                        's': 60
                    })
    ax.axes.flat[0].set_title(
        'Scatterplot of a 50D dataset reduced to 2D- Unsupervised')

    #plt.show()

    dftsne['class'] = df['class']

    g = sns.lmplot('x',
                   'y',
                   dftsne,
                   hue='class',
                   fit_reg=False,
                   size=8,
                   scatter_kws={
                       'alpha': 0.7,
                       's': 60
                   })

    g.axes.flat[0].set_title(
        'Scatterplot of a 50D dataset reduced to 2D -Supervised')

    plt.show()
Esempio n. 7
0
def svd(data, dim=2):
    svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
    return svd.fit_transform(doc_term_matrix_d1)
Esempio n. 8
0
File: p.py Progetto: chengxwcq/ee219
import numpy as np
from scipy.linalg import svd
from sklearn.decomposition import TruncatedSVD

d = np.array([[4, 2, 1], [10, 8, 3], [3, 9, 4]])
u, s, v = svd(d)
r = 1
new_s = s[0:r]
new_u = u[:, 0 : r]
new_v = v[0 : r, :]
print new_v
print new_u.dot(np.diag(new_s))

svd = TruncatedSVD(n_components = r, random_state = 42)
p = svd.fit_transform(d)
print p



Esempio n. 9
0
def main():
    millis_start = int(round(time.time() * 1000))
    eta = 0.8
    c = 0.5
    epochs = 1
    figure_count = 0

    print("Start", millis_start)
    # load data
    #file_path="/home/fahad/code/ML/MNIST_digit_data.mat";
    file_path = "MNIST_digit_data.mat"
    #file_path=input("Enter MNIST file path: ");
    if (Path(file_path).exists() == False):
        print("File does not exist in current directory")
        file_path = input("Enter MNIST file path: ")
        #return;

    matdata = scio.loadmat(file_path)
    labels_test = np.array(matdata['labels_test'])
    labels_train = np.array(matdata['labels_train'])
    images_test = np.array(matdata['images_test'])
    images_train = np.array(matdata['images_train'])

    #These are the data sets to be used for everything
    shuff_train_images_1000, shuff_train_labels_1000 = shuffle_in_unison(
        images_train, labels_train, 1000)

    #3. Train SVM
    svm = LinearSVC()
    svm_model_wts = svm.fit(shuff_train_images_1000, shuff_train_labels_1000)
    #weights=svm.coef_;
    predicted = svm.predict(images_test)
    print("predicted", predicted.shape)
    #print("actual labels",labels_test);
    conf_matrix = np.zeros((10, 10))
    accuracy = 0
    for i in range(predicted.shape[0]):
        conf_matrix[labels_test[i][0]][predicted[i]] += 1
    #cnf_matrix=confusion_matrix(labels_test[0:4], predicted);
    for j in range(10):
        accuracy += conf_matrix[j][j]

    print(np.sum(conf_matrix), " Accuracy:", accuracy / (np.sum(conf_matrix)))

    #4. Access m to get weight values
    #print("weight",svm.coef_.shape);
    confusion_matrix = np.zeros((10, 10))
    #print("c matrix",confusion_matrix.shape);
    idx = 0
    count_wts = 0
    misclassify = 0
    misclassify_records = []
    for row in images_test:
        #print("looping test",idx);
        count_wts = 0
        flag = True
        #weights_missclassified=[];
        prev_wt = 0
        pred_idx = 0
        for weights in svm.coef_:
            #print("looping",count_wts);
            #if(labels_test[idx]==count_wts):

            if (np.dot(weights, row) > prev_wt):
                prev_wt = np.dot(weights, row)
                pred_idx = count_wts
            #weights_missclassified.append(np.dot(weights,row));
            if (np.dot(weights, row) >= 1 and flag):
                #print("labels",labels_test[idx][0]);
                confusion_matrix[labels_test[idx][0]][count_wts] += 1
                flag = False
                #else:
                # confusion_matrix[labels_test[idx][0]][count_wts]+=1;
            if (flag and count_wts == 9):
                #record is misclassified by all
                misclassify += 1
                confusion_matrix[labels_test[idx][0]][pred_idx] += 1
                #misclassify_records.append(row);
            count_wts += 1
        idx += 1
    #print(confusion_matrix);

    #print("total sum",sum(sum(confusion_matrix)));
    accuracy = 0
    for j in range(10):
        accuracy += confusion_matrix[j][j]
    #print("Accuracy:",accuracy/labels_test.shape[0]);

    #5. PCA on training data
    pca = PCA(n_components=50)
    pca.fit(images_train)
    images_train_reduced = pca.transform(images_train)
    #print("shape of PCA",images_train_reduced.shape);

    #manually reduce the dimension using SVD
    mean = np.mean(images_train, axis=1)
    print("mean shape", mean.shape)
    mean_vec = mean.reshape((mean.shape[0], 1))
    #N=images_train.shape[0];

    norm_images_train = images_train - mean_vec

    mean_test = np.mean(images_test, axis=1)
    mean_vec_test = mean_test.reshape((mean_test.shape[0], 1))
    norm_images_test = images_test - mean_vec_test
    print("shape of norm data", norm_images_train.shape)

    svd = skd.TruncatedSVD(n_components=50)
    reduced_images_train = svd.fit_transform(norm_images_train)

    #reduced_images_train=svd.transform(norm_images_train);
    print("reduced", reduced_images_train.shape)
    retrans_images_train = svd.inverse_transform(reduced_images_train)
    print("unreduced", retrans_images_train.shape)
    #Now compare to original data -- How??
    #pick random idxes for 20 images and plot alongside
    indexes = random.sample(range(0, norm_images_train.shape[0]), 20)
    indexes = sorted(indexes)
    #print("indexes",indexes);
    fig_num = 1
    figure_count += 1
    plt.figure(figure_count)
    for i in range(0, 2):
        #figure_count+=1;
        for idxs in indexes:
            plt.subplot(8, 5, fig_num)
            plt.axis('off')
            if (i == 0):
                #print("which idx",idxs);
                imag = norm_images_train[idxs, :]
            else:
                #print("trans",idxs);
                imag = retrans_images_train[idxs, :]
            rot_img = np.fliplr(np.array(imag).reshape((28, 28)))
            final_rot_img = scipy.ndimage.rotate(rot_img, 90)
            plt.imshow(final_rot_img, cmap='gray', interpolation='nearest')
            #plt.tight_layout();
            fig_num += 1
        #plt.text(0,0,actual_string,fontsize=7);

    plt.savefig("comparison-20.png")

    vecs_eig, list_mse, list_dim = project_data(norm_images_train, 500)
    #plot mses
    figure_count += 1
    plt.figure(figure_count)
    plt.ylabel("Mean Square Errors")
    plt.xlabel("Dimensions")
    plt.title("MSE across reduced dimensions")
    plt.hold(True)
    plt.grid(True)
    #plt.ylim((0,1));
    plt.plot(list_dim, list_mse, 'xb-')
    plt.savefig("hw4-5-mse.png")

    #Display eigs
    fig_num = 1
    figure_count += 1
    plt.figure(figure_count)
    for i in range(vecs_eig.shape[0]):
        row = vecs_eig[i, :]
        plt.subplot(2, 5, fig_num)
        plt.axis('off')
        rot_img = np.fliplr(vecs_eig[i, :].reshape((28, 28)))
        final_rot_img = scipy.ndimage.rotate(rot_img, 90)
        plt.imshow(final_rot_img, cmap='gray', interpolation='nearest')
        fig_num += 1
    plt.savefig("hw4-5-eigs.png")
    #return;

    #6. project and train - should i be using normed data?
    dimension_list = [
        2, 5, 10, 20, 30, 50, 70, 100, 150, 200, 250, 300, 400, 500, 748
    ]
    #dimension_list=[400,500,748];
    #print("dim length", len(dimension_list));
    dim_acc_svm = np.empty((len(dimension_list), 2))
    dim_acc_mlp = np.empty((len(dimension_list), 2))
    counter = 0
    for dims in dimension_list:
        accu = train_svm(norm_images_train[:10000], labels_train[:10000],
                         norm_images_test[:3000], labels_test[:3000], dims)
        accu_mlp = train_mlp(norm_images_train[:10000], labels_train[:10000],
                             norm_images_test[:3000], labels_test[:3000], dims)
        dim_acc_svm[counter, 0] = dims
        dim_acc_svm[counter, 1] = accu
        dim_acc_mlp[counter, 0] = dims
        dim_acc_mlp[counter, 1] = accu
        counter += 1

    #print("dimension",dim_acc_svm);
    #print("dim mlp",dim_acc_mlp);
    figure_count += 1
    plt.figure(figure_count)
    plt.ylabel("Accuracy")
    plt.xlabel("Dimensions")
    plt.title("Distribution of accuracy across dimensions")
    plt.hold(True)
    plt.grid(True)
    plt.ylim((0, 1))
    plt.plot(dim_acc_svm[:, 0], dim_acc_svm[:, 1], 'xb-')
    plt.savefig("hw4-7.png")
    #return;
    #8. implementaion of neural network
    figure_count += 1
    plt.figure(figure_count)
    plt.ylabel("Accuracy")
    plt.xlabel("Dimensions")
    plt.title("Distribution of accuracy across dimensions")
    plt.hold(True)
    plt.grid(True)
    plt.ylim((0, 1))
    plt.plot(dim_acc_mlp[:, 0], dim_acc_mlp[:, 1], 'xb-')
    plt.savefig("hw4-8.png")