コード例 #1
0
def nmf_filter(field, nmodes, return_filter=False, **kwargs_nmf):
    """
    Apply a Non-Negative Matrix Factorisation (NMF) filter to a field. This 
    finds two non-negative matrices whose product approximates the (strictly 
    non-negative) input signal. 

    Uses `sklearn.decomposition.NMF`. For more details, see:
    https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

    Parameters:
        field (array_like):
            3D array containing the field that the filter will be applied to. 
            NOTE: This assumes that the 3rd axis of the array is frequency.

        nmodes (int):
            Number of eigenmodes to filter out.

        return_filter (bool, optional):
            Whether to also return the linear FG filter operator and coefficients. 

        **kwargs_nmf (dict, optional):
            Keyword arguments for the `sklearn.decomposition.NMF`

    Returns:
        cleaned_field (array_like), transformer (sklearn.decomposition.NMF instance, optional): Foreground-filtered field and NMF filter object.
        
        - ``cleaned_field (array_like)``:
            Foreground-cleaned field.

        - ``transformer (sklearn.decomposition.NMF instance, optional)``:
            Contains the NMF filter. Only returned if `return_operator = True`. 
            To get the foreground model, you can do the following: 
                ```
                x = field - mean_field # shape (Npix, Nfreq)
                x_trans = transformer.fit_transform(x.T) # mode amplitudes per pixel
                x_fg = transformer.inverse_transform(x_trans).T # foreground model
                ```
    """
    # Calculate freq-freq covariance matrix
    d = field.reshape((-1, field.shape[-1])).T  # (Nfreqs, Nxpix * Nypix)

    # Calculate average spectrum (avg. over pixels, as a function of frequency)
    d_mean = np.mean(d, axis=-1)[:, np.newaxis]
    x = d

    # Build NMF model and get amplitudes for each mode per pixel
    transformer = NMF(n_components=nmodes, **kwargs_nmf)
    x_trans = transformer.fit_transform(x.T)

    # Construct foreground operator
    x_fg = transformer.inverse_transform(x_trans).T

    # Subtract foreground operator
    x_clean = (x - x_fg).T.reshape(field.shape)

    # Return FG-subtracted data (and, optionally, the NMF filter instance)
    if return_filter:
        return x_clean, transformer
    else:
        return x_clean
コード例 #2
0
def test_perfect_separation_of_latents():
    latents = data()
    W, H, mapping = generate_w_h(latents, n_users=100)
    nmf = NMF(solver='mu', init='custom', n_components=3)
    nmf.components_ = H
    nmf.n_components_ = H.shape[0]
    X = nmf.inverse_transform(W)
コード例 #3
0
def nmf(data, n_components, norm=True, plot=False):
    """
    Computes Non-Negative Matrix Factorization.
    :param data:
    :param n_components:
        Number of components.
    :param norm:
        If normalize by MinMaxScaler returned components
    :param plot:
        If plot. Default False.
    :return:
    """
    if norm:
        data = MinMaxScaler().fit_transform(data)

    nmf = NMF(n_components=n_components)
    c = nmf.fit_transform(data)

    predictions = nmf.inverse_transform(c)
    explained_variance = explained_variance_score(data, predictions)

    if norm:
        c = MinMaxScaler().fit_transform(c)

    if plot:
        for i in range(0, c.shape[1]):
            plt.plot(c[:, i], label='%s_%s' % ("NMF", i))

    return nmf, c, explained_variance
コード例 #4
0
ファイル: NMF.py プロジェクト: shabiouyang/ALF
def main():
    train_data, train_length = get_train_data()
    test_data, test_length, width, height = get_test_data()

    model = NMF(n_components=5, init='random', random_state=0)
    W = model.fit_transform(train_data)
    H = model.components_
    compressed_images = model.transform(test_data)
    output_images = model.inverse_transform(compressed_images)

    output_length = len(output_images)
    rgb_length = int(output_length / 3)
    reconstruct_subimages = np.zeros([height * width, 25, 3], dtype=np.float32)
    for channels in range(3):
        reconstruct_subimages[:, :, int(channels)] = output_images[(
            rgb_length * channels):(rgb_length * (channels + 1)), :]
    all_image_rec = np.zeros([25, height, width, 3], dtype=np.float32)
    for x in range(width):
        for y in range(height):
            all_image_rec[:, y,
                          x, :] = reconstruct_subimages[y * width + x, :] * 255
    for numbers in range(25):
        all_image_rec[numbers, :, :, :] = cv2.cvtColor(
            all_image_rec[numbers, :, :, :], cv2.COLOR_BGR2RGB)
        cv2.imwrite("./Reconstruct/" + str(numbers + 1) + "_" + "NMF" + ".png",
                    all_image_rec[numbers, :, :, :])
コード例 #5
0
def nmf_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          setting='eval'):
    """
    nmf_ratings Predicates
    """

    nmf_model = NMF(n_components=50)
    observed_user_item_matrix = observed_ratings_df.loc[:, 'rating'].unstack(
        fill_value=0.5)
    truth_user_item_matrix = truth_ratings_df.loc[:, 'rating'].unstack()

    transformed_matrix = nmf_model.fit_transform(observed_user_item_matrix)
    predictions = pd.DataFrame(nmf_model.inverse_transform(transformed_matrix),
                               index=observed_user_item_matrix.index,
                               columns=observed_user_item_matrix.columns)

    # make predictions for the user item pairs in the truth frame
    predictions = predictions.reindex(truth_user_item_matrix.index,
                                      columns=truth_user_item_matrix.columns,
                                      fill_value=0.5).stack()

    predictions = predictions.clip(0, 1)

    write(predictions, 'nmf_rating_obs', fold, setting)
    def test(cls, csv, K=3, dr='PCA'):
        '''
        csv - A csv file without header.
        '''

        from sklearn.decomposition import PCA, NMF
        from sklearn.random_projection import GaussianRandomProjection
        from sklearn.manifold import MDS, TSNE
        from sklearn.cluster import KMeans
        from sklearn.preprocessing import OneHotEncoder

        X = pd.read_csv(csv, header=None).values
        Z = None
        Xr = None

        if (dr == 'PCA'):
            pca = PCA(n_components=K)  # keep the first K components
            pca.fit(X)
            Z = pca.transform(X)
            Xr = pca.inverse_transform(Z)
        elif (dr == 'NMF'):
            # make sure X is non-negative
            Xmin = np.min(X)
            if (Xmin < 0):
                X = X - Xmin

            nmf = NMF(n_components=K)  # keep the first K components
            nmf.fit(X)
            Z = nmf.transform(X)
            Xr = nmf.inverse_transform(Z)

            if (Xmin < 0):
                Xr = Xr + Xmin
        elif (dr == 'RP'):
            grp = GaussianRandomProjection(
                n_components=K)  # keep the first K components
            Z = grp.fit_transform(X)
        elif (dr == 'VQ'):
            kmeans = KMeans(n_clusters=K).fit(X)
            Xvq = kmeans.predict(X)
            H = kmeans.cluster_centers_
            ohe = OneHotEncoder()
            Z = ohe.fit_transform(Xvq.reshape(-1, 1)).A
            Xr = Z @ H
        elif (dr == 'MDS'):
            mds = MDS(n_components=K)  # keep the first K components
            Z = mds.fit_transform(X)
        elif (dr == 'TSNE'):
            tsne = MDS(n_components=K)  # keep the first K components
            Z = tsne.fit_transform(X)
        elif (dr == 'IDENTITY'):
            # for this case, k is not used.
            Z = X
            Xr = X
        else:
            raise Exception("Invalid DR name")

        return cls(X, Z, Xr)
コード例 #7
0
ファイル: test_nmf.py プロジェクト: kjacks21/scikit-learn
def test_nmf_inverse_transform(solver):
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    m = NMF(solver=solver, n_components=4, init='random', random_state=0,
            max_iter=1000)
    ft = m.fit_transform(A)
    A_new = m.inverse_transform(ft)
    assert_array_almost_equal(A, A_new, decimal=2)
コード例 #8
0
ファイル: test_nmf.py プロジェクト: AnAnteup/icp4
def test_nmf_inverse_transform(solver):
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    m = NMF(solver=solver, n_components=4, init='random', random_state=0,
            max_iter=1000)
    ft = m.fit_transform(A)
    A_new = m.inverse_transform(ft)
    assert_array_almost_equal(A, A_new, decimal=2)
コード例 #9
0
ファイル: test_nmf.py プロジェクト: jnothman/scikit-learn
def test_nmf_inverse_transform():
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    m = NMF(n_components=4, init="random", random_state=0)
    m.fit_transform(A)
    t = m.transform(A)
    A_new = m.inverse_transform(t)
    assert_array_almost_equal(A, A_new, decimal=2)
コード例 #10
0
def test_nmf_inverse_transform():
    # Test that NMF.inverse_transform returns close values
    random_state = np.random.RandomState(0)
    A = np.abs(random_state.randn(6, 4))
    for solver in ('pg', 'cd'):
        m = NMF(solver=solver, n_components=4, init='random', random_state=0)
        m.fit_transform(A)
        t = m.transform(A)
        A_new = m.inverse_transform(t)
        assert_array_almost_equal(A, A_new, decimal=2)
コード例 #11
0
def test_nmf_on_masked_ratings():
    _matrix_1 = nan_masked_from_ratings(
        [Rating(0, 3, 1.0),
         Rating(5, 3, 0.5),
         Rating(5, 8, 1.0)],
        rows=10,
        columns=10)
    _nmf = NMF(solver='mu', init='random', n_components=2)
    W = _nmf.fit_transform(_matrix_1)
    X = _nmf.inverse_transform(W)
    assert np.shape(X) == np.shape(_matrix_1)
    recommended = np.argsort(X[0])
    assert recommended[-1] == 8
    assert recommended[-2] == 3
コード例 #12
0
def generate_array_data_file():
    latents = data()
    # W = np.zeros(shape=(n_users + 1, n_components))
    # H = np.zeros(shape=(n_components, n_items))
    n_users = 100
    W, H, mapping = generate_w_h(latents,
                                 n_users=n_users,
                                 use_random=True,
                                 sigma=.5,
                                 mean=-1.5)
    nmf = NMF(solver='mu', init='custom', n_components=3)
    nmf.components_ = H
    nmf.n_components_ = H.shape[0]
    X = nmf.inverse_transform(W)
    # returns a shape of n_users+1. Wipe these ratings since they are not yet determined.
    X[n_users, :] = np.nan
    file_contents = dedent('''
namespace MonsterMatch.CollaborativeFiltering
{
    public static class MonsterMatchArrayData
    {
        // @formatter:off
        public const int UserCount = %d;
        public const int ItemCount = %d;
        public const int PlayerUserId = %d;
        public const int FactorCount = %d;
        public static readonly int[] ForProfiles = %s;
        public static readonly double[,] Data = %s;
        public static readonly double[,] Pu = %s;
        public static readonly double[,] Qi = %s;
        // @formatter:on
    }
}
    ''')
    file_contents = file_contents % (
        n_users + 1, len(latents), n_users, nmf.n_components_, 'new [] {' +
        ','.join([str(profile.index)
                  for profile in latents]) + '}', csharp_repr_ndarray(X),
        csharp_repr_ndarray(W), csharp_repr_ndarray(H.T))
    print(file_contents)
コード例 #13
0
ファイル: NMF.py プロジェクト: shabiouyang/ALF
def main():
	train_data, train_length = get_train_data()
	test_data, test_length, width, height = get_test_data()

	model = NMF(n_components=5, init='random', random_state=0)
	W = model.fit_transform(train_data)
	H = model.components_
	compressed_images = model.transform(test_data)
	output_images = model.inverse_transform(compressed_images)

	output_length= len(output_images)
	rgb_length = int(output_length/3)
	reconstruct_subimages = np.zeros([height*width, 25, 3], dtype=np.float32)
	for channels in range(3):
		reconstruct_subimages[:, :, int(channels)] = output_images[(rgb_length*channels):(rgb_length*(channels+1)),:]
	all_image_rec = np.zeros([25,height,width,3], dtype=np.float32)
	for x in range(width):
		for y in range(height):
			all_image_rec[:,y,x,:] = reconstruct_subimages[y * width + x,:]*255
	for numbers in range(25):
		all_image_rec[numbers, :, :, :] = cv2.cvtColor(all_image_rec[numbers, :, :, :], cv2.COLOR_BGR2RGB)
		cv2.imwrite("./Reconstruct/" + str(numbers+1) + "_" + "NMF" + ".png", all_image_rec[numbers, :, :, :])
コード例 #14
0
def compute_scores(X):
    pca = PCA(svd_solver='auto')
    kpca = KernelPCA(fit_inverse_transform=True)
    ica = FastICA()
    nmf = NMF(init='nndsvda')
    pca_scores, ica_scores, nmf_scores, kpca_scores = [], [], [], []
    for n in n_components:
        pca.n_components = n
        ica.n_components = n
        nmf.n_components = n
        kpca.n_components = n
        print(n)

        Xpca = pca.inverse_transform(pca.fit_transform(Xs))
        pca_scores.append(explained_variance_score(Xs, Xpca))
        Xica = ica.inverse_transform(ica.fit_transform(Xs))
        ica_scores.append(explained_variance_score(Xs, Xica))
        Xkpca = kpca.inverse_transform(kpca.fit_transform(Xs))
        kpca_scores.append(explained_variance_score(Xs, Xkpca))

        Xnmf = nmf.inverse_transform(nmf.fit_transform(X))
        nmf_scores.append(explained_variance_score(X, Xnmf))

    return pca_scores, ica_scores, nmf_scores, kpca_scores
コード例 #15
0
ファイル: noise_filter_all_runs.py プロジェクト: dermen/loki
            new_cpsi = np.linspace(np.max( (exp_cpsi2.min(),sim_cpsi2.min()) )+0.05,
                                  np.min((exp_cpsi2.max(), sim_cpsi2.max()))-0.05,
                                  interp_num_phi,endpoint=False )
            interp_cpsi[qidx] = new_cpsi

            interp_X = interp_shots(norm_X2, interp_num_phi, sim_cpsi2, new_cpsi)
            interp_pro = interp_shots(norm_GDPpro2, interp_num_phi, exp_cpsi2, new_cpsi)
            interp_buf = interp_shots(norm_buf2, interp_num_phi, exp_cpsi2, new_cpsi)

            # transform and inverse transform
            model = NMF(n_components=10,solver='cd')
            W=model.fit_transform(interp_X)
            H=model.components_

            new_buf = model.transform(interp_buf)
            new_pro = model.transform(interp_pro)
    
            inverse_diff = model.inverse_transform(new_pro-new_buf)

            # average and error estimate

            pro[qidx] = inverse_diff.mean(0)
            err[qidx] = inverse_diff.std(0)/np.sqrt(inverse_diff.shape[0])

        grp.create_dataset('ave_cor',data=pro)
        grp.create_dataset('err',data=err)
        grp.create_dataset('num_shots',data=inverse_diff.shape[0])
        grp.create_dataset('interp_cpsi', data = interp_cpsi)
        grp.create_dataset('nnmf_n_components', data = model.n_components)

コード例 #16
0
# W = samples x n_components (component mapping for each sample)

fig = plt.figure(figsize=(8, 8))
rows = min(int(np.sqrt(train_samples)), 10)
columns = min(int(train_samples / rows), 10)
for i in range(0, columns * rows):
    img = W[i].reshape(
        (int(np.sqrt(nmf_components)), int(np.sqrt(nmf_components))))
    fig.add_subplot(rows, columns, i + 1)
    plt.axis('off')
    plt.imshow(img)
plt.show()

# ===============================================
# Reconstruction
# ===============================================

output = nmf.inverse_transform(W)

fig = plt.figure(figsize=(8, 8))
rows = min(int(np.sqrt(train_samples)), 10)
columns = min(int(train_samples / rows), 10)
for i in range(0, columns * rows):
    img = output[i].reshape((28, 28))
    fig.add_subplot(rows, columns, i + 1)
    plt.axis('off')
    plt.imshow(img)
plt.show()

# ===============================================
コード例 #17
0
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
import onnx
from skl2onnx.algebra.onnx_ops import (OnnxArrayFeatureExtractor, OnnxMul,
                                       OnnxReduceSum)
from skl2onnx.common.data_types import FloatTensorType
from onnxruntime import InferenceSession

mat = np.array(
    [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]],
    dtype=np.float64)
mat[:mat.shape[1], :] += np.identity(mat.shape[1])

mod = NMF(n_components=2)
W = mod.fit_transform(mat)
H = mod.components_
pred = mod.inverse_transform(W)

print("original predictions")
exp = []
for i in range(mat.shape[0]):
    for j in range(mat.shape[1]):
        exp.append((i, j, pred[i, j]))

print(exp)

#######################
# Let's rewrite the prediction in a way it is closer
# to the function we need to convert into ONNX.


def predict(W, H, row_index, col_index):
コード例 #18
0
class Operation:
    contentStrengthFramework = ''  #used to store Pandas Framework corresponding to each content popularity
    contentStrengthRegionAndCountryWise = ''  #used to store Pandas Framework corresponding to each content popularity specific to their region
    personCountryAndRegionDict = {}
    ratingsDf = ''
    contentBasedSimilarity = {}
    itemList = []
    nmfmodel = ''

    def __init__(self):
        pass

    def setItemList(self, itemList):
        self.itemList = itemList

    def setContentBasedSimilarity(self, contentBasedSimilarity):
        self.contentBasedSimilarity = contentBasedSimilarity

    def performanceMatrixFormation(self, userContentInteractionFramework):
        self.contentStrengthFramework = userContentInteractionFramework.groupby(
            'contentId')['eventStrength'].sum().sort_values(
                ascending=False).reset_index()
        self.contentStrengthRegionAndCountryWise = userContentInteractionFramework.groupby(
            ['userCountry', 'userRegion',
             'contentId'])['eventStrength'].sum().sort_values(
                 ascending=False).reset_index()

    def personDictFormation(self, userContentInteractionFramework):
        for i in range(len(userContentInteractionFramework)):
            try:
                personId = userContentInteractionFramework.loc[i]['personId']
                # print(personId)
                if personId not in self.personCountryAndRegionDict:
                    # print(personId,"first condition")
                    self.personCountryAndRegionDict[personId] = {}
                    self.personCountryAndRegionDict[personId]['Country'] = ''
                    self.personCountryAndRegionDict[personId]['Region'] = ''

                if type(userContentInteractionFramework.loc[i]
                        ['userCountry']) != float:
                    # print(personId,"second condition")
                    self.personCountryAndRegionDict[personId][
                        'Country'] = userContentInteractionFramework.loc[i][
                            'userCountry']

                if type(userContentInteractionFramework.loc[i]
                        ['userRegion']) != float:
                    # print(personId,"third condition")
                    self.personCountryAndRegionDict[personId][
                        'Region'] = userContentInteractionFramework.loc[i][
                            'userRegion']
            except:
                print("error occured at i th iteration ", i)

    def matrixFactorization(self, userContentInteractionFrameworkTrain,
                            numOfFactor):
        matrixDF = userContentInteractionFrameworkTrain.pivot(
            index='personId', columns='contentId',
            values='eventStrength').fillna(0)
        matrix = matrixDF.as_matrix()
        u, sigma, vT = svds(matrix, k=numOfFactor)
        sigma = numpy.diag(sigma)
        predictedMatrix = numpy.dot(numpy.dot(u, sigma), vT)
        self.ratingsDf = pd.DataFrame(predictedMatrix,
                                      columns=matrixDF.columns,
                                      index=list(matrixDF.index)).transpose()

    def matrixFactorizationCluster(self,
                                   userContentInteractionFrameworkTrain,
                                   numOfFactor,
                                   k=2):
        matrixDF = userContentInteractionFrameworkTrain.pivot(
            index='personId', columns='contentId',
            values='eventStrength').fillna(0)
        matrix = matrixDF.as_matrix()
        a = []
        b = []
        kmeans = KMeans(n_clusters=k, random_state=0).fit(matrix)
        for labels in kmeans.labels_:
            if labels == 1:
                b.append(labels)
            else:
                a.append(labels)
        u, sigma, vT = svds(matrix[a], k=numOfFactor)
        sigma = numpy.diag(sigma)
        predictedMatrixA = numpy.dot(numpy.dot(u, sigma), vT)
        u, sigma, vT = svds(matrix[b], k=numOfFactor)
        sigma = numpy.diag(sigma)
        predictedMatrixB = numpy.dot(numpy.dot(u, sigma), vT)
        predictedMatrix = numpy.zeros(shape=(matrix.shape))
        predictedMatrix[a] = predictedMatrixA
        predictedMatrix[b] = predictedMatrixB
        self.ratingsDfCluster = pd.DataFrame(predictedMatrix,
                                             columns=matrixDF.columns,
                                             index=list(
                                                 matrixDF.index)).transpose()

    def matrixFactorizationNMF(self, userContentInteractionFrameworkTrain,
                               numOfFactor):
        matrixDF = userContentInteractionFrameworkTrain.pivot(
            index='personId', columns='contentId',
            values='eventStrength').fillna(0)
        matrix = matrixDF.as_matrix()
        self.nmfmodel = NMF(n_components=numOfFactor)
        W = self.nmfmodel.fit_transform(matrix)
        self.matrixnmf = self.nmfmodel.inverse_transform(W)
        self.matrixnmf = pd.DataFrame(self.matrixnmf,
                                      columns=matrixDF.columns,
                                      index=list(matrixDF.index)).transpose()

    def recommendation(self,
                       personId,
                       userContentInteractionFrameworkTrain,
                       topk,
                       isRegionMatter=True):
        contentList = userContentInteractionFrameworkTrain[
            userContentInteractionFrameworkTrain['personId'] ==
            personId]['contentId'].tolist()
        recommended = []
        curk = topk
        # print(topk)
        # personId = int(personId)
        if personId in self.personCountryAndRegionDict and isRegionMatter:
            if self.personCountryAndRegionDict[personId]['Region'] != '':
                contentStrengthRegionAndCountryWise = self.contentStrengthRegionAndCountryWise[
                    -self.contentStrengthRegionAndCountryWise.contentId.
                    isin(contentList)]
                recommendedRegionWise = contentStrengthRegionAndCountryWise[
                    contentStrengthRegionAndCountryWise['userRegion'] ==
                    self.personCountryAndRegionDict[personId]
                    ['Region']]['contentId'].head(curk).tolist()
                recommended.extend(list(set(recommendedRegionWise)))
                curk = topk - len(recommended)
                # print('first condition',len(recommended),len(list(set(recommendedRegionWise))),curk,topk)
            if curk <= 0:
                return recommended
            contentList.extend(recommended)

            if self.personCountryAndRegionDict[personId]['Country'] != '':
                contentStrengthRegionAndCountryWise = self.contentStrengthRegionAndCountryWise[
                    -self.contentStrengthRegionAndCountryWise.contentId.
                    isin(contentList)]
                recommendedCountryWise = contentStrengthRegionAndCountryWise[
                    contentStrengthRegionAndCountryWise['userCountry'] ==
                    self.personCountryAndRegionDict[personId]
                    ['Country']]['contentId'].head(curk).tolist()
                recommendedCountryWise = list(set(recommendedCountryWise))
                recommended.extend(recommendedCountryWise)
                contentList.extend(recommendedCountryWise)
                # print('second condition',len(recommended),len(list(set(recommendedCountryWise))),curk)
                curk = topk - len(recommended)
            if curk <= 0:
                return recommended
        # else:
        #     print(personId,"not in dict")
        while (curk > 0):
            recommendedBasedOnPopularity = self.contentStrengthFramework[
                -self.contentStrengthFramework.contentId.
                isin(contentList)]['contentId'].head(curk).tolist()
            recommendedBasedOnPopularity = list(
                set(recommendedBasedOnPopularity))
            contentList.extend(recommendedBasedOnPopularity)
            recommended.extend(recommendedBasedOnPopularity)
            curk -= len(recommendedBasedOnPopularity)
            # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk)
        return recommended

    def cfRecommendation(self, personId, userContentInteractionFrameworkTrain,
                         topk):
        contentList = userContentInteractionFrameworkTrain[
            userContentInteractionFrameworkTrain['personId'] ==
            personId]['contentId'].tolist()
        recommended = []
        curk = topk
        # print(topk)
        # personId = int(personId)
        while (curk > 0):
            ratings = self.ratingsDf[personId].sort_values(
                ascending=False).reset_index()
            recommendedBasedOnMatrix = ratings[-ratings.contentId.isin(
                contentList)]['contentId'].head(curk).tolist()
            recommendedBasedOnMatrix = list(set(recommendedBasedOnMatrix))
            contentList.extend(recommendedBasedOnMatrix)
            recommended.extend(recommendedBasedOnMatrix)
            curk -= len(recommendedBasedOnMatrix)
            # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk)
        return recommended

    def cfRecommendationCluster(self, personId,
                                userContentInteractionFrameworkTrain, topk):

        contentList = userContentInteractionFrameworkTrain[
            userContentInteractionFrameworkTrain['personId'] ==
            personId]['contentId'].tolist()
        recommended = []
        curk = topk
        # print(topk)
        # personId = int(personId)
        while (curk > 0):
            ratings = self.ratingsDfCluster[personId].sort_values(
                ascending=False).reset_index()
            recommendedBasedOnMatrix = ratings[-ratings.contentId.isin(
                contentList)]['contentId'].head(curk).tolist()
            recommendedBasedOnMatrix = list(set(recommendedBasedOnMatrix))
            contentList.extend(recommendedBasedOnMatrix)
            recommended.extend(recommendedBasedOnMatrix)
            curk -= len(recommendedBasedOnMatrix)
            # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk)
        return recommended

    def cfRecommendationNMF(self, personId,
                            userContentInteractionFrameworkTrain, topk):
        contentList = userContentInteractionFrameworkTrain[
            userContentInteractionFrameworkTrain['personId'] ==
            personId]['contentId'].tolist()
        recommended = []
        curk = topk
        # print(topk)
        # personId = int(personId)
        while (curk > 0):
            ratings = self.matrixnmf[personId].sort_values(
                ascending=False).reset_index()
            recommendedBasedOnMatrix = ratings[-ratings.contentId.isin(
                contentList)]['contentId'].head(curk).tolist()
            recommendedBasedOnMatrix = list(set(recommendedBasedOnMatrix))
            contentList.extend(recommendedBasedOnMatrix)
            recommended.extend(recommendedBasedOnMatrix)
            curk -= len(recommendedBasedOnMatrix)
            # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk)
        return recommended

    def evaluation(self,
                   userContentInteractionFrameworkTrain,
                   userContentInteractionFrameworkTest,
                   topk,
                   isRegionMatter=True,
                   recommendartionType=1):
        toreturn = {}
        count = 0
        for idx, i in enumerate(
                list(userContentInteractionFrameworkTest.index.unique().values)
        ):
            personId = userContentInteractionFrameworkTest.loc[i]['personId']
            if recommendartionType == 1:
                predicted = self.recommendation(
                    personId, userContentInteractionFrameworkTrain, topk,
                    isRegionMatter)
            elif recommendartionType == 2:
                predicted = self.cfRecommendationNMF(
                    personId, userContentInteractionFrameworkTrain, topk)
            elif recommendartionType == 3:
                predicted = self.cfRecommendationCluster(
                    personId, userContentInteractionFrameworkTrain, topk)
            else:
                predicted = self.cfRecommendation(
                    personId, userContentInteractionFrameworkTrain, topk)
            actual = userContentInteractionFrameworkTest[
                userContentInteractionFrameworkTest['personId'] ==
                personId]['contentId'].tolist()
            actual = list(set(actual))
            toreturn[personId] = {}
            toreturn[personId]['predicted'] = predicted
            toreturn[personId]['actual'] = actual
            denom = topk
            if len(actual) < topk:
                denom = len(actual)
            numer = len(list(set(actual).intersection(predicted)))
            recall = numer / float(denom)
            toreturn[personId]['recall'] = recall
            toreturn[personId]['numerator'] = numer
            toreturn[personId]['denominator'] = denom
            # print(personId,recall)
            if personId not in self.personCountryAndRegionDict:
                count += 1
            # break
        print(
            count,
            len(list(
                userContentInteractionFrameworkTest.index.unique().values)))
        return toreturn

    def contentBasedRecommendation(self, personId,
                                   userContentInteractionFrameworkTrain, topk):
        contentList = userContentInteractionFrameworkTrain[
            userContentInteractionFrameworkTrain['personId'] ==
            personId]['contentId'].tolist()
        recommended = []
        curk = topk
        # print(topk)
        # personId = int(personId)
        ratings = self.contentBasedSimilarity[personId]
        recommended = [
            x for _, x in sorted(zip(ratings, self.itemList), reverse=True)
        ]
        # print('third condition',len(recommended),len(list(set(recommendedBasedOnPopularity))),curk)
        return recommended[:topk]

    def intersectionAmongList(self, listOfLists):
        toreturn = listOfLists[0]
        for lol in listOfLists:
            toreturn = list(set(toreturn).intersection(lol))
        return lol

    # weights = [CFbased, contentBased, Country And Region wise popularityBased, popularityBased
    def hybridModelEvaluation(self,
                              userContentInteractionFrameworkTrain,
                              userContentInteractionFrameworkTest,
                              topk,
                              weights=[5, 2, 3, 0]):
        toreturn = {}
        count = 0
        for idx, i in enumerate(
                list(userContentInteractionFrameworkTest.index.unique().values)
        ):
            personId = userContentInteractionFrameworkTest.loc[i]['personId']
            predicted3 = self.recommendation(
                personId, userContentInteractionFrameworkTrain, topk, True)
            predicted4 = self.recommendation(
                personId, userContentInteractionFrameworkTrain, topk, False)
            predicted1 = self.cfRecommendation(
                personId, userContentInteractionFrameworkTrain, topk)
            try:
                predicted2 = self.contentBasedRecommendation(
                    personId, userContentInteractionFrameworkTrain, topk)
            except:
                predicted2 = []
            if len(predicted2) == 0:
                predicted = self.intersectionAmongList(
                    [predicted1, predicted3, predicted4])
            else:
                predicted = self.intersectionAmongList(
                    [predicted1, predicted2, predicted3, predicted4])
            predicted1 = list(set(predicted1) - set(predicted))
            predicted2 = list(set(predicted2) - set(predicted))
            predicted3 = list(set(predicted3) - set(predicted))
            predicted4 = list(set(predicted4) - set(predicted))

            if len(predicted) < topk:
                if len(predicted2) == 0:
                    predicted.extend(
                        self.intersectionAmongList([predicted1, predicted3]))
                else:
                    predicted.extend(
                        self.intersectionAmongList([predicted1, predicted3]))
                predicted1 = list(set(predicted1) - set(predicted))
                predicted2 = list(set(predicted2) - set(predicted))
                predicted3 = list(set(predicted3) - set(predicted))
                predicted4 = list(set(predicted4) - set(predicted))
                if len(predicted) < topk:
                    remain = topk - len(predicted)
                    for w, lol in zip(
                            weights,
                        [predicted1, predicted2, predicted3, predicted4]):
                        if remain <= 0:
                            predicted = predicted[:topk]
                        else:
                            res = math.ceil(0.1 * w * len(lol))
                            predicted.extend(lol[:res])
                            remain = topk - len(predicted)
                else:
                    predicted = predicted[:topk]
            else:
                predicted = predicted[:topk]

            actual = userContentInteractionFrameworkTest[
                userContentInteractionFrameworkTest['personId'] ==
                personId]['contentId'].tolist()
            actual = list(set(actual))
            toreturn[personId] = {}
            toreturn[personId]['predicted'] = predicted
            toreturn[personId]['actual'] = actual
            denom = topk
            if len(actual) < topk:
                denom = len(actual)
            numer = len(list(set(actual).intersection(predicted)))
            recall = numer / float(denom)
            toreturn[personId]['recall'] = recall
            toreturn[personId]['numerator'] = numer
            toreturn[personId]['denominator'] = denom
            # print(personId,recall)
            if personId not in self.personCountryAndRegionDict:
                count += 1
            # break
        print(
            count,
            len(list(
                userContentInteractionFrameworkTest.index.unique().values)))
        return toreturn

    # weights = [CFbased, contentBased, Country And Region wise popularityBased, popularityBased
    def latestHybridModelEvaluation(self,
                                    userContentInteractionFrameworkTrain,
                                    userContentInteractionFrameworkTest,
                                    topk,
                                    weights=[5, 5]):
        toreturn = {}
        count = 0
        for idx, i in enumerate(
                list(userContentInteractionFrameworkTest.index.unique().values)
        ):
            personId = userContentInteractionFrameworkTest.loc[i]['personId']
            # predicted2 = self.recommendation(personId, userContentInteractionFrameworkTrain, topk*2, True)
            predicted1 = self.cfRecommendation(
                personId, userContentInteractionFrameworkTrain, topk * 2)
            predicted2 = self.cfRecommendationNMF(
                personId, userContentInteractionFrameworkTrain, topk * 2)
            predicted = self.intersectionAmongList([predicted1, predicted2])
            predicted1 = list(set(predicted1) - set(predicted))
            predicted2 = list(set(predicted2) - set(predicted))

            if len(predicted) < topk:
                for w, lol in zip(weights, [predicted1, predicted2]):
                    res = math.ceil(0.1 * w * len(lol))
                    predicted.extend(lol[:res])
                    remain = topk - len(predicted)
                    if remain <= 0:
                        predicted = predicted[:topk]
            else:
                predicted = predicted[:topk]

            actual = userContentInteractionFrameworkTest[
                userContentInteractionFrameworkTest['personId'] ==
                personId]['contentId'].tolist()
            actual = list(set(actual))
            toreturn[personId] = {}
            toreturn[personId]['predicted'] = predicted
            toreturn[personId]['actual'] = actual
            denom = topk
            if len(actual) < topk:
                denom = len(actual)
            numer = len(list(set(actual).intersection(predicted)))
            recall = numer / float(denom)
            toreturn[personId]['recall'] = recall
            toreturn[personId]['numerator'] = numer
            toreturn[personId]['denominator'] = denom
            # print(personId,recall)
            #         print(count, len(list(userContentInteractionFrameworkTest.index.unique().values)))
        return toreturn

    def globalRecallCalc(self, recallDict, comp=0.1):
        numer = 0.0
        denom = 0.0
        recallList = []
        count = 0
        total = 0
        for key in recallDict:
            recallList.append(recallDict[key]['recall'])
            numer += recallDict[key]['numerator']
            denom += recallDict[key]['denominator']
            total += 1
            if recallDict[key]['recall'] >= comp:
                count += 1
        try:
            val = numer / denom
        except:
            val = 0
        return val, numpy.mean(recallList), numpy.median(
            recallList), count / float(total)

    def plotBar(self, xlabel, ylabel, label, scores, title):
        plt.bar(numpy.arange(len(label)), scores)
        plt.xlabel(xlabel, fontsize=5)
        plt.ylabel(ylabel, fontsize=5)
        plt.xticks(scores, label, fontsize=5, rotation=45)
        plt.title(title)
        plt.savefig('project/' + title + '.jpg')
        plt.show()

    def conversionIntoGlobalFormatDictionary(self, actualDict, predictedDict,
                                             topk):
        toreturn = {}
        for personId in actualDict:
            try:
                predicted = predictedDict[personId]
                actual = actualDict[personId]
                toreturn[personId] = {}
                toreturn[personId]['predicted'] = predictedDict[personId]
                toreturn[personId]['actual'] = actualDict[personId]
                denom = topk
                if len(actual) < topk:
                    denom = len(actual)
                numer = len(list(set(actual).intersection(predicted)))
                recall = numer / float(denom)
                toreturn[personId]['recall'] = recall
                toreturn[personId]['numerator'] = numer
                toreturn[personId]['denominator'] = denom
            except:
                pass
        return toreturn
コード例 #19
0
print(tficfVectorsTrain.shape)
print("test dataset")
print(tficfVectorsTest.shape)
print("")
print("Reduce Dimension:LSI")
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
lsiVectorsTrain = svd.fit_transform(tficfVectorsTrain)
print(lsiVectorsTrain)
print(lsiVectorsTrain.shape)
b = svd.inverse_transform(lsiVectorsTrain)
print("error:")
print(np.linalg.norm(tficfVectorsTrain - b, ord='fro'))
print("Apply on test dataset")
print(tficfVectorsTest.shape)
lsiVectorsTest = svd.transform(tficfVectorsTest)
print(lsiVectorsTest)
print(lsiVectorsTest.shape)
print("")
print("Reduce Dimension:NMF")
nmf = NMF(n_components=50, init='random', random_state=0)
nmfVectorsTrain = nmf.fit_transform(tficfVectorsTrain)
print(nmfVectorsTrain)
print(nmfVectorsTrain.shape)
print("error:")
b = nmf.inverse_transform(nmfVectorsTrain)
print(b.shape)
print(np.linalg.norm(tficfVectorsTrain - b, ord='fro'))
print("Apply on test dataset")
nmfVectorsTest = nmf.transform(tficfVectorsTest)
print(nmfVectorsTest)
print(nmfVectorsTest.shape)
コード例 #20
0
                np.min((exp_cpsi2.max(), sim_cpsi2.max())) - 0.05,
                interp_num_phi,
                endpoint=False)

            interp_X = interp_shots(X, interp_num_phi, sim_cpsi2, new_cpsi)
            interp_pro = interp_shots(shots, interp_num_phi, exp_cpsi2,
                                      new_cpsi)

            print('constructing filter...')
            # transform and inverse transform
            model = NMF(n_components=n_comp, solver='cd')
            W = model.fit_transform(interp_X)
            H = model.components_

            new_pro = model.transform(interp_pro)
            inverse_pro = model.inverse_transform(new_pro)

            # average and error estimate

            pro[qidx] = inverse_pro.mean(0)
            err[qidx] = inverse_pro.std(0) / np.sqrt(inverse_pro.shape[0])
            interp_cpsi[qidx] = new_cpsi
            grp.create_dataset('all_filtered_cor', data=inverse_pro)
            grp.create_dataset('W_matrix', data=new_pro)
            grp.create_dataset('H_matrix', data=H)

        f_out.create_dataset('ave_cor', data=pro)
        f_out.create_dataset('err', data=err)

        f_out.create_dataset('unfiltered_ave_cor', data=original_pro)
        f_out.create_dataset('unfiltered_err', data=original_err)
コード例 #21
0
ix = np.argsort(X)
X = X[ix]
emis = emis[ix, :]

OD = -np.log(1 - emis)
pcaOD = PCA(whiten=True, n_components=48)

ica = FastICA(n_components=36, max_iter=5000)
ODIR = ica.fit_transform(OD)  # Reconstruct signals
OD2 = ica.inverse_transform(ODIR)
emis2 = 1 - np.exp(-OD2)  # Reconstruct signals
A_ = ica.mixing_  # Get estimated mixing matrix

nmf = NMF(n_components=48)
ODNR = nmf.fit_transform(OD)
OD2 = nmf.inverse_transform(ODNR)
emis2 = 1 - np.exp(-OD2)

N = 48
knots = np.linspace(X.min(), X.max(), N)[1:-1]
tck = splrep(X, -np.log(emis[:, 350]), t=knots)

t = tck[0]
c = np.zeros((emis.shape[-1], tck[1].size))
k = tck[2]
for ii in range(emis.shape[-1]):
    tck = splrep(X, -np.log(emis[:, ii]), t=knots)
    c[ii, :] = tck[1]


def emisFcn(X, tck):
コード例 #22
0
print()

# Print information
print("Clustering sparse data with k-means with k = 2...")
print()

# K-Means clustering with k = kvalue = 2
t0 = time()
km = KMeans(n_clusters=kvalue,
            init='k-means++',
            max_iter=100,
            n_init=1,
            verbose=False)
#km = MiniBatchKMeans(n_clusters=2, init = 'k-means++', n_init=1, init_size = 1000, batch_size = 1000, verbose = False)
km.fit(X_nmf)
original_space_centroids = svd1.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
cm = metrics.confusion_matrix(labels, km.labels_)

# Print information
print(
    "-------------------------Processing Finshed 3---------------------------")
print("Cluster sparse data  done with k-means with k = 2 in %fs" %
      (time() - t0))
print(
    "This k-means cluster with dimensionality reduction using NMF without non-linear transformation)"
)
print("Top 10 terms per cluster:")
for i in range(kvalue):
    print("Cluster %d:" % i, end='')
コード例 #23
0
from sklearn.datasets import load_iris
#载入数据
X, _ = load_iris(True)
# 最重要的参数是n_components、alpha、l1_ratio、solver
nmf = NMF(
    n_components=2,  # k value,默认会保留全部特征
    init=
    None,  # W H 的初始化方法,包括'random' | 'nndsvd'(默认) |  'nndsvda' | 'nndsvdar' | 'custom'.
    solver='cd',  # 'cd' | 'mu'
    #{'frobenius', 'kullback-leibler', 'itakura-saito'},一般默认就好
    beta_loss='frobenius',
    tol=1e-4,  # 停止迭代的极限条件
    max_iter=200,  # 最大迭代次数
    random_state=None,
    alpha=0.,  # 正则化参数
    l1_ratio=0.,  # 正则化参数
    verbose=0,  # 冗长模式
    shuffle=False  # 针对"cd solver"
)
# -----------------函数------------------------
print('params:', nmf.get_params())  # 获取构造函数参数的值,也可以nmf.attr得到,所以下面我会省略这些属性
# 下面四个函数很简单,也最核心,例子中见
nmf.fit(X)
W = nmf.fit_transform(X)
W = nmf.transform(X)
nmf.inverse_transform(W)
# -----------------属性------------------------
H = nmf.components_  # H矩阵
print('reconstruction_err_', nmf.reconstruction_err_)  # 损失函数值
print('n_iter_', nmf.n_iter_)  # 实际迭代次数
コード例 #24
0
    def test_custom_nmf(self):

        mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0],
                        [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64)
        mat[:mat.shape[1], :] += np.identity(mat.shape[1])

        mod = NMF(n_components=2)
        W = mod.fit_transform(mat)
        H = mod.components_

        def predict(W, H, row_index, col_index):
            return np.dot(W[row_index, :], H[:, col_index])

        pred = mod.inverse_transform(W)

        exp = []
        got = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                exp.append((i, j, pred[i, j]))
                got.append((i, j, predict(W, H, i, j)))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got))
        assert max_diff <= 1e-5

        def nmf_to_onnx(W, H):
            """
            The function converts a NMF described by matrices
            *W*, *H* (*WH* approximate training data *M*).
            into a function which takes two indices *(i, j)*
            and returns the predictions for it. It assumes
            these indices applies on the training data.
            """
            col = OnnxArrayFeatureExtractor(H, 'col')
            row = OnnxArrayFeatureExtractor(W.T, 'row')
            dot = OnnxMul(col, row)
            res = OnnxReduceSum(dot, output_names="rec")
            indices_type = np.array([0], dtype=np.int64)
            onx = res.to_onnx(inputs={'col': indices_type,
                                      'row': indices_type},
                              outputs=[('rec', FloatTensorType((None, 1)))])
            return onx

        model_onnx = nmf_to_onnx(W, H)
        sess = InferenceSession(model_onnx.SerializeToString())

        def predict_onnx(sess, row_indices, col_indices):
            res = sess.run(None,
                           {'col': col_indices,
                            'row': row_indices})
            return res

        onnx_preds = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                row_indices = np.array([i], dtype=np.int64)
                col_indices = np.array([j], dtype=np.int64)
                pred = predict_onnx(sess, row_indices, col_indices)[0]
                onnx_preds.append((i, j, pred[0, 0]))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds))
        assert max_diff <= 1e-5
コード例 #25
0
def optimize_NMF_rank_fuv(data,
                          n_samples,
                          plot_output_dir,
                          train_size=0.8,
                          k_min_max=[2, 30]):
    k_range = range(k_min_max[0], k_min_max[1])
    k_fuv_dict = {}
    k_fuv_dict['rep'] = []
    k_fuv_dict['k'] = []
    k_fuv_dict['fuv_vals'] = []
    k_fuv_dict['error_variance'] = []
    k_fuv_dict['non_zero_ratio'] = []
    # k_fuv_dict['sparsity_var_ratio'] = []
    k_fuv_dict['SS_err'] = []

    group_dict = {}
    group_dict['rep_err_var'] = []
    group_dict['reconstruct_X_test'] = []
    group_dict['X_test_flat'] = []
    group_dict['k'] = []

    n_repeats = 15

    for k in k_range:
        # group_dict['X_test_flat'] = []
        # group_dict['reconstruct_X_test'] = []
        for rep in range(n_repeats):

            # Generate test and train data
            model_indexes = list(range(n_samples))
            train_indexes = np.random.choice(model_indexes,
                                             size=int(n_samples * train_size),
                                             replace=False)
            test_indexs = [i for i in model_indexes if i not in train_indexes]

            X_test = np.copy(data[test_indexs])
            X_train = np.copy(data[train_indexes])

            # perturb_mat = np.random.normal(0.0, scale=10, size=np.shape(X_train))

            # X_train = np.random.normal(0.0, scale=10, size=np.shape(X_train))
            X_train = abs(X_train)

            # Apply speckled mask
            mask = np.random.choice([0, 1], size=X_train.shape,
                                    p=[0.2, 0.8]).astype(np.bool)
            # mask = np.random.randint(0,2,size=X_train.shape, weights=[0.2, 0.8]).astype(np.bool)
            # print(mask)
            r = np.zeros(X_train.shape)

            X_train[mask] = r[mask]

            model = NMF(n_components=k,
                        init='nndsvda',
                        verbose=0,
                        max_iter=100,
                        tol=4e-18,
                        l1_ratio=1).fit(X_train)

            # Transform test set and reconstruct
            W_test = model.transform(X_test)
            reconstruct_X_test = model.inverse_transform(W_test).reshape(1, -1)

            # reconstruct_X_test = np.round(reconstruct_X_test, decimals=0)

            # Flatten elements
            X_test_flat = np.copy(X_test).reshape(1, -1)
            X_test_mean = np.mean(X_test_flat)

            SS_err = np.sum((X_test_flat - reconstruct_X_test)**2)
            SS_tot = np.sum((X_test_flat - X_test_mean)**2)
            fuv = SS_err / SS_tot

            error_variance = np.mean(SS_err)

            sparsity = measure_sparseness(model.components_)

            k_fuv_dict['rep'].append(rep)
            k_fuv_dict['k'].append(k)
            k_fuv_dict['fuv_vals'].append(fuv)
            k_fuv_dict['error_variance'].append(error_variance)
            k_fuv_dict['SS_err'].append(SS_err)
            k_fuv_dict['non_zero_ratio'].append(sparsity)
            # k_fuv_dict['sparsity_var_ratio'].append( error_variance / sparsity)

            # group_dict['X_test_flat'].extend(X_test_flat)
            # group_dict['reconstruct_X_test'].extend(reconstruct_X_test)

        # X_test_mean = np.mean(group_dict['X_test_flat'])
        # X_test_flat = np.array(group_dict['X_test_flat']).reshape(1, -1)
        # reconstruct_X_test = np.array(group_dict['reconstruct_X_test']).reshape(1, -1)

        # SS_err = np.sum((X_test_flat - reconstruct_X_test)**2)
        # group_dict['rep_err_var'].append(SS_err)
        # group_dict['k'].append(k)
        print(k)

    df = pd.DataFrame(k_fuv_dict)

    group_dict.pop('X_test_flat', None)
    group_dict.pop('reconstruct_X_test', None)
    group_df = pd.DataFrame(group_dict)

    df.to_csv('fuv_vals.csv')

    width_inches = 200 / 25.4
    height_inches = 150 / 25.4
    fig, ax = plt.subplots(figsize=(width_inches, height_inches))
    sns.lineplot(x='k', y='SS_err', data=df)
    # sns.lineplot(x='k', y='error_variance', data=df)
    plt.savefig(plot_output_dir + 'NMF_optim_SS_err.pdf',
                dpi=500,
                bbox_inches='tight')
    plt.close()

    fig, ax = plt.subplots(figsize=(width_inches, height_inches))
    sns.lineplot(x='k', y='error_variance', data=df)
    # sns.lineplot(x='k', y='error_variance', data=df)
    plt.savefig(plot_output_dir + 'NMF_optim_err_var.pdf',
                dpi=500,
                bbox_inches='tight')
    plt.close()

    fig, ax = plt.subplots(figsize=(width_inches, height_inches))
    sns.lineplot(x='k', y='non_zero_ratio', data=df)
    # sns.lineplot(x='k', y='error_variance', data=df)
    plt.savefig(plot_output_dir + 'NMF_sparsity.pdf',
                dpi=500,
                bbox_inches='tight')
    plt.close()
コード例 #26
0
import numpy as np

from sklearn.datasets import load_iris
from sklearn.decomposition import NMF

# For reproducibility
np.random.seed(1000)

if __name__ == '__main__':
    # Load iris dataset
    iris = load_iris()
    print('Irid dataset shape')
    print(iris.data.shape)

    # Perform a non-negative matrix factorization
    nmf = NMF(n_components=3, init='random', l1_ratio=0.1)
    Xt = nmf.fit_transform(iris.data)

    print('Reconstruction error')
    print(nmf.reconstruction_err_)

    print('Original Iris sample')
    print(iris.data[0])

    print('Compressed Iris sample (via Non-Negative Matrix Factorization)')
    print(Xt[0])

    print('Rebuilt sample')
    print(nmf.inverse_transform(Xt[0]))
コード例 #27
0
print(f'NearestNeighbor -- time: {end}')

# implements nearest neighbor using SVD
start = time.time()
SVD = TruncatedSVD(n_components=12, random_state=42)
matrix = SVD.fit_transform(df)
corr = np.corrcoef(matrix)
for index, _ in df[:-1].iterrows():
    res = corr[index].argsort()[-20:][::-1]
recons_matrix = SVD.inverse_transform(matrix)
err = mean_squared_error(df, recons_matrix)
end = time.time() - start
print(f'SVD -- err: {err}, time: {end}')

# non-negative matrix factorization; just looking at time and error 
start = time.time()
nmf = NMF(n_components=12, init='random', random_state=42)
matrix = nmf.fit_transform(df)
recons_matrix = nmf.inverse_transform(matrix)
err = mean_squared_error(df, recons_matrix)
end = time.time() - start
print(f'NMF -- err: {err}, time: {end}')








コード例 #28
0
class TensorDecomp:
    """
    A class to represent a tensor object with decomposition and reconstruction methods.

    
    Attributes
    ----------
    tensor      : numpy.ndarray
        The tensor that is given to the class.
    memSize     : int
        The size of the tensor in the memory before decomposition.
    decMemSize  : int
        The size of the tensor in the memory after decomposition.
    decomp_time : float
        The time elapsed to decompose the tensor.
    decomp_type : str
        The __name__ of the provided func argument.
    memChange   : float
        The relative change of memory requirement of the tensor after decomposition.


    Methods
    -------
    decompose(func, *args, **kwargs):
        Decomposes the given tensor with the 'func' decomposition and computes the size in memory after decomposition.
    
    reconstruct(self):
        Reconstructs the decomposed tensor.
    
    error(func, x, y):
        Calculates the error between x and y with the given 'func' error handle.

    """
    def __init__(self, tensor):

        self.tensor = tensor
        self.decMemSize = 0

        if isinstance(self.tensor, sparse._coo.core.COO):

            print("Sparse!!!!")
            #self.memSize = tensor.data.nbytes + tensor.row.nbytes + tensor.col.nbytes
            self.memSize = tensor.nbytes

        else:
            self.memSize = tensor.nbytes

    def decompose(self, func, *args, **kwargs):
        """
        Decomposes the tensor with the func argument decomposition type.
        Assigns the objects after decomposition to self.decomposed.
        Computes the decomposition time and assigns to self.decomp_time.
        Assigns the func argument as the decomposition type to self.decomp_type.

        Parameters
        ----------
        self : object of class TensorDecomp type.
            
        Returns
        -------
        None            
        
        """

        if func.__name__ not in decomp_list:
            print(f'Error! Given decomposition --> {func.__name__}')
            return

        elif func.__name__ == 'svd':
            ts = timer()
            self.decomposed = func(self.tensor)
            te = timer()
            self.decomp_time = te - ts
            self.decomp_type = func.__name__

        elif func.__name__ == 'NMF':
            self.nmf_obj = NMF()
            ts = timer()
            self.decomposed = []
            self.decomposed.append(self.nmf_obj.fit_transform(self.tensor))
            self.decomposed.append(self.nmf_obj.components_)
            te = timer()
            self.decomp_time = te - ts
            self.decomp_type = func.__name__

        elif args:
            ts = timer()
            self.decomposed = func(self.tensor, args[0])
            te = timer()
            self.decomp_type = func.__name__
            self.decomp_time = te - ts

        else:
            ts = timer()
            self.decomposed = func(self.tensor, **kwargs)
            te = timer()
            self.decomp_type = func.__name__
            self.decomp_time = te - ts

        for array in self.decomposed:
            if isinstance(array, (np.ndarray)):
                self.decMemSize += array.nbytes
            for array in self.decomposed[1]:
                if isinstance(array, (np.ndarray)):
                    self.decMemSize += array.nbytes

        # the tensor size change in memory

        self.memChange = (self.decMemSize - self.memSize) / self.memSize

    def reconstruct(self):
        """
        Reconstructs the decomposed TensorDecomp object.
        Assigns the reconstructed tensor to self.recons attribute.

        Parameters
        ----------
        self : object of class TensorDecomp type.
            
        Returns
        -------
        None            
        
        """

        if self.decomp_type == 'svd':
            self.recons = self.decomposed[0] @ (
                np.diag(self.decomposed[1]) @ self.decomposed[2])

        elif self.decomp_type == 'NMF':
            self.recons = self.nmf_obj.inverse_transform(self.decomposed[0])

        elif self.decomp_type == 'tucker':
            from tensorly import tucker_tensor as tt
            self.recons = tt.tucker_to_tensor(self.decomposed)

        elif self.decomp_type == 'parafac':
            from tensorly import cp_tensor as ct
            self.recons = ct.cp_to_tensor(self.decomposed)

        elif self.decomp_type == 'matrix_product_state':
            from tensorly import tt_tensor as tt
            self.recons = tt.tt_to_tensor(self.decomposed)
        elif self.decomp_type == 'clarkson_woodruff_transform':
            self.recons = self.decomposed

    def error(self, func, x, y):
        """
        Computes the error between the original and reconstructed tensor with a given error function.

        Parameters
        ----------
        func    : function object for error calculation. Example: np.linalg.norm
        x       : the original tensor
        y       : the reconstructed tensor
            
        Returns
        -------
        float
            the error between the original and the reconstructed tensor.            
        
        """
        if isinstance(x, sparse._coo.core.COO):
            # convert sparse matrix into dense matrix for error calc
            x = x.todense()

        return (func(x) - func(y)) / func(x)
コード例 #29
0
    image = imzmlio.normalize(image)
else:
    image = np.uint8(image)

image_shape = image.shape[:-1]
image_norm = fusion.flatten(image, is_spectral=True)
M = image_norm.T
print(M.shape)

if is_nmf:
    nmf = NMF(n_components=n, init='nndsvda', solver='cd', random_state=0)
    fit_nmf = nmf.fit(M)
    eigenvectors = fit_nmf.components_  #H
    eigenvalues = nmf.fit_transform(M)
    #W
    inverse_transform = nmf.inverse_transform(eigenvalues)
    eigenvectors_transposed = eigenvalues.T
else:
    # p, n = M.shape
    pca = PCA(n)
    fit_pca = pca.fit(M)
    eigenvectors = fit_pca.components_
    eigenvalues = fit_pca.transform(M)
    inverse_transform = pca.inverse_transform(eigenvalues)
    eigenvectors_transposed = eigenvalues.T

mse = mean_squared_error(M, inverse_transform, multioutput='raw_values')
outlier_indices = [i for i in range(len(mse))]
outlier_indices.sort(key=lambda x: mse[x], reverse=True)

number_outliers = 10
コード例 #30
0
def fit_nmf(train_max,
            heldout_max=None,
            vocab=None,
            k=10,
            alpha_regularization=0.0):
    nmf = NMF(k, alpha=alpha_regularization, verbose=False)

    train_nmf = nmf.fit_transform(train_max)

    if heldout_max is not None:
        heldout_nmf = nmf.transform(heldout_max)
    else:
        heldout_nmf = None

    batch_size = 100

    prop_train_reconst_errs = []
    prop_heldout_reconst_errs = []

    for iteration in range(20):
        train_idxes = np.random.choice(train_nmf.shape[0],
                                       batch_size,
                                       replace=True)

        reconst_train_max = nmf.inverse_transform(train_nmf[train_idxes])

        prop_train_reconst_err = np.linalg.norm(reconst_train_max - train_max[train_idxes]) / \
                                 scipy.sparse.linalg.norm(train_max[train_idxes])
        prop_train_reconst_errs.append(prop_train_reconst_err)

        if heldout_nmf is not None:
            heldout_idxes = np.random.choice(heldout_nmf.shape[0],
                                             batch_size,
                                             replace=True)

            reconst_heldout_max = nmf.inverse_transform(
                heldout_nmf[heldout_idxes])
            prop_heldout_reconst_err = np.linalg.norm(reconst_heldout_max - heldout_max[heldout_idxes]) / \
                                       scipy.sparse.linalg.norm(heldout_max[heldout_idxes])
            prop_heldout_reconst_errs.append(prop_heldout_reconst_err)
        else:
            prop_heldout_reconst_errs.append(-1.0)

    print('Train reconstruction error: {}'.format(
        np.mean(prop_train_reconst_errs)))
    print('Heldout reconstruction error: {}'.format(
        np.mean(prop_heldout_reconst_errs)))

    top_words_per_topic = get_top_words(nmf,
                                        vocab,
                                        n=20,
                                        subtract_off_mean=False,
                                        verbose=False)
    top_words_per_topic_womean = get_top_words(nmf,
                                               vocab,
                                               n=20,
                                               subtract_off_mean=True,
                                               verbose=False)

    topic_path = os.path.join(
        TOPIC_DIR, 'nmf-k{}-alpha{}.topics.txt'.format(k,
                                                       alpha_regularization))
    topic_womean_path = os.path.join(
        TOPIC_DIR, 'nmf-k{}-alpha{}.topics_without_mean.txt'.format(
            k, alpha_regularization))
    topic_dist_path = os.path.join(
        TOPIC_DIR, 'nmf-k{}-alpha{}.topic_distribution_per_tweet.txt'.format(
            k, alpha_regularization))
    model_path = os.path.join(
        TOPIC_DIR,
        'nmf-k{}-alpha{}.model.pickle'.format(k, alpha_regularization))

    # save top words per topic
    with open(topic_path, 'wt', encoding='utf8') as topic_file:
        for topic_idx, words in enumerate(top_words_per_topic):
            topic_file.write('Topic #{}:'.format(topic_idx))
            for w in words:
                topic_file.write(' ')
                topic_file.write(w)
            topic_file.write('\n')

    with open(topic_womean_path, 'wt', encoding='utf8') as topic_file:
        for topic_idx, words in enumerate(top_words_per_topic_womean):
            topic_file.write('Topic #{}:'.format(topic_idx))
            for w in words:
                topic_file.write(' ')
                topic_file.write(w)
            topic_file.write('\n')

    # save NMF model
    with open(model_path, 'wb') as model_file:
        pickle.dump(nmf, model_file)

    # save topic activation for each tweet, compressed numpy format
    if heldout_nmf is not None:
        all_nmf = np.concatenate((train_nmf, heldout_nmf), axis=0)
    else:
        all_nmf = train_nmf

    np.savez_compressed(topic_dist_path, topics_per_tweet=all_nmf)