Beispiel #1
0
def recover_term_topic_matrix(Q, anchors, tol=TOL, betaloss=1):
    """
    Compute C such that C * Q_anchors = Q_bar minimized with Kullback-Leibler divergence.
    All rowsums of this matrix product are 1, for Q_* by construction, for C it follows.
    Params:
        Q: numpy float array, word coocurrence matrix
        anchors: list of indices of anchor words
        tol: tolerance for nmf
        beta_loss: 1 for Kullback-Leibler (more precise), 2 for L2 loss (faster)
    Returns:
        A: term x topic matrix
        C: intermediate result
        n_iter: number of iterations till convergence in computation of C
    """
    n_topics = len(anchors)
    P_w = Q.sum(axis=1)
    Q_bar = normalize(Q, axis=1, norm='l1')
    Q_anchors = Q_bar[anchors, :]

    #   Q_anchor
    # C Q_bar
    C, _, n_iter = non_negative_factorization(Q_bar, W=None, H=Q_anchors, n_components=n_topics,
                                              update_H=False, solver='mu', beta_loss=beta_loss,
                                              tol=tol)

    A_prime = np.multiply(P_w.reshape(-1, 1), C)
    A = normalize(A_prime, axis=0, norm='l1')

    return A, C, n_iter
Beispiel #2
0
    def _nmf(self, X, nmf_kwargs, topic_labels=None):
        """
        Parameters
        ----------
        X : pandas.DataFrame,
            Normalized counts dataFrame to be factorized.

        nmf_kwargs : dict,
            Arguments to be passed to ``non_negative_factorization``

        """
        (W, H, niter) = non_negative_factorization(X.values, **nmf_kwargs)

        usages = pd.DataFrame(W, index=X.index, columns=topic_labels)
        spectra = pd.DataFrame(H, columns=X.columns, index=topic_labels)

        #Sort by overall usage, and rename topics with 1-indexing.
        topic_order = spectra.sum(axis=1).sort_values(ascending=False).index

        spectra = spectra.loc[topic_order, :]
        usages = usages.loc[:, topic_order]

        if topic_labels is None:
            spectra.index = np.arange(1, nmf_kwargs['n_components'] + 1)
            usages.columns = np.arange(1, nmf_kwargs['n_components'] + 1)

        return spectra, usages
Beispiel #3
0
def test_non_negative_factorization_consistency():
    # Test that the function is called in the same way, either directly
    # or through the NMF class
    A = np.abs(random_state.randn(10, 10))
    A[:, 2 * np.arange(5)] = 0

    for solver in ('pg', 'cd'):
        W_nmf, H, _ = nmf.non_negative_factorization(
            A, solver=solver, random_state=1, tol=1e-2)
        W_nmf_2, _, _ = nmf.non_negative_factorization(
            A, H=H, update_H=False, solver=solver, random_state=1, tol=1e-2)

        model_class = nmf.NMF(solver=solver, random_state=1, tol=1e-2)
        W_cls = model_class.fit_transform(A)
        W_cls_2 = model_class.transform(A)
        assert_array_almost_equal(W_nmf, W_cls, decimal=10)
        assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def extractActivation(y, W, w=d_w, h=d_h):
    """
        important : in sklearn, H is "dictionary", while W is "activation".
        but in our case, W is "dictionary". So we have to pass W as H into sklearn
    """
    S = librosa.core.stft(y, n_fft=w, hop_length=h)
    activation, components, n_iter = nmf.non_negative_factorization(X=np.abs(S.T), H=W.T, update_H=False, n_components=W.shape[1], beta=beta, max_iter=max_iter)
    return activation.T
Beispiel #5
0
    def _nmf(self, X, nmf_kwargs):
        """
        Parameters
        ----------
        X : pandas.DataFrame,
            Normalized counts dataFrame to be factorized.

        nmf_kwargs : dict,
            Arguments to be passed to ``non_negative_factorization``

        """
        (usages, spectra, niter) = non_negative_factorization(X, **nmf_kwargs)

        return (spectra, usages)
Beispiel #6
0
def test_non_negative_factorization_consistency():
    # Test that the function is called in the same way, either directly
    # or through the NMF class
    A = np.abs(random_state.randn(10, 10))
    A[:, 2 * np.arange(5)] = 0

    for solver in ('pg', 'cd'):
        W_nmf, H, _ = nmf.non_negative_factorization(A,
                                                     solver=solver,
                                                     random_state=1,
                                                     tol=1e-2)
        W_nmf_2, _, _ = nmf.non_negative_factorization(A,
                                                       H=H,
                                                       update_H=False,
                                                       solver=solver,
                                                       random_state=1,
                                                       tol=1e-2)

        model_class = nmf.NMF(solver=solver, random_state=1, tol=1e-2)
        W_cls = model_class.fit_transform(A)
        W_cls_2 = model_class.transform(A)
        assert_array_almost_equal(W_nmf, W_cls, decimal=10)
        assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
Beispiel #7
0
    def update_reviews_with_topics(self, records):

        corpora = \
            [' '.join(record[Constants.BOW_FIELD]) for record in records]
        document_term_matrix = \
            self.tfidf_vectorizer.transform(corpora)
        document_topic_matrix, _, _ = nmf.non_negative_factorization(
            document_term_matrix, H=self.topic_term_matrix, init='nndsvd',
            n_components=self.num_topics, regularization='both',
            max_iter=Constants.TOPIC_MODEL_ITERATIONS, update_H=False)

        for record_index in range(len(records)):
            record = records[record_index]
            record[Constants.TOPICS_FIELD] = \
                [(i, document_topic_matrix[record_index][i])
                 for i in range(self.num_topics)]
Beispiel #8
0
    def _nmf(self, X, nmf_kwargs):
        """
        Parameters
        ----------
        X : pandas.DataFrame,
            Normalized counts dataFrame to be factorized.

        nmf_kwargs : dict,
            Arguments to be passed to ``non_negative_factorization``

        """
        # remove parameter not used by non_negative_factorization
        if 'cell_sampling_fraction' in nmf_kwargs:
            del nmf_kwargs['cell_sampling_fraction']

        (usages, spectra, niter) = non_negative_factorization(X, **nmf_kwargs)

        return (spectra, usages)
Beispiel #9
0
    def transform(self, doc_term_mat, tol=TOL, beta_loss=1):
        """
        Params:
            doc_term_mat: scipy.sparse matrix as from CountVectorizer
            tol: tolerance for nmf
            beta_loss: 1 for Kullback-Leibler (more precise), 2 for L2 loss (faster)
        Returns:
            W_T: topic x term matrix
        """
        M = doc_term_mat.T

        # product matrix M.T is document term matrix
        #      A.T
        # W.T  M.T
        W_T, _, self.n_iter_transform = non_negative_factorization(
            M.T, W=None, H=self.A.T, n_components=self.n_topics,
            update_H=False, solver='mu', beta_loss=beta_loss, tol=tol)

        return W_T
Beispiel #10
0
    def update_reviews_with_topics(self, records):

        corpora = \
            [' '.join(record[Constants.BOW_FIELD]) for record in records]
        document_term_matrix = \
            self.tfidf_vectorizer.transform(corpora)
        document_topic_matrix, _, _ = nmf.non_negative_factorization(
            document_term_matrix,
            H=self.topic_term_matrix,
            init='nndsvd',
            n_components=self.num_topics,
            regularization='both',
            max_iter=Constants.TOPIC_MODEL_ITERATIONS,
            update_H=False)

        for record_index in range(len(records)):
            record = records[record_index]
            record[Constants.TOPICS_FIELD] = \
                [(i, document_topic_matrix[record_index][i])
                 for i in range(self.num_topics)]
Beispiel #11
0
def estTransOrder(points, users, cluster_centers):
    t_hr_file = 'data\\trans_hr.txt'
    c_order_file = 'data\\clus_order.txt'
    c_time_file = 'data\\clus_time.txt'

    if os.path.isfile(t_hr_file) and os.path.isfile(
            c_order_file) and os.path.isfile(c_time_file):
        global trans_hr
        global clus_order
        global clus_time

        trans_hr = np.loadtxt(t_hr_file)
        clus_order = np.loadtxt(c_order_file)
        clus_time = np.loadtxt(c_time_file)
        print 'load trans/clus time, order score'
    else:
        transLen = 0.0
        transTimeLen = 0.0

        global trans_hr
        global clus_order
        transTimes = [
        ]  #number of times between each pair of clusters(no direction)
        for i in range(clus_k):
            trans_hr.append(np.zeros(clus_k))
            clus_order.append(np.zeros(clus_k) * 50)
        trans_hr = transTimes = np.array(trans_hr)
        clus_order = np.array(clus_order)

        global clus_time
        for i in range(clus_k):
            clus_time.append(np.zeros(24) * 50)
        clus_time = np.array(clus_time)

        allusers = np.unique(points[:, 1])

        for user in users:
            user_points = points[:, 1] == user

            #for all points of the user, sort by time(index 2-7)
            tsorted = np.array(
                sorted(points[user_points], key=itemgetter(2, 3, 4, 5, 6, 7)))

            if len(tsorted) > 0:

                #dates of all posts of user
                datesToClus = np.vstack(
                    {tuple(row)
                     for row in tsorted[:, 2:5]})  #remove duplicate date
                for date in datesToClus:

                    #locations of the user visited in this date(boolean list)
                    same_date = tsorted[:, 2:5] == date
                    sd = []
                    for d in same_date:
                        sd.append(d.all())
                    same_date = np.array(sd, dtype=bool)

                    #locations of posts in this date
                    l = len(tsorted[same_date])
                    for i in range(l - 1):

                        #find locations that have diff clus to next locaiton
                        thisClus = (tsorted[same_date])[i][-2]
                        nextClus = (tsorted[same_date])[i + 1][-2]

                        if (thisClus < clus_k):
                            clus_time[thisClus][(
                                tsorted[same_date])[i][5]] += 1

                            if thisClus != nextClus and (nextClus < clus_k):
                                hr1 = (tsorted[same_date])[i][5] + (
                                    tsorted[same_date]
                                )[i][6] / 60.0  #time of prior post
                                hr2 = (tsorted[same_date])[i + 1][5] + (
                                    tsorted[same_date]
                                )[i + 1][6] / 60.0  #time of latter post

                                thisLoc = ((tsorted[same_date])[i][10],
                                           (tsorted[same_date])[i][9])
                                nextLoc = ((tsorted[same_date])[i + 1][10],
                                           (tsorted[same_date])[i + 1][9])
                                transLen += great_circle(thisLoc,
                                                         nextLoc).meters
                                transTimeLen += (hr2 - hr1)

                                trans_hr[thisClus][nextClus] += (hr2 - hr1)
                                trans_hr[nextClus][thisClus] += (hr2 - hr1)

                                transTimes[thisClus][nextClus] += 1
                                transTimes[nextClus][thisClus] += 1

                                clus_order[thisClus][nextClus] += 1

        #speed unit: meters/hr
        avgSpeed = transLen / transTimeLen
        trans_hr = trans_hr / transTimes

        for i, row in enumerate(trans_hr):
            for j, timeLen in enumerate(row):
                if j > i:
                    if np.isnan(timeLen) and i != j:
                        thisLoc = (cluster_centers[i][1],
                                   cluster_centers[i][0])
                        nextLoc = (cluster_centers[j][1],
                                   cluster_centers[j][0])
                        estTime = great_circle(thisLoc,
                                               nextLoc).meters / avgSpeed

                        timeLen = estTime

                    if round((timeLen * 100 % 100) / 50.0) == 0:
                        trans_hr[i][j] = trans_hr[j][i] = round(timeLen)
                    elif round((timeLen * 100 % 100) / 50.0) == 1:
                        trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 0.5
                    elif round((timeLen * 100 % 100) / 50.0) == 2:
                        trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 1

        print 'transition time:'
        print trans_hr
        np.savetxt(t_hr_file, np.array(trans_hr))

        clus_order = clus_order / np.max(clus_order)
        W, H, iter = nmf.non_negative_factorization(clus_order,
                                                    n_components=10,
                                                    random_state=2)
        clus_order = np.dot(W, H)
        print 'condition probability:'
        print clus_order
        np.savetxt(c_order_file, np.array(clus_order))

        clus_time = clus_time / np.amax(clus_time, axis=1)[:, None]
        W, H, iter = nmf.non_negative_factorization(clus_time,
                                                    n_components=10,
                                                    random_state=2)
        clus_time = np.dot(W, H)
        print 'cluster time:'
        print clus_time
        np.savetxt(c_time_file, np.array(clus_time))
Beispiel #12
0
def estTransOrder(points, users, cluster_centers):
    t_hr_file = 'data\\trans_hr.txt'
    c_order_file = 'data\\clus_order.txt'
    c_time_file = 'data\\clus_time.txt'

    if os.path.isfile(t_hr_file) and os.path.isfile(c_order_file) and os.path.isfile(c_time_file):
        global trans_hr
        global clus_order
        global clus_time

        trans_hr = np.loadtxt(t_hr_file)
        clus_order = np.loadtxt(c_order_file)
        clus_time = np.loadtxt(c_time_file)
        print 'load trans/clus time, order score'
    else:
        transLen = 0.0
        transTimeLen = 0.0

        global trans_hr
        global clus_order
        transTimes = [] #number of times between each pair of clusters(no direction)
        for i in range(clus_k):
            trans_hr.append(np.zeros(clus_k))
            clus_order.append(np.zeros(clus_k)*50)
        trans_hr = transTimes = np.array(trans_hr)
        clus_order = np.array(clus_order)

        global clus_time
        for i in range(clus_k):
            clus_time.append( np.zeros(24)*50)
        clus_time = np.array(clus_time)

        allusers = np.unique(points[:,1])

        for user in users:
            user_points = points[:,1] == user
           
            #for all points of the user, sort by time(index 2-7)
            tsorted = np.array(sorted(points[user_points], key=itemgetter(2,3,4,5,6,7)))

            if len(tsorted)>0:

                #dates of all posts of user
                datesToClus = np.vstack({tuple(row) for row in tsorted[:,2:5]})#remove duplicate date
                for date in datesToClus:

                    #locations of the user visited in this date(boolean list)
                    same_date = tsorted[:,2:5] == date
                    sd = []
                    for d in same_date:
                        sd.append(d.all())
                    same_date = np.array(sd, dtype=bool)
                    
                    #locations of posts in this date
                    l = len(tsorted[same_date])
                    for i in range(l-1):

                        #find locations that have diff clus to next locaiton 
                        thisClus = (tsorted[same_date])[i][-2]
                        nextClus = (tsorted[same_date])[i+1][-2]

                        if (thisClus < clus_k):
                            clus_time[thisClus][(tsorted[same_date])[i][5]] += 1

                            if thisClus != nextClus and (nextClus < clus_k):
                                hr1 = (tsorted[same_date])[i][5] + (tsorted[same_date])[i][6]/60.0 #time of prior post
                                hr2 = (tsorted[same_date])[i+1][5] + (tsorted[same_date])[i+1][6]/60.0 #time of latter post
                                
                                thisLoc = ((tsorted[same_date])[i][10], (tsorted[same_date])[i][9])
                                nextLoc = ((tsorted[same_date])[i+1][10], (tsorted[same_date])[i+1][9])
                                transLen += great_circle(thisLoc, nextLoc).meters
                                transTimeLen += (hr2-hr1)

                                trans_hr[thisClus][nextClus] += (hr2-hr1)
                                trans_hr[nextClus][thisClus] += (hr2-hr1)

                                transTimes[thisClus][nextClus] += 1
                                transTimes[nextClus][thisClus] += 1

                                clus_order[thisClus][nextClus] += 1

        #speed unit: meters/hr
        avgSpeed = transLen / transTimeLen
        trans_hr = trans_hr / transTimes

        for i, row in enumerate(trans_hr):
            for j, timeLen in enumerate(row):
                if j > i:
                    if np.isnan(timeLen) and i != j:
                        thisLoc = (cluster_centers[i][1], cluster_centers[i][0])
                        nextLoc = (cluster_centers[j][1], cluster_centers[j][0])
                        estTime = great_circle(thisLoc, nextLoc).meters / avgSpeed
                        
                        timeLen = estTime

                    if round( (timeLen*100%100) / 50.0) == 0:
                        trans_hr[i][j] = trans_hr[j][i] = round(timeLen)
                    elif round( (timeLen*100%100) / 50.0) == 1:
                        trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 0.5
                    elif round( (timeLen*100%100) / 50.0) == 2:
                        trans_hr[i][j] = trans_hr[j][i] = round(timeLen) + 1

        print 'transition time:'
        print trans_hr
        np.savetxt(t_hr_file, np.array(trans_hr))

        clus_order = clus_order / np.max(clus_order)
        W, H, iter=nmf.non_negative_factorization(clus_order, n_components=10,random_state=2)
        clus_order = np.dot(W,H)
        print 'condition probability:'
        print clus_order
        np.savetxt(c_order_file, np.array(clus_order))

        clus_time = clus_time / np.amax(clus_time, axis = 1)[:, None]
        W, H, iter=nmf.non_negative_factorization(clus_time, n_components=10,random_state=2)
        clus_time = np.dot(W,H)
        print 'cluster time:'
        print clus_time
        np.savetxt(c_time_file, np.array(clus_time))