コード例 #1
0
ファイル: NMF_medium_code.py プロジェクト: domdomdidom/NMF
def get_items_associated(historical_purchase_matrix, feature_df, product_df, n_topics=5, max_iters=350, n_churniest_topics=3, n_churniest_items=25):
    ''' 
    Get products most associated with a particular group
    Parameters
    ----------
    index of a dataframe slice (pandas.core.indexes.numeric.Int64Index)

        
    Attributes
    ----------  
    returns "most associated" items. I often use this for churn.

    '''
    slice = feature_df.index.astype(int)

    nmf = NMF_sklearn(n_components=n_topics, max_iter=max_iters, alpha=0.0)
    W = nmf.fit_transform(historical_purchase_matrix) # how much each customer belongs to each "topic"
    H = nmf.components_ # how much each item belongs to each "topic"
    
    sums = W[slice].sum(axis=0)
    churniest_topics = sums.argsort()[-n_churniest_topics:] 
    
    c = Counter()
    
    for topic in churniest_topics:
        indicies = H[topic].argsort()[-50:]
    
        for product in product_df['Name'][indicies]:
            c[product] += 1
            
    return c.most_common(n_churniest_items)
コード例 #2
0
def do_nmf(X):
    nmf = NMF_sklearn(n_components=30,
                      max_iter=100,
                      random_state=34,
                      alpha=0.0,
                      verbose=True)
    W = nmf.fit_transform(X)
    H = nmf.components_
    print('reconstruction error:', nmf.reconstruction_err_)
    return W, H
コード例 #3
0
def do_nmf_loop(V):
    rec_err_lst = []
    for c in range(7, 8):
        nmf = NMF_sklearn(n_components=c,
                          max_iter=100,
                          random_state=34,
                          alpha=.01,
                          verbose=True)
        W = nmf.fit_transform(V)
        H = nmf.components_
        print('reconstruction error:', nmf.reconstruction_err_)
        rec_err_lst.append(nmf.reconstruction_err_)
    return W, H, rec_err_lst
コード例 #4
0
def main():
    '''
    Run the unsupervised analysis of the NYT corpus, using NMF to find latent
    topics. The user will be prompted to label each latent topic, then a few
    articles will be analyzed to see which topics they contain.
    '''
    # Load the corpus.
    df = pd.read_pickle("data/articles.pkl")
    contents = df.content
    web_urls = df.web_url

    # Build our text-to-vector vectorizer, then vectorize our corpus.
    vectorizer, vocabulary = build_text_vectorizer(contents,
                                                   use_tfidf=True,
                                                   use_stemmer=False,
                                                   max_features=5000)
    X = vectorizer(contents)

    # We'd like to see consistent results, so set the seed.
    np.random.seed(12345)

    # Find latent topics using our NMF model.
    factorizer = NMF(k=7, max_iters=35, alpha=0.5)
    W, H = factorizer.fit(X, verbose=True)

    # Label topics and analyze a few NYT articles.
    # Btw, if you haven't modified anything, the seven topics which should
    # pop out are:  (you should type these as the labels when prompted)
    #  1. "football",
    #  2. "arts",
    #  3. "baseball",
    #  4. "world news (middle eastern?)",
    #  5. "politics",
    #  6. "world news (war?)",
    #  7. "economics"
    hand_labels = hand_label_topics(H, vocabulary)
    rand_articles = np.random.choice(list(range(len(W))), 15)
    for i in rand_articles:
        analyze_article(i, contents, web_urls, W, hand_labels)

    # Do it all again, this time using scikit-learn.
    nmf = NMF_sklearn(n_components=7,
                      max_iter=100,
                      random_state=12345,
                      alpha=0.0)
    W = nmf.fit_transform(X)
    H = nmf.components_
    print('reconstruction error:', nmf.reconstruction_err_)
    hand_labels = hand_label_topics(H, vocabulary)
    for i in rand_articles:
        analyze_article(i, contents, web_urls, W, hand_labels)
コード例 #5
0
ファイル: NMF_medium_code.py プロジェクト: domdomdidom/NMF
def do_NMF(historical_purchase_matrix, product_df, get_top_products=True):
        
    nmf = NMF_sklearn(n_components=5, max_iter=450)
    W = nmf.fit_transform(historical_purchase_matrix) # how much each customer belongs to each "topic"
    H = nmf.components_ # how much each item belongs to each "topic"

    if get_top_products == True:
        
        print("Here are the top products for %s topics" % (5))
        for topic in range(0, 5):
            indicies = H[topic].argsort()[-25:]
            print("\n")
            print(product_df['Name'][indicies])

    return W, H
コード例 #6
0
def main():
    '''
    Run the unsupervised analysis of the corpus, using NMF to find latent
    topics. The user will be prompted to label each latent topic, then a few
    documents will be analyzed to see which topics they contain.
    '''
    # Load the corpus.
    df = pd.read_pickle("data/articles.pkl")
    contents = df.content
    web_urls = df.web_url

    # Build our text-to-vector vectorizer, then vectorize our corpus.
    vectorizer, vocabulary = build_text_vectorizer(contents,
                                                   use_tfidf=True,
                                                   use_stemmer=False,
                                                   max_features=5000)
    X = vectorizer(contents)

    # We'd like to see consistent results, so set the seed.
    np.random.seed(12345)

    # Find latent topics using our NMF model.
    factorizer = NMF(k=7, max_iters=35, alpha=0.5)
    W, H = factorizer.fit(X, verbose=True)

    hand_labels = hand_label_topics(H, vocabulary)
    rand_articles = np.random.choice(list(range(len(W))), 15)
    for i in rand_articles:
        analyze_recipe(i, contents, W, hand_labels)

    # Do it all again, this time using scikit-learn.
    nmf = NMF_sklearn(n_components=7,
                      max_iter=100,
                      random_state=12345,
                      alpha=0.0)
    W = nmf.fit_transform(X)
    H = nmf.components_
    print('reconstruction error:', nmf.reconstruction_err_)
    hand_labels = hand_label_topics(H, vocabulary)
    for i in rand_articles:
        analyze_recipe(i, contents, W, hand_labels)
コード例 #7
0
 def train_text_features(self, series, n_components):
     series = series.fillna('unknown')
     vectorizer, vocabulary = self.build_text_vectorizer(series,
                                                         use_tfidf=True,
                                                         use_stemmer=True,
                                                         max_features=None)
     self.vectorizer = vectorizer
     self.vocabulary = vocabulary
     X = vectorizer(series)
     nmf = NMF_sklearn(n_components=n_components, max_iter=100, alpha=0.0)
     nmf_model = nmf.fit(X)
     self.nmf = nmf_model
     W = nmf.transform(X)
     H = nmf.components_
     self.W_train = W
     self.H_train = H
     prob_w = self.softmax(W, temperature=0.01)
     self.prob_w
     hand_labels = self.hand_label_topics(H, vocabulary, prob_w, series)
     self.hand_labels = hand_labels
     return pd.DataFrame(W, columns=[hand_labels])
コード例 #8
0
ファイル: NMF.py プロジェクト: MouseLand/kesa-et-al-2019
 def __init__(self, nr_neurons, nr_timepoints, nr_components,
              initialization, X):
     super(NMF, self).__init__()
     if initialization == 'random':
         #zeros_for_U=np.random.choice([0,1], nr_neurons*nr_components, p=[1-0.01, 0.01]).reshape((nr_neurons,nr_components))
         self.U = nn.Parameter(
             0.1 *
             torch.randn(nr_neurons, nr_components, requires_grad=True))
         self.V = nn.Parameter(
             torch.randn(nr_components, nr_timepoints, requires_grad=True))
     if initialization == 'NMF':
         X[X < 0] = 0
         X = X.T
         model = NMF_sklearn(n_components=nr_components,
                             init='nndsvd',
                             random_state=7)
         self.V = nn.Parameter(
             torch.tensor(model.fit_transform(X).T,
                          requires_grad=True,
                          dtype=torch.float32))
         print(self.V.size())
         self.U = nn.Parameter(
             torch.tensor(model.components_.T,
                          requires_grad=True,
                          dtype=torch.float32))
         print(self.U.size())
     if initialization == 'PCA':
         X = X.T
         model = PCA(n_components=nr_components)
         self.V = nn.Parameter(
             torch.tensor(model.fit_transform(X).T,
                          requires_grad=True,
                          dtype=torch.float32))
         print(self.V.size())
         self.U = nn.Parameter(
             torch.tensor(model.components_.T,
                          requires_grad=True,
                          dtype=torch.float32))
         print(self.U.size())
コード例 #9
0
def main():
    '''
    
    '''
    # Load the corpus.
    df = load_file()
    contents = df.content
    web_urls = df.web_url

    # Build our text-to-vector vectorizer, then vectorize our corpus.
    vectorizer, vocabulary = build_text_vectorizer(contents,
                                                   use_tfidf=True,
                                                   use_stemmer=False,
                                                   max_features=5000)
    X = vectorizer(contents)

    # We'd like to see consistent results, so set the seed.
    np.random.seed(12345)

    # Find latent topics using our NMF model.
    factorizer = NMF(k=7, max_iters=35, alpha=0.5)
    W, H = factorizer.fit(X, verbose=True)

    hand_labels = hand_label_topics(H, vocabulary)
    rand_articles = np.random.choice(list(range(len(W))), 15)
    for i in rand_articles:
        analyze_article(i, contents, web_urls, W, hand_labels)

    # Do it all again, this time using scikit-learn.
    nmf = NMF_sklearn(n_components=7,
                      max_iter=100,
                      random_state=12345,
                      alpha=0.0)
    W = nmf.fit_transform(X)
    H = nmf.components_
    print('reconstruction error:', nmf.reconstruction_err_)
    hand_labels = hand_label_topics(H, vocabulary)
    for i in rand_articles:
        analyze_article(i, contents, web_urls, W, hand_labels)
コード例 #10
0
    def do_NMF(self,
               historical_purchase_matrix,
               product_df,
               get_top_products=False):

        nmf = NMF_sklearn(n_components=5, max_iter=450)
        W = nmf.fit_transform(
            historical_purchase_matrix
        )  # how much each customer belongs to each "topic"
        H = nmf.components_  # how much each item belongs to each "topic"

        W_df = pd.DataFrame(W)  # weights matrix only

        merged_df = pd.concat([
            self.feature_df.reset_index(drop=False),
            W_df.reset_index(drop=True)
        ],
                              axis=1)  # combine weights matrix with feature_df
        merged_df = merged_df.rename(
            columns={
                0: 'Consumer Rehab/Single Rolls',
                1: 'Education/Movement Professionals',
                2: 'Consumer Fitness',
                3: 'Marketing & Promo',
                4: 'CrossFit'
            })
        merged_df = merged_df.set_index('Customer ID')
        self.feature_df = merged_df

        if get_top_products == True:

            print("Here are the top products for %s topics" % (5))
            for topic in range(0, 5):
                indicies = H[topic].argsort()[-25:]
                print("\n")
                print(product_df['Name'][indicies])

        return self.feature_df
コード例 #11
0
def topic_model(df, cluster_num):
    content = df['clean_text']
    vectorizer = TfidfVectorizer(stop_words='english',
                                 max_features=5000,
                                 min_df=10)
    X = vectorizer.fit_transform(content)
    features = vectorizer.get_feature_names()
    nmf = NMF_sklearn(n_components=10,
                      max_iter=100,
                      random_state=12345,
                      alpha=0.0)
    W = nmf.fit_transform(X)
    H = nmf.components_

    avg_sent = df['sent'].mean()
    avg_trump_sent = df[df['trump_mentioned_cnt'] > 0]['sent'].mean()
    avg_biden_sent = df[df['biden_mentioned_cnt'] > 0]['sent'].mean()
    print(
        f'Sentiment for Cluster {cluster_num} is: \n Average Sentiment: {avg_sent} \n Biden Sentiment:  {avg_biden_sent} \n Trump Sentiment:{avg_trump_sent}'
    )
    for i, row in enumerate(H):
        top_ten = np.argsort(row)[::-1][:10]
        print(np.array(features)[top_ten])