def get_items_associated(historical_purchase_matrix, feature_df, product_df, n_topics=5, max_iters=350, n_churniest_topics=3, n_churniest_items=25): ''' Get products most associated with a particular group Parameters ---------- index of a dataframe slice (pandas.core.indexes.numeric.Int64Index) Attributes ---------- returns "most associated" items. I often use this for churn. ''' slice = feature_df.index.astype(int) nmf = NMF_sklearn(n_components=n_topics, max_iter=max_iters, alpha=0.0) W = nmf.fit_transform(historical_purchase_matrix) # how much each customer belongs to each "topic" H = nmf.components_ # how much each item belongs to each "topic" sums = W[slice].sum(axis=0) churniest_topics = sums.argsort()[-n_churniest_topics:] c = Counter() for topic in churniest_topics: indicies = H[topic].argsort()[-50:] for product in product_df['Name'][indicies]: c[product] += 1 return c.most_common(n_churniest_items)
def do_nmf(X): nmf = NMF_sklearn(n_components=30, max_iter=100, random_state=34, alpha=0.0, verbose=True) W = nmf.fit_transform(X) H = nmf.components_ print('reconstruction error:', nmf.reconstruction_err_) return W, H
def do_nmf_loop(V): rec_err_lst = [] for c in range(7, 8): nmf = NMF_sklearn(n_components=c, max_iter=100, random_state=34, alpha=.01, verbose=True) W = nmf.fit_transform(V) H = nmf.components_ print('reconstruction error:', nmf.reconstruction_err_) rec_err_lst.append(nmf.reconstruction_err_) return W, H, rec_err_lst
def main(): ''' Run the unsupervised analysis of the NYT corpus, using NMF to find latent topics. The user will be prompted to label each latent topic, then a few articles will be analyzed to see which topics they contain. ''' # Load the corpus. df = pd.read_pickle("data/articles.pkl") contents = df.content web_urls = df.web_url # Build our text-to-vector vectorizer, then vectorize our corpus. vectorizer, vocabulary = build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=5000) X = vectorizer(contents) # We'd like to see consistent results, so set the seed. np.random.seed(12345) # Find latent topics using our NMF model. factorizer = NMF(k=7, max_iters=35, alpha=0.5) W, H = factorizer.fit(X, verbose=True) # Label topics and analyze a few NYT articles. # Btw, if you haven't modified anything, the seven topics which should # pop out are: (you should type these as the labels when prompted) # 1. "football", # 2. "arts", # 3. "baseball", # 4. "world news (middle eastern?)", # 5. "politics", # 6. "world news (war?)", # 7. "economics" hand_labels = hand_label_topics(H, vocabulary) rand_articles = np.random.choice(list(range(len(W))), 15) for i in rand_articles: analyze_article(i, contents, web_urls, W, hand_labels) # Do it all again, this time using scikit-learn. nmf = NMF_sklearn(n_components=7, max_iter=100, random_state=12345, alpha=0.0) W = nmf.fit_transform(X) H = nmf.components_ print('reconstruction error:', nmf.reconstruction_err_) hand_labels = hand_label_topics(H, vocabulary) for i in rand_articles: analyze_article(i, contents, web_urls, W, hand_labels)
def do_NMF(historical_purchase_matrix, product_df, get_top_products=True): nmf = NMF_sklearn(n_components=5, max_iter=450) W = nmf.fit_transform(historical_purchase_matrix) # how much each customer belongs to each "topic" H = nmf.components_ # how much each item belongs to each "topic" if get_top_products == True: print("Here are the top products for %s topics" % (5)) for topic in range(0, 5): indicies = H[topic].argsort()[-25:] print("\n") print(product_df['Name'][indicies]) return W, H
def main(): ''' Run the unsupervised analysis of the corpus, using NMF to find latent topics. The user will be prompted to label each latent topic, then a few documents will be analyzed to see which topics they contain. ''' # Load the corpus. df = pd.read_pickle("data/articles.pkl") contents = df.content web_urls = df.web_url # Build our text-to-vector vectorizer, then vectorize our corpus. vectorizer, vocabulary = build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=5000) X = vectorizer(contents) # We'd like to see consistent results, so set the seed. np.random.seed(12345) # Find latent topics using our NMF model. factorizer = NMF(k=7, max_iters=35, alpha=0.5) W, H = factorizer.fit(X, verbose=True) hand_labels = hand_label_topics(H, vocabulary) rand_articles = np.random.choice(list(range(len(W))), 15) for i in rand_articles: analyze_recipe(i, contents, W, hand_labels) # Do it all again, this time using scikit-learn. nmf = NMF_sklearn(n_components=7, max_iter=100, random_state=12345, alpha=0.0) W = nmf.fit_transform(X) H = nmf.components_ print('reconstruction error:', nmf.reconstruction_err_) hand_labels = hand_label_topics(H, vocabulary) for i in rand_articles: analyze_recipe(i, contents, W, hand_labels)
def train_text_features(self, series, n_components): series = series.fillna('unknown') vectorizer, vocabulary = self.build_text_vectorizer(series, use_tfidf=True, use_stemmer=True, max_features=None) self.vectorizer = vectorizer self.vocabulary = vocabulary X = vectorizer(series) nmf = NMF_sklearn(n_components=n_components, max_iter=100, alpha=0.0) nmf_model = nmf.fit(X) self.nmf = nmf_model W = nmf.transform(X) H = nmf.components_ self.W_train = W self.H_train = H prob_w = self.softmax(W, temperature=0.01) self.prob_w hand_labels = self.hand_label_topics(H, vocabulary, prob_w, series) self.hand_labels = hand_labels return pd.DataFrame(W, columns=[hand_labels])
def __init__(self, nr_neurons, nr_timepoints, nr_components, initialization, X): super(NMF, self).__init__() if initialization == 'random': #zeros_for_U=np.random.choice([0,1], nr_neurons*nr_components, p=[1-0.01, 0.01]).reshape((nr_neurons,nr_components)) self.U = nn.Parameter( 0.1 * torch.randn(nr_neurons, nr_components, requires_grad=True)) self.V = nn.Parameter( torch.randn(nr_components, nr_timepoints, requires_grad=True)) if initialization == 'NMF': X[X < 0] = 0 X = X.T model = NMF_sklearn(n_components=nr_components, init='nndsvd', random_state=7) self.V = nn.Parameter( torch.tensor(model.fit_transform(X).T, requires_grad=True, dtype=torch.float32)) print(self.V.size()) self.U = nn.Parameter( torch.tensor(model.components_.T, requires_grad=True, dtype=torch.float32)) print(self.U.size()) if initialization == 'PCA': X = X.T model = PCA(n_components=nr_components) self.V = nn.Parameter( torch.tensor(model.fit_transform(X).T, requires_grad=True, dtype=torch.float32)) print(self.V.size()) self.U = nn.Parameter( torch.tensor(model.components_.T, requires_grad=True, dtype=torch.float32)) print(self.U.size())
def main(): ''' ''' # Load the corpus. df = load_file() contents = df.content web_urls = df.web_url # Build our text-to-vector vectorizer, then vectorize our corpus. vectorizer, vocabulary = build_text_vectorizer(contents, use_tfidf=True, use_stemmer=False, max_features=5000) X = vectorizer(contents) # We'd like to see consistent results, so set the seed. np.random.seed(12345) # Find latent topics using our NMF model. factorizer = NMF(k=7, max_iters=35, alpha=0.5) W, H = factorizer.fit(X, verbose=True) hand_labels = hand_label_topics(H, vocabulary) rand_articles = np.random.choice(list(range(len(W))), 15) for i in rand_articles: analyze_article(i, contents, web_urls, W, hand_labels) # Do it all again, this time using scikit-learn. nmf = NMF_sklearn(n_components=7, max_iter=100, random_state=12345, alpha=0.0) W = nmf.fit_transform(X) H = nmf.components_ print('reconstruction error:', nmf.reconstruction_err_) hand_labels = hand_label_topics(H, vocabulary) for i in rand_articles: analyze_article(i, contents, web_urls, W, hand_labels)
def do_NMF(self, historical_purchase_matrix, product_df, get_top_products=False): nmf = NMF_sklearn(n_components=5, max_iter=450) W = nmf.fit_transform( historical_purchase_matrix ) # how much each customer belongs to each "topic" H = nmf.components_ # how much each item belongs to each "topic" W_df = pd.DataFrame(W) # weights matrix only merged_df = pd.concat([ self.feature_df.reset_index(drop=False), W_df.reset_index(drop=True) ], axis=1) # combine weights matrix with feature_df merged_df = merged_df.rename( columns={ 0: 'Consumer Rehab/Single Rolls', 1: 'Education/Movement Professionals', 2: 'Consumer Fitness', 3: 'Marketing & Promo', 4: 'CrossFit' }) merged_df = merged_df.set_index('Customer ID') self.feature_df = merged_df if get_top_products == True: print("Here are the top products for %s topics" % (5)) for topic in range(0, 5): indicies = H[topic].argsort()[-25:] print("\n") print(product_df['Name'][indicies]) return self.feature_df
def topic_model(df, cluster_num): content = df['clean_text'] vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, min_df=10) X = vectorizer.fit_transform(content) features = vectorizer.get_feature_names() nmf = NMF_sklearn(n_components=10, max_iter=100, random_state=12345, alpha=0.0) W = nmf.fit_transform(X) H = nmf.components_ avg_sent = df['sent'].mean() avg_trump_sent = df[df['trump_mentioned_cnt'] > 0]['sent'].mean() avg_biden_sent = df[df['biden_mentioned_cnt'] > 0]['sent'].mean() print( f'Sentiment for Cluster {cluster_num} is: \n Average Sentiment: {avg_sent} \n Biden Sentiment: {avg_biden_sent} \n Trump Sentiment:{avg_trump_sent}' ) for i, row in enumerate(H): top_ten = np.argsort(row)[::-1][:10] print(np.array(features)[top_ten])