def lsa_text_summarizer(documents, num_sentences=2, num_topics=2, feature_type='frequency', sv_threshold=0.5): vec, dt_matrix = build_feature_matrix(documents, feature_type=feature_type) td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort() for index in top_sentence_indices: print(sentences[index])
def lsa_text_summarizer(documents, num_sentences=2, num_topics=2, feature_type='frequency', sv_threshold=0.5): vec, dt_matrix = build_feature_matrix(documents, feature_type=feature_type) td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort() for index in top_sentence_indices: print sentences[index]
# In[51]: norm_corpus = normalize_corpus(toy_corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') td_matrix = tfidf_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) total_topics = 2 feature_names = vectorizer.get_feature_names() u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics) weights = u.transpose() * s[:, None] topics = get_topics_terms_weights(weights, feature_names) print_topics_udf(topics=topics, total_topics=total_topics, weight_threshold=0.15, display_weights=True) # ### NMF # In[55]:
# LSI custom built topic model from utils import build_feature_matrix, low_rank_svd norm_corpus = normalize_corpus(toy_corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') td_matrix = tfidf_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) total_topics = 2 feature_names = vectorizer.get_feature_names() u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics) weights = u.transpose() * s[:, None] def get_topics_terms_weights(weights, feature_names): feature_names = np.array(feature_names) sorted_indices = np.array([list(row[::-1]) for row in np.argsort(np.abs(weights))]) sorted_weights = np.array([list(wt[index]) for wt, index in zip(weights,sorted_indices)]) sorted_terms = np.array([list(feature_names[row]) for row in sorted_indices]) topics = [np.vstack((terms.T,