def all_summaries(): ms = MongoStore() #Get only the 25 latest summaries for now. #Complete with pagination after integrating categorization model and other code #Being restored from hard drive that crashed. sums = ms.get_summaries() return render_template('all.html',summaries=sums)
class SendToDataStorePipeline(object): def __init__(self): self.db = MongoStore() def process_item(self, item, spider): #Runs saving to db separate thread to prevent it from blocking #on single items return deferToThread(self._process_item, item, spider) def _process_item(self, item, spider): item_dict = dict(item) self.db.insert_article(item) return item
def summarize_clusters_lexrank(): ms = MongoStore() clusters = ms.get_pending_clusters() for cluster in clusters: retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55] articles = ms.get_articles_from_ids(retrieve_ids) art_texts = [ a["text_content"].replace(u"\u201D", u"\"").replace(u"\u201C", u"\"") for a in articles ] summary = sumpy.lexrank(art_texts) sents = [] for x, row in summary._df.head(5).iterrows(): s = { "article_id": articles[row["doc id"]]["_id"], "sentence_id": row["sent id"], "text": row["sent text"] } sents.append(s) summary_map = { "sentences": sents, "cluster_id": cluster["_id"], "summary_type": "lexrank", "date": datetime.now() } if ms.insert_summary(summary_map): ms.set_summarized_flag(cluster)
def summarize_clusters_lexrank(): ms = MongoStore() clusters = ms.get_pending_clusters() for cluster in clusters: retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55] articles = ms.get_articles_from_ids(retrieve_ids) art_texts = [a["text_content"].replace(u"\u201D", u"\"").replace( u"\u201C", u"\"") for a in articles] summary = sumpy.lexrank(art_texts) sents = [] for x, row in summary._df.head(5).iterrows(): s = {"article_id": articles[row["doc id"]]["_id"], "sentence_id": row["sent id"], "text": row["sent text"]} sents.append(s) summary_map = {"sentences": sents, "cluster_id": cluster["_id"], "summary_type": "lexrank", "date": datetime.now()} if ms.insert_summary(summary_map): ms.set_summarized_flag(cluster)
def cluster_articles(): ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: tfidf = TfidfVectorizer(tokenizer=preprocess) good_articles = [article for article in articles if article["text_content"].strip() != ""] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace("\n", " ") cluster["articles"].append((good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
from nltk.corpus import stopwords from nltk import word_tokenize from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import AffinityPropagation from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Import local modules module_path = os.path.dirname(os.path.realpath(__file__)) datastore_module = os.path.join(module_path, os.path.join('..' + os.sep + 'datastore')) sys.path.append(datastore_module) from mongo import MongoStore ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: stemmer = PorterStemmer() stop = stopwords.words('english') def preprocess(text): tokens = [ tok for tok in word_tokenize(text.lower()) if tok not in stop ] tokens_stemmed = [stemmer.stem(tok) for tok in tokens] return tokens_stemmed tfidf = TfidfVectorizer(tokenizer=preprocess)
def cluster_articles(): ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: tfidf = TfidfVectorizer(tokenizer=preprocess) good_articles = [ article for article in articles if article["text_content"].strip() != "" ] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = { "articles": [], "date": datetime.now(), "summarized": False } if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace( "\n", " ") cluster["articles"].append( (good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
from nltk.corpus import stopwords from nltk import word_tokenize from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import AffinityPropagation from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Import local modules module_path = os.path.dirname(os.path.realpath(__file__)) datastore_module = os.path.join(module_path,os.path.join( '..' + os.sep + 'datastore' )) sys.path.append(datastore_module) from mongo import MongoStore ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: stemmer = PorterStemmer() stop = stopwords.words('english') def preprocess(text): tokens = [tok for tok in word_tokenize(text.lower()) if tok not in stop] tokens_stemmed = [stemmer.stem(tok) for tok in tokens] return tokens_stemmed tfidf = TfidfVectorizer(tokenizer=preprocess)
from nltk import word_tokenize from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import AffinityPropagation from sklearn.metrics.pairwise import cosine_similarity import numpy as np import sumpy # Import local modules module_path = os.path.dirname(os.path.realpath(__file__)) datastore_module = os.path.join(module_path,os.path.join( '..' + os.sep + 'datastore' )) sys.path.append(datastore_module) from mongo import MongoStore ms = MongoStore() clusters = ms.get_pending_clusters() for cluster in clusters: retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55] articles = ms.get_articles_from_ids(retrieve_ids) art_texts = [a["text_content"].replace(u"\u201D", u"\"").replace( u"\u201C", u"\"") for a in articles] summary = sumpy.lexrank(art_texts) sents = [] for x, row in summary._df.head(5).iterrows(): s = {"article_id": articles[row["doc id"]]["_id"], "sentence_id": row["sent id"], "text": row["sent text"]} sents.append(s)
def __init__(self): self.db = MongoStore()