Esempio n. 1
0
def all_summaries():
    
    ms = MongoStore()
    #Get only the 25 latest summaries for now. 
    #Complete with pagination after integrating categorization model and other code
    #Being restored from hard drive that crashed.
    sums  = ms.get_summaries()
    
    return render_template('all.html',summaries=sums)
Esempio n. 2
0
class SendToDataStorePipeline(object):
    def __init__(self):
        self.db = MongoStore()

    def process_item(self, item, spider):
        #Runs saving to db separate thread to prevent it from blocking
        #on single items
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):

        item_dict = dict(item)
        self.db.insert_article(item)
        return item
Esempio n. 3
0
class SendToDataStorePipeline(object):

    def __init__(self):
        self.db = MongoStore()

    def process_item(self, item, spider):
    #Runs saving to db separate thread to prevent it from blocking
    #on single items
        return deferToThread(self._process_item, item, spider)

    def _process_item(self, item, spider):

        item_dict = dict(item)
        self.db.insert_article(item)
        return item
Esempio n. 4
0
def summarize_clusters_lexrank():

    ms = MongoStore()

    clusters = ms.get_pending_clusters()
    for cluster in clusters:
        retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55]
        articles = ms.get_articles_from_ids(retrieve_ids)
        art_texts = [
            a["text_content"].replace(u"\u201D",
                                      u"\"").replace(u"\u201C", u"\"")
            for a in articles
        ]

        summary = sumpy.lexrank(art_texts)
        sents = []

        for x, row in summary._df.head(5).iterrows():
            s = {
                "article_id": articles[row["doc id"]]["_id"],
                "sentence_id": row["sent id"],
                "text": row["sent text"]
            }
            sents.append(s)

        summary_map = {
            "sentences": sents,
            "cluster_id": cluster["_id"],
            "summary_type": "lexrank",
            "date": datetime.now()
        }

        if ms.insert_summary(summary_map):
            ms.set_summarized_flag(cluster)
Esempio n. 5
0
def summarize_clusters_lexrank():

    ms = MongoStore()

    clusters = ms.get_pending_clusters()
    for cluster in clusters:
        retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55]
        articles = ms.get_articles_from_ids(retrieve_ids)
        art_texts = [a["text_content"].replace(u"\u201D", u"\"").replace(
            u"\u201C", u"\"") for a in articles]

        summary = sumpy.lexrank(art_texts)
        sents = []

        for x, row in summary._df.head(5).iterrows():
            s = {"article_id": articles[row["doc id"]]["_id"],
                 "sentence_id": row["sent id"],
                 "text": row["sent text"]}
            sents.append(s)

        summary_map = {"sentences": sents, "cluster_id": cluster["_id"],
                       "summary_type": "lexrank",
                       "date": datetime.now()}

        if ms.insert_summary(summary_map):
            ms.set_summarized_flag(cluster)
Esempio n. 6
0
def cluster_articles():
  ms = MongoStore()
  articles = [a for a in ms.get_pending_articles()]

  if len(articles) > 0:

    tfidf = TfidfVectorizer(tokenizer=preprocess)


    good_articles = [article for article in articles 
                     if article["text_content"].strip() != ""]

    texts = [article["text_content"] for article in good_articles]

    X_tfidf = tfidf.fit_transform(texts)

    print X_tfidf

    ap = AffinityPropagation(damping=0.95, max_iter=4000, 
            convergence_iter=400, copy=True, preference=-4, 
            affinity='euclidean', verbose=True)

    C = ap.fit_predict(X_tfidf)
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim), 
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue
        
        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)
Esempio n. 7
0
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Import local modules
module_path = os.path.dirname(os.path.realpath(__file__))
datastore_module = os.path.join(module_path,
                                os.path.join('..' + os.sep + 'datastore'))
sys.path.append(datastore_module)
from mongo import MongoStore

ms = MongoStore()
articles = [a for a in ms.get_pending_articles()]

if len(articles) > 0:

    stemmer = PorterStemmer()
    stop = stopwords.words('english')

    def preprocess(text):
        tokens = [
            tok for tok in word_tokenize(text.lower()) if tok not in stop
        ]
        tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
        return tokens_stemmed

    tfidf = TfidfVectorizer(tokenizer=preprocess)
Esempio n. 8
0
def cluster_articles():
    ms = MongoStore()
    articles = [a for a in ms.get_pending_articles()]

    if len(articles) > 0:

        tfidf = TfidfVectorizer(tokenizer=preprocess)

        good_articles = [
            article for article in articles
            if article["text_content"].strip() != ""
        ]

        texts = [article["text_content"] for article in good_articles]

        X_tfidf = tfidf.fit_transform(texts)

        print X_tfidf

        ap = AffinityPropagation(damping=0.95,
                                 max_iter=4000,
                                 convergence_iter=400,
                                 copy=True,
                                 preference=-4,
                                 affinity='euclidean',
                                 verbose=True)

        C = ap.fit_predict(X_tfidf)
        print X_tfidf.shape, C.shape
        print C
        centers = ap.cluster_centers_indices_
        clusters = []
        for c, center in enumerate(centers):

            members = np.where(C == c)[0]
            K = cosine_similarity(X_tfidf[members], X_tfidf[center])
            member_sims = [(m, float(k)) for m, k in zip(members, K)]
            member_sims.sort(key=lambda x: x[1], reverse=True)

            cluster = {
                "articles": [],
                "date": datetime.now(),
                "summarized": False
            }

            if len([member for member, sim in member_sims if sim > .55]) >= 3:
                print texts[center][:75].replace("\n", " ")

                for member, sim in member_sims:

                    print "\t{:3.3f} ".format(sim),
                    print good_articles[member]["title"][:60].replace(
                        "\n", " ")
                    cluster["articles"].append(
                        (good_articles[member]["_id"], sim))
            else:
                continue

            clusters.append(cluster)

        if len(clusters) > 0:
            ms.insert_clusters(clusters)

        ms.set_clustered_flag(articles)
Esempio n. 9
0
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Import local modules
module_path = os.path.dirname(os.path.realpath(__file__))
datastore_module = os.path.join(module_path,os.path.join(
    '..' + os.sep + 'datastore' ))
sys.path.append(datastore_module)
from mongo import MongoStore

ms = MongoStore()
articles = [a for a in ms.get_pending_articles()]

if len(articles) > 0:

    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    def preprocess(text):
        tokens = [tok for tok in word_tokenize(text.lower())
                  if tok not in stop]
        tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
        return tokens_stemmed    

    tfidf = TfidfVectorizer(tokenizer=preprocess)

Esempio n. 10
0
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import sumpy

# Import local modules
module_path = os.path.dirname(os.path.realpath(__file__))
datastore_module = os.path.join(module_path,os.path.join(
    '..' + os.sep + 'datastore' ))
sys.path.append(datastore_module)
from mongo import MongoStore

ms = MongoStore()

clusters = ms.get_pending_clusters()
for cluster in clusters:
    retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55]
    articles = ms.get_articles_from_ids(retrieve_ids)
    art_texts = [a["text_content"].replace(u"\u201D", u"\"").replace(
        u"\u201C", u"\"") for a in articles]

    summary = sumpy.lexrank(art_texts)
    sents = []
    for x, row in summary._df.head(5).iterrows():
        s = {"article_id": articles[row["doc id"]]["_id"],
             "sentence_id": row["sent id"],
             "text": row["sent text"]}
        sents.append(s)
Esempio n. 11
0
 def __init__(self):
     self.db = MongoStore()
Esempio n. 12
0
 def __init__(self):
     self.db = MongoStore()