Beispiel #1
0
def summarize_clusters_lexrank():

    ms = MongoStore()

    clusters = ms.get_pending_clusters()
    for cluster in clusters:
        retrieve_ids = [aid for aid, sim in cluster["articles"] if sim > .55]
        articles = ms.get_articles_from_ids(retrieve_ids)
        art_texts = [
            a["text_content"].replace(u"\u201D",
                                      u"\"").replace(u"\u201C", u"\"")
            for a in articles
        ]

        summary = sumpy.lexrank(art_texts)
        sents = []

        for x, row in summary._df.head(5).iterrows():
            s = {
                "article_id": articles[row["doc id"]]["_id"],
                "sentence_id": row["sent id"],
                "text": row["sent text"]
            }
            sents.append(s)

        summary_map = {
            "sentences": sents,
            "cluster_id": cluster["_id"],
            "summary_type": "lexrank",
            "date": datetime.now()
        }

        if ms.insert_summary(summary_map):
            ms.set_summarized_flag(cluster)
Beispiel #2
0
def all_summaries():
    
    ms = MongoStore()
    #Get only the 25 latest summaries for now. 
    #Complete with pagination after integrating categorization model and other code
    #Being restored from hard drive that crashed.
    sums  = ms.get_summaries()
    
    return render_template('all.html',summaries=sums)
Beispiel #3
0
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Import local modules
module_path = os.path.dirname(os.path.realpath(__file__))
datastore_module = os.path.join(module_path,
                                os.path.join('..' + os.sep + 'datastore'))
sys.path.append(datastore_module)
from mongo import MongoStore

ms = MongoStore()
articles = [a for a in ms.get_pending_articles()]

if len(articles) > 0:

    stemmer = PorterStemmer()
    stop = stopwords.words('english')

    def preprocess(text):
        tokens = [
            tok for tok in word_tokenize(text.lower()) if tok not in stop
        ]
        tokens_stemmed = [stemmer.stem(tok) for tok in tokens]
        return tokens_stemmed

    tfidf = TfidfVectorizer(tokenizer=preprocess)
Beispiel #4
0
def cluster_articles():
    ms = MongoStore()
    articles = [a for a in ms.get_pending_articles()]

    if len(articles) > 0:

        tfidf = TfidfVectorizer(tokenizer=preprocess)

        good_articles = [
            article for article in articles
            if article["text_content"].strip() != ""
        ]

        texts = [article["text_content"] for article in good_articles]

        X_tfidf = tfidf.fit_transform(texts)

        print X_tfidf

        ap = AffinityPropagation(damping=0.95,
                                 max_iter=4000,
                                 convergence_iter=400,
                                 copy=True,
                                 preference=-4,
                                 affinity='euclidean',
                                 verbose=True)

        C = ap.fit_predict(X_tfidf)
        print X_tfidf.shape, C.shape
        print C
        centers = ap.cluster_centers_indices_
        clusters = []
        for c, center in enumerate(centers):

            members = np.where(C == c)[0]
            K = cosine_similarity(X_tfidf[members], X_tfidf[center])
            member_sims = [(m, float(k)) for m, k in zip(members, K)]
            member_sims.sort(key=lambda x: x[1], reverse=True)

            cluster = {
                "articles": [],
                "date": datetime.now(),
                "summarized": False
            }

            if len([member for member, sim in member_sims if sim > .55]) >= 3:
                print texts[center][:75].replace("\n", " ")

                for member, sim in member_sims:

                    print "\t{:3.3f} ".format(sim),
                    print good_articles[member]["title"][:60].replace(
                        "\n", " ")
                    cluster["articles"].append(
                        (good_articles[member]["_id"], sim))
            else:
                continue

            clusters.append(cluster)

        if len(clusters) > 0:
            ms.insert_clusters(clusters)

        ms.set_clustered_flag(articles)
Beispiel #5
0
 def __init__(self):
     self.db = MongoStore()