Esempio n. 1
0
def cluster_articles():
  ms = MongoStore()
  articles = [a for a in ms.get_pending_articles()]

  if len(articles) > 0:

    tfidf = TfidfVectorizer(tokenizer=preprocess)


    good_articles = [article for article in articles 
                     if article["text_content"].strip() != ""]

    texts = [article["text_content"] for article in good_articles]

    X_tfidf = tfidf.fit_transform(texts)

    print X_tfidf

    ap = AffinityPropagation(damping=0.95, max_iter=4000, 
            convergence_iter=400, copy=True, preference=-4, 
            affinity='euclidean', verbose=True)

    C = ap.fit_predict(X_tfidf)
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim), 
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue
        
        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)
Esempio n. 2
0
    print X_tfidf.shape, C.shape
    print C
    centers = ap.cluster_centers_indices_
    clusters = []
    for c, center in enumerate(centers):

        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim),
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue

        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)
Esempio n. 3
0
def cluster_articles():
    ms = MongoStore()
    articles = [a for a in ms.get_pending_articles()]

    if len(articles) > 0:

        tfidf = TfidfVectorizer(tokenizer=preprocess)

        good_articles = [
            article for article in articles
            if article["text_content"].strip() != ""
        ]

        texts = [article["text_content"] for article in good_articles]

        X_tfidf = tfidf.fit_transform(texts)

        print X_tfidf

        ap = AffinityPropagation(damping=0.95,
                                 max_iter=4000,
                                 convergence_iter=400,
                                 copy=True,
                                 preference=-4,
                                 affinity='euclidean',
                                 verbose=True)

        C = ap.fit_predict(X_tfidf)
        print X_tfidf.shape, C.shape
        print C
        centers = ap.cluster_centers_indices_
        clusters = []
        for c, center in enumerate(centers):

            members = np.where(C == c)[0]
            K = cosine_similarity(X_tfidf[members], X_tfidf[center])
            member_sims = [(m, float(k)) for m, k in zip(members, K)]
            member_sims.sort(key=lambda x: x[1], reverse=True)

            cluster = {
                "articles": [],
                "date": datetime.now(),
                "summarized": False
            }

            if len([member for member, sim in member_sims if sim > .55]) >= 3:
                print texts[center][:75].replace("\n", " ")

                for member, sim in member_sims:

                    print "\t{:3.3f} ".format(sim),
                    print good_articles[member]["title"][:60].replace(
                        "\n", " ")
                    cluster["articles"].append(
                        (good_articles[member]["_id"], sim))
            else:
                continue

            clusters.append(cluster)

        if len(clusters) > 0:
            ms.insert_clusters(clusters)

        ms.set_clustered_flag(articles)
Esempio n. 4
0
    clusters = []
    for c, center in enumerate(centers):

        
        members = np.where(C == c)[0]
        K = cosine_similarity(X_tfidf[members], X_tfidf[center])
        member_sims = [(m, float(k)) for m, k in zip(members, K)]
        member_sims.sort(key=lambda x: x[1], reverse=True)

        cluster = {"articles": [], "date": datetime.now(), "summarized": False}

        if len([member for member, sim in member_sims if sim > .55]) >= 3:
            print texts[center][:75].replace("\n", " ")

            for member, sim in member_sims:

                print "\t{:3.3f} ".format(sim), 
                print good_articles[member]["title"][:60].replace("\n", " ")
                cluster["articles"].append((good_articles[member]["_id"], sim))
        else:
            continue
        
        clusters.append(cluster)

    if len(clusters) > 0:
        ms.insert_clusters(clusters)

    ms.set_clustered_flag(articles)