def cluster_articles(): ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: tfidf = TfidfVectorizer(tokenizer=preprocess) good_articles = [article for article in articles if article["text_content"].strip() != ""] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace("\n", " ") cluster["articles"].append((good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace("\n", " ") cluster["articles"].append((good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
def cluster_articles(): ms = MongoStore() articles = [a for a in ms.get_pending_articles()] if len(articles) > 0: tfidf = TfidfVectorizer(tokenizer=preprocess) good_articles = [ article for article in articles if article["text_content"].strip() != "" ] texts = [article["text_content"] for article in good_articles] X_tfidf = tfidf.fit_transform(texts) print X_tfidf ap = AffinityPropagation(damping=0.95, max_iter=4000, convergence_iter=400, copy=True, preference=-4, affinity='euclidean', verbose=True) C = ap.fit_predict(X_tfidf) print X_tfidf.shape, C.shape print C centers = ap.cluster_centers_indices_ clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = { "articles": [], "date": datetime.now(), "summarized": False } if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace( "\n", " ") cluster["articles"].append( (good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)
clusters = [] for c, center in enumerate(centers): members = np.where(C == c)[0] K = cosine_similarity(X_tfidf[members], X_tfidf[center]) member_sims = [(m, float(k)) for m, k in zip(members, K)] member_sims.sort(key=lambda x: x[1], reverse=True) cluster = {"articles": [], "date": datetime.now(), "summarized": False} if len([member for member, sim in member_sims if sim > .55]) >= 3: print texts[center][:75].replace("\n", " ") for member, sim in member_sims: print "\t{:3.3f} ".format(sim), print good_articles[member]["title"][:60].replace("\n", " ") cluster["articles"].append((good_articles[member]["_id"], sim)) else: continue clusters.append(cluster) if len(clusters) > 0: ms.insert_clusters(clusters) ms.set_clustered_flag(articles)