Beispiel #1
0
                    '$sum': 1
                }
            }
        }]))
    urls = [source['_id'] for source in unq_sources]
    counts = [source['count'] for source in unq_sources]
    flagged_sources = []
    for url in urls:
        articles = list(
            db.articles.find({'sourceUrl': url}, {
                'keywords': 1,
                '_id': 0
            }))
        keywords = [a['keywords'] for a in articles]
        if len(articles) > 10:
            a_dist = ArticleDistance(keywords)
            dists = a_dist.article_pdist()
            n = dists.shape[0]
            dists[range(n), range(n)] = 0
            d = distance.squareform(dists)
            cluster = Clustering(d)
            c = cluster.get_clusters_at(1)
            cluster_count = len(np.unique(c))
            if cluster_count / n < 0.5:
                flagged_sources.append(url)
    db.sources.update_many({'url': {
        '$in': flagged_sources
    }}, {'$set': {
        'flagged': 1
    }})