def test_elastic_hash(self): """Can we reproduce a hash from elastic data alone?""" article = Article(**{ "date": datetime.date(2015, 1, 1), "section": "\u6f22\u5b57", "pagenr": 1928390, "headline": "Headline hier.", "byline": "byline..", "length": 1928, "metastring": "Even more strange characters.. \x0C ..", "url": "https://example.com", "externalid": None, "author": None, "addressee": "Hmm", "text": "Contains invalid char \x08 woo", "medium": create_test_medium(name="abc."), "project": create_test_project() }) article.save() es = ES() es.add_articles([article.id]) hash = get_article_dict(article)["hash"] es.flush() es_articles = es.query_all(filters={"ids": [article.id]}, fields=HASH_FIELDS + ["hash"]) es_article = list(es_articles)[0] self.assertEqual(article.id, es_article.id) self.assertEqual(hash, es_article.hash) self.assertEqual(_get_hash(es_article.to_dict()), hash)
def handle(self, *args, **options): es = amcates.ES() print("Counting articles..", end=" ") sys.stdout.flush() narticles = es.count(query="*", filters={}) print(narticles) then, now = datetime.datetime.now(), datetime.datetime.now() for i, article_ids in enumerate(grouper(es.query_ids(), n=GROUP_SIZE)): progress = (float(i * GROUP_SIZE) / float(narticles)) * 100 print("{} of {} ({:.2f}%)".format(i * GROUP_SIZE, narticles, progress)) es_articles = es.query_all(filters={"ids": article_ids}, fields=HASH_FIELDS) es.bulk_update_values( {a.id: { "hash": _get_hash(a.to_dict()) } for a in es_articles}) then, now = now, datetime.datetime.now() print("Articles per second: ", end="") print(int(GROUP_SIZE / (now - then).total_seconds())) print("Done.")
def handle(self, *args, **options): es = amcates.ES() print("Counting articles..", end=" ") sys.stdout.flush() narticles = es.count(query="*", filters={}) print(narticles) then, now = datetime.datetime.now(), datetime.datetime.now() for i, article_ids in enumerate(grouper(es.query_ids(), n=GROUP_SIZE)): progress = (float(i * GROUP_SIZE) / float(narticles)) * 100 print("{} of {} ({:.2f}%)".format(i*GROUP_SIZE, narticles, progress)) es_articles = es.query_all(filters={"ids": article_ids}, fields=HASH_FIELDS) es.bulk_update_values({a.id: {"hash": _get_hash(a.to_dict())} for a in es_articles}) then, now = now, datetime.datetime.now() print("Articles per second: ", end="") print(int(GROUP_SIZE / (now - then).total_seconds())) print("Done.")