async def test_clear_scroll(self, async_client): bulk = [] for x in range(4): bulk.append({"index": {"_index": "test_index"}}) bulk.append({"value": x}) await async_client.bulk(bulk, refresh=True) with patch.object( async_client, "clear_scroll", wraps=async_client.clear_scroll ) as spy: _ = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2 ) ] spy.assert_called_once() spy.reset_mock() _ = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, clear_scroll=True ) ] spy.assert_called_once() spy.reset_mock() _ = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, clear_scroll=False ) ] spy.assert_not_called()
async def test_scroll_error(self, async_client): bulk = [] for x in range(4): bulk.append({"index": {"_index": "test_index"}}) bulk.append({"value": x}) await async_client.bulk(bulk, refresh=True) with patch.object(async_client, "scroll", MockScroll()): data = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, raise_on_error=False, clear_scroll=False, ) ] assert len(data) == 3 assert data[-1] == {"scroll_data": 42} with patch.object(async_client, "scroll", MockScroll()): with pytest.raises(ScanError): data = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, raise_on_error=True, clear_scroll=False, ) ] assert len(data) == 3 assert data[-1] == {"scroll_data": 42}
async def test_initial_search_error(self, async_client): with patch.object(async_client, "clear_scroll", new_callable=AsyncMock): with patch.object( async_client, "search", MockResponse( { "_scroll_id": "dummy_id", "_shards": {"successful": 4, "total": 5, "skipped": 0}, "hits": {"hits": [{"search_data": 1}]}, } ), ): with patch.object(async_client, "scroll", MockScroll()): data = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, raise_on_error=False, ) ] assert data == [{"search_data": 1}, {"scroll_data": 42}] with patch.object( async_client, "search", MockResponse( { "_scroll_id": "dummy_id", "_shards": {"successful": 4, "total": 5, "skipped": 0}, "hits": {"hits": [{"search_data": 1}]}, } ), ): with patch.object(async_client, "scroll", MockScroll()) as mock_scroll: with pytest.raises(ScanError): data = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, raise_on_error=True, ) ] assert data == [{"search_data": 1}] assert mock_scroll.calls == []
async def async_scan_types() -> None: async for _ in async_scan( es, query={"query": {"match_all": {}}}, request_timeout=10, clear_scroll=True, scroll_kwargs={"request_timeout": 10}, ): pass async for _ in async_scan( es, raise_on_error=False, preserve_order=False, scroll="10m", size=10, request_timeout=10.0, ): pass
async def scan(self, index, query): _generator = helpers.async_scan( self._client, query=query, index=index, ) async for doc in _generator: yield doc
async def main(): async for doc in async_scan( client=es, query={"query": { "match": { "title": "python" } }}, index="orders-*"): print(doc)
async def test_logger(self, logger_mock, async_client): bulk = [] for x in range(4): bulk.append({"index": {"_index": "test_index"}}) bulk.append({"value": x}) await async_client.bulk(bulk, refresh=True) with patch.object(async_client, "scroll", MockScroll()): _ = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, raise_on_error=False, clear_scroll=False, ) ] logger_mock.warning.assert_called() with patch.object(async_client, "scroll", MockScroll()): try: _ = [ x async for x in helpers.async_scan( async_client, index="test_index", size=2, raise_on_error=True, clear_scroll=False, ) ] except ScanError: pass logger_mock.warning.assert_called_with( "Scroll request has only succeeded on %d (+%d skipped) shards out of %d.", 4, 0, 5, )
async def test_no_scroll_id_fast_route(self, async_client, scan_teardown): with patch.object(async_client, "search", MockResponse({"no": "_scroll_id"})): with patch.object(async_client, "scroll") as scroll_mock: with patch.object(async_client, "clear_scroll") as clear_mock: data = [ x async for x in helpers.async_scan(async_client, index="test_index") ] assert data == [] scroll_mock.assert_not_called() clear_mock.assert_not_called()
async def test_all_documents_are_read(self, async_client): bulk = [] for x in range(100): bulk.append({"index": {"_index": "test_index", "_id": x}}) bulk.append({"answer": x, "correct": x == 42}) await async_client.bulk(bulk, refresh=True) docs = [ x async for x in helpers.async_scan(async_client, index="test_index", size=2) ] assert 100 == len(docs) assert set(map(str, range(100))) == set(d["_id"] for d in docs) assert set(range(100)) == set(d["_source"]["answer"] for d in docs)
async def keywordSearch(keywords, myindex): """ 查询 """ mysearch = { "query": { "match_all": {}, # "constant_score":{"filter": {"term": {"cityid": keywords}}} }, "size": 10000, } # 直接查询 # res=es.search(index=myindex,body=mysearch) # print(res) # total=res["hits"]["total"]["value"] # resc=[item for item in res["hits"]["hits"]] # # print(res) #generator object scan at 0x0A686930 # searchres=[] # for item in resc: # tmp=item["_source"] # searchres.append((tmp["hospitalid"],tmp["hospitalname"])) # print("共查询到 %d" %total) # print(f"查询结果 {searchres}{len(searchres)}") # helpers查询 res = [] async for doc in helpers.async_scan( client=es, query=mysearch, scroll="5m", # 查询一次数据在ES中缓存10分钟再销毁 index=myindex, timeout="10m", ): res.append(doc) # print(res) # generator object scan at 0x0A686930 # res = [item for item in res] # print(res) #generator object scan at 0x0A686930 searchres = [] for item in res: tmp = item["_source"] searchres.append((tmp["hospitalid"], tmp["hospitalname"])) print("共查询到%d" % len(res)) print(searchres)
async def test_order_can_be_preserved(self, async_client, scan_teardown): bulk = [] for x in range(100): bulk.append({"index": {"_index": "test_index", "_id": x}}) bulk.append({"answer": x, "correct": x == 42}) await async_client.bulk(bulk, refresh=True) docs = [ doc async for doc in helpers.async_scan( async_client, index="test_index", query={"sort": "answer"}, preserve_order=True, ) ] assert 100 == len(docs) assert list(map(str, range(100))) == list(d["_id"] for d in docs) assert list(range(100)) == list(d["_source"]["answer"] for d in docs)
async def query( session: plugins.session.SessionObject, query_defuzzed, query_limit=10000, shorten=False, ): """ Advanced query and grab for stats.py """ docs = [] hits = 0 assert session.database, "Database not connected!" async for hit in async_scan( client=session.database.client, query={ "query": { "bool": query_defuzzed }, "sort": [{ "epoch": { "order": "desc" } }], }, ): doc = hit["_source"] doc["id"] = doc["mid"] if plugins.aaa.can_access_email(session, doc): if not session.credentials: doc = anonymize(doc) if shorten: doc["body"] = (doc["body"] or "")[:200] trim_email(doc) docs.append(doc) hits += 1 if hits > query_limit: break return docs
async def main(): print("Welcome to the Apache Pony Mail -> Foal migrator.") print("This will copy your old database, adjust the structure, and insert the emails into your new foal database.") print("------------------------------------") old_es_url = input("Enter the full URL (including http/https) of your old ES server: ") or "http://localhost:9200/" new_es_url = input("Enter the full URL (including http/https) of your NEW ES server: ") or "http://localhost:9200/" if old_es_url == new_es_url: print("Old and new DB should not be the same, assuming error in input and exiting!") return old_es = AsyncElasticsearch([old_es_url]) new_es = AsyncElasticsearch([new_es_url]) old_dbname = input("What is the database name for the old Pony Mail emails? [ponymail]: ") or "ponymail" new_dbprefix = input("What is the database prefix for the new Pony Mail emails? [ponymail]: ") or "ponymail" do_dkim = True dkim_txt = input("Do you wish to perform DKIM re-indexing of all emails? This will still preserve old permalinks " "(y/n) [y]: ") or "y" if dkim_txt.lower() == 'n': do_dkim = False # Define index names for new ES dbname_mbox = new_dbprefix + "-mbox" dbname_source = new_dbprefix + "-source" dbname_attachment = new_dbprefix + "-attachment" # Let's get started..! start_time = time.time() now = start_time processed = 0 count = await old_es.count(index=old_dbname, doc_type="mbox") no_emails = count['count'] print("------------------------------------") print("Starting migration of %u emails, this may take quite a while..." % no_emails) bulk_array = [] async for doc in async_scan( client=old_es, query={"query": {"match_all": {}}}, doc_type="mbox", index=old_dbname, ): list_id = doc['_source']['list_raw'].strip("<>") try: source = await old_es.get(index=old_dbname, doc_type="mbox_source", id=doc['_id']) # If we hit a 404 on a source, we have to fake an empty document, as we don't know the source. except: print("Source for %s was not found, faking it..." % doc['_id']) source = { '_source': { 'source': "" } } source_text: str = source['_source']['source'] if ':' not in source_text: # Base64 source_text = base64.b64decode(source_text) else: # bytify source_text = source_text.encode('utf-8', 'ignore') if do_dkim: dkim_id = generators.dkimid(None, None, list_id, None, source_text) old_id = doc['_id'] doc['_source']['mid'] = dkim_id doc['_source']['permalinks'] = [ dkim_id, old_id ] else: doc['_source']['permalinks'] = [ doc['_id'] ] source['_source']['permalinks'] = doc['_source']['permalinks'] doc['_source']['dbid'] = hashlib.sha3_256(source_text).hexdigest() # Append migration details to notes field in doc notes = doc['_source'].get('_notes', []) # We want a list, not a single string if isinstance(notes, str): notes = list(notes) notes.append("MIGRATE: Document migrated from Pony Mail to Pony Mail Foal at %u, " "using foal migrator v/%s" % (now, MIGRATION_MAGIC_NUMBER)) # If we re-indexed the document, make a note of that as well. if do_dkim: notes.append("REINDEX: Document re-indexed with DKIM_ID at %u, " "from %s to %s" % (now, dkim_id, old_id)) doc['_source']['_notes'] = notes # Copy to new DB bulk_array.append({ 'index': dbname_mbox, 'id': doc['_id'], 'body': doc['_source'] }) bulk_array.append({ 'index': dbname_source, 'id': doc['_source']['dbid'], 'body': source['_source'] }) if len(bulk_array) > 100: await bulk_push(bulk_array, new_es) bulk_array[:] = [] processed += 1 if processed % 500 == 0: now = time.time() time_spent = now - start_time docs_per_second = processed / time_spent time_left = (no_emails - processed) / docs_per_second # stringify time left time_left_str = "%u seconds" % time_left if time_left > 60: time_left_str = "%u minute(s), %u second(s)" % (int(time_left / 60), time_left % 60) if time_left > 3600: time_left_str = "%u hour(s), %u minute(s), %u second(s)" % ( int(time_left / 3600), int(time_left % 3600 / 60), time_left % 60) print("Processed %u emails, %u remain. ETA: %s (at %u emails per second)" % (processed, (no_emails - processed), time_left_str, docs_per_second) ) # There may be some docs left over to push if bulk_array: await bulk_push(bulk_array, new_es) start_time = time.time() processed = 0 count = await old_es.count(index=old_dbname, doc_type="attachment") no_att = count['count'] print("Transferring %u attachments..." % no_att) async for doc in async_scan( client=old_es, query={"query": {"match_all": {}}}, doc_type="attachment", index=old_dbname, ): # Copy to new DB await new_es.index(index=dbname_attachment, doc_type='_doc', id=doc['_id'], body=doc['_source']) processed += 1 if processed % 500 == 0: now = time.time() time_spent = now - start_time docs_per_second = processed / time_spent time_left = (no_att - processed) / docs_per_second # stringify time left time_left_str = "%u seconds" % time_left if time_left > 60: time_left_str = "%u minute(s), %u second(s)" % (int(time_left / 60), time_left % 60) if time_left > 3600: time_left_str = "%u hour(s), %u minute(s), %u second(s)" % ( int(time_left / 3600), int(time_left % 3600 / 60), time_left % 60) print("Processed %u emails, %u remain. ETA: %s (at %u attachments per second)" % (processed, (no_att - processed), time_left_str, docs_per_second) ) await old_es.close() await new_es.close() print("All done, enjoy!")
async def get_public_activity( database: plugins.configuration.DBConfig) -> dict: """ :param database: a PyPony database configuration :return: A dictionary with activity stats """ client = AsyncElasticsearch([ { "host": database.hostname, "port": database.port, "url_prefix": database.url_prefix or "", "use_ssl": database.secure, }, ]) # Fetch aggregations of all public emails s = (Search(using=client, index=database.db_prefix + "-mbox").query( "match", private=False).filter("range", date={ "lt": "now+1d", "gt": "now-14d" })) s.aggs.bucket("number_of_lists", "cardinality", field="list_raw") s.aggs.bucket("number_of_senders", "cardinality", field="from_raw") s.aggs.bucket("daily_emails", "date_histogram", field="date", calendar_interval="1d") res = await client.search(index=database.db_prefix + "-mbox", body=s.to_dict(), size=0) no_emails = res["hits"]["total"]["value"] no_lists = res["aggregations"]["number_of_lists"]["value"] no_senders = res["aggregations"]["number_of_senders"]["value"] daily_emails = [] for entry in res["aggregations"]["daily_emails"]["buckets"]: daily_emails.append((entry["key"], entry["doc_count"])) # Now the nitty gritty thread count seen_emails = {} seen_topics = [] thread_count = 0 s = (Search(using=client, index=database.db_prefix + "-mbox").query( "match", private=False).filter("range", date={ "lt": "now+1d", "gt": "now-14d" })) async for doc in async_scan( index=database.db_prefix + "-mbox", client=client, query=s.to_dict(), _source_includes=[ "message-id", "in-reply-to", "subject", "references", "epoch", "list_raw", ], ): found = False message_id = doc["_source"].get("message-id") irt = doc["_source"].get("in-reply-to") references = doc["_source"].get("references") list_raw = doc["_source"].get("list_raw", "_") subject = doc["_source"].get("subject", "_") if irt and irt in seen_emails: seen_emails[message_id] = irt found = True elif references: for refid in re.split(r"\s+", references): if refid in seen_emails: seen_emails[message_id] = refid found = True if not found: subject = PYPONY_RE_PREFIX.sub("", subject) subject += list_raw if subject in seen_topics: seen_emails[message_id] = subject else: seen_topics.append(subject) thread_count += 1 await client.close() activity = { "hits": no_emails, "no_threads": thread_count, "no_active_lists": no_lists, "participants": no_senders, "activity": daily_emails, } return activity
async def main(args): no_jobs = args.jobs graceful = args.graceful print("Welcome to the Apache Pony Mail -> Foal migrator.") print("This will copy your old database, adjust the structure, and insert the emails into your new foal database.") print("We will be utilizing %u cores for this operation." % no_jobs) print("------------------------------------") old_es_url = args.old_url or input("Enter the full URL (including http/https) of your old ES server: ") or "http://localhost:9200/" new_es_url = args.new_url or input("Enter the full URL (including http/https) of your NEW ES server: ") or "http://localhost:9200/" if old_es_url == new_es_url: print("Old and new DB should not be the same, assuming error in input and exiting!") return ols_es_async = AsyncElasticsearch([old_es_url]) old_dbname = args.old_name or input("What is the database name for the old Pony Mail emails? [ponymail]: ") or "ponymail" new_dbprefix = args.new_prefix or input("What is the database prefix for the new Pony Mail emails? [ponymail]: ") or "ponymail" do_dkim = True dkim_txt = ( input( "Do you wish to perform DKIM re-indexing of all emails? This will NOT preserve all old permalinks currently " "(y/n) [y]: " ) or "y" ) if dkim_txt.lower() == "n": do_dkim = False # Define index names for new ES dbname_mbox = new_dbprefix + "-mbox" dbname_source = new_dbprefix + "-source" dbname_attachment = new_dbprefix + "-attachment" # Let's get started..! # start_time = time.time() count = await ols_es_async.count(index=old_dbname, doc_type="mbox") no_emails = count["count"] print("------------------------------------") print("Starting migration of %u emails, this may take quite a while..." % no_emails) processes = MultiDocProcessor(old_es_url, new_es_url, process_document, no_jobs) docs_read = 0 async for doc in async_scan( client=ols_es_async, query={"query": {"match_all": {}}}, doc_type="mbox", index=old_dbname, ): docs_read += 1 processes.feed(doc, old_dbname, dbname_source, dbname_mbox, do_dkim) # Don't speed too far ahead of processing... processed = processes.processed.value while docs_read - processed > 100 * no_jobs: await asyncio.sleep(0.01) processed = processes.processed.value + 0 processes.status(no_emails) # There may be some docs left over to push processes.sighup() while processed < no_emails: # Wait for all documents to have been processed. await asyncio.sleep(1) print(f"Waiting for bulk push to complete ({processed} out of {no_emails} done...)") processed = processes.processed.value processes.stop() # Process attachments # start_time = time.time() processes = MultiDocProcessor(old_es_url, new_es_url, process_attachment, no_jobs, graceful) docs_read = 0 count = await ols_es_async.count(index=old_dbname, doc_type="attachment") no_att = count["count"] print("Transferring %u attachments..." % no_att) async for doc in async_scan( client=ols_es_async, query={"query": {"match_all": {}}}, doc_type="attachment", index=old_dbname, ): processes.feed(doc, dbname_attachment) docs_read += 1 # Don't speed ahead processed = processes.processed.value + 0 while docs_read - processed > 10 * no_jobs: await asyncio.sleep(0.01) processed = processes.processed.value + 0 processes.status(no_att) # There may be some docs left over to push processes.sighup() while processed < no_att: # Wait for all attachments to have been processed. await asyncio.sleep(1) print(f"Waiting for bulk push to complete ({processed} out of {no_att} done...)") processed = processes.processed.value processes.stop() await ols_es_async.close() print("All done, enjoy!")
async def scroll(self, model: Type[Model], **kwargs): async for i in async_scan(self._client, index=model.index, **kwargs): yield model(_id=i["_id"], _typecheck=False, **i["_source"])