def first_pass(elastic: Elastic) -> None: hits = scan( client=elastic.es, index=elastic.db_mbox, # Thanks to elasticsearch_dsl.Q # (~Q(...)) | (~Q(...)) query={ "query": { "bool": { "should": [ { "bool": { "must_not": [{"exists": {"field": "forum"}}] } }, { "bool": { "must_not": [{"exists": {"field": "size"}}] } }, ] } } }, ) for hit in hits: pid = hit["_id"] ojson = hit["_source"] ojson["forum"] = ojson.get("list", "").strip("<>").replace(".", "@", 1) source = elastic.es.get( elastic.db_source, ojson["dbid"], _source="source" )["_source"]["source"] ojson["size"] = len(source) elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
def third_pass(elastic: Elastic) -> None: hits = scan(client=elastic.es, index=elastic.db_mbox, query={}) for hit in hits: pid = hit["_id"] ojson = hit["_source"] if ojson["thread"] != "": continue if ojson["top"] is True: ojson["previous"] = archiver.get_previous_mid(elastic, ojson) ojson["thread"] = pid elastic.index(index=elastic.db_mbox, id=pid, body=ojson) else: tree = [] while ojson["thread"] == "": tree.append(ojson) ojson_parent = archiver.get_parent_info(elastic, ojson) if ojson_parent is None: ojson["previous"] = None print("Error:", ojson["mid"], "has no parent") break ojson["previous"] = ojson_parent["mid"] ojson = ojson_parent for info in tree: info["thread"] = ojson["thread"] elastic.index(index=elastic.db_mbox, id=info["mid"], body=info)
def second_pass(elastic: Elastic) -> None: hits = scan( client=elastic.es, index=elastic.db_mbox, query={"sort": {"epoch": "asc"}}, ) for hit in hits: pid = hit["_id"] ojson = hit["_source"] parent_info = archiver.get_parent_info(elastic, ojson) ojson["top"] = parent_info is None ojson["previous"] = "" ojson["thread"] = pid if (parent_info is None) else "" elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
def first_pass(elastic: Elastic) -> None: hits = scan( client=elastic.es, index=elastic.db_mbox, query={"sort": { "epoch": "asc" }}, ) for hit in hits: pid = hit["_id"] ojson = hit["_source"] parent_info = archiver.get_parent_info(elastic, ojson) ojson["top"] = parent_info is None ojson["forum"] = ojson.get("list", "").strip("<>").replace(".", "@", 1) source = elastic.es.get(elastic.db_source, ojson["dbid"], _source="source")["_source"]["source"] ojson["size"] = len(source) ojson["previous"] = "" ojson["thread"] = pid if (parent_info is None) else "" elastic.index(index=elastic.db_mbox, id=pid, body=ojson)
# ** INITIAL VERSION, liable to change ** import argparse import sys import yaml from plugins.elastic import Elastic # Needs 3.4 or higher to work if sys.version_info <= (3, 3): print("This script requires Python 3.4 or higher in order to work!") sys.exit(-1) # the desired mappings mapping_file = yaml.safe_load(open("mappings.yaml", "r")) elastic = Elastic() major = elastic.engineMajor() if major != 7: print("This script requires ElasticSearch 7 API in order to work!") sys.exit(-1) parser = argparse.ArgumentParser(description="Command line options.") parser.add_argument( "--create", dest="create", action="store_true", help="Create the missing mapping(s)", ) parser.add_argument( "--shards", dest="shards",
def archive_message(self, mlist, msg, raw_message=None, dry=False, dump=None, defaultepoch=None, digest=False): """Send the message to the archiver. :param mlist: The IMailingList object. :param msg: The message object. :param raw_message: Raw message bytes :param dry: Whether or not to actually run :param dump: Optional path for dump on fail :return (lid, mid) """ lid = textlib.normalize_lid(mlist.list_id, strict=True) if lid is None: raise ValueError(f"Invalid list id {lid}") private = False if hasattr(mlist, "archive_public") and mlist.archive_public is True: private = False elif hasattr(mlist, "archive_public") and mlist.archive_public is False: private = True elif (hasattr(mlist, "archive_policy") and mlist.archive_policy is not ArchivePolicy.public): private = True if raw_message is None: raw_message = msg.as_bytes() ojson, contents, msg_metadata, irt, skipit = self.compute_updates( lid, private, msg, raw_message, defaultepoch) if not ojson: _id = msg.get("message-id") or msg.get("Subject") or msg.get( "Date") raise Exception("Could not parse message %s for %s" % (_id, lid)) if skipit: print( "Skipping archiving of email due to invalid date and default date set to skip" ) return lid, "(skipped)" if digest: return lid, ojson["mid"] if dry: print("**** Dry run, not saving message to database *****") return lid, ojson["mid"] if dump: try: elastic = Elastic() except elasticsearch.exceptions.ElasticsearchException as e: print(e) print( "ES connection failed, but dumponfail specified, dumping to %s" % dump) else: elastic = Elastic() if config.get("archiver", "threadinfo"): try: timeout = int(config.get("archiver", "threadtimeout") or 5) timeout = str(timeout) + "s" limit = int(config.get("archiver", "threadparents") or 10) ojson = add_thread_properties(elastic, ojson, timeout, limit) except Exception as err: print("Could not add thread info", err) if logger: logger.info("Could not add thread info %s" % (err, )) else: print("Added thread info successfully", ojson["mid"]) if logger: logger.info("Added thread info successfully %s" % (ojson["mid"], )) try: if contents: for key in contents: elastic.index( index=elastic.db_attachment, id=key, body={"source": contents[key]}, ) elastic.index( index=elastic.db_mbox, id=ojson["mid"], body=ojson, ) elastic.index( index=elastic.db_source, id=ojson["dbid"], body={ "message-id": msg_metadata["message-id"], "source": mbox_source(raw_message), }, ) # Write to audit log try: auditlog_exists = elastic.indices.exists( index=elastic.db_auditlog) except elasticsearch.exceptions.AuthorizationException: auditlog_exists = False if auditlog_exists: elastic.index( index=elastic.db_auditlog, body={ "date": time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(time.time())), "action": "index", "remote": "internal", "author": "archiver.py", "target": ojson["mid"], "lid": lid, "log": f"Indexed email {ojson['message-id']} for {lid} as {ojson['mid']}", }) # If we have a dump dir and ES failed, push to dump dir instead as a JSON object # We'll leave it to another process to pick up the slack. except Exception as err: print(err) if dump: print( "Pushing to ES failed, but dumponfail specified, dumping JSON docs" ) uid = uuid.uuid4() mbox_path = os.path.join(dump, "%s.json" % uid) with open(mbox_path, "w") as f: json.dump( { "id": ojson["mid"], "mbox": ojson, "mbox_source": { "id": ojson["dbid"], "permalink": ojson["mid"], "message-id": msg_metadata["message-id"], "source": mbox_source(raw_message), }, "attachments": contents, }, f, indent=2, ) f.close() sys.exit( 0) # We're exiting here, the rest can't be done without ES # otherwise fail as before raise err if logger: logger.info("Pony Mail archived message %s successfully", ojson["mid"]) oldrefs = [] # Is this a direct reply to a pony mail email? if irt != "": dm = re.search(r"pony-([a-f0-9]+)-([a-f0-9]+)@", irt) if dm: cid = dm.group(1) mid = dm.group(2) if elastic.exists(index=elastic.db_account, id=cid): doc = elastic.get(index=elastic.db_account, id=cid) if doc: oldrefs.append(cid) # N.B. no index is supplied, so ES will generate one elastic.index( index=elastic.db_notification, body={ "type": "direct", "recipient": cid, "list": lid, "private": private, "date": ojson["date"], "from": msg_metadata["from"], "to": msg_metadata["to"], "subject": msg_metadata["subject"], "message-id": msg_metadata["message-id"], "in-reply-to": irt, "epoch": ojson["epoch"], "mid": mid, "seen": 0, }, ) if logger: logger.info("Notification sent to %s for %s", cid, mid) # Are there indirect replies to pony emails? if msg_metadata.get("references"): for im in re.finditer(r"pony-([a-f0-9]+)-([a-f0-9]+)@", msg_metadata.get("references")): cid = im.group(1) mid = im.group(2) # TODO: Fix this to work with pibbles if elastic.exists(index=elastic.db_mbox, id=cid): doc = elastic.get(index=elastic.db_mbox, id=cid) # does the user want to be notified of indirect replies? if (doc and "preferences" in doc["_source"] and doc["_source"]["preferences"].get("notifications") == "indirect" and cid not in oldrefs): oldrefs.append(cid) # N.B. no index mapping is supplied, so ES will generate one elastic.index( index=elastic.db_notification, body={ "type": "indirect", "recipient": cid, "list": lid, "private": private, "date": ojson["date"], "from": msg_metadata["from"], "to": msg_metadata["to"], "subject": msg_metadata["subject"], "message-id": msg_metadata["message-id"], "in-reply-to": irt, "epoch": ojson["epoch"], "mid": mid, "seen": 0, }, ) if logger: logger.info("Notification sent to %s for %s", cid, mid) return lid, ojson["mid"]
# elasticsearch logs lots of warnings on retries/connection failure logging.getLogger("elasticsearch").setLevel(logging.ERROR) verbose_logger = None if args.verbose: verbose_logger = logging.getLogger("verbose") verbose_logger.setLevel(logging.INFO) # The default handler is set to WARN level verbose_logger.addHandler(logging.StreamHandler(sys.stdout)) archiver.logger = verbose_logger if args.dry: print("Dry-run; continuing to check input data") else: # Fetch config and set up ES es = Elastic() # We need the index name for bulk actions dbname = es.getdbname() # No point continuing if the index does not exist print("Checking that the database index %s exists ... " % dbname) # Need to check the index before starting bulk operations try: if not es.indices.exists(index=es.db_mbox): print("Error: the index '%s' does not exist!" % (es.db_mbox)) sys.exit(1) print("Database exists OK") except Exception as err: print("Error: unable to check if the index %s exists!: %s" % (es.db_mbox, err))
verbose_logger = logging.getLogger("verbose") verbose_logger.setLevel(logging.INFO) # The default handler is set to WARN level verbose_logger.addHandler(logging.StreamHandler(sys.stdout)) archiver.logger = verbose_logger if args.dry: print("Dry-run; continuing to check input data") if args.dump: print("Writing mbox output to %s" % args.dump[0]) dumpfile = open(args.dump[0], 'w') dumpfile.write("[\n") else: # Fetch config and set up ES es = Elastic( logger_level=args.logger_level[0] if args.logger_level else None, trace_level=args.trace_level[0] if args.trace_level else None) # No point continuing if the index does not exist print("Checking that the database index %s exists ... " % es.db_mbox) # Need to check the index before starting bulk operations try: if not es.indices.exists(index=es.db_mbox): print("Error: the index '%s' does not exist!" % (es.db_mbox)) sys.exit(1) print("Database exists OK") except Exception as err: print("Error: unable to check if the index %s exists!: %s" % (es.db_mbox, err)) sys.exit(1)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utility for retrying docs that we failed to index earlier. """ import argparse import json import os if not __package__: from plugins.elastic import Elastic else: from .plugins.elastic import Elastic elastic = Elastic() parser = argparse.ArgumentParser(description="Command line options.") parser.add_argument( "--source", dest="dumpdir", help= "Path to the directory containing the JSON documents that failed to index") args = parser.parse_args() dumpDir = args.dumpdir if args.dumpdir else "." print("Looking for *.json files in %s" % dumpDir) files = [
def main() -> None: elastic: Elastic = Elastic() first_pass(elastic) second_pass(elastic) third_pass(elastic)