def run_once(self, path_outputs=None): """ This is the main test function. It runs the testing procedure for every PPD file. Tests are run simultaneously in many threads. @param path_outputs: if it is not None, raw outputs sent to printers are dumped here; the directory is overwritten if already exists (is deleted and recreated) @raises error.TestFail if at least one of the tests failed """ # Set directory for output documents self._path_output_directory = self._calculate_full_path(path_outputs) if self._path_output_directory is not None: # Delete whole directory if already exists file_utils.rm_dir_if_exists(self._path_output_directory) # Create archivers self._archivers = dict() for doc_name in self._docs: path_for_archiver = os.path.join(self._path_output_directory, doc_name) self._archivers[doc_name] = archiver.Archiver( path_for_archiver, self._ppds, 50) # A place for new digests self._new_digests = dict() for doc_name in self._docs: self._new_digests[doc_name] = dict() # Runs tests for all PPD files (in parallel) outputs = self._processor.run(self._thread_test_PPD, len(self._ppds)) # Analyses tests' outputs, prints a summary report and builds a list # of PPD filenames that failed failures = [] for i, output in enumerate(outputs): ppd_file = self._ppds[i] if output != True: failures.append(ppd_file) else: output = 'OK' line = "%s: %s" % (ppd_file, output) logging.info(line) # Calculate digests files for output documents (if dumped) if self._path_output_directory is not None: for doc_name in self._docs: path = os.path.join(self._path_output_directory, doc_name + '.digests') helpers.save_digests_file(path, self._new_digests[doc_name], failures) # Raises an exception if at least one test failed if len(failures) > 0: failures.sort() raise error.TestFail('Test failed for %d PPD files: %s' % (len(failures), ', '.join(failures)))
def extractMedia(self, arname, artype=None, excludeMimetypes=None, clfilter=None, timefilter=None): if not self.smsXML: raise Exception('cannot extract media from call file') archive = archiver.Archiver(arname, type_=artype) usedNames = set() for node in self.genMmsMedia(excludeMimetypes, clFilter=clfilter, timeFilter=timefilter): if 'name' in node: name = node['name'] if '.' in name: spl = name.split('.') fname = '.'.join(spl[1:]) ext = '.' + spl[0] else: fname = '' ext = '' incr = 1 while name in usedNames: name = '%s_%d%s' % (fname, incr, ext) incr += 1 else: mmsparent = node.parent.parent ext = mimetype.guessExtension(node['ct']) if ext is not None and len(ext) != 0: ext = '.' + ext name = '%s-%s%s' % (mmsparent['date'], mmsparent['contact_name'], ext) incr = 1 while name in usedNames: name = '%s-%s_%d%s' % (mmsparent['date'], mmsparent['contact_name'], incr, ext) incr += 1 data = base64.b64decode(node.attrs['data']) try: archive.addFile(name, data) usedNames.add(name) except: traceback.print_exc() archive.close()
def destalinate_job(): print("Destalinating") if "SB_TOKEN" not in os.environ or "API_TOKEN" not in os.environ: print("ERR: Missing at least one Slack environment variable.") else: scheduled_warner = warner.Warner() scheduled_archiver = archiver.Archiver() scheduled_announcer = announcer.Announcer() scheduled_flagger = flagger.Flagger() print("Warning") scheduled_warner.warn() print("Archiving") scheduled_archiver.archive() print("Announcing") scheduled_announcer.announce() print("Flagging") scheduled_flagger.flag() print("OK: destalinated") print("END: destalinate_job")
def destalinate_job(): logging.info("Destalinating") if not _config.sb_token or not _config.api_token: logging.error( "Missing at least one required Slack environment variable.\n" "Make sure to set DESTALINATOR_SB_TOKEN and DESTALINATOR_API_TOKEN." ) else: try: archiver.Archiver().archive() warner.Warner().warn() announcer.Announcer().announce() flagger.Flagger().flag() logging.info("OK: destalinated") except Exception as e: # pylint: disable=W0703 raven_client.captureException() if not _config.sentry_dsn: raise e logging.info("END: destalinate_job")
def destalinate_job(): print("Destalinating") if "SB_TOKEN" not in os.environ or "API_TOKEN" not in os.environ: print("ERR: Missing at least one Slack environment variable.") else: try: scheduled_warner = warner.Warner() scheduled_archiver = archiver.Archiver() scheduled_announcer = announcer.Announcer() scheduled_flagger = flagger.Flagger() print("Warning") scheduled_warner.warn() print("Archiving") scheduled_archiver.archive() print("Announcing") scheduled_announcer.announce() print("Flagging") scheduled_flagger.flag() print("OK: destalinated") except Exception as e: # pylint: disable=W0703 raven_client.captureException() if not os.getenv('SENTRY_DSN'): raise e print("END: destalinate_job")
def test_tar_guess_archive(self): self.create_compare(archiver.Archiver('tmp.tar'), MD5SUM_TAR)
def test_zip_archive(self): self.create_compare(archiver.Archiver('tmp.zip', type_='zip'), MD5SUM_ZIP)
def test_tgz_archive(self): self.create_compare(archiver.Archiver('tmp.tgz', type_='tgz'), MD5SUM_TGZ)
def test_tar_archive(self): self.create_compare(archiver.Archiver('tmp.tar', type_='tar'), MD5SUM_TAR)
def run(self): global block, y, es, lists, baddies, config, resendTo, timeout ja = [] jas = [] print("Thread started") mla = None ml = "" mboxfile = "" filename = "" xlist_override = None foo = archiver.Archiver(parseHTML = parseHTML) while len(lists) > 0: print("%u elements left to slurp" % len(lists)) block.acquire() try: mla = lists.pop(0) except Exception as err: print("Could not pop list: %s" % err) block.release() return if not mla: print("Nothing more to do here") block.release() return block.release() y += 1 EY = 1980 EM = 1 stime = time.time() dFile = False if maildir: messages = mailbox.Maildir(tmpname) elif imap: y -= 1 # TODO don't understand the increment above imap4 = mla[2] def mailgen(list): for uid in list: msgbytes = imap4.uid('fetch', uid, '(RFC822)')[1][0][1] yield email.message_from_bytes(msgbytes) messages = mailgen(mla[0]) xlist_override = mla[1] elif filebased: tmpname = mla[0] filename = mla[0] xlist_override = mla[1] if filename.find(".gz") != -1: print("Decompressing %s..." % filename) try: with open(filename, "rb") as bf: bmd = bf.read() bf.close() bmd = gzip.decompress(bmd) tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False) tmpfile.write(bmd) tmpfile.flush() tmpfile.close() tmpname = tmpfile.name filename = tmpname dFile = True # Slated for deletion upon having been read print("%s -> %u bytes" % (tmpname, len(bmd))) except Exception as err: print("This wasn't a gzip file: %s" % err ) print("Slurping %s" % filename) messages = mailbox.mbox(tmpname) else: ml = mla[0] mboxfile = mla[1] xlist_override = list_override print("Slurping %s/%s" % (ml, mboxfile)) m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile) EY = 1997 EM = 1 if m: EY = int(m.group(1)) EM = int(m.group(2)) ctx = urlopen("%s%s/%s" % (source, ml, mboxfile )) inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore') tmpname = hashlib.sha224(("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8') ).hexdigest() with open(tmpname, "w") as f: f.write(inp) f.close() messages = mailbox.mbox(tmpname) count = 0 LEY = EY for message in messages: if resendTo: print("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??') s = SMTP('localhost') try: if list_override: message.replace_header('List-ID', list_override) message.replace_header('To', resendTo) except: if list_override: message['List-ID'] = list_override message['cc'] = None s.send_message(message, from_addr=None, to_addrs=(resendTo)) continue if (time.time() - stime > timeout): # break out after N seconds, it shouldn't take this long..! print("Whoa, this is taking way too long, ignoring %s for now" % tmpname) break json, contents = foo.compute_updates(list_override, private, message) if json: json_source = { 'mid': json['mid'], 'message-id': json['message-id'], 'source': message.as_bytes().decode('utf-8', errors='replace') } count += 1 ja.append(json) jas.append(json_source) if contents: iname = config.get("elasticsearch", "dbname") if not args.dry: for key in contents: es.index( index=iname, doc_type="attachment", id=key, body = { 'source': contents[key] } ) if len(ja) >= 40: if not args.dry: bulk = BulkThread() bulk.assign(ja, es, 'mbox') bulk.insert() ja = [] if not args.dry: bulks = BulkThread() bulks.assign(jas, es, 'mbox_source') bulks.insert() jas = [] else: baddies += 1 if filebased: print("Parsed %u records from %s" % (count, filename)) if dFile: os.unlink(tmpname) elif imap: print("Parsed %u records from imap" % count) else: print("Parsed %s/%s: %u records from %s" % (ml, mboxfile, count, tmpname)) os.unlink(tmpname) y += count if not args.dry: bulk = BulkThread() bulk.assign(ja, es) bulk.insert() ja = [] if not args.dry: bulks = BulkThread() bulks.assign(jas, es, 'mbox_source') bulks.insert() jas = [] print("Done, %u elements left to slurp" % len(lists))
def run(self): global block, y, es, lists, baddies, config, resendTo, timeout, dedupped, dedup self.name = Thread.getName(self) ja = [] jas = [] self.printid("Thread started") mla = None ml = "" mboxfile = "" filename = "" archie = archiver.Archiver(parseHTML = parseHTML) while len(lists) > 0: self.printid("%u elements left to slurp" % len(lists)) block.acquire() try: mla = lists.pop(0) if not mla: self.printid("Nothing more to do here") return except Exception as err: self.printid("Could not pop list: %s" % err) return finally: block.release() EY = 1980 EM = 1 stime = time.time() dFile = False if maildir: messages = mailbox.Maildir(tmpname, create=False) elif imap: imap4 = mla[2] def mailgen(list): for uid in list: msgbytes = imap4.uid('fetch', uid, '(RFC822)')[1][0][1] yield email.message_from_bytes(msgbytes) messages = mailgen(mla[0]) elif filebased: tmpname = mla[0] filename = mla[0] if filename.find(".gz") != -1: self.printid("Decompressing %s..." % filename) try: with open(filename, "rb") as bf: bmd = bf.read() bf.close() # explicit early close bmd = gzip.decompress(bmd) tmpfile = tempfile.NamedTemporaryFile(mode='w+b', buffering=1, delete=False) tmpfile.write(bmd) tmpfile.flush() tmpfile.close() tmpname = tmpfile.name dFile = True # Slated for deletion upon having been read self.printid("%s -> %u bytes" % (tmpname, len(bmd))) except Exception as err: self.printid("This wasn't a gzip file: %s" % err ) self.printid("Slurping %s" % filename) messages = mailbox.mbox(tmpname, None if noMboxo else MboxoFactory, create=False) else: ml = mla[0] mboxfile = mla[1] self.printid("Slurping %s/%s" % (ml, mboxfile)) m = re.match(r"(\d\d\d\d)(\d\d)", mboxfile) EY = 1997 EM = 1 if m: EY = int(m.group(1)) EM = int(m.group(2)) ctx = urlopen("%s%s/%s" % (source, ml, mboxfile )) inp = ctx.read().decode(ctx.headers.get_content_charset() or 'utf-8', errors='ignore') tmpname = hashlib.sha224(("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode('utf-8') ).hexdigest() with open(tmpname, "w") as f: f.write(inp) messages = mailbox.mbox(tmpname, None if noMboxo else MboxoFactory, create=False) count = 0 bad = 0 LEY = EY for key in messages.iterkeys(): message=messages.get(key) # If --filter is set, discard any messages not matching by continuing to next email if fromFilter and 'from' in message and message['from'].find(fromFilter) == -1: continue if resendTo: self.printid("Delivering message %s via MTA" % message['message-id'] if 'message-id' in message else '??') s = SMTP('localhost') try: if list_override: message.replace_header('List-ID', list_override) message.replace_header('To', resendTo) except: if list_override: message['List-ID'] = list_override message['cc'] = None s.send_message(message, from_addr=None, to_addrs=(resendTo)) continue if (time.time() - stime > timeout): # break out after N seconds, it shouldn't take this long..! self.printid("Whoa, this is taking way too long, ignoring %s for now" % tmpname) break # Don't pass message to archiver unless we have a list id if not (list_override or message['list-id']): self.printid("No list id found for %s " % message['message-id']) bad += 1 continue json, contents = archie.compute_updates(list_override, private, message) # Not sure this can ever happen if json and not (json['list'] and json['list_raw']): self.printid("No list id found for %s " % json['message-id']) bad += 1 continue # If --dedup is active, try to filter out any messages that already exist on the list if json and dedup and message.get('message-id', None): res = es.search( index=dbname, doc_type="mbox", size = 1, _source = ['mid'], # so can report the match source body = { 'query': { 'bool': { 'must': [ { 'term': { 'message-id': message.get('message-id', None) } }, { 'term': { 'list_raw': json['list'] } } ] } } } ) if res and res['hits']['total'] > 0: self.printid("Dedupping %s - matched in %s" % (json['message-id'], res['hits']['hits'][0]['_source']['mid'])) dedupped += 1 continue if json: file=messages.get_file(key, True) # If the parsed data is filtered, also need to filter the raw input # so the source agrees with the summary info if message.__class__.__name__ == 'MboxoFactory': file=MboxoReader(file) raw_msg=file.read() file.close() if args.dups: try: duplicates[json['mid']].append(json['message-id'] + " in " + filename) except: duplicates[json['mid']]=[json['message-id'] + " in " + filename] try: # temporary hack to try and find an encoding issue # needs to be replaced by proper exception handling json_source = { 'mid': json['mid'], # needed for bulk-insert only, not needed in database 'message-id': json['message-id'], 'source': archie.mbox_source(raw_msg) } except Exception as e: self.printid("Error '%s' processing id %s msg %s " % (e, json['mid'], json['message-id'])) bad += 1 continue count += 1 ja.append(json) jas.append(json_source) if contents: if not args.dry: for key in contents: es.index( index=dbname, doc_type="attachment", id=key, body = { 'source': contents[key] } ) if len(ja) >= 40: bulk = BulkThread() bulk.assign(self.name, ja, es, 'mbox') bulk.insert() ja = [] bulks = BulkThread() bulks.assign(self.name, jas, es, 'mbox_source') bulks.insert() jas = [] else: self.printid("Failed to parse: Return=%s Message-Id=%s" % (message.get('Return-Path'), message.get('Message-Id'))) bad += 1 if filebased: self.printid("Parsed %u records (failed: %u) from %s" % (count, bad, filename)) if dFile: os.unlink(tmpname) elif imap: self.printid("Parsed %u records (failed: %u) from imap" % (count, bad)) else: self.printid("Parsed %s/%s: %u records (failed: %u) from %s" % (ml, mboxfile, count, bad, tmpname)) os.unlink(tmpname) y += count baddies += bad if len(ja) > 0: bulk = BulkThread() bulk.assign(self.name, ja, es, 'mbox') bulk.insert() ja = [] if len(jas) > 0: bulks = BulkThread() bulks.assign(self.name, jas, es, 'mbox_source') bulks.insert() jas = [] self.printid("Done, %u elements left to slurp" % len(lists))
def run(self): global goodies, baddies, dedupped self.name = Thread.getName(self) ja = [] jas = [] self.printid("Thread started") mla = None ml = "" mboxfile = "" filename = "" archie = archiver.Archiver(generator=args.generator, parse_html=args.html2text, ignore_body=args.ibody, verbose=args.verbose) while len(lists) > 0: self.printid("%u elements left to slurp" % len(lists)) block.acquire() try: mla = lists.pop(0) if not mla: self.printid("Nothing more to do here") return except Exception as err: self.printid("Could not pop list: %s" % err) return finally: block.release() stime = time.time() delete_file = False if imap: imap4 = mla[2] def mailgen(_list): for uid in _list: msgbytes = imap4.uid("fetch", uid, "(RFC822)")[1][0][1] yield email.message_from_bytes(msgbytes) messages = mailgen(mla[0]) elif filebased: tmpname = mla[0] filename = mla[0] if filename.find(".gz") != -1: self.printid("Decompressing %s..." % filename) try: with open(filename, "rb") as bf: bmd = bf.read() bf.close() # explicit early close bmd = gzip.decompress(bmd) tmpfile = tempfile.NamedTemporaryFile(mode="w+b", buffering=1, delete=False) tmpfile.write(bmd) tmpfile.flush() tmpfile.close() tmpname = tmpfile.name delete_file = True # Slated for deletion upon having been read self.printid("%s -> %u bytes" % (tmpname, len(bmd))) except Exception as err: self.printid("This wasn't a gzip file: %s" % err) self.printid("Slurping %s" % filename) if maildir: messages = mailbox.Maildir(tmpname, create=False) else: messages = mailbox.mbox(tmpname, None if noMboxo else MboxoFactory, create=False) else: ml = mla[0] mboxfile = mla[1] self.printid("Slurping %s/%s" % (ml, mboxfile)) ctx = urlopen("%s%s/%s" % (source, ml, mboxfile)) inp = ctx.read().decode(ctx.headers.get_content_charset() or "utf-8", errors="ignore") tmpname = hashlib.sha224( ("%f-%f-%s-%s.mbox" % (random.random(), time.time(), ml, mboxfile)).encode("utf-8")).hexdigest() with open(tmpname, "w") as f: f.write(inp) if maildir: messages = mailbox.Maildir(tmpname, create=False) else: messages = mailbox.mbox(tmpname, None if noMboxo else MboxoFactory, create=False) count = 0 bad = 0 for key in messages.iterkeys(): message = messages.get(key) file = messages.get_file(key, True) # If the parsed data is filtered, also need to filter the raw input # so the source agrees with the summary info if message.__class__.__name__ == "MboxoFactory": file = MboxoReader(file) message_raw = file.read() file.close() # If --filter is set, discard any messages not matching by continuing to next email if (fromFilter and "from" in message and message["from"].find(fromFilter) == -1): continue if resendTo: self.printid("Delivering message %s via MTA" % message["message-id"] if "message-id" in message else "??") s = SMTP("localhost") try: if list_override: message.replace_header("List-ID", list_override) message.replace_header("To", resendTo) except: if list_override: message["List-ID"] = list_override message["cc"] = None s.send_message(message, from_addr=None, to_addrs=(resendTo)) continue if ( time.time() - stime > timeout ): # break out after N seconds, it shouldn't take this long..! self.printid( "Whoa, this is taking way too long, ignoring %s for now" % tmpname) break # Don't pass message to archiver unless we have a list id if not (list_override or message["list-id"]): self.printid("No list id found for %s " % message["message-id"]) bad += 1 continue json, contents, _msgdata, _irt = archie.compute_updates( list_override, private, message, message_raw) # Not sure this can ever happen if json and not (json["list"] and json["list_raw"]): self.printid("No list id found for %s " % json["message-id"]) bad += 1 continue # If --dedup is active, try to filter out any messages that already exist on the list if json and dedup and message.get("message-id", None): res = es.search( index=es.db_mbox, doc_type="_doc", size=1, _source=["mid"], # so can report the match source body={ "query": { "bool": { "must": [ { "term": { "message-id": message.get( "message-id", None) } }, { "term": { "list_raw": json["list"] } }, ] } } }, ) if res and res["hits"]["total"] > 0: self.printid("Dedupping %s - matched in %s" % ( json["message-id"], res["hits"]["hits"][0]["_source"]["mid"], )) dedupped += 1 continue if json: if args.dups: try: duplicates[json["mid"]].append(json["message-id"] + " in " + filename) except: duplicates[json["mid"]] = [ json["message-id"] + " in " + filename ] # Mark that we imported this email json["_notes"] = [ x for x in json["_notes"] if "ARCHIVE:" not in x ] # Pop archiver.py note json["_notes"].append([ "IMPORT: Email imported as %s at %u" % (json["mid"], time.time()) ]) try: # temporary hack to try and find an encoding issue # needs to be replaced by proper exception handling json_source = { "permalinks": json["permalinks"], "mid": json["dbid"], "message-id": json["message-id"], "source": archiver.mbox_source(message_raw), } except Exception as e: self.printid("Error '%s' processing id %s msg %s " % (e, json["mid"], json["message-id"])) bad += 1 continue count += 1 ja.append(json) jas.append(json_source) if args.verbose and verbose_logger: # TODO optionally show other fields (e.g. From_ line) verbose_logger.info("MID:%(mid)s MSGID:%(message-id)s", json) if contents: if not args.dry: for key in contents: es.index( index=es.db_attachment, doc_type="_doc", id=key, body={"source": contents[key]}, ) if len(ja) >= 40 and not args.dry: bulk_insert(self.name, ja, es, "mbox") ja = [] bulk_insert(self.name, jas, es, "source") jas = [] else: self.printid("Failed to parse: Return=%s Message-Id=%s" % (message.get("Return-Path"), message.get("Message-Id"))) bad += 1 if filebased: self.printid("Parsed %u records (failed: %u) from %s" % (count, bad, filename)) if delete_file: os.unlink(tmpname) elif imap: self.printid("Parsed %u records (failed: %u) from imap" % (count, bad)) else: self.printid("Parsed %s/%s: %u records (failed: %u) from %s" % (ml, mboxfile, count, bad, tmpname)) os.unlink(tmpname) goodies += count baddies += bad if len(ja) > 0 and not args.dry: bulk_insert(self.name, ja, es, "mbox") ja = [] if len(jas) > 0 and not args.dry: bulk_insert(self.name, jas, es, "source") jas = [] self.printid("Done, %u elements left to slurp" % len(lists))
#! /usr/bin/env python import warner import archiver import json if __name__ == "__main__": warn_and_archive_warner = warner.Warner() warn_and_archive_archiver = archiver.Archiver() warn_and_archive_warner.warn() warn_and_archive_archiver.archive()
from collections import namedtuple TOOLS = os.path.join( os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "tools") sys.path.append(TOOLS) import archiver ARCHIVER = os.path.join(TOOLS, "archiver.py") import generators list_override = None # could affect id private = False # does not affect id generation parseHTML = False # can this affect id generation? GENS = generators.generator_names() archie = archiver.Archiver(parse_html=parseHTML) fake_args = namedtuple('fakeargs', ['verbose', 'ibody'])(False, None) for arg in sys.argv[1:]: if arg.endswith('.yml') or arg.endswith('.yaml'): errors = 0 with open(arg, 'r') as stream: data = yaml.safe_load(stream) for test in data['tests']: for file in test: print("Testing with %s" % file) mbox = mailbox.mbox(file, None, create=False) scripts = test[file] msgcnt = len(mbox) scrcnt = len(scripts) if msgcnt != scrcnt:
def test_tgz_guess_archive(self): self.create_compare(archiver.Archiver('tmp.tgz'), MD5SUM_TGZ)
def test_zip_guess_archive(self): self.create_compare(archiver.Archiver('tmp.zip'), MD5SUM_ZIP)
""" # PYTHONPATH is used to give access to archiver.py # PYTHONPATH=../tools python3 generatortest.py generatortest.yaml import mailbox import archiver import sys import yaml from pprint import pprint list_override = None # could affect id private = False # does not affect id generation parseHTML = False # can this affect id generation? archie = archiver.Archiver(parseHTML=parseHTML) for arg in sys.argv[1:]: if arg.endswith('.yml') or arg.endswith('.yaml'): with open(arg, 'r') as stream: data = yaml.safe_load(stream) for test in data['tests']: for file in test: print("Testing with %s" % file) mbox = mailbox.mbox(file, None, create=False) scripts = test[file] msgcnt = len(mbox) scrcnt = len(scripts) if msgcnt != scrcnt: print( "WARN: mbox contains %d messages, but there are %d unit tests"